diff -uprN linux-2.6.18/COPYING.SWsoft linux-2.6.18.ovz/COPYING.SWsoft --- linux-2.6.18/COPYING.SWsoft 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/COPYING.SWsoft 2007-06-13 06:55:04.000000000 -0400 @@ -0,0 +1,350 @@ + +Nothing in this license should be construed as a grant by SWsoft of any rights +beyond the rights specified in the GNU General Public License, and nothing in +this license should be construed as a waiver by SWsoft of its patent, copyright +and/or trademark rights, beyond the waiver required by the GNU General Public +License. This license is expressly inapplicable to any product that is not +within the scope of the GNU General Public License + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff -uprN linux-2.6.18/Documentation/dontdiff linux-2.6.18.ovz/Documentation/dontdiff --- linux-2.6.18/Documentation/dontdiff 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/Documentation/dontdiff 2007-06-13 06:55:04.000000000 -0400 @@ -135,6 +135,7 @@ tags times.h* tkparse trix_boot.h +utsrelease.h* version.h* vmlinux vmlinux-* diff -uprN linux-2.6.18/Documentation/filesystems/Locking linux-2.6.18.ovz/Documentation/filesystems/Locking --- linux-2.6.18/Documentation/filesystems/Locking 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/Documentation/filesystems/Locking 2007-06-13 06:55:04.000000000 -0400 @@ -171,6 +171,7 @@ prototypes: int (*releasepage) (struct page *, int); int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, loff_t offset, unsigned long nr_segs); + int (*launder_page) (struct page *); locking rules: All except set_page_dirty may block @@ -188,6 +189,7 @@ bmap: yes invalidatepage: no yes releasepage: no yes direct_IO: no +launder_page: no yes ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() may be called from the request handler (/dev/loop). @@ -281,6 +283,12 @@ buffers from the page in preparation for indicate that the buffers are (or may be) freeable. If ->releasepage is zero, the kernel assumes that the fs has no private interest in the buffers. + ->launder_page() may be called prior to releasing a page if +it is still found to be dirty. It returns zero if the page was successfully +cleaned, or an error value if not. Note that in order to prevent the page +getting mapped back in and redirtied, it needs to be kept locked +across the entire operation. + Note: currently almost all instances of address_space methods are using BKL for internal serialization and that's one of the worst sources of contention. Normally they are calling library functions (in fs/buffer.c) diff -uprN linux-2.6.18/Documentation/scsi/libsas.txt linux-2.6.18.ovz/Documentation/scsi/libsas.txt --- linux-2.6.18/Documentation/scsi/libsas.txt 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/Documentation/scsi/libsas.txt 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,484 @@ +SAS Layer +--------- + +The SAS Layer is a management infrastructure which manages +SAS LLDDs. It sits between SCSI Core and SAS LLDDs. The +layout is as follows: while SCSI Core is concerned with +SAM/SPC issues, and a SAS LLDD+sequencer is concerned with +phy/OOB/link management, the SAS layer is concerned with: + + * SAS Phy/Port/HA event management (LLDD generates, + SAS Layer processes), + * SAS Port management (creation/destruction), + * SAS Domain discovery and revalidation, + * SAS Domain device management, + * SCSI Host registration/unregistration, + * Device registration with SCSI Core (SAS) or libata + (SATA), and + * Expander management and exporting expander control + to user space. + +A SAS LLDD is a PCI device driver. It is concerned with +phy/OOB management, and vendor specific tasks and generates +events to the SAS layer. + +The SAS Layer does most SAS tasks as outlined in the SAS 1.1 +spec. + +The sas_ha_struct describes the SAS LLDD to the SAS layer. +Most of it is used by the SAS Layer but a few fields need to +be initialized by the LLDDs. + +After initializing your hardware, from the probe() function +you call sas_register_ha(). It will register your LLDD with +the SCSI subsystem, creating a SCSI host and it will +register your SAS driver with the sysfs SAS tree it creates. +It will then return. Then you enable your phys to actually +start OOB (at which point your driver will start calling the +notify_* event callbacks). + +Structure descriptions: + +struct sas_phy -------------------- +Normally this is statically embedded to your driver's +phy structure: + struct my_phy { + blah; + struct sas_phy sas_phy; + bleh; + }; +And then all the phys are an array of my_phy in your HA +struct (shown below). + +Then as you go along and initialize your phys you also +initialize the sas_phy struct, along with your own +phy structure. + +In general, the phys are managed by the LLDD and the ports +are managed by the SAS layer. So the phys are initialized +and updated by the LLDD and the ports are initialized and +updated by the SAS layer. + +There is a scheme where the LLDD can RW certain fields, +and the SAS layer can only read such ones, and vice versa. +The idea is to avoid unnecessary locking. + +enabled -- must be set (0/1) +id -- must be set [0,MAX_PHYS) +class, proto, type, role, oob_mode, linkrate -- must be set +oob_mode -- you set this when OOB has finished and then notify +the SAS Layer. + +sas_addr -- this normally points to an array holding the sas +address of the phy, possibly somewhere in your my_phy +struct. + +attached_sas_addr -- set this when you (LLDD) receive an +IDENTIFY frame or a FIS frame, _before_ notifying the SAS +layer. The idea is that sometimes the LLDD may want to fake +or provide a different SAS address on that phy/port and this +allows it to do this. At best you should copy the sas +address from the IDENTIFY frame or maybe generate a SAS +address for SATA directly attached devices. The Discover +process may later change this. + +frame_rcvd -- this is where you copy the IDENTIFY/FIS frame +when you get it; you lock, copy, set frame_rcvd_size and +unlock the lock, and then call the event. It is a pointer +since there's no way to know your hw frame size _exactly_, +so you define the actual array in your phy struct and let +this pointer point to it. You copy the frame from your +DMAable memory to that area holding the lock. + +sas_prim -- this is where primitives go when they're +received. See sas.h. Grab the lock, set the primitive, +release the lock, notify. + +port -- this points to the sas_port if the phy belongs +to a port -- the LLDD only reads this. It points to the +sas_port this phy is part of. Set by the SAS Layer. + +ha -- may be set; the SAS layer sets it anyway. + +lldd_phy -- you should set this to point to your phy so you +can find your way around faster when the SAS layer calls one +of your callbacks and passes you a phy. If the sas_phy is +embedded you can also use container_of -- whatever you +prefer. + + +struct sas_port -------------------- +The LLDD doesn't set any fields of this struct -- it only +reads them. They should be self explanatory. + +phy_mask is 32 bit, this should be enough for now, as I +haven't heard of a HA having more than 8 phys. + +lldd_port -- I haven't found use for that -- maybe other +LLDD who wish to have internal port representation can make +use of this. + + +struct sas_ha_struct -------------------- +It normally is statically declared in your own LLDD +structure describing your adapter: +struct my_sas_ha { + blah; + struct sas_ha_struct sas_ha; + struct my_phy phys[MAX_PHYS]; + struct sas_port sas_ports[MAX_PHYS]; /* (1) */ + bleh; +}; + +(1) If your LLDD doesn't have its own port representation. + +What needs to be initialized (sample function given below). + +pcidev +sas_addr -- since the SAS layer doesn't want to mess with + memory allocation, etc, this points to statically + allocated array somewhere (say in your host adapter + structure) and holds the SAS address of the host + adapter as given by you or the manufacturer, etc. +sas_port +sas_phy -- an array of pointers to structures. (see + note above on sas_addr). + These must be set. See more notes below. +num_phys -- the number of phys present in the sas_phy array, + and the number of ports present in the sas_port + array. There can be a maximum num_phys ports (one per + port) so we drop the num_ports, and only use + num_phys. + +The event interface: + + /* LLDD calls these to notify the class of an event. */ + void (*notify_ha_event)(struct sas_ha_struct *, enum ha_event); + void (*notify_port_event)(struct sas_phy *, enum port_event); + void (*notify_phy_event)(struct sas_phy *, enum phy_event); + +When sas_register_ha() returns, those are set and can be +called by the LLDD to notify the SAS layer of such events +the SAS layer. + +The port notification: + + /* The class calls these to notify the LLDD of an event. */ + void (*lldd_port_formed)(struct sas_phy *); + void (*lldd_port_deformed)(struct sas_phy *); + +If the LLDD wants notification when a port has been formed +or deformed it sets those to a function satisfying the type. + +A SAS LLDD should also implement at least one of the Task +Management Functions (TMFs) described in SAM: + + /* Task Management Functions. Must be called from process context. */ + int (*lldd_abort_task)(struct sas_task *); + int (*lldd_abort_task_set)(struct domain_device *, u8 *lun); + int (*lldd_clear_aca)(struct domain_device *, u8 *lun); + int (*lldd_clear_task_set)(struct domain_device *, u8 *lun); + int (*lldd_I_T_nexus_reset)(struct domain_device *); + int (*lldd_lu_reset)(struct domain_device *, u8 *lun); + int (*lldd_query_task)(struct sas_task *); + +For more information please read SAM from T10.org. + +Port and Adapter management: + + /* Port and Adapter management */ + int (*lldd_clear_nexus_port)(struct sas_port *); + int (*lldd_clear_nexus_ha)(struct sas_ha_struct *); + +A SAS LLDD should implement at least one of those. + +Phy management: + + /* Phy management */ + int (*lldd_control_phy)(struct sas_phy *, enum phy_func); + +lldd_ha -- set this to point to your HA struct. You can also +use container_of if you embedded it as shown above. + +A sample initialization and registration function +can look like this (called last thing from probe()) +*but* before you enable the phys to do OOB: + +static int register_sas_ha(struct my_sas_ha *my_ha) +{ + int i; + static struct sas_phy *sas_phys[MAX_PHYS]; + static struct sas_port *sas_ports[MAX_PHYS]; + + my_ha->sas_ha.sas_addr = &my_ha->sas_addr[0]; + + for (i = 0; i < MAX_PHYS; i++) { + sas_phys[i] = &my_ha->phys[i].sas_phy; + sas_ports[i] = &my_ha->sas_ports[i]; + } + + my_ha->sas_ha.sas_phy = sas_phys; + my_ha->sas_ha.sas_port = sas_ports; + my_ha->sas_ha.num_phys = MAX_PHYS; + + my_ha->sas_ha.lldd_port_formed = my_port_formed; + + my_ha->sas_ha.lldd_dev_found = my_dev_found; + my_ha->sas_ha.lldd_dev_gone = my_dev_gone; + + my_ha->sas_ha.lldd_max_execute_num = lldd_max_execute_num; (1) + + my_ha->sas_ha.lldd_queue_size = ha_can_queue; + my_ha->sas_ha.lldd_execute_task = my_execute_task; + + my_ha->sas_ha.lldd_abort_task = my_abort_task; + my_ha->sas_ha.lldd_abort_task_set = my_abort_task_set; + my_ha->sas_ha.lldd_clear_aca = my_clear_aca; + my_ha->sas_ha.lldd_clear_task_set = my_clear_task_set; + my_ha->sas_ha.lldd_I_T_nexus_reset= NULL; (2) + my_ha->sas_ha.lldd_lu_reset = my_lu_reset; + my_ha->sas_ha.lldd_query_task = my_query_task; + + my_ha->sas_ha.lldd_clear_nexus_port = my_clear_nexus_port; + my_ha->sas_ha.lldd_clear_nexus_ha = my_clear_nexus_ha; + + my_ha->sas_ha.lldd_control_phy = my_control_phy; + + return sas_register_ha(&my_ha->sas_ha); +} + +(1) This is normally a LLDD parameter, something of the +lines of a task collector. What it tells the SAS Layer is +whether the SAS layer should run in Direct Mode (default: +value 0 or 1) or Task Collector Mode (value greater than 1). + +In Direct Mode, the SAS Layer calls Execute Task as soon as +it has a command to send to the SDS, _and_ this is a single +command, i.e. not linked. + +Some hardware (e.g. aic94xx) has the capability to DMA more +than one task at a time (interrupt) from host memory. Task +Collector Mode is an optional feature for HAs which support +this in their hardware. (Again, it is completely optional +even if your hardware supports it.) + +In Task Collector Mode, the SAS Layer would do _natural_ +coalescing of tasks and at the appropriate moment it would +call your driver to DMA more than one task in a single HA +interrupt. DMBS may want to use this by insmod/modprobe +setting the lldd_max_execute_num to something greater than +1. + +(2) SAS 1.1 does not define I_T Nexus Reset TMF. + +Events +------ + +Events are _the only way_ a SAS LLDD notifies the SAS layer +of anything. There is no other method or way a LLDD to tell +the SAS layer of anything happening internally or in the SAS +domain. + +Phy events: + PHYE_LOSS_OF_SIGNAL, (C) + PHYE_OOB_DONE, + PHYE_OOB_ERROR, (C) + PHYE_SPINUP_HOLD. + +Port events, passed on a _phy_: + PORTE_BYTES_DMAED, (M) + PORTE_BROADCAST_RCVD, (E) + PORTE_LINK_RESET_ERR, (C) + PORTE_TIMER_EVENT, (C) + PORTE_HARD_RESET. + +Host Adapter event: + HAE_RESET + +A SAS LLDD should be able to generate + - at least one event from group C (choice), + - events marked M (mandatory) are mandatory (only one), + - events marked E (expander) if it wants the SAS layer + to handle domain revalidation (only one such). + - Unmarked events are optional. + +Meaning: + +HAE_RESET -- when your HA got internal error and was reset. + +PORTE_BYTES_DMAED -- on receiving an IDENTIFY/FIS frame +PORTE_BROADCAST_RCVD -- on receiving a primitive +PORTE_LINK_RESET_ERR -- timer expired, loss of signal, loss +of DWS, etc. (*) +PORTE_TIMER_EVENT -- DWS reset timeout timer expired (*) +PORTE_HARD_RESET -- Hard Reset primitive received. + +PHYE_LOSS_OF_SIGNAL -- the device is gone (*) +PHYE_OOB_DONE -- OOB went fine and oob_mode is valid +PHYE_OOB_ERROR -- Error while doing OOB, the device probably +got disconnected. (*) +PHYE_SPINUP_HOLD -- SATA is present, COMWAKE not sent. + +(*) should set/clear the appropriate fields in the phy, + or alternatively call the inlined sas_phy_disconnected() + which is just a helper, from their tasklet. + +The Execute Command SCSI RPC: + + int (*lldd_execute_task)(struct sas_task *, int num, + unsigned long gfp_flags); + +Used to queue a task to the SAS LLDD. @task is the tasks to +be executed. @num should be the number of tasks being +queued at this function call (they are linked listed via +task::list), @gfp_mask should be the gfp_mask defining the +context of the caller. + +This function should implement the Execute Command SCSI RPC, +or if you're sending a SCSI Task as linked commands, you +should also use this function. + +That is, when lldd_execute_task() is called, the command(s) +go out on the transport *immediately*. There is *no* +queuing of any sort and at any level in a SAS LLDD. + +The use of task::list is two-fold, one for linked commands, +the other discussed below. + +It is possible to queue up more than one task at a time, by +initializing the list element of struct sas_task, and +passing the number of tasks enlisted in this manner in num. + +Returns: -SAS_QUEUE_FULL, -ENOMEM, nothing was queued; + 0, the task(s) were queued. + +If you want to pass num > 1, then either +A) you're the only caller of this function and keep track + of what you've queued to the LLDD, or +B) you know what you're doing and have a strategy of + retrying. + +As opposed to queuing one task at a time (function call), +batch queuing of tasks, by having num > 1, greatly +simplifies LLDD code, sequencer code, and _hardware design_, +and has some performance advantages in certain situations +(DBMS). + +The LLDD advertises if it can take more than one command at +a time at lldd_execute_task(), by setting the +lldd_max_execute_num parameter (controlled by "collector" +module parameter in aic94xx SAS LLDD). + +You should leave this to the default 1, unless you know what +you're doing. + +This is a function of the LLDD, to which the SAS layer can +cater to. + +int lldd_queue_size + The host adapter's queue size. This is the maximum +number of commands the lldd can have pending to domain +devices on behalf of all upper layers submitting through +lldd_execute_task(). + +You really want to set this to something (much) larger than +1. + +This _really_ has absolutely nothing to do with queuing. +There is no queuing in SAS LLDDs. + +struct sas_task { + dev -- the device this task is destined to + list -- must be initialized (INIT_LIST_HEAD) + task_proto -- _one_ of enum sas_proto + scatter -- pointer to scatter gather list array + num_scatter -- number of elements in scatter + total_xfer_len -- total number of bytes expected to be transfered + data_dir -- PCI_DMA_... + task_done -- callback when the task has finished execution +}; + +When an external entity, entity other than the LLDD or the +SAS Layer, wants to work with a struct domain_device, it +_must_ call kobject_get() when getting a handle on the +device and kobject_put() when it is done with the device. + +This does two things: + A) implements proper kfree() for the device; + B) increments/decrements the kref for all players: + domain_device + all domain_device's ... (if past an expander) + port + host adapter + pci device + and up the ladder, etc. + +DISCOVERY +--------- + +The sysfs tree has the following purposes: + a) It shows you the physical layout of the SAS domain at + the current time, i.e. how the domain looks in the + physical world right now. + b) Shows some device parameters _at_discovery_time_. + +This is a link to the tree(1) program, very useful in +viewing the SAS domain: +ftp://mama.indstate.edu/linux/tree/ +I expect user space applications to actually create a +graphical interface of this. + +That is, the sysfs domain tree doesn't show or keep state if +you e.g., change the meaning of the READY LED MEANING +setting, but it does show you the current connection status +of the domain device. + +Keeping internal device state changes is responsibility of +upper layers (Command set drivers) and user space. + +When a device or devices are unplugged from the domain, this +is reflected in the sysfs tree immediately, and the device(s) +removed from the system. + +The structure domain_device describes any device in the SAS +domain. It is completely managed by the SAS layer. A task +points to a domain device, this is how the SAS LLDD knows +where to send the task(s) to. A SAS LLDD only reads the +contents of the domain_device structure, but it never creates +or destroys one. + +Expander management from User Space +----------------------------------- + +In each expander directory in sysfs, there is a file called +"smp_portal". It is a binary sysfs attribute file, which +implements an SMP portal (Note: this is *NOT* an SMP port), +to which user space applications can send SMP requests and +receive SMP responses. + +Functionality is deceptively simple: + +1. Build the SMP frame you want to send. The format and layout + is described in the SAS spec. Leave the CRC field equal 0. +open(2) +2. Open the expander's SMP portal sysfs file in RW mode. +write(2) +3. Write the frame you built in 1. +read(2) +4. Read the amount of data you expect to receive for the frame you built. + If you receive different amount of data you expected to receive, + then there was some kind of error. +close(2) +All this process is shown in detail in the function do_smp_func() +and its callers, in the file "expander_conf.c". + +The kernel functionality is implemented in the file +"sas_expander.c". + +The program "expander_conf.c" implements this. It takes one +argument, the sysfs file name of the SMP portal to the +expander, and gives expander information, including routing +tables. + +The SMP portal gives you complete control of the expander, +so please be careful. diff -uprN linux-2.6.18/Documentation/sysctl/vm.txt linux-2.6.18.ovz/Documentation/sysctl/vm.txt --- linux-2.6.18/Documentation/sysctl/vm.txt 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/Documentation/sysctl/vm.txt 2007-06-13 06:55:04.000000000 -0400 @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/ - drop-caches - zone_reclaim_mode - min_unmapped_ratio +- min_slab_ratio - panic_on_oom ============================================================== @@ -138,7 +139,6 @@ This is value ORed together of 1 = Zone reclaim on 2 = Zone reclaim writes dirty pages out 4 = Zone reclaim swaps pages -8 = Also do a global slab reclaim pass zone_reclaim_mode is set during bootup to 1 if it is determined that pages from remote zones will cause a measurable performance reduction. The @@ -162,18 +162,13 @@ Allowing regular swap effectively restri node unless explicitly overridden by memory policies or cpuset configurations. -It may be advisable to allow slab reclaim if the system makes heavy -use of files and builds up large slab caches. However, the slab -shrink operation is global, may take a long time and free slabs -in all nodes of the system. - ============================================================= min_unmapped_ratio: This is available only on NUMA kernels. -A percentage of the file backed pages in each zone. Zone reclaim will only +A percentage of the total pages in each zone. Zone reclaim will only occur if more than this percentage of pages are file backed and unmapped. This is to insure that a minimal amount of local pages is still available for file I/O even if the node is overallocated. @@ -182,6 +177,24 @@ The default is 1 percent. ============================================================= +min_slab_ratio: + +This is available only on NUMA kernels. + +A percentage of the total pages in each zone. On Zone reclaim +(fallback from the local zone occurs) slabs will be reclaimed if more +than this percentage of pages in a zone are reclaimable slab pages. +This insures that the slab growth stays under control even in NUMA +systems that rarely perform global reclaim. + +The default is 5 percent. + +Note that slab reclaim is triggered in a per zone / node fashion. +The process of reclaiming slab memory is currently not node specific +and may not be fast. + +============================================================= + panic_on_oom This enables or disables panic on out-of-memory feature. If this is set to 1, diff -uprN linux-2.6.18/Documentation/vsched.txt linux-2.6.18.ovz/Documentation/vsched.txt --- linux-2.6.18/Documentation/vsched.txt 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/Documentation/vsched.txt 2007-06-13 06:55:04.000000000 -0400 @@ -0,0 +1,83 @@ +Copyright (C) 2005 SWsoft. All rights reserved. +Licensing governed by "linux/COPYING.SWsoft" file. + +Hierarchical CPU schedulers +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Hierarchical CPU scheduler is a stack of CPU schedulers which allows +to organize different policies of scheduling in the system and/or between +groups of processes. + +Virtuozzo uses a hierarchical Fair CPU scheduler organized as a 2-stage +CPU scheduler, where the scheduling decisions are made in 2 steps: +1. On the first step Fair CPU scheduler selects a group of processes + which should get some CPU time. +2. Then standard Linux scheduler chooses a process inside the group. +Such scheduler efficiently allows to isolate one group of processes +from another and still allows a group to use more than 1 CPU on SMP systems. + +This document describes a new middle layer of Virtuozzo hierarchical CPU +scheduler which makes decisions after Fair scheduler, but before Linux +scheduler and which is called VCPU scheduler. + + +Where VCPU scheduler comes from? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Existing hierarchical CPU scheduler uses isolated algorithms on each stage +of decision making, i.e. every scheduler makes its decisions without +taking into account the details of other schedulers. This can lead to a number +of problems described below. + +On SMP systems there are possible situations when the first CPU scheduler +in the hierarchy (e.g. Fair scheduler) wants to schedule some group of +processes on the physical CPU, but the underlying process scheduler +(e.g. Linux O(1) CPU scheduler) is unable to schedule any processes +on this physical CPU. Usually this happens due to the fact that Linux +kernel scheduler uses per-physical CPU runqueues. + +Another problem is that Linux scheduler also knows nothing about +Fair scheduler and can't balance efficiently without taking into account +statistics about process groups from Fair scheduler. Without such +statistics Linux scheduler can concentrate all processes on one physical +CPU, thus making CPU consuming highly inefficient. + +VCPU scheduler solves these problems by adding a new layer between +Fair schedule and Linux scheduler. + +VCPU scheduler +~~~~~~~~~~~~~~ + +VCPU scheduler is a CPU scheduler which splits notion of +physical and virtual CPUs (VCPU and PCPU). This means that tasks are +running on virtual CPU runqueues, while VCPUs are running on PCPUs. + +The Virtuozzo hierarchical fair scheduler becomes 3 stage CPU scheduler: +1. First, Fair CPU scheduler select a group of processes. +2. Then VCPU scheduler select a virtual CPU to run (this is actually + a runqueue). +3. Standard Linux scheduler chooses a process from the runqueue. + +For example on the picture below PCPU0 executes tasks from +VCPU1 runqueue and PCPU1 is idle: + + virtual | physical | virtual + idle CPUs | CPUs | CPUS +--------------------|------------------------|-------------------------- + | | ----------------- + | | | virtual sched X | + | | | ----------- | + | | | | VCPU0 | | + | | | ----------- | + ------------ | ----------- | ----------- | +| idle VCPU0 | | | PCPU0 | <---> | | VCPU1 | | + ------------ | ----------- | ----------- | + | | ----------------- + | | + | | ----------------- + | | | virtual sched Y | + ------------ ----------- | | ----------- | +| idle VCPU1 | <---> | PCPU1 | | | | VCPU0 | | + ------------ ----------- | | ----------- | + | | ----------------- + | | diff -uprN linux-2.6.18/Makefile linux-2.6.18.ovz/Makefile --- linux-2.6.18/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/Makefile 2007-06-13 06:55:04.000000000 -0400 @@ -894,6 +894,9 @@ export INSTALL_HDR_PATH PHONY += headers_install headers_install: include/linux/version.h + @if [ ! -r include/asm-$(ARCH)/Kbuild ]; then \ + echo '*** Error: Headers not exportable for this architecture ($(ARCH))'; \ + exit 1 ; fi $(Q)unifdef -Ux /dev/null $(Q)rm -rf $(INSTALL_HDR_PATH)/include $(Q)$(MAKE) -rR -f $(srctree)/scripts/Makefile.headersinst obj=include @@ -1076,13 +1079,17 @@ help: @echo ' cscope - Generate cscope index' @echo ' kernelrelease - Output the release version string' @echo ' kernelversion - Output the version stored in Makefile' - @echo ' headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH' + @if [ -r include/asm-$(ARCH)/Kbuild ]; then \ + echo ' headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH'; \ + fi @echo ' (default: $(INSTALL_HDR_PATH))' @echo '' @echo 'Static analysers' @echo ' checkstack - Generate a list of stack hogs' @echo ' namespacecheck - Name space analysis on compiled kernel' - @echo ' headers_check - Sanity check on exported headers' + @if [ -r include/asm-$(ARCH)/Kbuild ]; then \ + echo ' headers_check - Sanity check on exported headers'; \ + fi @echo '' @echo 'Kernel packaging:' @$(MAKE) $(build)=$(package-dir) help diff -uprN linux-2.6.18/arch/alpha/Kconfig linux-2.6.18.ovz/arch/alpha/Kconfig --- linux-2.6.18/arch/alpha/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/alpha/Kconfig 2007-06-13 06:55:04.000000000 -0400 @@ -381,7 +381,7 @@ config ALPHA_EV56 config ALPHA_EV56 prompt "EV56 CPU (speed >= 333MHz)?" - depends on ALPHA_NORITAKE && ALPHA_PRIMO + depends on ALPHA_NORITAKE || ALPHA_PRIMO config ALPHA_EV56 prompt "EV56 CPU (speed >= 400MHz)?" diff -uprN linux-2.6.18/arch/alpha/kernel/init_task.c linux-2.6.18.ovz/arch/alpha/kernel/init_task.c --- linux-2.6.18/arch/alpha/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/alpha/kernel/init_task.c 2007-06-13 06:55:04.000000000 -0400 @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -13,6 +14,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/alpha/kernel/osf_sys.c linux-2.6.18.ovz/arch/alpha/kernel/osf_sys.c --- linux-2.6.18/arch/alpha/kernel/osf_sys.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/alpha/kernel/osf_sys.c 2007-06-13 06:55:04.000000000 -0400 @@ -402,15 +402,15 @@ osf_utsname(char __user *name) down_read(&uts_sem); error = -EFAULT; - if (copy_to_user(name + 0, system_utsname.sysname, 32)) + if (copy_to_user(name + 0, utsname()->sysname, 32)) goto out; - if (copy_to_user(name + 32, system_utsname.nodename, 32)) + if (copy_to_user(name + 32, utsname()->nodename, 32)) goto out; - if (copy_to_user(name + 64, system_utsname.release, 32)) + if (copy_to_user(name + 64, utsname()->release, 32)) goto out; - if (copy_to_user(name + 96, system_utsname.version, 32)) + if (copy_to_user(name + 96, utsname()->version, 32)) goto out; - if (copy_to_user(name + 128, system_utsname.machine, 32)) + if (copy_to_user(name + 128, utsname()->machine, 32)) goto out; error = 0; @@ -449,8 +449,8 @@ osf_getdomainname(char __user *name, int down_read(&uts_sem); for (i = 0; i < len; ++i) { - __put_user(system_utsname.domainname[i], name + i); - if (system_utsname.domainname[i] == '\0') + __put_user(utsname()->domainname[i], name + i); + if (utsname()->domainname[i] == '\0') break; } up_read(&uts_sem); @@ -607,12 +607,12 @@ osf_sigstack(struct sigstack __user *uss asmlinkage long osf_sysinfo(int command, char __user *buf, long count) { - static char * sysinfo_table[] = { - system_utsname.sysname, - system_utsname.nodename, - system_utsname.release, - system_utsname.version, - system_utsname.machine, + char *sysinfo_table[] = { + utsname()->sysname, + utsname()->nodename, + utsname()->release, + utsname()->version, + utsname()->machine, "alpha", /* instruction set architecture */ "dummy", /* hardware serial number */ "dummy", /* hardware manufacturer */ @@ -959,7 +959,7 @@ osf_utimes(char __user *filename, struct return -EFAULT; } - return do_utimes(AT_FDCWD, filename, tvs ? ktvs : NULL); + return do_utimes(AT_FDCWD, filename, tvs ? ktvs : NULL, 0); } #define MAX_SELECT_SECONDS \ diff -uprN linux-2.6.18/arch/arm/Kconfig linux-2.6.18.ovz/arch/arm/Kconfig --- linux-2.6.18/arch/arm/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/arm/Kconfig 2007-06-13 06:55:04.000000000 -0400 @@ -448,7 +448,7 @@ config NR_CPUS config HOTPLUG_CPU bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" - depends on SMP && HOTPLUG && EXPERIMENTAL + depends on SMP && HOTPLUG && EXPERIMENTAL && !SCHED_VCPU help Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. diff -uprN linux-2.6.18/arch/arm/kernel/calls.S linux-2.6.18.ovz/arch/arm/kernel/calls.S --- linux-2.6.18/arch/arm/kernel/calls.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/arm/kernel/calls.S 2007-06-13 06:55:04.000000000 -0400 @@ -331,6 +331,19 @@ CALL(sys_mbind) /* 320 */ CALL(sys_get_mempolicy) CALL(sys_set_mempolicy) + CALL(sys_openat) + CALL(sys_mkdirat) + CALL(sys_mknodat) +/* 325 */ CALL(sys_fchownat) + CALL(sys_futimesat) + CALL(sys_fstatat64) + CALL(sys_unlinkat) + CALL(sys_renameat) +/* 330 */ CALL(sys_linkat) + CALL(sys_symlinkat) + CALL(sys_readlinkat) + CALL(sys_fchmodat) + CALL(sys_faccessat) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff -uprN linux-2.6.18/arch/arm/kernel/init_task.c linux-2.6.18.ovz/arch/arm/kernel/init_task.c --- linux-2.6.18/arch/arm/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/arm/kernel/init_task.c 2007-06-13 06:55:04.000000000 -0400 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/arm/kernel/setup.c linux-2.6.18.ovz/arch/arm/kernel/setup.c --- linux-2.6.18/arch/arm/kernel/setup.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/arm/kernel/setup.c 2007-06-13 06:55:04.000000000 -0400 @@ -348,7 +348,7 @@ static void __init setup_processor(void) cpu_name, processor_id, (int)processor_id & 15, proc_arch[cpu_architecture()], cr_alignment); - sprintf(system_utsname.machine, "%s%c", list->arch_name, ENDIANNESS); + sprintf(init_utsname()->machine, "%s%c", list->arch_name, ENDIANNESS); sprintf(elf_platform, "%s%c", list->elf_name, ENDIANNESS); elf_hwcap = list->elf_hwcap; #ifndef CONFIG_ARM_THUMB diff -uprN linux-2.6.18/arch/arm/kernel/smp.c linux-2.6.18.ovz/arch/arm/kernel/smp.c --- linux-2.6.18/arch/arm/kernel/smp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/arm/kernel/smp.c 2007-06-13 06:55:04.000000000 -0400 @@ -196,7 +196,7 @@ int __cpuexit __cpu_disable(void) local_flush_tlb_all(); read_lock(&tasklist_lock); - for_each_process(p) { + for_each_process_all(p) { if (p->mm) cpu_clear(cpu, p->mm->cpu_vm_mask); } diff -uprN linux-2.6.18/arch/arm26/kernel/init_task.c linux-2.6.18.ovz/arch/arm26/kernel/init_task.c --- linux-2.6.18/arch/arm26/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/arm26/kernel/init_task.c 2007-06-13 06:55:04.000000000 -0400 @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/arm26/kernel/setup.c linux-2.6.18.ovz/arch/arm26/kernel/setup.c --- linux-2.6.18/arch/arm26/kernel/setup.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/arm26/kernel/setup.c 2007-06-13 06:55:04.000000000 -0400 @@ -143,7 +143,7 @@ static void __init setup_processor(void) dump_cpu_info(); - sprintf(system_utsname.machine, "%s", list->arch_name); + sprintf(init_utsname()->machine, "%s", list->arch_name); sprintf(elf_platform, "%s", list->elf_name); elf_hwcap = list->elf_hwcap; diff -uprN linux-2.6.18/arch/cris/kernel/setup.c linux-2.6.18.ovz/arch/cris/kernel/setup.c --- linux-2.6.18/arch/cris/kernel/setup.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/cris/kernel/setup.c 2007-06-13 06:55:04.000000000 -0400 @@ -160,7 +160,7 @@ setup_arch(char **cmdline_p) show_etrax_copyright(); /* Setup utsname */ - strcpy(system_utsname.machine, cris_machine_name); + strcpy(init_utsname()->machine, cris_machine_name); } static void *c_start(struct seq_file *m, loff_t *pos) diff -uprN linux-2.6.18/arch/frv/kernel/init_task.c linux-2.6.18.ovz/arch/frv/kernel/init_task.c --- linux-2.6.18/arch/frv/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/frv/kernel/init_task.c 2007-06-13 06:55:04.000000000 -0400 @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/frv/mm/mmu-context.c linux-2.6.18.ovz/arch/frv/mm/mmu-context.c --- linux-2.6.18/arch/frv/mm/mmu-context.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/frv/mm/mmu-context.c 2007-06-13 06:55:04.000000000 -0400 @@ -181,7 +181,7 @@ int cxn_pin_by_pid(pid_t pid) /* get a handle on the mm_struct */ read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_pid_ve(pid); if (tsk) { ret = -EINVAL; diff -uprN linux-2.6.18/arch/h8300/kernel/init_task.c linux-2.6.18.ovz/arch/h8300/kernel/init_task.c --- linux-2.6.18/arch/h8300/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/h8300/kernel/init_task.c 2007-06-13 06:55:04.000000000 -0400 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/i386/Kconfig linux-2.6.18.ovz/arch/i386/Kconfig --- linux-2.6.18/arch/i386/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/Kconfig 2007-06-13 06:55:04.000000000 -0400 @@ -241,6 +241,8 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +source "kernel/Kconfig.fairsched" + config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on X86_HT @@ -785,7 +787,7 @@ config PHYSICAL_START config HOTPLUG_CPU bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" - depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER + depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER && !SCHED_VCPU ---help--- Say Y here to experiment with turning CPUs off and on, and to enable suspend on SMP systems. CPUs can be controlled through @@ -1142,12 +1144,16 @@ endmenu source "arch/i386/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" +source "kernel/ub/Kconfig" + # # Use the generic interrupt handling code in kernel/irq/: # diff -uprN linux-2.6.18/arch/i386/Kconfig.cpu linux-2.6.18.ovz/arch/i386/Kconfig.cpu --- linux-2.6.18/arch/i386/Kconfig.cpu 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/Kconfig.cpu 2007-06-13 06:55:04.000000000 -0400 @@ -7,6 +7,7 @@ choice config M386 bool "386" + depends on !UML ---help--- This is the processor type of your CPU. This information is used for optimizing purposes. In order to compile a kernel that can run on @@ -301,7 +302,7 @@ config X86_USE_PPRO_CHECKSUM config X86_USE_3DNOW bool - depends on MCYRIXIII || MK7 || MGEODE_LX + depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML default y config X86_OOSTORE diff -uprN linux-2.6.18/arch/i386/kernel/alternative.c linux-2.6.18.ovz/arch/i386/kernel/alternative.c --- linux-2.6.18/arch/i386/kernel/alternative.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/alternative.c 2007-06-13 06:55:04.000000000 -0400 @@ -344,6 +344,7 @@ void alternatives_smp_switch(int smp) void __init alternative_instructions(void) { + unsigned long flags; if (no_replacement) { printk(KERN_INFO "(SMP-)alternatives turned off\n"); free_init_pages("SMP alternatives", @@ -351,6 +352,8 @@ void __init alternative_instructions(voi (unsigned long)__smp_alt_end); return; } + + local_irq_save(flags); apply_alternatives(__alt_instructions, __alt_instructions_end); /* switch to patch-once-at-boottime-only mode and free the @@ -386,4 +389,5 @@ void __init alternative_instructions(voi alternatives_smp_switch(0); } #endif + local_irq_restore(flags); } diff -uprN linux-2.6.18/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c linux-2.6.18.ovz/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c --- linux-2.6.18/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c 2007-06-13 06:55:04.000000000 -0400 @@ -560,7 +560,6 @@ static struct cpufreq_driver acpi_cpufre .name = "acpi-cpufreq", .owner = THIS_MODULE, .attr = acpi_cpufreq_attr, - .flags = CPUFREQ_STICKY, }; @@ -571,7 +570,7 @@ acpi_cpufreq_init (void) acpi_cpufreq_early_init_acpi(); - return cpufreq_register_driver(&acpi_cpufreq_driver); + return cpufreq_register_driver(&acpi_cpufreq_driver); } diff -uprN linux-2.6.18/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c linux-2.6.18.ovz/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c --- linux-2.6.18/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2007-06-13 06:55:04.000000000 -0400 @@ -63,7 +63,7 @@ static int cpufreq_p4_setdc(unsigned int if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) return -EINVAL; - rdmsr(MSR_IA32_THERM_STATUS, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h); if (l & 0x01) dprintk("CPU#%d currently thermal throttled\n", cpu); @@ -71,10 +71,10 @@ static int cpufreq_p4_setdc(unsigned int if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT)) newstate = DC_38PT; - rdmsr(MSR_IA32_THERM_CONTROL, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); if (newstate == DC_DISABLE) { dprintk("CPU#%d disabling modulation\n", cpu); - wrmsr(MSR_IA32_THERM_CONTROL, l & ~(1<<4), h); + wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h); } else { dprintk("CPU#%d setting duty cycle to %d%%\n", cpu, ((125 * newstate) / 10)); @@ -85,7 +85,7 @@ static int cpufreq_p4_setdc(unsigned int */ l = (l & ~14); l = l | (1<<4) | ((newstate & 0x7)<<1); - wrmsr(MSR_IA32_THERM_CONTROL, l, h); + wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h); } return 0; @@ -112,7 +112,6 @@ static int cpufreq_p4_target(struct cpuf { unsigned int newstate = DC_RESV; struct cpufreq_freqs freqs; - cpumask_t cpus_allowed; int i; if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], target_freq, relation, &newstate)) @@ -133,17 +132,8 @@ static int cpufreq_p4_target(struct cpuf /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software * Developer's Manual, Volume 3 */ - cpus_allowed = current->cpus_allowed; - - for_each_cpu_mask(i, policy->cpus) { - cpumask_t this_cpu = cpumask_of_cpu(i); - - set_cpus_allowed(current, this_cpu); - BUG_ON(smp_processor_id() != i); - + for_each_cpu_mask(i, policy->cpus) cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); - } - set_cpus_allowed(current, cpus_allowed); /* notifiers */ for_each_cpu_mask(i, policy->cpus) { @@ -267,17 +257,9 @@ static int cpufreq_p4_cpu_exit(struct cp static unsigned int cpufreq_p4_get(unsigned int cpu) { - cpumask_t cpus_allowed; u32 l, h; - cpus_allowed = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - BUG_ON(smp_processor_id() != cpu); - - rdmsr(MSR_IA32_THERM_CONTROL, l, h); - - set_cpus_allowed(current, cpus_allowed); + rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); if (l & 0x10) { l = l >> 1; diff -uprN linux-2.6.18/arch/i386/kernel/cpu/cpufreq/powernow-k8.c linux-2.6.18.ovz/arch/i386/kernel/cpu/cpufreq/powernow-k8.c --- linux-2.6.18/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2007-06-13 06:55:04.000000000 -0400 @@ -84,17 +84,17 @@ static u32 find_khz_freq_from_fiddid(u32 return 1000 * find_freq_from_fiddid(fid, did); } -static u32 find_fid_from_pstate(u32 pstate) +static u32 find_fid_from_pstate(unsigned int cpu, u32 pstate) { u32 hi, lo; - rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi); + rdmsr_on_cpu(cpu, MSR_PSTATE_DEF_BASE + pstate, &lo, &hi); return lo & HW_PSTATE_FID_MASK; } -static u32 find_did_from_pstate(u32 pstate) +static u32 find_did_from_pstate(unsigned int cpu, u32 pstate) { u32 hi, lo; - rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi); + rdmsr_on_cpu(cpu, MSR_PSTATE_DEF_BASE + pstate, &lo, &hi); return (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT; } @@ -116,14 +116,14 @@ static u32 convert_fid_to_vco_fid(u32 fi * Return 1 if the pending bit is set. Unless we just instructed the processor * to transition to a new state, seeing this bit set is really bad news. */ -static int pending_bit_stuck(void) +static int pending_bit_stuck(unsigned int cpu) { u32 lo, hi; if (cpu_family == CPU_HW_PSTATE) return 0; - rdmsr(MSR_FIDVID_STATUS, lo, hi); + rdmsr_on_cpu(cpu, MSR_FIDVID_STATUS, &lo, &hi); return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0; } @@ -133,13 +133,14 @@ static int pending_bit_stuck(void) */ static int query_current_values_with_pending_wait(struct powernow_k8_data *data) { + unsigned int cpu = data->cpu; u32 lo, hi; u32 i = 0; if (cpu_family == CPU_HW_PSTATE) { - rdmsr(MSR_PSTATE_STATUS, lo, hi); + rdmsr_on_cpu(cpu, MSR_PSTATE_STATUS, &lo, &hi); i = lo & HW_PSTATE_MASK; - rdmsr(MSR_PSTATE_DEF_BASE + i, lo, hi); + rdmsr_on_cpu(cpu, MSR_PSTATE_DEF_BASE + i, &lo, &hi); data->currfid = lo & HW_PSTATE_FID_MASK; data->currdid = (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT; return 0; @@ -149,7 +150,7 @@ static int query_current_values_with_pen dprintk("detected change pending stuck\n"); return 1; } - rdmsr(MSR_FIDVID_STATUS, lo, hi); + rdmsr_on_cpu(cpu, MSR_FIDVID_STATUS, &lo, &hi); } while (lo & MSR_S_LO_CHANGE_PENDING); data->currvid = hi & MSR_S_HI_CURRENT_VID; @@ -173,18 +174,18 @@ static void count_off_vst(struct powerno } /* need to init the control msr to a safe value (for each cpu) */ -static void fidvid_msr_init(void) +static void fidvid_msr_init(unsigned int cpu) { u32 lo, hi; u8 fid, vid; - rdmsr(MSR_FIDVID_STATUS, lo, hi); + rdmsr_on_cpu(cpu, MSR_FIDVID_STATUS, &lo, &hi); vid = hi & MSR_S_HI_CURRENT_VID; fid = lo & MSR_S_LO_CURRENT_FID; lo = fid | (vid << MSR_C_LO_VID_SHIFT); hi = MSR_C_HI_STP_GNT_BENIGN; dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi); - wrmsr(MSR_FIDVID_CTL, lo, hi); + wrmsr_on_cpu(cpu, MSR_FIDVID_CTL, lo, hi); } @@ -206,7 +207,7 @@ static int write_new_fid(struct powernow fid, lo, data->plllock * PLL_LOCK_CONVERSION); do { - wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); + wrmsr_on_cpu(data->cpu, MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); if (i++ > 100) { printk(KERN_ERR PFX "Hardware error - pending bit very stuck - no further pstate changes possible\n"); return 1; @@ -291,8 +292,8 @@ static int decrease_vid_code_by_step(str /* Change hardware pstate by single MSR write */ static int transition_pstate(struct powernow_k8_data *data, u32 pstate) { - wrmsr(MSR_PSTATE_CTRL, pstate, 0); - data->currfid = find_fid_from_pstate(pstate); + wrmsr_on_cpu(data->cpu, MSR_PSTATE_CTRL, pstate, 0); + data->currfid = find_fid_from_pstate(data->cpu, pstate); return 0; } @@ -335,7 +336,7 @@ static int core_voltage_pre_transition(s smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); - rdmsr(MSR_FIDVID_STATUS, lo, maxvid); + rdmsr_on_cpu(data->cpu, MSR_FIDVID_STATUS, &lo, &maxvid); maxvid = 0x1f & (maxvid >> 16); dprintk("ph1 maxvid=0x%x\n", maxvid); if (reqvid < maxvid) /* lower numbers are higher voltages */ @@ -499,22 +500,13 @@ static int core_voltage_post_transition( static int check_supported_cpu(unsigned int cpu) { - cpumask_t oldmask = CPU_MASK_ALL; u32 eax, ebx, ecx, edx; unsigned int rc = 0; - oldmask = current->cpus_allowed; - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - - if (smp_processor_id() != cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); - goto out; - } - - if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) + if (cpu_data[cpu].x86_vendor != X86_VENDOR_AMD) goto out; - eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + eax = cpuid_eax_on_cpu(cpu, CPUID_PROCESSOR_SIGNATURE); if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) goto out; @@ -526,20 +518,20 @@ static int check_supported_cpu(unsigned goto out; } - eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); + eax = cpuid_eax_on_cpu(cpu, CPUID_GET_MAX_CAPABILITIES); if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { printk(KERN_INFO PFX "No frequency change capabilities detected\n"); goto out; } - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); + cpuid_on_cpu(cpu, CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) { printk(KERN_INFO PFX "Power state transitions not supported\n"); goto out; } } else { /* must be a HW Pstate capable processor */ - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); + cpuid_on_cpu(cpu, CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) cpu_family = CPU_HW_PSTATE; else @@ -549,7 +541,6 @@ static int check_supported_cpu(unsigned rc = 1; out: - set_cpus_allowed(current, oldmask); return rc; } @@ -849,7 +840,7 @@ static int fill_powernow_table_pstate(st printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); } - rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); + rdmsr_on_cpu(data->cpu, MSR_PSTATE_DEF_BASE + index, &lo, &hi); if (!(hi & HW_PSTATE_VALID_MASK)) { dprintk("invalid pstate %d, ignoring\n", index); powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; @@ -1035,8 +1026,8 @@ static int transition_frequency_pstate(s } res = transition_pstate(data, pstate); - data->currfid = find_fid_from_pstate(pstate); - data->currdid = find_did_from_pstate(pstate); + data->currfid = find_fid_from_pstate(data->cpu, pstate); + data->currdid = find_did_from_pstate(data->cpu, pstate); freqs.new = find_khz_freq_from_fiddid(data->currfid, data->currdid); for_each_cpu_mask(i, *(data->available_cores)) { @@ -1049,7 +1040,6 @@ static int transition_frequency_pstate(s /* Driver entry point to switch to the target frequency */ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { - cpumask_t oldmask = CPU_MASK_ALL; struct powernow_k8_data *data = powernow_data[pol->cpu]; u32 checkfid; u32 checkvid; @@ -1062,16 +1052,7 @@ static int powernowk8_target(struct cpuf checkfid = data->currfid; checkvid = data->currvid; - /* only run on specific CPU from here on */ - oldmask = current->cpus_allowed; - set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); - - if (smp_processor_id() != pol->cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); - goto err_out; - } - - if (pending_bit_stuck()) { + if (pending_bit_stuck(pol->cpu)) { printk(KERN_ERR PFX "failing targ, change pending bit set\n"); goto err_out; } @@ -1122,7 +1103,6 @@ static int powernowk8_target(struct cpuf ret = 0; err_out: - set_cpus_allowed(current, oldmask); return ret; } @@ -1141,7 +1121,6 @@ static int powernowk8_verify(struct cpuf static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; - cpumask_t oldmask = CPU_MASK_ALL; int rc; if (!cpu_online(pol->cpu)) @@ -1180,16 +1159,7 @@ static int __cpuinit powernowk8_cpu_init } } - /* only run on specific CPU from here on */ - oldmask = current->cpus_allowed; - set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); - - if (smp_processor_id() != pol->cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); - goto err_out; - } - - if (pending_bit_stuck()) { + if (pending_bit_stuck(pol->cpu)) { printk(KERN_ERR PFX "failing init, change pending bit set\n"); goto err_out; } @@ -1198,10 +1168,7 @@ static int __cpuinit powernowk8_cpu_init goto err_out; if (cpu_family == CPU_OPTERON) - fidvid_msr_init(); - - /* run on any CPU again */ - set_cpus_allowed(current, oldmask); + fidvid_msr_init(pol->cpu); pol->governor = CPUFREQ_DEFAULT_GOVERNOR; if (cpu_family == CPU_HW_PSTATE) @@ -1244,7 +1211,6 @@ static int __cpuinit powernowk8_cpu_init return 0; err_out: - set_cpus_allowed(current, oldmask); powernow_k8_cpu_exit_acpi(data); kfree(data); @@ -1271,7 +1237,6 @@ static int __devexit powernowk8_cpu_exit static unsigned int powernowk8_get (unsigned int cpu) { struct powernow_k8_data *data; - cpumask_t oldmask = current->cpus_allowed; unsigned int khz = 0; data = powernow_data[first_cpu(cpu_core_map[cpu])]; @@ -1279,20 +1244,12 @@ static unsigned int powernowk8_get (unsi if (!data) return -EINVAL; - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) { - printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu); - set_cpus_allowed(current, oldmask); - return 0; - } - if (query_current_values_with_pending_wait(data)) goto out; khz = find_khz_freq_from_fid(data->currfid); out: - set_cpus_allowed(current, oldmask); return khz; } diff -uprN linux-2.6.18/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c linux-2.6.18.ovz/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c --- linux-2.6.18/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c 2007-06-13 06:55:04.000000000 -0400 @@ -17,7 +17,6 @@ #include #include #include -#include /* current */ #include #include @@ -318,14 +317,8 @@ static unsigned int get_cur_freq(unsigne { unsigned l, h; unsigned clock_freq; - cpumask_t saved_mask; - saved_mask = current->cpus_allowed; - set_cpus_allowed(current, cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) - return 0; - - rdmsr(MSR_IA32_PERF_STATUS, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); clock_freq = extract_clock(l, cpu, 0); if (unlikely(clock_freq == 0)) { @@ -335,11 +328,10 @@ static unsigned int get_cur_freq(unsigne * P-state transition (like TM2). Get the last freq set * in PERF_CTL. */ - rdmsr(MSR_IA32_PERF_CTL, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); clock_freq = extract_clock(l, cpu, 1); } - set_cpus_allowed(current, saved_mask); return clock_freq; } @@ -548,15 +540,15 @@ static int centrino_cpu_init(struct cpuf /* Check to see if Enhanced SpeedStep is enabled, and try to enable it if not. */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); + rdmsr_on_cpu(policy->cpu, MSR_IA32_MISC_ENABLE, &l, &h); if (!(l & (1<<16))) { l |= (1<<16); dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); - wrmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr_on_cpu(policy->cpu, MSR_IA32_MISC_ENABLE, l, h); /* check to see if it stuck */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); + rdmsr_on_cpu(policy->cpu, MSR_IA32_MISC_ENABLE, &l, &h); if (!(l & (1<<16))) { printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); return -ENODEV; @@ -636,7 +628,6 @@ static int centrino_target (struct cpufr unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; struct cpufreq_freqs freqs; cpumask_t online_policy_cpus; - cpumask_t saved_mask; cpumask_t set_mask; cpumask_t covered_cpus; int retval = 0; @@ -660,7 +651,6 @@ static int centrino_target (struct cpufr online_policy_cpus = policy->cpus; #endif - saved_mask = current->cpus_allowed; first_cpu = 1; cpus_clear(covered_cpus); for_each_cpu_mask(j, online_policy_cpus) { @@ -674,8 +664,7 @@ static int centrino_target (struct cpufr else cpu_set(j, set_mask); - set_cpus_allowed(current, set_mask); - if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { + if (unlikely(!cpu_isset(j, set_mask))) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { @@ -688,7 +677,7 @@ static int centrino_target (struct cpufr msr = centrino_model[cpu]->op_points[newstate].index; if (first_cpu) { - rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); + rdmsr_on_cpu(j, MSR_IA32_PERF_CTL, &oldmsr, &h); if (msr == (oldmsr & 0xffff)) { dprintk("no change needed - msr was and needs " "to be %x\n", oldmsr); @@ -715,7 +704,7 @@ static int centrino_target (struct cpufr oldmsr |= msr; } - wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); + wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) break; @@ -737,8 +726,7 @@ static int centrino_target (struct cpufr if (!cpus_empty(covered_cpus)) { for_each_cpu_mask(j, covered_cpus) { - set_cpus_allowed(current, cpumask_of_cpu(j)); - wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); + wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); } } @@ -753,7 +741,6 @@ static int centrino_target (struct cpufr } migrate_end: - set_cpus_allowed(current, saved_mask); return 0; } diff -uprN linux-2.6.18/arch/i386/kernel/cpu/mtrr/if.c linux-2.6.18.ovz/arch/i386/kernel/cpu/mtrr/if.c --- linux-2.6.18/arch/i386/kernel/cpu/mtrr/if.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/cpu/mtrr/if.c 2007-06-13 06:55:04.000000000 -0400 @@ -392,7 +392,7 @@ static int __init mtrr_if_init(void) return -ENODEV; proc_root_mtrr = - create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root); + create_proc_entry("mtrr", S_IWUSR | S_IRUGO, NULL); if (proc_root_mtrr) { proc_root_mtrr->owner = THIS_MODULE; proc_root_mtrr->proc_fops = &mtrr_fops; diff -uprN linux-2.6.18/arch/i386/kernel/cpu/proc.c linux-2.6.18.ovz/arch/i386/kernel/cpu/proc.c --- linux-2.6.18/arch/i386/kernel/cpu/proc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/cpu/proc.c 2007-06-13 06:55:04.000000000 -0400 @@ -4,6 +4,7 @@ #include #include #include +#include /* * Get CPU information for use by the procfs. @@ -77,7 +78,7 @@ static int show_cpuinfo(struct seq_file int fpu_exception; #ifdef CONFIG_SMP - if (!cpu_online(n)) + if (!vcpu_online(n)) return 0; #endif seq_printf(m, "processor\t: %d\n" @@ -97,9 +98,13 @@ static int show_cpuinfo(struct seq_file seq_printf(m, "stepping\t: unknown\n"); if ( cpu_has(c, X86_FEATURE_TSC) ) { +#ifndef CONFIG_FAIRSCHED unsigned int freq = cpufreq_quick_get(n); if (!freq) freq = cpu_khz; +#else + unsigned int freq = ve_scale_khz(cpu_khz); +#endif seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } diff -uprN linux-2.6.18/arch/i386/kernel/entry.S linux-2.6.18.ovz/arch/i386/kernel/entry.S --- linux-2.6.18/arch/i386/kernel/entry.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/entry.S 2007-06-13 06:55:04.000000000 -0400 @@ -209,6 +209,7 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%ebp) popl %eax CFI_ADJUST_CFA_OFFSET -4 +ret_from_fork_tail: pushl $0x0202 # Reset kernel eflags CFI_ADJUST_CFA_OFFSET 4 popfl @@ -216,6 +217,25 @@ ENTRY(ret_from_fork) jmp syscall_exit CFI_ENDPROC +ENTRY(i386_ret_from_resume) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + movl (%esp),%eax + testl %eax,%eax + jz 1f + pushl %esp + call *%eax + addl $4,%esp +1: + addl $256,%esp + jmp ret_from_fork_tail + CFI_ENDPROC + /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to diff -uprN linux-2.6.18/arch/i386/kernel/i386_ksyms.c linux-2.6.18.ovz/arch/i386/kernel/i386_ksyms.c --- linux-2.6.18/arch/i386/kernel/i386_ksyms.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/i386_ksyms.c 2007-06-13 06:55:04.000000000 -0400 @@ -1,6 +1,7 @@ #include #include #include +#include EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); @@ -27,4 +28,5 @@ EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); #endif +EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(csum_partial); diff -uprN linux-2.6.18/arch/i386/kernel/init_task.c linux-2.6.18.ovz/arch/i386/kernel/init_task.c --- linux-2.6.18/arch/i386/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/init_task.c 2007-06-13 06:55:04.000000000 -0400 @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/i386/kernel/ldt.c linux-2.6.18.ovz/arch/i386/kernel/ldt.c --- linux-2.6.18/arch/i386/kernel/ldt.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/ldt.c 2007-06-13 06:55:04.000000000 -0400 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,8 @@ #include #include +#include + #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) { @@ -39,9 +42,9 @@ static int alloc_ldt(mm_context_t *pc, i oldsize = pc->size; mincount = (mincount+511)&(~511); if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); if (!newldt) return -ENOMEM; @@ -105,6 +108,7 @@ int init_new_context(struct task_struct } return retval; } +EXPORT_SYMBOL_GPL(init_new_context); /* * No need to lock the MM as we are the last user @@ -251,3 +255,5 @@ asmlinkage int sys_modify_ldt(int func, } return ret; } + +EXPORT_SYMBOL_GPL(default_ldt); diff -uprN linux-2.6.18/arch/i386/kernel/microcode.c linux-2.6.18.ovz/arch/i386/kernel/microcode.c --- linux-2.6.18/arch/i386/kernel/microcode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/microcode.c 2007-06-13 06:55:04.000000000 -0400 @@ -250,14 +250,14 @@ static int find_matching_ucodes (void) } total_size = get_totalsize(&mc_header); - if ((cursor + total_size > user_buffer_size) || (total_size < DEFAULT_UCODE_TOTALSIZE)) { + if (cursor + total_size > user_buffer_size) { printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); error = -EINVAL; goto out; } data_size = get_datasize(&mc_header); - if ((data_size + MC_HEADER_SIZE > total_size) || (data_size < DEFAULT_UCODE_DATASIZE)) { + if (data_size + MC_HEADER_SIZE > total_size) { printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); error = -EINVAL; goto out; @@ -460,11 +460,6 @@ static ssize_t microcode_write (struct f { ssize_t ret; - if (len < DEFAULT_UCODE_TOTALSIZE) { - printk(KERN_ERR "microcode: not enough data\n"); - return -EINVAL; - } - if ((len >> PAGE_SHIFT) > num_physpages) { printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); return -EINVAL; diff -uprN linux-2.6.18/arch/i386/kernel/nmi.c linux-2.6.18.ovz/arch/i386/kernel/nmi.c --- linux-2.6.18/arch/i386/kernel/nmi.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/nmi.c 2007-06-13 06:55:04.000000000 -0400 @@ -579,7 +579,22 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs) +void smp_show_regs(struct pt_regs *regs, void *info) +{ + static DEFINE_SPINLOCK(show_regs_lock); + + if (regs == NULL) + return; + + bust_spinlocks(1); + spin_lock(&show_regs_lock); + printk("----------- IPI show regs -----------"); + show_regs(regs); + spin_unlock(&show_regs_lock); + bust_spinlocks(0); +} + +void nmi_watchdog_tick(struct pt_regs *regs) { /* @@ -595,10 +610,10 @@ void nmi_watchdog_tick (struct pt_regs * if (last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... + * wait a few IRQs (30 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) + if (alert_counter[cpu] == 30*nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ diff -uprN linux-2.6.18/arch/i386/kernel/process.c linux-2.6.18.ovz/arch/i386/kernel/process.c --- linux-2.6.18/arch/i386/kernel/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/process.c 2007-06-13 06:55:04.000000000 -0400 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,8 @@ #include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +asmlinkage void i386_ret_from_resume(void) __asm__("i386_ret_from_resume"); +EXPORT_SYMBOL_GPL(i386_ret_from_resume); static int hlt_counter; @@ -287,18 +290,22 @@ __setup("idle=", idle_setup); void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + extern int die_counter; printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); - printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->eip); + printk("Pid: %d, comm: %20s, oopses: %d\n", + current->pid, current->comm, die_counter); + printk("EIP: %04x:[<%08lx>] CPU: %d, VCPU: %d:%d\n",0xffff & regs->xcs,regs->eip, smp_processor_id(), + task_vsched_id(current), task_cpu(current)); + if (decode_call_traces) + print_symbol("EIP is at %s\n", regs->eip); if (user_mode_vm(regs)) printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); printk(" EFLAGS: %08lx %s (%s %.*s)\n", - regs->eflags, print_tainted(), system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + regs->eflags, print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", @@ -312,6 +319,8 @@ void show_regs(struct pt_regs * regs) cr4 = read_cr4_safe(); printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); show_trace(NULL, regs, ®s->esp); + if (!decode_call_traces) + printk(" EIP: [<%08lx>]\n",regs->eip); } /* @@ -320,8 +329,10 @@ void show_regs(struct pt_regs * regs) * the "args". */ extern void kernel_thread_helper(void); +EXPORT_SYMBOL(kernel_thread_helper); __asm__(".section .text\n" ".align 4\n" + ".global kernel_thread_helper\n" "kernel_thread_helper:\n\t" "movl %edx,%eax\n\t" "pushl %edx\n\t" @@ -337,6 +348,13 @@ int kernel_thread(int (*fn)(void *), voi { struct pt_regs regs; + /* Don't allow kernel_thread() inside VE */ + if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { + printk("kernel_thread call inside VE\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.ebx = (unsigned long) fn; diff -uprN linux-2.6.18/arch/i386/kernel/ptrace.c linux-2.6.18.ovz/arch/i386/kernel/ptrace.c --- linux-2.6.18/arch/i386/kernel/ptrace.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/ptrace.c 2007-06-13 06:55:04.000000000 -0400 @@ -709,7 +709,9 @@ int do_syscall_trace(struct pt_regs *reg /* the 0x80 provides a way for the tracing parent to distinguish between a syscall stop and SIGTRAP delivery */ /* Note that the debugger could change the result of test_thread_flag!*/ + set_pn_state(current, entryexit ? PN_STOP_LEAVE : PN_STOP_ENTRY); ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); + clear_pn_state(current); /* * this isn't the same as continuing with a signal, but it will do diff -uprN linux-2.6.18/arch/i386/kernel/signal.c linux-2.6.18.ovz/arch/i386/kernel/signal.c --- linux-2.6.18/arch/i386/kernel/signal.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/signal.c 2007-06-13 06:55:04.000000000 -0400 @@ -583,6 +583,9 @@ static void fastcall do_signal(struct pt if (!user_mode(regs)) return; + if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + if (test_thread_flag(TIF_RESTORE_SIGMASK)) oldset = ¤t->saved_sigmask; else @@ -611,6 +614,7 @@ static void fastcall do_signal(struct pt return; } +no_signal: /* Did we come from a system call? */ if (regs->orig_eax >= 0) { /* Restart the system call - no handlers present */ diff -uprN linux-2.6.18/arch/i386/kernel/smp.c linux-2.6.18.ovz/arch/i386/kernel/smp.c --- linux-2.6.18/arch/i386/kernel/smp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/smp.c 2007-06-13 06:55:04.000000000 -0400 @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -442,6 +443,8 @@ void flush_tlb_mm (struct mm_struct * mm preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_mm); + void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; @@ -572,6 +575,89 @@ int smp_call_function (void (*func) (voi } EXPORT_SYMBOL(smp_call_function); +static spinlock_t nmi_call_lock = SPIN_LOCK_UNLOCKED; +static struct nmi_call_data_struct { + smp_nmi_function func; + void *info; + atomic_t started; + atomic_t finished; + cpumask_t cpus_called; + int wait; +} *nmi_call_data; + +static int smp_nmi_callback(struct pt_regs * regs, int cpu) +{ + smp_nmi_function func; + void *info; + int wait; + + func = nmi_call_data->func; + info = nmi_call_data->info; + wait = nmi_call_data->wait; + ack_APIC_irq(); + /* prevent from calling func() multiple times */ + if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) + return 0; + /* + * notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&nmi_call_data->started); + /* at this point the nmi_call_data structure is out of scope */ + irq_enter(); + func(regs, info); + irq_exit(); + if (wait) + atomic_inc(&nmi_call_data->finished); + + return 0; +} + +/* + * This function tries to call func(regs, info) on each cpu. + * Func must be fast and non-blocking. + * May be called with disabled interrupts and from any context. + */ +int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) +{ + struct nmi_call_data_struct data; + int cpus; + + cpus = num_online_cpus() - 1; + if (!cpus) + return 0; + + data.func = func; + data.info = info; + data.wait = wait; + atomic_set(&data.started, 0); + atomic_set(&data.finished, 0); + cpus_clear(data.cpus_called); + /* prevent this cpu from calling func if NMI happens */ + cpu_set(smp_processor_id(), data.cpus_called); + + if (!spin_trylock(&nmi_call_lock)) + return -1; + + nmi_call_data = &data; + set_nmi_ipi_callback(smp_nmi_callback); + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(APIC_DM_NMI); + while (atomic_read(&data.started) != cpus) + barrier(); + + unset_nmi_ipi_callback(); + if (wait) + while (atomic_read(&data.finished) != cpus) + barrier(); + spin_unlock(&nmi_call_lock); + + return 0; +} + static void stop_this_cpu (void * dummy) { /* diff -uprN linux-2.6.18/arch/i386/kernel/smpboot.c linux-2.6.18.ovz/arch/i386/kernel/smpboot.c --- linux-2.6.18/arch/i386/kernel/smpboot.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/smpboot.c 2007-06-13 06:55:04.000000000 -0400 @@ -320,6 +320,10 @@ static void __init synchronize_tsc_bp(vo } if (!buggy) printk("passed.\n"); +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; +#endif } static void __init synchronize_tsc_ap(void) @@ -347,6 +351,10 @@ static void __init synchronize_tsc_ap(vo while (atomic_read(&tsc.count_stop) != num_booting_cpus()) cpu_relax(); } +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; +#endif } #undef NR_LOOPS @@ -642,9 +650,13 @@ static void map_cpu_to_logical_apicid(vo { int cpu = smp_processor_id(); int apicid = logical_smp_processor_id(); + int node = apicid_to_node(apicid); + + if (!node_online(node)) + node = first_online_node; cpu_2_logical_apicid[cpu] = apicid; - map_cpu_to_node(cpu, apicid_to_node(apicid)); + map_cpu_to_node(cpu, node); } static void unmap_cpu_to_logical_apicid(int cpu) @@ -937,6 +949,13 @@ static int __devinit do_boot_cpu(int api if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); idle->thread.eip = (unsigned long) start_secondary; + +#ifdef CONFIG_VE + /* Cosmetic: sleep_time won't be changed afterwards for the idle + * thread; keep it 0 rather than -cycles. */ + VE_TASK_INFO(idle)->sleep_time = 0; +#endif + /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); diff -uprN linux-2.6.18/arch/i386/kernel/sys_i386.c linux-2.6.18.ovz/arch/i386/kernel/sys_i386.c --- linux-2.6.18/arch/i386/kernel/sys_i386.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/sys_i386.c 2007-06-13 06:55:04.000000000 -0400 @@ -210,7 +210,7 @@ asmlinkage int sys_uname(struct old_utsn if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } @@ -226,16 +226,21 @@ asmlinkage int sys_olduname(struct oldol down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); - error |= __put_user(0,name->sysname+__OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); - error |= __put_user(0,name->nodename+__OLD_UTS_LEN); - error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); - error |= __put_user(0,name->release+__OLD_UTS_LEN); - error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); - error |= __put_user(0,name->version+__OLD_UTS_LEN); - error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); - error |= __put_user(0,name->machine+__OLD_UTS_LEN); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); + error |= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + error |= __put_user(0, name->release + __OLD_UTS_LEN); + error |= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + error |= __put_user(0, name->version + __OLD_UTS_LEN); + error |= __copy_to_user(&name->machine, &utsname()->machine, + __OLD_UTS_LEN); + error |= __put_user(0, name->machine + __OLD_UTS_LEN); up_read(&uts_sem); diff -uprN linux-2.6.18/arch/i386/kernel/syscall_table.S linux-2.6.18.ovz/arch/i386/kernel/syscall_table.S --- linux-2.6.18/arch/i386/kernel/syscall_table.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/syscall_table.S 2007-06-13 06:55:04.000000000 -0400 @@ -317,3 +317,24 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .rept 500-(.-sys_call_table)/4 + .long sys_ni_syscall + .endr + .long sys_fairsched_mknod /* 500 */ + .long sys_fairsched_rmnod + .long sys_fairsched_chwt + .long sys_fairsched_mvpr + .long sys_fairsched_rate + .long sys_fairsched_vcpus /* 505 */ + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_getluid /* 510 */ + .long sys_setluid + .long sys_setublimit + .long sys_ubstat + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_lchmod /* 516 */ + .long sys_lutime diff -uprN linux-2.6.18/arch/i386/kernel/traps.c linux-2.6.18.ovz/arch/i386/kernel/traps.c --- linux-2.6.18/arch/i386/kernel/traps.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/kernel/traps.c 2007-06-13 06:55:04.000000000 -0400 @@ -125,7 +125,8 @@ static inline void print_addr_and_symbol { printk(" [<%08lx>] ", addr); - print_symbol("%s\n", addr); + if (decode_call_traces) + print_symbol("%s\n", addr); } static inline unsigned long print_context_stack(struct thread_info *tinfo, @@ -224,7 +225,10 @@ static void show_trace_log_lvl(struct ta stack = (unsigned long*)context->previous_esp; if (!stack) break; - printk("%s =======================\n", log_lvl); + if (decode_call_traces) + printk("%s =======================\n", log_lvl); + else + printk("%s == ", log_lvl); } } @@ -254,8 +258,13 @@ static void show_stack_log_lvl(struct ta printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } - printk("\n%sCall Trace:\n", log_lvl); + if (decode_call_traces) + printk("\n%s Call Trace:\n", log_lvl); + else + printk("\n%s Call Trace: ", log_lvl); show_trace_log_lvl(task, regs, esp, log_lvl); + if (!decode_call_traces) + printk("\n"); } void show_stack(struct task_struct *task, unsigned long *esp) @@ -272,6 +281,8 @@ void dump_stack(void) unsigned long stack; show_trace(current, NULL, &stack); + if (!decode_call_traces) + printk("\n"); } EXPORT_SYMBOL(dump_stack); @@ -291,12 +302,13 @@ void show_registers(struct pt_regs *regs ss = regs->xss & 0xffff; } print_modules(); - printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n" + printk(KERN_EMERG "CPU: %d, VCPU: %d.%d\nEIP: %04x:[<%08lx>] %s VLI\n" "EFLAGS: %08lx (%s %.*s) \n", - smp_processor_id(), 0xffff & regs->xcs, regs->eip, - print_tainted(), regs->eflags, system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + smp_processor_id(), task_vsched_id(current), task_cpu(current), + 0xffff & regs->xcs, regs->eip, + print_tainted(), regs->eflags, init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip); printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", regs->eax, regs->ebx, regs->ecx, regs->edx); @@ -304,8 +316,9 @@ void show_registers(struct pt_regs *regs regs->esi, regs->edi, regs->ebp, esp); printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", regs->xds & 0xffff, regs->xes & 0xffff, ss); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", + printk(KERN_EMERG "Process %.*s (pid: %d, veid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, current->pid, + VEID(VE_TASK_INFO(current)->owner_env), current_thread_info(), current, current->thread_info); /* * When in-kernel, we also print out the stack and code at the @@ -356,9 +369,9 @@ static void handle_BUG(struct pt_regs *r char *file; char c; - if (__get_user(line, (unsigned short __user *)(eip + 2))) + if (__get_user(line, (unsigned short __user *)(eip + 4))) break; - if (__get_user(file, (char * __user *)(eip + 4)) || + if (__get_user(file, (char * __user *)(eip + 7)) || (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) file = ""; @@ -369,6 +382,15 @@ static void handle_BUG(struct pt_regs *r printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n"); } +int die_counter = 0; + +static void inline check_kernel_csum_bug(void) +{ + if (kernel_text_csum_broken) + printk("Kernel code checksum mismatch detected %d times\n", + kernel_text_csum_broken); +} + /* This is gone through when something in the kernel * has done something bad and is about to be terminated. */ @@ -383,7 +405,6 @@ void die(const char * str, struct pt_reg .lock_owner = -1, .lock_owner_depth = 0 }; - static int die_counter; unsigned long flags; oops_enter(); @@ -443,6 +464,7 @@ void die(const char * str, struct pt_reg } else printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + check_kernel_csum_bug(); bust_spinlocks(0); die.lock_owner = -1; spin_unlock_irqrestore(&die.lock, flags); @@ -672,12 +694,27 @@ static void unknown_nmi_error(unsigned c printk("Do you have a strange power saving mode enabled?\n"); } -static DEFINE_SPINLOCK(nmi_print_lock); +/* + * Voyager doesn't implement these + */ +void __attribute__((weak)) smp_show_regs(struct pt_regs *regs, void *info) +{ +} + +#ifdef CONFIG_SMP +int __attribute__((weak)) +smp_nmi_call_function(smp_nmi_function func, void *info, int wait) +{ + return 0; +} +#endif void die_nmi (struct pt_regs *regs, const char *msg) { + static DEFINE_SPINLOCK(nmi_print_lock); + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == - NOTIFY_STOP) + NOTIFY_STOP) return; spin_lock(&nmi_print_lock); @@ -690,7 +727,11 @@ void die_nmi (struct pt_regs *regs, cons printk(" on CPU%d, eip %08lx, registers:\n", smp_processor_id(), regs->eip); show_registers(regs); - printk(KERN_EMERG "console shuts up ...\n"); + smp_nmi_call_function(smp_show_regs, NULL, 1); + bust_spinlocks(1); + /* current CPU messages should go bottom */ + if (!decode_call_traces) + smp_show_regs(regs, NULL); console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); @@ -706,6 +747,14 @@ void die_nmi (struct pt_regs *regs, cons do_exit(SIGSEGV); } +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; +static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; + static void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -728,6 +777,9 @@ static void default_do_nmi(struct pt_reg return; } #endif + if (nmi_ipi_callback != dummy_nmi_callback) + return; + unknown_nmi_error(reason, regs); return; } @@ -744,13 +796,6 @@ static void default_do_nmi(struct pt_reg reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - fastcall void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -764,9 +809,20 @@ fastcall void do_nmi(struct pt_regs * re if (!rcu_dereference(nmi_callback)(regs, cpu)) default_do_nmi(regs); + nmi_ipi_callback(regs, cpu); nmi_exit(); } +void set_nmi_ipi_callback(nmi_callback_t callback) +{ + nmi_ipi_callback = callback; +} + +void unset_nmi_ipi_callback(void) +{ + nmi_ipi_callback = dummy_nmi_callback; +} + void set_nmi_callback(nmi_callback_t callback) { vmalloc_sync_all(); diff -uprN linux-2.6.18/arch/i386/lib/Makefile linux-2.6.18.ovz/arch/i386/lib/Makefile --- linux-2.6.18/arch/i386/lib/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/lib/Makefile 2007-06-13 06:55:04.000000000 -0400 @@ -7,3 +7,5 @@ lib-y = checksum.o delay.o usercopy.o ge bitops.o lib-$(CONFIG_X86_USE_3DNOW) += mmx.o + +obj-$(CONFIG_SMP) += cpuid-on-cpu.o msr-on-cpu.o diff -uprN linux-2.6.18/arch/i386/lib/cpuid-on-cpu.c linux-2.6.18.ovz/arch/i386/lib/cpuid-on-cpu.c --- linux-2.6.18/arch/i386/lib/cpuid-on-cpu.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/arch/i386/lib/cpuid-on-cpu.c 2007-06-13 06:55:04.000000000 -0400 @@ -0,0 +1,73 @@ +#include +#include +#include +#include + +struct cpuid_info { + unsigned int cpu; + u32 op; + u32 eax, ebx, ecx, edx; +}; + +static void __cpuid_on_cpu(void *info) +{ + struct cpuid_info *rv = info; + + if (smp_processor_id() == rv->cpu) + cpuid(rv->op, &rv->eax, &rv->ebx, &rv->ecx, &rv->edx); +} + +void cpuid_on_cpu(unsigned int cpu, u32 op, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + preempt_disable(); + if (smp_processor_id() == cpu) + cpuid(op, eax, ebx, ecx, edx); + else { + struct cpuid_info rv; + + rv.cpu = cpu; + rv.op = op; + smp_call_function(__cpuid_on_cpu, &rv, 0, 1); + *eax = rv.eax; + *ebx = rv.ebx; + *ecx = rv.ecx; + *edx = rv.edx; + } + preempt_enable(); +} + +struct cpuid_eax_info { + unsigned int cpu; + u32 op; + u32 eax; +}; + +static void __cpuid_eax_on_cpu(void *info) +{ + struct cpuid_info *rv = info; + + if (smp_processor_id() == rv->cpu) + rv->eax = cpuid_eax(rv->op); +} + +u32 cpuid_eax_on_cpu(unsigned int cpu, u32 op) +{ + u32 ret; + + preempt_disable(); + if (smp_processor_id() == cpu) + ret = cpuid_eax(op); + else { + struct cpuid_eax_info rv; + + rv.cpu = cpu; + rv.op = op; + smp_call_function(__cpuid_eax_on_cpu, &rv, 0, 1); + ret = rv.eax; + } + preempt_enable(); + return ret; +} + +EXPORT_SYMBOL(cpuid_on_cpu); +EXPORT_SYMBOL(cpuid_eax_on_cpu); diff -uprN linux-2.6.18/arch/i386/lib/msr-on-cpu.c linux-2.6.18.ovz/arch/i386/lib/msr-on-cpu.c --- linux-2.6.18/arch/i386/lib/msr-on-cpu.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/arch/i386/lib/msr-on-cpu.c 2007-06-13 06:55:04.000000000 -0400 @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +struct msr_info { + unsigned int cpu; + u32 msr_no; + u32 l, h; +}; + +static void __rdmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + + if (smp_processor_id() == rv->cpu) + rdmsr(rv->msr_no, rv->l, rv->h); +} + +void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + preempt_disable(); + if (smp_processor_id() == cpu) + rdmsr(msr_no, *l, *h); + else { + struct msr_info rv; + + rv.cpu = cpu; + rv.msr_no = msr_no; + smp_call_function(__rdmsr_on_cpu, &rv, 0, 1); + *l = rv.l; + *h = rv.h; + } + preempt_enable(); +} + +static void __wrmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + + if (smp_processor_id() == rv->cpu) + wrmsr(rv->msr_no, rv->l, rv->h); +} + +void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + preempt_disable(); + if (smp_processor_id() == cpu) + wrmsr(msr_no, l, h); + else { + struct msr_info rv; + + rv.cpu = cpu; + rv.msr_no = msr_no; + rv.l = l; + rv.h = h; + smp_call_function(__wrmsr_on_cpu, &rv, 0, 1); + } + preempt_enable(); +} + +EXPORT_SYMBOL(rdmsr_on_cpu); +EXPORT_SYMBOL(wrmsr_on_cpu); diff -uprN linux-2.6.18/arch/i386/mm/boot_ioremap.c linux-2.6.18.ovz/arch/i386/mm/boot_ioremap.c --- linux-2.6.18/arch/i386/mm/boot_ioremap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/mm/boot_ioremap.c 2007-06-13 06:55:04.000000000 -0400 @@ -29,8 +29,11 @@ */ #define BOOT_PTE_PTRS (PTRS_PER_PTE*2) -#define boot_pte_index(address) \ - (((address) >> PAGE_SHIFT) & (BOOT_PTE_PTRS - 1)) + +static unsigned long boot_pte_index(unsigned long vaddr) +{ + return __pa(vaddr) >> PAGE_SHIFT; +} static inline boot_pte_t* boot_vaddr_to_pte(void *address) { diff -uprN linux-2.6.18/arch/i386/mm/fault.c linux-2.6.18.ovz/arch/i386/mm/fault.c --- linux-2.6.18/arch/i386/mm/fault.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/mm/fault.c 2007-06-13 06:55:05.000000000 -0400 @@ -65,32 +65,6 @@ static inline int notify_page_fault(enum /* - * Unlock any spinlocks which will prevent us from getting the - * message out - */ -void bust_spinlocks(int yes) -{ - int loglevel_save = console_loglevel; - - if (yes) { - oops_in_progress = 1; - return; - } -#ifdef CONFIG_VT - unblank_screen(); -#endif - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk will give klogd - * a poke. Hold onto your hats... - */ - console_loglevel = 15; /* NMI oopser may have shut the console up */ - printk(" "); - console_loglevel = loglevel_save; -} - -/* * Return EIP plus the CS segment base. The segment limit is also * adjusted, clamped to the kernel/user address space (whichever is * appropriate), and returned in *eip_limit. @@ -453,7 +427,6 @@ good_area: goto bad_area; } - survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -598,14 +571,14 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (tsk->pid == 1) { - yield(); - down_read(&mm->mmap_sem); - goto survive; + if (error_code & 4) { + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. + */ + force_sig(SIGKILL, tsk); + return; } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_exit(SIGKILL); goto no_context; do_sigbus: diff -uprN linux-2.6.18/arch/i386/mm/hugetlbpage.c linux-2.6.18.ovz/arch/i386/mm/hugetlbpage.c --- linux-2.6.18/arch/i386/mm/hugetlbpage.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/mm/hugetlbpage.c 2007-06-13 06:55:05.000000000 -0400 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,7 @@ int pmd_huge(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_PSE); } +EXPORT_SYMBOL(pmd_huge); struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, diff -uprN linux-2.6.18/arch/i386/mm/init.c linux-2.6.18.ovz/arch/i386/mm/init.c --- linux-2.6.18/arch/i386/mm/init.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/mm/init.c 2007-06-13 06:55:05.000000000 -0400 @@ -680,7 +680,7 @@ void __init pgtable_cache_init(void) pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), PTRS_PER_PMD*sizeof(pmd_t), - 0, + SLAB_UBC, pmd_ctor, NULL); if (!pmd_cache) @@ -689,7 +689,7 @@ void __init pgtable_cache_init(void) pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), PTRS_PER_PGD*sizeof(pgd_t), - 0, + SLAB_UBC, pgd_ctor, PTRS_PER_PMD == 1 ? pgd_dtor : NULL); if (!pgd_cache) diff -uprN linux-2.6.18/arch/i386/mm/pgtable.c linux-2.6.18.ovz/arch/i386/mm/pgtable.c --- linux-2.6.18/arch/i386/mm/pgtable.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/mm/pgtable.c 2007-06-13 06:55:05.000000000 -0400 @@ -4,8 +4,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -64,6 +66,7 @@ void show_mem(void) printk(KERN_INFO "%lu pages pagetables\n", global_page_state(NR_PAGETABLE)); } +EXPORT_SYMBOL(show_mem); /* * Associate a virtual page frame with a given physical page frame @@ -158,9 +161,11 @@ struct page *pte_alloc_one(struct mm_str struct page *pte; #ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_HIGHMEM| + __GFP_REPEAT|__GFP_ZERO, 0); #else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC| + __GFP_REPEAT|__GFP_ZERO, 0); #endif return pte; } diff -uprN linux-2.6.18/arch/i386/pci/irq.c linux-2.6.18.ovz/arch/i386/pci/irq.c --- linux-2.6.18/arch/i386/pci/irq.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/i386/pci/irq.c 2007-06-13 06:55:05.000000000 -0400 @@ -255,13 +255,13 @@ static int pirq_via_set(struct pci_dev * */ static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { - static const unsigned int pirqmap[4] = { 3, 2, 5, 1 }; + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; return read_config_nybble(router, 0x55, pirqmap[pirq-1]); } static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - static const unsigned int pirqmap[4] = { 3, 2, 5, 1 }; + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); return 1; } diff -uprN linux-2.6.18/arch/ia64/Kconfig linux-2.6.18.ovz/arch/ia64/Kconfig --- linux-2.6.18/arch/ia64/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/Kconfig 2007-06-13 06:55:05.000000000 -0400 @@ -272,7 +272,7 @@ config NR_CPUS config HOTPLUG_CPU bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" - depends on SMP && EXPERIMENTAL + depends on SMP && EXPERIMENTAL && !SCHED_VCPU select HOTPLUG default n ---help--- @@ -322,6 +322,8 @@ config PREEMPT Say Y here if you are building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. +source "kernel/Kconfig.fairsched" + source "mm/Kconfig" config ARCH_SELECT_MEMORY_MODEL @@ -525,6 +527,10 @@ endmenu source "arch/ia64/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" + +source "kernel/ub/Kconfig" diff -uprN linux-2.6.18/arch/ia64/ia32/binfmt_elf32.c linux-2.6.18.ovz/arch/ia64/ia32/binfmt_elf32.c --- linux-2.6.18/arch/ia64/ia32/binfmt_elf32.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/ia32/binfmt_elf32.c 2007-06-13 06:55:05.000000000 -0400 @@ -17,6 +17,8 @@ #include #include +#include + #include "ia32priv.h" #include "elfcore32.h" @@ -138,6 +140,12 @@ ia64_elf32_init (struct pt_regs *regs) up_write(¤t->mm->mmap_sem); } + if (ub_memory_charge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * + IA32_LDT_ENTRY_SIZE), + VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, + NULL, UB_SOFT)) + goto skip; + /* * Install LDT as anonymous memory. This gives us all-zero segment descriptors * until a task modifies them via modify_ldt(). @@ -159,7 +167,12 @@ ia64_elf32_init (struct pt_regs *regs) } } up_write(¤t->mm->mmap_sem); - } + } else + ub_memory_uncharge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * + IA32_LDT_ENTRY_SIZE), + VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, NULL); + +skip: ia64_psr(regs)->ac = 0; /* turn off alignment checking */ regs->loadrs = 0; @@ -214,9 +227,15 @@ ia32_setup_arg_pages (struct linux_binpr bprm->loader += stack_base; bprm->exec += stack_base; + ret = -ENOMEM; + if (ub_memory_charge(mm, IA32_STACK_TOP - + (PAGE_MASK & (unsigned long)bprm->p), + VM_STACK_FLAGS, NULL, UB_SOFT)) + goto err_charge; + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!mpnt) - return -ENOMEM; + goto err_alloc; memset(mpnt, 0, sizeof(*mpnt)); @@ -233,11 +252,8 @@ ia32_setup_arg_pages (struct linux_binpr mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)? PAGE_COPY_EXEC: PAGE_COPY; - if ((ret = insert_vm_struct(current->mm, mpnt))) { - up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, mpnt); - return ret; - } + if ((ret = insert_vm_struct(current->mm, mpnt))) + goto err_insert; current->mm->stack_vm = current->mm->total_vm = vma_pages(mpnt); } @@ -256,6 +272,16 @@ ia32_setup_arg_pages (struct linux_binpr current->thread.ppl = ia32_init_pp_list(); return 0; + +err_insert: + up_write(¤t->mm->mmap_sem); + kmem_cache_free(vm_area_cachep, mpnt); +err_alloc: + ub_memory_uncharge(mm, IA32_STACK_TOP - + (PAGE_MASK & (unsigned long)bprm->p), + VM_STACK_FLAGS, NULL); +err_charge: + return ret; } static void diff -uprN linux-2.6.18/arch/ia64/ia32/ia32_entry.S linux-2.6.18.ovz/arch/ia64/ia32/ia32_entry.S --- linux-2.6.18/arch/ia64/ia32/ia32_entry.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/ia32/ia32_entry.S 2007-06-13 06:55:05.000000000 -0400 @@ -52,43 +52,6 @@ ENTRY(ia32_clone) br.ret.sptk.many rp END(ia32_clone) -ENTRY(sys32_rt_sigsuspend) - .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) - alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs - mov loc0=rp - mov out0=in0 // mask - mov out1=in1 // sigsetsize - mov out2=sp // out2 = &sigscratch - .fframe 16 - adds sp=-16,sp // allocate dummy "sigscratch" - ;; - .body - br.call.sptk.many rp=ia32_rt_sigsuspend -1: .restore sp - adds sp=16,sp - mov rp=loc0 - mov ar.pfs=loc1 - br.ret.sptk.many rp -END(sys32_rt_sigsuspend) - -ENTRY(sys32_sigsuspend) - .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) - alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs - mov loc0=rp - mov out0=in2 // mask (first two args are ignored) - ;; - mov out1=sp // out1 = &sigscratch - .fframe 16 - adds sp=-16,sp // allocate dummy "sigscratch" - .body - br.call.sptk.many rp=ia32_sigsuspend -1: .restore sp - adds sp=16,sp - mov rp=loc0 - mov ar.pfs=loc1 - br.ret.sptk.many rp -END(sys32_sigsuspend) - GLOBAL_ENTRY(ia32_ret_from_clone) PT_REGS_UNWIND_INFO(0) { /* @@ -341,7 +304,7 @@ ia32_syscall_table: data8 sys_ni_syscall /* init_module */ data8 sys_ni_syscall /* delete_module */ data8 sys_ni_syscall /* get_kernel_syms */ /* 130 */ - data8 sys_quotactl + data8 sys32_quotactl data8 sys_getpgid data8 sys_fchdir data8 sys_ni_syscall /* sys_bdflush */ @@ -389,7 +352,7 @@ ia32_syscall_table: data8 sys_rt_sigpending data8 compat_sys_rt_sigtimedwait data8 sys32_rt_sigqueueinfo - data8 sys32_rt_sigsuspend + data8 compat_sys_rt_sigsuspend data8 sys32_pread /* 180 */ data8 sys32_pwrite data8 sys_chown /* 16-bit version */ diff -uprN linux-2.6.18/arch/ia64/ia32/ia32_signal.c linux-2.6.18.ovz/arch/ia64/ia32/ia32_signal.c --- linux-2.6.18/arch/ia64/ia32/ia32_signal.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/ia32/ia32_signal.c 2007-06-13 06:55:05.000000000 -0400 @@ -452,59 +452,20 @@ sigact_set_handler (struct k_sigaction * sa->sa.sa_handler = (__sighandler_t) (((unsigned long) restorer << 32) | handler); } -long -__ia32_rt_sigsuspend (compat_sigset_t *sset, unsigned int sigsetsize, struct sigscratch *scr) +asmlinkage long +sys32_sigsuspend (int history0, int history1, old_sigset_t mask) { - extern long ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall); - sigset_t oldset, set; - - scr->scratch_unat = 0; /* avoid leaking kernel bits to user level */ - memset(&set, 0, sizeof(set)); - - memcpy(&set.sig, &sset->sig, sigsetsize); - - sigdelsetmask(&set, ~_BLOCKABLE); - + mask &= _BLOCKABLE; spin_lock_irq(¤t->sighand->siglock); - { - oldset = current->blocked; - current->blocked = set; - recalc_sigpending(); - } + current->saved_sigmask = current->blocked; + siginitset(¤t->blocked, mask); + recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - /* - * The return below usually returns to the signal handler. We need to pre-set the - * correct error code here to ensure that the right values get saved in sigcontext - * by ia64_do_signal. - */ - scr->pt.r8 = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (ia64_do_signal(&oldset, scr, 1)) - return -EINTR; - } -} - -asmlinkage long -ia32_rt_sigsuspend (compat_sigset_t __user *uset, unsigned int sigsetsize, struct sigscratch *scr) -{ - compat_sigset_t set; - - if (sigsetsize > sizeof(compat_sigset_t)) - return -EINVAL; - - if (copy_from_user(&set.sig, &uset->sig, sigsetsize)) - return -EFAULT; - - return __ia32_rt_sigsuspend(&set, sigsetsize, scr); -} - -asmlinkage long -ia32_sigsuspend (unsigned int mask, struct sigscratch *scr) -{ - return __ia32_rt_sigsuspend((compat_sigset_t *) &mask, sizeof(mask), scr); + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_thread_flag(TIF_RESTORE_SIGMASK); + return -ERESTARTNOHAND; } asmlinkage long diff -uprN linux-2.6.18/arch/ia64/kernel/acpi.c linux-2.6.18.ovz/arch/ia64/kernel/acpi.c --- linux-2.6.18/arch/ia64/kernel/acpi.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/acpi.c 2007-06-13 06:55:05.000000000 -0400 @@ -771,16 +771,19 @@ int acpi_map_cpu2node(acpi_handle handle { #ifdef CONFIG_ACPI_NUMA int pxm_id; + int nid; pxm_id = acpi_get_pxm(handle); - /* - * Assuming that the container driver would have set the proximity - * domain and would have initialized pxm_to_node(pxm_id) && pxm_flag + * We don't have cpu-only-node hotadd. But if the system equips + * SRAT table, pxm is already found and node is ready. + * So, just pxm_to_nid(pxm) is OK. + * This code here is for the system which doesn't have full SRAT + * table for possible cpus. */ - node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_node(pxm_id); - + nid = acpi_map_pxm_to_node(pxm_id); node_cpuid[cpu].phys_id = physid; + node_cpuid[cpu].nid = nid; #endif return (0); } diff -uprN linux-2.6.18/arch/ia64/kernel/asm-offsets.c linux-2.6.18.ovz/arch/ia64/kernel/asm-offsets.c --- linux-2.6.18/arch/ia64/kernel/asm-offsets.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/asm-offsets.c 2007-06-13 06:55:05.000000000 -0400 @@ -43,11 +43,19 @@ void foo(void) DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); +#ifdef CONFIG_VE + DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, vpid)); +#else DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); +#endif DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand)); DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal)); +#ifdef CONFIG_VE + DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, vtgid)); +#else DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid)); +#endif DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp)); DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack)); diff -uprN linux-2.6.18/arch/ia64/kernel/entry.S linux-2.6.18.ovz/arch/ia64/kernel/entry.S --- linux-2.6.18/arch/ia64/kernel/entry.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/entry.S 2007-06-13 06:55:05.000000000 -0400 @@ -504,6 +504,74 @@ GLOBAL_ENTRY(clone) br.ret.sptk.many rp END(clone) +GLOBAL_ENTRY(ia64_ret_from_resume) + PT_REGS_UNWIND_INFO(0) +{ /* + * Some versions of gas generate bad unwind info if the first instruction of a + * procedure doesn't go into the first slot of a bundle. This is a workaround. + */ + nop.m 0 + nop.i 0 + /* + * We need to call schedule_tail() to complete the scheduling process. + * Called by ia64_switch_to() after do_fork()->copy_thread(). r8 contains the + * address of the previously executing task. + */ + br.call.sptk.many rp=ia64_invoke_schedule_tail +} + br.call.sptk.many rp=ia64_invoke_resume + ;; + adds sp=256,sp + ;; + /* Return from interrupt, we are all right. */ +(pNonSys) br ia64_leave_kernel + ;; + /* Tricky part follows. We must restore correct syscall + * register frame before doing normal syscall exit job. + * It would the most natural to keep sw->ar_pfs correct, + * then we would be here with correct register frame. + * Unfortunately, IA64 has a feature. Registers were in backstore + * after context switch, and the first br.ret does _NOT_ fetch + * output registers. + * It is quite natural: look, if caller has output regs in his + * frame, they should be consumed. If callee does not have (enough of) + * input/local registers (1 in this case), the situation is unusual. + * Practical evidence: they are filled with something random crap. + * The only case, when this is essential in mainstream kernel + * is sys_clone(). The result is that new process gets some kernel + * information in its register frame. Which is a security problem, btw. + * + * So, we set sw->ar_pfs to pretend the whole frame is of local + * regs. And we have to repartition the frame it manually, using + * information from pt->cr_ifs (the register is invalid in this + * case, but it holds correct pfm). + */ + adds r3=PT(CR_IFS)+16,sp + ;; + ld8 r2=[r3],-(PT(CR_IFS)-PT(R8)) + ;; + extr.u r2=r2,0,37 + mov r8=ar.ec + ;; + extr.u r8=r8,0,5 + ;; + shl r8=r8,52 + ;; + or r2=r2,r8 + ;; + mov ar.pfs=r2 + ;; + movl r2=ia64_leave_syscall + ;; + mov rp=r2 + /* Plus, we should fetch r8 and r10 from pt_regs. Something else? */ + ld8 r8=[r3],PT(R10)-PT(R8) + ;; + ld8 r10=[r3] + ;; + br.ret.sptk.many rp +END(ia64_ret_from_resume) + /* * Invoke a system call, but do some tracing before and after the call. * We MUST preserve the current register frame throughout this routine @@ -1170,6 +1238,34 @@ GLOBAL_ENTRY(ia64_invoke_schedule_tail) br.ret.sptk.many rp END(ia64_invoke_schedule_tail) +GLOBAL_ENTRY(ia64_invoke_resume) + alloc loc1=ar.pfs,0,3,1,0 + mov loc0=rp + adds out0=16,sp + ;; + ld8 r8=[out0] + ;; + cmp.eq p6,p0=r8,r0 + ;; +(p6) br.cond.sptk 1f + ;; + mov loc2=gp + ;; + ld8 r10=[r8],8 + ;; + ld8 gp=[r8] + ;; + mov b7=r10 + ;; + br.call.sptk.many rp=b7 + ;; + mov gp=loc2 +1: + mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(ia64_invoke_resume) + /* * Setup stack and call do_notify_resume_user(). Note that pSys and pNonSys need to * be set up by the caller. We declare 8 input registers so the system call @@ -1202,32 +1298,6 @@ ENTRY(notify_resume_user) br.ret.sptk.many rp END(notify_resume_user) -GLOBAL_ENTRY(sys_rt_sigsuspend) - .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) - alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart! - mov r9=ar.unat - mov loc0=rp // save return address - mov out0=in0 // mask - mov out1=in1 // sigsetsize - adds out2=8,sp // out2=&sigscratch->ar_pfs - ;; - .fframe 16 - .spillsp ar.unat, 16 - st8 [sp]=r9,-16 // allocate space for ar.unat and save it - st8 [out2]=loc1,-8 // save ar.pfs, out2=&sigscratch - .body - br.call.sptk.many rp=ia64_rt_sigsuspend -.ret17: .restore sp - adds sp=16,sp // pop scratch stack space - ;; - ld8 r9=[sp] // load new unat from sw->caller_unat - mov rp=loc0 - ;; - mov ar.unat=r9 - mov ar.pfs=loc1 - br.ret.sptk.many rp -END(sys_rt_sigsuspend) - ENTRY(sys_rt_sigreturn) PT_REGS_UNWIND_INFO(0) /* @@ -1601,8 +1671,8 @@ sys_call_table: data8 sys_readlinkat data8 sys_fchmodat data8 sys_faccessat - data8 sys_ni_syscall // reserved for pselect - data8 sys_ni_syscall // 1295 reserved for ppoll + data8 sys_pselect6 + data8 sys_ppoll data8 sys_unshare data8 sys_splice data8 sys_ni_syscall // reserved for set_robust_list @@ -1610,5 +1680,20 @@ sys_call_table: data8 sys_sync_file_range // 1300 data8 sys_tee data8 sys_vmsplice +.rept 1499-1303 + data8 sys_ni_syscall +.endr + data8 sys_fairsched_vcpus + data8 sys_fairsched_mknod // 1500 + data8 sys_fairsched_rmnod + data8 sys_fairsched_chwt + data8 sys_fairsched_mvpr + data8 sys_fairsched_rate + data8 sys_getluid // 1505 + data8 sys_setluid + data8 sys_setublimit + data8 sys_ubstat + data8 sys_lchmod + data8 sys_lutime // 1510 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls diff -uprN linux-2.6.18/arch/ia64/kernel/fsys.S linux-2.6.18.ovz/arch/ia64/kernel/fsys.S --- linux-2.6.18/arch/ia64/kernel/fsys.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/fsys.S 2007-06-13 06:55:05.000000000 -0400 @@ -72,6 +72,7 @@ ENTRY(fsys_getpid) FSYS_RETURN END(fsys_getpid) +#ifndef CONFIG_VE ENTRY(fsys_getppid) .prologue .altrp b6 @@ -118,6 +119,7 @@ ENTRY(fsys_getppid) #endif FSYS_RETURN END(fsys_getppid) +#endif ENTRY(fsys_set_tid_address) .prologue @@ -665,7 +667,11 @@ fsyscall_table: data8 0 // chown data8 0 // lseek // 1040 data8 fsys_getpid // getpid +#ifdef CONFIG_VE + data8 0 +#else data8 fsys_getppid // getppid +#endif data8 0 // mount data8 0 // umount data8 0 // setuid // 1045 diff -uprN linux-2.6.18/arch/ia64/kernel/ia64_ksyms.c linux-2.6.18.ovz/arch/ia64/kernel/ia64_ksyms.c --- linux-2.6.18/arch/ia64/kernel/ia64_ksyms.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/ia64_ksyms.c 2007-06-13 06:55:05.000000000 -0400 @@ -74,6 +74,8 @@ EXPORT_SYMBOL(xor_ia64_4); EXPORT_SYMBOL(xor_ia64_5); #endif +EXPORT_SYMBOL(empty_zero_page); + #include EXPORT_SYMBOL(ia64_pal_call_phys_stacked); EXPORT_SYMBOL(ia64_pal_call_phys_static); diff -uprN linux-2.6.18/arch/ia64/kernel/init_task.c linux-2.6.18.ovz/arch/ia64/kernel/init_task.c --- linux-2.6.18/arch/ia64/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -21,6 +22,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/ia64/kernel/mca.c linux-2.6.18.ovz/arch/ia64/kernel/mca.c --- linux-2.6.18/arch/ia64/kernel/mca.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/mca.c 2007-06-13 06:55:05.000000000 -0400 @@ -1320,10 +1320,10 @@ default_monarch_init_process(struct noti } printk("\n\n"); if (read_trylock(&tasklist_lock)) { - do_each_thread (g, t) { + do_each_thread_all (g, t) { printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); show_stack(t, NULL); - } while_each_thread (g, t); + } while_each_thread_all (g, t); read_unlock(&tasklist_lock); } return NOTIFY_DONE; diff -uprN linux-2.6.18/arch/ia64/kernel/numa.c linux-2.6.18.ovz/arch/ia64/kernel/numa.c --- linux-2.6.18/arch/ia64/kernel/numa.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/numa.c 2007-06-13 06:55:05.000000000 -0400 @@ -29,6 +29,36 @@ EXPORT_SYMBOL(cpu_to_node_map); cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; +void __cpuinit map_cpu_to_node(int cpu, int nid) +{ + int oldnid; + if (nid < 0) { /* just initialize by zero */ + cpu_to_node_map[cpu] = 0; + return; + } + /* sanity check first */ + oldnid = cpu_to_node_map[cpu]; + if (cpu_isset(cpu, node_to_cpu_mask[oldnid])) { + return; /* nothing to do */ + } + /* we don't have cpu-driven node hot add yet... + In usual case, node is created from SRAT at boot time. */ + if (!node_online(nid)) + nid = first_online_node; + cpu_to_node_map[cpu] = nid; + cpu_set(cpu, node_to_cpu_mask[nid]); + return; +} + +void __cpuinit unmap_cpu_from_node(int cpu, int nid) +{ + WARN_ON(!cpu_isset(cpu, node_to_cpu_mask[nid])); + WARN_ON(cpu_to_node_map[cpu] != nid); + cpu_to_node_map[cpu] = 0; + cpu_clear(cpu, node_to_cpu_mask[nid]); +} + + /** * build_cpu_to_node_map - setup cpu to node and node to cpumask arrays * @@ -49,8 +79,6 @@ void __init build_cpu_to_node_map(void) node = node_cpuid[i].nid; break; } - cpu_to_node_map[cpu] = (node >= 0) ? node : 0; - if (node >= 0) - cpu_set(cpu, node_to_cpu_mask[node]); + map_cpu_to_node(cpu, node); } } diff -uprN linux-2.6.18/arch/ia64/kernel/perfmon.c linux-2.6.18.ovz/arch/ia64/kernel/perfmon.c --- linux-2.6.18/arch/ia64/kernel/perfmon.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/perfmon.c 2007-06-13 06:55:05.000000000 -0400 @@ -2623,7 +2623,7 @@ pfm_get_task(pfm_context_t *ctx, pid_t p read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); /* make sure task cannot go away while we operate on it */ if (p) get_task_struct(p); @@ -4187,12 +4187,12 @@ pfm_check_task_exist(pfm_context_t *ctx) read_lock(&tasklist_lock); - do_each_thread (g, t) { + do_each_thread_ve (g, t) { if (t->thread.pfm_context == ctx) { ret = 0; break; } - } while_each_thread (g, t); + } while_each_thread_ve (g, t); read_unlock(&tasklist_lock); diff -uprN linux-2.6.18/arch/ia64/kernel/process.c linux-2.6.18.ovz/arch/ia64/kernel/process.c --- linux-2.6.18/arch/ia64/kernel/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/process.c 2007-06-13 06:55:05.000000000 -0400 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -107,7 +108,8 @@ show_regs (struct pt_regs *regs) unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; print_modules(); - printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm); + printk("\nPid: %d, CPU %d, VCPU %d:%d, comm: %20s\n", current->pid, smp_processor_id(), + task_vsched_id(current), task_cpu(current), current->comm); printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); print_symbol("ip is at %s\n", ip); @@ -157,7 +159,7 @@ show_regs (struct pt_regs *regs) } void -do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall) +do_notify_resume_user (sigset_t *unused, struct sigscratch *scr, long in_syscall) { if (fsys_mode(current, &scr->pt)) { /* defer signal-handling etc. until we return to privilege-level 0. */ @@ -172,8 +174,8 @@ do_notify_resume_user (sigset_t *oldset, #endif /* deal with pending signal delivery */ - if (test_thread_flag(TIF_SIGPENDING)) - ia64_do_signal(oldset, scr, in_syscall); + if (test_thread_flag(TIF_SIGPENDING)||test_thread_flag(TIF_RESTORE_SIGMASK)) + ia64_do_signal(scr, in_syscall); } static int pal_halt = 1; @@ -356,6 +358,9 @@ ia64_load_extra (struct task_struct *tas #endif } +extern char ia64_ret_from_resume; +EXPORT_SYMBOL(ia64_ret_from_resume); + /* * Copy the state of an ia-64 thread. * @@ -429,7 +434,6 @@ copy_thread (int nr, unsigned long clone child_ptregs->r12 = user_stack_base + user_stack_size - 16; child_ptregs->ar_bspstore = user_stack_base; child_ptregs->ar_rnat = 0; - child_ptregs->loadrs = 0; } } else { /* @@ -669,16 +673,26 @@ out: return error; } +extern void start_kernel_thread (void); +EXPORT_SYMBOL(start_kernel_thread); +EXPORT_SYMBOL(execve); + pid_t kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) { - extern void start_kernel_thread (void); unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; struct { struct switch_stack sw; struct pt_regs pt; } regs; + /* Don't allow kernel_thread() inside VE */ + if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { + printk("kernel_thread call inside VE\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ regs.pt.r1 = helper_fptr[1]; /* set GP */ diff -uprN linux-2.6.18/arch/ia64/kernel/ptrace.c linux-2.6.18.ovz/arch/ia64/kernel/ptrace.c --- linux-2.6.18/arch/ia64/kernel/ptrace.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/ptrace.c 2007-06-13 06:55:05.000000000 -0400 @@ -7,6 +7,7 @@ * Derived from the x86 and Alpha versions. */ #include +#include #include #include #include @@ -100,6 +101,8 @@ ia64_get_scratch_nat_bits (struct pt_reg # undef GET_BITS } +EXPORT_SYMBOL(ia64_get_scratch_nat_bits); +EXPORT_SYMBOL(__ia64_save_fpu); /* * Set the NaT bits for the scratch registers according to NAT and @@ -456,6 +459,7 @@ ia64_peek (struct task_struct *child, st *val = ret; return 0; } +EXPORT_SYMBOL(ia64_peek); long ia64_poke (struct task_struct *child, struct switch_stack *child_stack, @@ -520,6 +524,7 @@ ia64_get_user_rbs_end (struct task_struc *cfmp = cfm; return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty); } +EXPORT_SYMBOL(ia64_get_user_rbs_end); /* * Synchronize (i.e, write) the RSE backing store living in kernel @@ -757,20 +762,20 @@ access_nat_bits (struct task_struct *chi if (write_access) { nat_bits = *data; scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits); - if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) { - dprintk("ptrace: failed to set ar.unat\n"); - return -1; - } + if (info->pri_unat_loc) + *info->pri_unat_loc = scratch_unat; + else + info->sw->caller_unat = scratch_unat; for (regnum = 4; regnum <= 7; ++regnum) { unw_get_gr(info, regnum, &dummy, &nat); unw_set_gr(info, regnum, dummy, (nat_bits >> regnum) & 1); } } else { - if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) { - dprintk("ptrace: failed to read ar.unat\n"); - return -1; - } + if (info->pri_unat_loc) + scratch_unat = *info->pri_unat_loc; + else + scratch_unat = info->sw->caller_unat; nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat); for (regnum = 4; regnum <= 7; ++regnum) { unw_get_gr(info, regnum, &dummy, &nat); @@ -1432,7 +1437,7 @@ sys_ptrace (long request, pid_t pid, uns ret = -ESRCH; read_lock(&tasklist_lock); { - child = find_task_by_pid(pid); + child = find_task_by_pid_ve(pid); if (child) { if (peek_or_poke) child = find_thread_for_addr(child, addr); @@ -1627,9 +1632,11 @@ syscall_trace_enter (long arg0, long arg long arg4, long arg5, long arg6, long arg7, struct pt_regs regs) { + set_pn_state(current, PN_STOP_ENTRY); if (test_thread_flag(TIF_SYSCALL_TRACE) && (current->ptrace & PT_PTRACED)) syscall_trace(); + clear_pn_state(current); if (unlikely(current->audit_context)) { long syscall; @@ -1664,7 +1671,9 @@ syscall_trace_leave (long arg0, long arg audit_syscall_exit(success, result); } + set_pn_state(current, PN_STOP_LEAVE); if (test_thread_flag(TIF_SYSCALL_TRACE) && (current->ptrace & PT_PTRACED)) syscall_trace(); + clear_pn_state(current); } diff -uprN linux-2.6.18/arch/ia64/kernel/setup.c linux-2.6.18.ovz/arch/ia64/kernel/setup.c --- linux-2.6.18/arch/ia64/kernel/setup.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/setup.c 2007-06-13 06:55:05.000000000 -0400 @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -545,9 +546,13 @@ show_cpuinfo (struct seq_file *m, void * sprintf(cp, " 0x%lx", mask); } +#ifndef CONFIG_FAIRSCHED proc_freq = cpufreq_quick_get(cpunum); if (!proc_freq) proc_freq = c->proc_freq / 1000; +#else + proc_freq = ve_scale_khz(c->proc_freq) / 1000; +#endif seq_printf(m, "processor : %d\n" @@ -586,7 +591,7 @@ static void * c_start (struct seq_file *m, loff_t *pos) { #ifdef CONFIG_SMP - while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map)) + while (*pos < NR_CPUS && !vcpu_online(*pos)) ++*pos; #endif return *pos < NR_CPUS ? cpu_data(*pos) : NULL; diff -uprN linux-2.6.18/arch/ia64/kernel/sigframe.h linux-2.6.18.ovz/arch/ia64/kernel/sigframe.h --- linux-2.6.18/arch/ia64/kernel/sigframe.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/sigframe.h 2007-06-13 06:55:05.000000000 -0400 @@ -22,4 +22,4 @@ struct sigframe { struct sigcontext sc; }; -extern long ia64_do_signal (sigset_t *, struct sigscratch *, long); +extern void ia64_do_signal (struct sigscratch *, long); diff -uprN linux-2.6.18/arch/ia64/kernel/signal.c linux-2.6.18.ovz/arch/ia64/kernel/signal.c --- linux-2.6.18/arch/ia64/kernel/signal.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/signal.c 2007-06-13 06:55:05.000000000 -0400 @@ -41,47 +41,6 @@ # define GET_SIGSET(k,u) __get_user((k)->sig[0], &(u)->sig[0]) #endif -long -ia64_rt_sigsuspend (sigset_t __user *uset, size_t sigsetsize, struct sigscratch *scr) -{ - sigset_t oldset, set; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (!access_ok(VERIFY_READ, uset, sigsetsize)) - return -EFAULT; - - if (GET_SIGSET(&set, uset)) - return -EFAULT; - - sigdelsetmask(&set, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - { - oldset = current->blocked; - current->blocked = set; - recalc_sigpending(); - } - spin_unlock_irq(¤t->sighand->siglock); - - /* - * The return below usually returns to the signal handler. We need to - * pre-set the correct error code here to ensure that the right values - * get saved in sigcontext by ia64_do_signal. - */ - scr->pt.r8 = EINTR; - scr->pt.r10 = -1; - - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (ia64_do_signal(&oldset, scr, 1)) - return -EINTR; - } -} - asmlinkage long sys_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, long arg2, long arg3, long arg4, long arg5, long arg6, long arg7, @@ -269,7 +228,7 @@ ia64_rt_sigreturn (struct sigscratch *sc si.si_signo = SIGSEGV; si.si_errno = 0; si.si_code = SI_KERNEL; - si.si_pid = current->pid; + si.si_pid = virt_pid(current); si.si_uid = current->uid; si.si_addr = sc; force_sig_info(SIGSEGV, &si, current); @@ -374,7 +333,7 @@ force_sigsegv_info (int sig, void __user si.si_signo = SIGSEGV; si.si_errno = 0; si.si_code = SI_KERNEL; - si.si_pid = current->pid; + si.si_pid = virt_pid(current); si.si_uid = current->uid; si.si_addr = addr; force_sig_info(SIGSEGV, &si, current); @@ -478,10 +437,11 @@ handle_signal (unsigned long sig, struct * Note that `init' is a special process: it doesn't get signals it doesn't want to * handle. Thus you cannot kill init even with a SIGKILL even by mistake. */ -long -ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) +void +ia64_do_signal (struct sigscratch *scr, long in_syscall) { struct k_sigaction ka; + sigset_t *oldset; siginfo_t info; long restart = in_syscall; long errno = scr->pt.r8; @@ -493,9 +453,17 @@ ia64_do_signal (sigset_t *oldset, struct * doing anything if so. */ if (!user_mode(&scr->pt)) - return 0; + return; - if (!oldset) + if (try_to_freeze() && !signal_pending(current)) { + if ((long) scr->pt.r10 != -1) + restart = 0; + goto no_signal; + } + + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + oldset = ¤t->saved_sigmask; + else oldset = ¤t->blocked; /* @@ -548,8 +516,10 @@ ia64_do_signal (sigset_t *oldset, struct if (IS_IA32_PROCESS(&scr->pt)) { scr->pt.r8 = scr->pt.r1; scr->pt.cr_iip -= 2; - } else + } else { ia64_decrement_ip(&scr->pt); + scr->pt.r10 = 0; + } restart = 0; /* don't restart twice if handle_signal() fails... */ } } @@ -558,11 +528,19 @@ ia64_do_signal (sigset_t *oldset, struct * Whee! Actually deliver the signal. If the delivery failed, we need to * continue to iterate in this loop so we can deliver the SIGSEGV... */ - if (handle_signal(signr, &ka, &info, oldset, scr)) - return 1; + if (handle_signal(signr, &ka, &info, oldset, scr)) { + /* a signal was successfully delivered; the saved + * sigmask will have been stored in the signal frame, + * and will be restored by sigreturn, so we can simply + * clear the TIF_RESTORE_SIGMASK flag */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + clear_thread_flag(TIF_RESTORE_SIGMASK); + return; + } } /* Did we come from a system call? */ +no_signal: if (restart) { /* Restart the system call - no handlers present */ if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR @@ -582,8 +560,15 @@ ia64_do_signal (sigset_t *oldset, struct ia64_decrement_ip(&scr->pt); if (errno == ERESTART_RESTARTBLOCK) scr->pt.r15 = __NR_restart_syscall; + scr->pt.r10 = 0; } } } - return 0; + + /* if there's no signal to deliver, we just put the saved sigmask + * back */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) { + clear_thread_flag(TIF_RESTORE_SIGMASK); + sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); + } } diff -uprN linux-2.6.18/arch/ia64/kernel/topology.c linux-2.6.18.ovz/arch/ia64/kernel/topology.c --- linux-2.6.18/arch/ia64/kernel/topology.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/topology.c 2007-06-13 06:55:05.000000000 -0400 @@ -36,6 +36,7 @@ int arch_register_cpu(int num) */ if (!can_cpei_retarget() && is_cpu_cpei_target(num)) sysfs_cpus[num].cpu.no_control = 1; + map_cpu_to_node(num, node_cpuid[num].nid); #endif return register_cpu(&sysfs_cpus[num].cpu, num); @@ -45,7 +46,8 @@ int arch_register_cpu(int num) void arch_unregister_cpu(int num) { - return unregister_cpu(&sysfs_cpus[num].cpu); + unregister_cpu(&sysfs_cpus[num].cpu); + unmap_cpu_from_node(num, cpu_to_node(num)); } EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); diff -uprN linux-2.6.18/arch/ia64/kernel/traps.c linux-2.6.18.ovz/arch/ia64/kernel/traps.c --- linux-2.6.18/arch/ia64/kernel/traps.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/traps.c 2007-06-13 06:55:05.000000000 -0400 @@ -53,34 +53,6 @@ trap_init (void) fpswa_interface = __va(ia64_boot_param->fpswa); } -/* - * Unlock any spinlocks which will prevent us from getting the message out (timerlist_lock - * is acquired through the console unblank code) - */ -void -bust_spinlocks (int yes) -{ - int loglevel_save = console_loglevel; - - if (yes) { - oops_in_progress = 1; - return; - } - -#ifdef CONFIG_VT - unblank_screen(); -#endif - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() without - * oops_in_progress set so that printk will give klogd a poke. Hold onto - * your hats... - */ - console_loglevel = 15; /* NMI oopser may have shut the console up */ - printk(" "); - console_loglevel = loglevel_save; -} - void die (const char *str, struct pt_regs *regs, long err) { diff -uprN linux-2.6.18/arch/ia64/kernel/unaligned.c linux-2.6.18.ovz/arch/ia64/kernel/unaligned.c --- linux-2.6.18/arch/ia64/kernel/unaligned.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/unaligned.c 2007-06-13 06:55:05.000000000 -0400 @@ -1290,7 +1290,7 @@ within_logging_rate_limit (void) { static unsigned long count, last_time; - if (jiffies - last_time > 5*HZ) + if (jiffies - last_time > 60 * HZ) count = 0; if (count < 5) { last_time = jiffies; diff -uprN linux-2.6.18/arch/ia64/kernel/unwind.c linux-2.6.18.ovz/arch/ia64/kernel/unwind.c --- linux-2.6.18/arch/ia64/kernel/unwind.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/unwind.c 2007-06-13 06:55:05.000000000 -0400 @@ -60,6 +60,7 @@ # define UNW_DEBUG_ON(n) unw_debug_level >= n /* Do not code a printk level, not all debug lines end in newline */ # define UNW_DPRINT(n, ...) if (UNW_DEBUG_ON(n)) printk(__VA_ARGS__) +# undef inline # define inline #else /* !UNW_DEBUG */ # define UNW_DEBUG_ON(n) 0 @@ -1943,9 +1944,9 @@ EXPORT_SYMBOL(unw_unwind); int unw_unwind_to_user (struct unw_frame_info *info) { - unsigned long ip, sp, pr = 0; + unsigned long ip, sp, pr = info->pr; - while (unw_unwind(info) >= 0) { + do { unw_get_sp(info, &sp); if ((long)((unsigned long)info->task + IA64_STK_OFFSET - sp) < IA64_PT_REGS_SIZE) { @@ -1963,7 +1964,7 @@ unw_unwind_to_user (struct unw_frame_inf __FUNCTION__, ip); return -1; } - } + } while (unw_unwind(info) >= 0); unw_get_ip(info, &ip); UNW_DPRINT(0, "unwind.%s: failed to unwind to user-level (ip=0x%lx)\n", __FUNCTION__, ip); diff -uprN linux-2.6.18/arch/ia64/kernel/vmlinux.lds.S linux-2.6.18.ovz/arch/ia64/kernel/vmlinux.lds.S --- linux-2.6.18/arch/ia64/kernel/vmlinux.lds.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/kernel/vmlinux.lds.S 2007-06-13 06:55:05.000000000 -0400 @@ -163,6 +163,7 @@ SECTIONS } #endif + . = ALIGN(8); __con_initcall_start = .; .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { *(.con_initcall.init) } diff -uprN linux-2.6.18/arch/ia64/mm/contig.c linux-2.6.18.ovz/arch/ia64/mm/contig.c --- linux-2.6.18/arch/ia64/mm/contig.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/mm/contig.c 2007-06-13 06:55:05.000000000 -0400 @@ -70,6 +70,7 @@ show_mem (void) printk("%ld pages in page table cache\n", pgtable_quicklist_total_size()); } +EXPORT_SYMBOL(show_mem); /* physical address where the bootmem map is located */ unsigned long bootmap_start; diff -uprN linux-2.6.18/arch/ia64/mm/discontig.c linux-2.6.18.ovz/arch/ia64/mm/discontig.c --- linux-2.6.18/arch/ia64/mm/discontig.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/mm/discontig.c 2007-06-13 06:55:05.000000000 -0400 @@ -46,6 +46,7 @@ static struct early_node_data mem_data[M static nodemask_t memory_less_mask __initdata; static pg_data_t *pgdat_list[MAX_NUMNODES]; +EXPORT_SYMBOL(pgdat_list); /* * To prevent cache aliasing effects, align per-node structures so that they @@ -592,6 +593,7 @@ void show_mem(void) pgtable_quicklist_total_size()); printk("%d free buffer pages\n", nr_free_buffer_pages()); } +EXPORT_SYMBOL(show_mem); /** * call_pernode_memory - use SRAT to call callback functions with node info diff -uprN linux-2.6.18/arch/ia64/mm/fault.c linux-2.6.18.ovz/arch/ia64/mm/fault.c --- linux-2.6.18/arch/ia64/mm/fault.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/mm/fault.c 2007-06-13 06:55:05.000000000 -0400 @@ -153,7 +153,6 @@ ia64_do_page_fault (unsigned long addres if ((vma->vm_flags & mask) != mask) goto bad_area; - survive: /* * If for any reason at all we couldn't handle the fault, make * sure we exit gracefully rather than endlessly redo the @@ -278,13 +277,13 @@ ia64_do_page_fault (unsigned long addres out_of_memory: up_read(&mm->mmap_sem); - if (current->pid == 1) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk(KERN_CRIT "VM: killing process %s\n", current->comm); - if (user_mode(regs)) - do_exit(SIGKILL); + if (user_mode(regs)) { + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. + */ + force_sig(SIGKILL, current); + return; + } goto no_context; } diff -uprN linux-2.6.18/arch/ia64/mm/init.c linux-2.6.18.ovz/arch/ia64/mm/init.c --- linux-2.6.18/arch/ia64/mm/init.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/mm/init.c 2007-06-13 06:55:05.000000000 -0400 @@ -36,6 +36,8 @@ #include #include +#include + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist); @@ -95,7 +97,7 @@ check_pgt_cache(void) preempt_disable(); while (unlikely((pages_to_free = min_pages_to_free()) > 0)) { while (pages_to_free--) { - free_page((unsigned long)pgtable_quicklist_alloc()); + free_page((unsigned long)pgtable_quicklist_alloc(0)); } preempt_enable(); preempt_disable(); @@ -151,6 +153,10 @@ ia64_init_addr_space (void) ia64_set_rbs_bot(); + if (ub_memory_charge(current->mm, PAGE_SIZE, VM_DATA_DEFAULT_FLAGS, + NULL, UB_SOFT)) + goto skip; + /* * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore * the problem. When the process attempts to write to the register backing store @@ -168,11 +174,16 @@ ia64_init_addr_space (void) if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); kmem_cache_free(vm_area_cachep, vma); + ub_memory_uncharge(current->mm, PAGE_SIZE, + VM_DATA_DEFAULT_FLAGS, NULL); return; } up_write(¤t->mm->mmap_sem); - } + } else + ub_memory_uncharge(current->mm, PAGE_SIZE, + VM_DATA_DEFAULT_FLAGS, NULL); +skip: /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); diff -uprN linux-2.6.18/arch/ia64/sn/kernel/bte.c linux-2.6.18.ovz/arch/ia64/sn/kernel/bte.c --- linux-2.6.18/arch/ia64/sn/kernel/bte.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/sn/kernel/bte.c 2007-06-13 06:55:05.000000000 -0400 @@ -382,14 +382,13 @@ bte_result_t bte_unaligned_copy(u64 src, * bcopy to the destination. */ - /* Add the leader from source */ - headBteLen = len + (src & L1_CACHE_MASK); - /* Add the trailing bytes from footer. */ - headBteLen += L1_CACHE_BYTES - (headBteLen & L1_CACHE_MASK); - headBteSource = src & ~L1_CACHE_MASK; headBcopySrcOffset = src & L1_CACHE_MASK; headBcopyDest = dest; headBcopyLen = len; + + headBteSource = src - headBcopySrcOffset; + /* Add the leading and trailing bytes from source */ + headBteLen = L1_CACHE_ALIGN(len + headBcopySrcOffset); } if (headBcopyLen > 0) { diff -uprN linux-2.6.18/arch/ia64/sn/kernel/sn2/sn_hwperf.c linux-2.6.18.ovz/arch/ia64/sn/kernel/sn2/sn_hwperf.c --- linux-2.6.18/arch/ia64/sn/kernel/sn2/sn_hwperf.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ia64/sn/kernel/sn2/sn_hwperf.c 2007-06-13 06:55:05.000000000 -0400 @@ -422,7 +422,7 @@ static int sn_topology_show(struct seq_f "coherency_domain %d, " "region_size %d\n", - partid, system_utsname.nodename, + partid, utsname()->nodename, shubtype ? "shub2" : "shub1", (u64)nasid_mask << nasid_shift, nasid_msb, nasid_shift, system_size, sharing_size, coher, region_size); diff -uprN linux-2.6.18/arch/m32r/kernel/entry.S linux-2.6.18.ovz/arch/m32r/kernel/entry.S --- linux-2.6.18/arch/m32r/kernel/entry.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/m32r/kernel/entry.S 2007-06-13 06:55:05.000000000 -0400 @@ -23,35 +23,35 @@ * updated in fork.c:copy_thread, signal.c:do_signal, * ptrace.c and ptrace.h * - * M32Rx/M32R2 M32R - * @(sp) - r4 ditto - * @(0x04,sp) - r5 ditto - * @(0x08,sp) - r6 ditto - * @(0x0c,sp) - *pt_regs ditto - * @(0x10,sp) - r0 ditto - * @(0x14,sp) - r1 ditto - * @(0x18,sp) - r2 ditto - * @(0x1c,sp) - r3 ditto - * @(0x20,sp) - r7 ditto - * @(0x24,sp) - r8 ditto - * @(0x28,sp) - r9 ditto - * @(0x2c,sp) - r10 ditto - * @(0x30,sp) - r11 ditto - * @(0x34,sp) - r12 ditto - * @(0x38,sp) - syscall_nr ditto - * @(0x3c,sp) - acc0h @(0x3c,sp) - acch - * @(0x40,sp) - acc0l @(0x40,sp) - accl - * @(0x44,sp) - acc1h @(0x44,sp) - dummy_acc1h - * @(0x48,sp) - acc1l @(0x48,sp) - dummy_acc1l - * @(0x4c,sp) - psw ditto - * @(0x50,sp) - bpc ditto - * @(0x54,sp) - bbpsw ditto - * @(0x58,sp) - bbpc ditto - * @(0x5c,sp) - spu (cr3) ditto - * @(0x60,sp) - fp (r13) ditto - * @(0x64,sp) - lr (r14) ditto - * @(0x68,sp) - spi (cr2) ditto - * @(0x6c,sp) - orig_r0 ditto + * M32R/M32Rx/M32R2 + * @(sp) - r4 + * @(0x04,sp) - r5 + * @(0x08,sp) - r6 + * @(0x0c,sp) - *pt_regs + * @(0x10,sp) - r0 + * @(0x14,sp) - r1 + * @(0x18,sp) - r2 + * @(0x1c,sp) - r3 + * @(0x20,sp) - r7 + * @(0x24,sp) - r8 + * @(0x28,sp) - r9 + * @(0x2c,sp) - r10 + * @(0x30,sp) - r11 + * @(0x34,sp) - r12 + * @(0x38,sp) - syscall_nr + * @(0x3c,sp) - acc0h + * @(0x40,sp) - acc0l + * @(0x44,sp) - acc1h ; ISA_DSP_LEVEL2 only + * @(0x48,sp) - acc1l ; ISA_DSP_LEVEL2 only + * @(0x4c,sp) - psw + * @(0x50,sp) - bpc + * @(0x54,sp) - bbpsw + * @(0x58,sp) - bbpc + * @(0x5c,sp) - spu (cr3) + * @(0x60,sp) - fp (r13) + * @(0x64,sp) - lr (r14) + * @(0x68,sp) - spi (cr2) + * @(0x6c,sp) - orig_r0 */ #include @@ -95,17 +95,10 @@ #define R11(reg) @(0x30,reg) #define R12(reg) @(0x34,reg) #define SYSCALL_NR(reg) @(0x38,reg) -#if defined(CONFIG_ISA_M32R2) && defined(CONFIG_ISA_DSP_LEVEL2) #define ACC0H(reg) @(0x3C,reg) #define ACC0L(reg) @(0x40,reg) #define ACC1H(reg) @(0x44,reg) #define ACC1L(reg) @(0x48,reg) -#elif defined(CONFIG_ISA_M32R2) || defined(CONFIG_ISA_M32R) -#define ACCH(reg) @(0x3C,reg) -#define ACCL(reg) @(0x40,reg) -#else -#error unknown isa configuration -#endif #define PSW(reg) @(0x4C,reg) #define BPC(reg) @(0x50,reg) #define BBPSW(reg) @(0x54,reg) diff -uprN linux-2.6.18/arch/m32r/kernel/init_task.c linux-2.6.18.ovz/arch/m32r/kernel/init_task.c --- linux-2.6.18/arch/m32r/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/m32r/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -16,6 +17,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/m32r/kernel/sys_m32r.c linux-2.6.18.ovz/arch/m32r/kernel/sys_m32r.c --- linux-2.6.18/arch/m32r/kernel/sys_m32r.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/m32r/kernel/sys_m32r.c 2007-06-13 06:55:05.000000000 -0400 @@ -205,7 +205,7 @@ asmlinkage int sys_uname(struct old_utsn if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } diff -uprN linux-2.6.18/arch/m68knommu/kernel/init_task.c linux-2.6.18.ovz/arch/m68knommu/kernel/init_task.c --- linux-2.6.18/arch/m68knommu/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/m68knommu/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/mips/kernel/init_task.c linux-2.6.18.ovz/arch/mips/kernel/init_task.c --- linux-2.6.18/arch/mips/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/mips/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -14,6 +15,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/mips/kernel/linux32.c linux-2.6.18.ovz/arch/mips/kernel/linux32.c --- linux-2.6.18/arch/mips/kernel/linux32.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/mips/kernel/linux32.c 2007-06-13 06:55:05.000000000 -0400 @@ -1039,7 +1039,7 @@ asmlinkage long sys32_newuname(struct ne int ret = 0; down_read(&uts_sem); - if (copy_to_user(name,&system_utsname,sizeof *name)) + if (copy_to_user(name, utsname(), sizeof *name)) ret = -EFAULT; up_read(&uts_sem); diff -uprN linux-2.6.18/arch/mips/kernel/syscall.c linux-2.6.18.ovz/arch/mips/kernel/syscall.c --- linux-2.6.18/arch/mips/kernel/syscall.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/mips/kernel/syscall.c 2007-06-13 06:55:05.000000000 -0400 @@ -231,7 +231,7 @@ out: */ asmlinkage int sys_uname(struct old_utsname __user * name) { - if (name && !copy_to_user(name, &system_utsname, sizeof (*name))) + if (name && !copy_to_user(name, utsname(), sizeof (*name))) return 0; return -EFAULT; } @@ -248,16 +248,21 @@ asmlinkage int sys_olduname(struct oldol if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) return -EFAULT; - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); - error -= __put_user(0,name->sysname+__OLD_UTS_LEN); - error -= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); - error -= __put_user(0,name->nodename+__OLD_UTS_LEN); - error -= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); - error -= __put_user(0,name->release+__OLD_UTS_LEN); - error -= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); - error -= __put_user(0,name->version+__OLD_UTS_LEN); - error -= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); - error = __put_user(0,name->machine+__OLD_UTS_LEN); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + error -= __put_user(0, name->sysname + __OLD_UTS_LEN); + error -= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + error -= __put_user(0, name->nodename + __OLD_UTS_LEN); + error -= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + error -= __put_user(0, name->release + __OLD_UTS_LEN); + error -= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + error -= __put_user(0, name->version + __OLD_UTS_LEN); + error -= __copy_to_user(&name->machine, &utsname()->machine, + __OLD_UTS_LEN); + error = __put_user(0, name->machine + __OLD_UTS_LEN); error = error ? -EFAULT : 0; return error; diff -uprN linux-2.6.18/arch/mips/kernel/sysirix.c linux-2.6.18.ovz/arch/mips/kernel/sysirix.c --- linux-2.6.18/arch/mips/kernel/sysirix.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/mips/kernel/sysirix.c 2007-06-13 06:55:05.000000000 -0400 @@ -111,7 +111,7 @@ asmlinkage int irix_prctl(unsigned optio printk("irix_prctl[%s:%d]: Wants PR_ISBLOCKED\n", current->comm, current->pid); read_lock(&tasklist_lock); - task = find_task_by_pid(va_arg(args, pid_t)); + task = find_task_by_pid_ve(va_arg(args, pid_t)); error = -ESRCH; if (error) error = (task->run_list.next != NULL); @@ -884,7 +884,7 @@ asmlinkage int irix_getdomainname(char _ down_read(&uts_sem); if (len > __NEW_UTS_LEN) len = __NEW_UTS_LEN; - err = copy_to_user(name, system_utsname.domainname, len) ? -EFAULT : 0; + err = copy_to_user(name, utsname()->domainname, len) ? -EFAULT : 0; up_read(&uts_sem); return err; @@ -1127,11 +1127,11 @@ struct iuname { asmlinkage int irix_uname(struct iuname __user *buf) { down_read(&uts_sem); - if (copy_from_user(system_utsname.sysname, buf->sysname, 65) - || copy_from_user(system_utsname.nodename, buf->nodename, 65) - || copy_from_user(system_utsname.release, buf->release, 65) - || copy_from_user(system_utsname.version, buf->version, 65) - || copy_from_user(system_utsname.machine, buf->machine, 65)) { + if (copy_from_user(utsname()->sysname, buf->sysname, 65) + || copy_from_user(utsname()->nodename, buf->nodename, 65) + || copy_from_user(utsname()->release, buf->release, 65) + || copy_from_user(utsname()->version, buf->version, 65) + || copy_from_user(utsname()->machine, buf->machine, 65)) { return -EFAULT; } up_read(&uts_sem); diff -uprN linux-2.6.18/arch/parisc/Kconfig linux-2.6.18.ovz/arch/parisc/Kconfig --- linux-2.6.18/arch/parisc/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/parisc/Kconfig 2007-06-13 06:55:05.000000000 -0400 @@ -194,6 +194,7 @@ config SMP config HOTPLUG_CPU bool + depends on !SCHED_VCPU default y if SMP select HOTPLUG diff -uprN linux-2.6.18/arch/parisc/hpux/sys_hpux.c linux-2.6.18.ovz/arch/parisc/hpux/sys_hpux.c --- linux-2.6.18/arch/parisc/hpux/sys_hpux.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/parisc/hpux/sys_hpux.c 2007-06-13 06:55:05.000000000 -0400 @@ -266,16 +266,21 @@ static int hpux_uname(struct hpux_utsnam down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname,HPUX_UTSLEN-1); - error |= __put_user(0,name->sysname+HPUX_UTSLEN-1); - error |= __copy_to_user(&name->nodename,&system_utsname.nodename,HPUX_UTSLEN-1); - error |= __put_user(0,name->nodename+HPUX_UTSLEN-1); - error |= __copy_to_user(&name->release,&system_utsname.release,HPUX_UTSLEN-1); - error |= __put_user(0,name->release+HPUX_UTSLEN-1); - error |= __copy_to_user(&name->version,&system_utsname.version,HPUX_UTSLEN-1); - error |= __put_user(0,name->version+HPUX_UTSLEN-1); - error |= __copy_to_user(&name->machine,&system_utsname.machine,HPUX_UTSLEN-1); - error |= __put_user(0,name->machine+HPUX_UTSLEN-1); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->sysname + HPUX_UTSLEN - 1); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->nodename + HPUX_UTSLEN - 1); + error |= __copy_to_user(&name->release, &utsname()->release, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->release + HPUX_UTSLEN - 1); + error |= __copy_to_user(&name->version, &utsname()->version, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->version + HPUX_UTSLEN - 1); + error |= __copy_to_user(&name->machine, &utsname()->machine, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->machine + HPUX_UTSLEN - 1); up_read(&uts_sem); @@ -373,8 +378,8 @@ int hpux_utssys(char *ubuf, int n, int t /* TODO: print a warning about using this? */ down_write(&uts_sem); error = -EFAULT; - if (!copy_from_user(system_utsname.sysname, ubuf, len)) { - system_utsname.sysname[len] = 0; + if (!copy_from_user(utsname()->sysname, ubuf, len)) { + utsname()->sysname[len] = 0; error = 0; } up_write(&uts_sem); @@ -400,8 +405,8 @@ int hpux_utssys(char *ubuf, int n, int t /* TODO: print a warning about this? */ down_write(&uts_sem); error = -EFAULT; - if (!copy_from_user(system_utsname.release, ubuf, len)) { - system_utsname.release[len] = 0; + if (!copy_from_user(utsname()->release, ubuf, len)) { + utsname()->release[len] = 0; error = 0; } up_write(&uts_sem); @@ -422,13 +427,13 @@ int hpux_getdomainname(char *name, int l down_read(&uts_sem); - nlen = strlen(system_utsname.domainname) + 1; + nlen = strlen(utsname()->domainname) + 1; if (nlen < len) len = nlen; if(len > __NEW_UTS_LEN) goto done; - if(copy_to_user(name, system_utsname.domainname, len)) + if(copy_to_user(name, utsname()->domainname, len)) goto done; err = 0; done: diff -uprN linux-2.6.18/arch/parisc/kernel/init_task.c linux-2.6.18.ovz/arch/parisc/kernel/init_task.c --- linux-2.6.18/arch/parisc/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/parisc/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/powerpc/Kconfig linux-2.6.18.ovz/arch/powerpc/Kconfig --- linux-2.6.18/arch/powerpc/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/Kconfig 2007-06-13 06:55:05.000000000 -0400 @@ -594,6 +594,7 @@ config HIGHMEM bool "High memory support" depends on PPC32 +source "kernel/Kconfig.fairsched" source kernel/Kconfig.hz source kernel/Kconfig.preempt source "fs/Kconfig.binfmt" @@ -632,7 +633,7 @@ config IOMMU_VMERGE config HOTPLUG_CPU bool "Support for enabling/disabling CPUs" - depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) + depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) && !SCHED_VCPU ---help--- Say Y here to be able to disable and re-enable individual CPUs at runtime on SMP machines. @@ -729,6 +730,15 @@ config ARCH_MEMORY_PROBE def_bool y depends on MEMORY_HOTPLUG +# Some NUMA nodes have memory ranges that span +# other nodes. Even though a pfn is valid and +# between a node's start and end pfns, it may not +# reside on that node. See memmap_init_zone() +# for details. +config NODES_SPAN_OTHER_NODES + def_bool y + depends on NEED_MULTIPLE_NODES + config PPC_64K_PAGES bool "64k page size" depends on PPC64 @@ -1051,6 +1061,8 @@ source "arch/powerpc/platforms/iseries/K source "lib/Kconfig" +source "kernel/ub/Kconfig" + menu "Instrumentation Support" depends on EXPERIMENTAL @@ -1069,6 +1081,8 @@ endmenu source "arch/powerpc/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" config KEYS_COMPAT diff -uprN linux-2.6.18/arch/powerpc/configs/pseries_defconfig linux-2.6.18.ovz/arch/powerpc/configs/pseries_defconfig --- linux-2.6.18/arch/powerpc/configs/pseries_defconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/configs/pseries_defconfig 2007-06-13 06:55:05.000000000 -0400 @@ -184,6 +184,7 @@ CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_RESOURCES_64BIT=y CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y +CONFIG_NODES_SPAN_OTHER_NODES=y # CONFIG_PPC_64K_PAGES is not set CONFIG_SCHED_SMT=y CONFIG_PROC_DEVICETREE=y diff -uprN linux-2.6.18/arch/powerpc/kernel/init_task.c linux-2.6.18.ovz/arch/powerpc/kernel/init_task.c --- linux-2.6.18/arch/powerpc/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -5,6 +5,7 @@ #include #include #include +#include #include static struct fs_struct init_fs = INIT_FS; @@ -12,6 +13,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/powerpc/kernel/misc_32.S linux-2.6.18.ovz/arch/powerpc/kernel/misc_32.S --- linux-2.6.18/arch/powerpc/kernel/misc_32.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/kernel/misc_32.S 2007-06-13 06:55:05.000000000 -0400 @@ -816,7 +816,7 @@ _GLOBAL(_get_SP) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(ppc_kernel_thread) stwu r1,-16(r1) stw r30,8(r1) stw r31,12(r1) diff -uprN linux-2.6.18/arch/powerpc/kernel/misc_64.S linux-2.6.18.ovz/arch/powerpc/kernel/misc_64.S --- linux-2.6.18/arch/powerpc/kernel/misc_64.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/kernel/misc_64.S 2007-06-13 06:55:05.000000000 -0400 @@ -465,7 +465,7 @@ _GLOBAL(scom970_write) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(ppc_kernel_thread) std r29,-24(r1) std r30,-16(r1) stdu r1,-STACK_FRAME_OVERHEAD(r1) diff -uprN linux-2.6.18/arch/powerpc/kernel/process.c linux-2.6.18.ovz/arch/powerpc/kernel/process.c --- linux-2.6.18/arch/powerpc/kernel/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/kernel/process.c 2007-06-13 06:55:05.000000000 -0400 @@ -424,7 +424,7 @@ void show_regs(struct pt_regs * regs) printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); printk("REGS: %p TRAP: %04lx %s (%s)\n", - regs, regs->trap, print_tainted(), system_utsname.release); + regs, regs->trap, print_tainted(), init_utsname()->release); printk("MSR: "REG" ", regs->msr); printbits(regs->msr, msr_bits); printk(" CR: %08lX XER: %08lX\n", regs->ccr, regs->xer); @@ -435,7 +435,7 @@ void show_regs(struct pt_regs * regs) current, current->pid, current->comm, task_thread_info(current)); #ifdef CONFIG_SMP - printk(" CPU: %d", smp_processor_id()); + printk(" CPU: %d, VCPU: %d:%d", smp_processor_id(), task_vsched_id(current), task_cpu(current)); #endif /* CONFIG_SMP */ for (i = 0; i < 32; i++) { @@ -834,12 +834,12 @@ int validate_sp(unsigned long sp, struct return 1; #ifdef CONFIG_IRQSTACKS - stack_page = (unsigned long) hardirq_ctx[task_cpu(p)]; + stack_page = (unsigned long) hardirq_ctx[task_pcpu(p)]; if (sp >= stack_page + sizeof(struct thread_struct) && sp <= stack_page + THREAD_SIZE - nbytes) return 1; - stack_page = (unsigned long) softirq_ctx[task_cpu(p)]; + stack_page = (unsigned long) softirq_ctx[task_pcpu(p)]; if (sp >= stack_page + sizeof(struct thread_struct) && sp <= stack_page + THREAD_SIZE - nbytes) return 1; @@ -950,6 +950,20 @@ void dump_stack(void) } EXPORT_SYMBOL(dump_stack); +long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + extern long ppc_kernel_thread(int (*fn)(void *), void *arg, + unsigned long flags); + + if (!ve_is_super(get_exec_env())) { + printk("kernel_thread call inside VE\n"); + dump_stack(); + return -EPERM; + } + + return ppc_kernel_thread(fn, arg, flags); +} + #ifdef CONFIG_PPC64 void ppc64_runlatch_on(void) { diff -uprN linux-2.6.18/arch/powerpc/kernel/setup_64.c linux-2.6.18.ovz/arch/powerpc/kernel/setup_64.c --- linux-2.6.18/arch/powerpc/kernel/setup_64.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/kernel/setup_64.c 2007-06-13 06:55:05.000000000 -0400 @@ -420,7 +420,7 @@ void __init setup_system(void) smp_release_cpus(); #endif - printk("Starting Linux PPC64 %s\n", system_utsname.version); + printk("Starting Linux PPC64 %s\n", init_utsname()->version); printk("-----------------------------------------------------\n"); printk("ppc64_pft_size = 0x%lx\n", ppc64_pft_size); diff -uprN linux-2.6.18/arch/powerpc/kernel/syscalls.c linux-2.6.18.ovz/arch/powerpc/kernel/syscalls.c --- linux-2.6.18/arch/powerpc/kernel/syscalls.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/kernel/syscalls.c 2007-06-13 06:55:05.000000000 -0400 @@ -260,7 +260,7 @@ long ppc_newuname(struct new_utsname __u int err = 0; down_read(&uts_sem); - if (copy_to_user(name, &system_utsname, sizeof(*name))) + if (copy_to_user(name, utsname(), sizeof(*name))) err = -EFAULT; up_read(&uts_sem); if (!err) @@ -273,7 +273,7 @@ int sys_uname(struct old_utsname __user int err = 0; down_read(&uts_sem); - if (copy_to_user(name, &system_utsname, sizeof(*name))) + if (copy_to_user(name, utsname(), sizeof(*name))) err = -EFAULT; up_read(&uts_sem); if (!err) @@ -289,19 +289,19 @@ int sys_olduname(struct oldold_utsname _ return -EFAULT; down_read(&uts_sem); - error = __copy_to_user(&name->sysname, &system_utsname.sysname, + error = __copy_to_user(&name->sysname, &utsname()->sysname, __OLD_UTS_LEN); error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &system_utsname.nodename, + error |= __copy_to_user(&name->nodename, &utsname()->nodename, __OLD_UTS_LEN); error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &system_utsname.release, + error |= __copy_to_user(&name->release, &utsname()->release, __OLD_UTS_LEN); error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &system_utsname.version, + error |= __copy_to_user(&name->version, &utsname()->version, __OLD_UTS_LEN); error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &system_utsname.machine, + error |= __copy_to_user(&name->machine, &utsname()->machine, __OLD_UTS_LEN); error |= override_machine(name->machine); up_read(&uts_sem); diff -uprN linux-2.6.18/arch/powerpc/kernel/systbl.S linux-2.6.18.ovz/arch/powerpc/kernel/systbl.S --- linux-2.6.18/arch/powerpc/kernel/systbl.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/kernel/systbl.S 2007-06-13 06:55:05.000000000 -0400 @@ -41,5 +41,8 @@ #define sys_old_getrlimit sys_ni_syscall #endif +#define SYS_SKIP(from, to) .rept (to - from) +#define SYS_SKIP_END() .endr + _GLOBAL(sys_call_table) #include diff -uprN linux-2.6.18/arch/powerpc/kernel/traps.c linux-2.6.18.ovz/arch/powerpc/kernel/traps.c --- linux-2.6.18/arch/powerpc/kernel/traps.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/kernel/traps.c 2007-06-13 06:55:05.000000000 -0400 @@ -818,7 +818,7 @@ void __kprobes program_check_exception(s void alignment_exception(struct pt_regs *regs) { - int fixed = 0; + int sig, code, fixed = 0; /* we don't implement logging of alignment exceptions */ if (!(current->thread.align_ctl & PR_UNALIGN_SIGBUS)) @@ -832,14 +832,16 @@ void alignment_exception(struct pt_regs /* Operand address was bad */ if (fixed == -EFAULT) { - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_ACCERR, regs->dar); - else - /* Search exception table */ - bad_page_fault(regs, regs->dar, SIGSEGV); - return; + sig = SIGSEGV; + code = SEGV_ACCERR; + } else { + sig = SIGBUS; + code = BUS_ADRALN; } - _exception(SIGBUS, regs, BUS_ADRALN, regs->dar); + if (user_mode(regs)) + _exception(sig, regs, code, regs->dar); + else + bad_page_fault(regs, regs->dar, sig); } void StackOverflow(struct pt_regs *regs) diff -uprN linux-2.6.18/arch/powerpc/mm/fault.c linux-2.6.18.ovz/arch/powerpc/mm/fault.c --- linux-2.6.18/arch/powerpc/mm/fault.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/mm/fault.c 2007-06-13 06:55:05.000000000 -0400 @@ -342,7 +342,6 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - survive: switch (handle_mm_fault(mm, vma, address, is_write)) { case VM_FAULT_MINOR: @@ -386,14 +385,12 @@ bad_area_nosemaphore: */ out_of_memory: up_read(&mm->mmap_sem); - if (current->pid == 1) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); if (user_mode(regs)) - do_exit(SIGKILL); + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. Den + */ + force_sig(SIGKILL, current); return SIGKILL; do_sigbus: diff -uprN linux-2.6.18/arch/powerpc/mm/init_64.c linux-2.6.18.ovz/arch/powerpc/mm/init_64.c --- linux-2.6.18/arch/powerpc/mm/init_64.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/mm/init_64.c 2007-06-13 06:55:05.000000000 -0400 @@ -184,7 +184,8 @@ void pgtable_cache_init(void) pgtable_cache[i] = kmem_cache_create(name, size, size, SLAB_HWCACHE_ALIGN | - SLAB_MUST_HWCACHE_ALIGN, + SLAB_MUST_HWCACHE_ALIGN | + SLAB_UBC | SLAB_NO_CHARGE, zero_ctor, NULL); if (! pgtable_cache[i]) diff -uprN linux-2.6.18/arch/powerpc/mm/mem.c linux-2.6.18.ovz/arch/powerpc/mm/mem.c --- linux-2.6.18/arch/powerpc/mm/mem.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/mm/mem.c 2007-06-13 06:55:05.000000000 -0400 @@ -226,6 +226,7 @@ void show_mem(void) printk("%ld pages shared\n", shared); printk("%ld pages swap cached\n", cached); } +EXPORT_SYMBOL(show_mem); /* * Initialize the bootmem system and give it all the memory we diff -uprN linux-2.6.18/arch/powerpc/mm/pgtable_32.c linux-2.6.18.ovz/arch/powerpc/mm/pgtable_32.c --- linux-2.6.18/arch/powerpc/mm/pgtable_32.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/mm/pgtable_32.c 2007-06-13 06:55:05.000000000 -0400 @@ -84,7 +84,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret; - ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); + ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | + __GFP_ZERO, PGDIR_ORDER); return ret; } @@ -118,6 +119,7 @@ struct page *pte_alloc_one(struct mm_str #else gfp_t flags = GFP_KERNEL | __GFP_REPEAT; #endif + flags |= (__GFP_UBC | __GFP_SOFT_UBC); ptepage = alloc_pages(flags, 0); if (ptepage) diff -uprN linux-2.6.18/arch/powerpc/platforms/cell/spu_callbacks.c linux-2.6.18.ovz/arch/powerpc/platforms/cell/spu_callbacks.c --- linux-2.6.18/arch/powerpc/platforms/cell/spu_callbacks.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/platforms/cell/spu_callbacks.c 2007-06-13 06:55:05.000000000 -0400 @@ -46,6 +46,9 @@ void *spu_syscall_table[] = { #define PPC_SYS_SPU(func) ppc_##func, #define SYSX_SPU(f, f3264, f32) f, +#define SYS_SKIP(from, to) [from ... to] = +#define SYS_SKIP_END() + #include }; diff -uprN linux-2.6.18/arch/powerpc/platforms/pseries/setup.c linux-2.6.18.ovz/arch/powerpc/platforms/pseries/setup.c --- linux-2.6.18/arch/powerpc/platforms/pseries/setup.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/powerpc/platforms/pseries/setup.c 2007-06-13 06:55:05.000000000 -0400 @@ -328,7 +328,7 @@ static int __init pSeries_init_panel(voi { /* Manually leave the kernel version on the panel. */ ppc_md.progress("Linux ppc64\n", 0); - ppc_md.progress(system_utsname.release, 0); + ppc_md.progress(init_utsname()->version, 0); return 0; } diff -uprN linux-2.6.18/arch/ppc/Kconfig linux-2.6.18.ovz/arch/ppc/Kconfig --- linux-2.6.18/arch/ppc/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ppc/Kconfig 2007-06-13 06:55:05.000000000 -0400 @@ -953,6 +953,7 @@ config NR_CPUS config HIGHMEM bool "High memory support" +source "kernel/Kconfig.fairsched" source kernel/Kconfig.hz source kernel/Kconfig.preempt source "mm/Kconfig" @@ -1418,6 +1419,10 @@ source "arch/powerpc/oprofile/Kconfig" source "arch/ppc/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" +source "kernel/ub/Kconfig" + source "crypto/Kconfig" diff -uprN linux-2.6.18/arch/ppc/kernel/misc.S linux-2.6.18.ovz/arch/ppc/kernel/misc.S --- linux-2.6.18/arch/ppc/kernel/misc.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ppc/kernel/misc.S 2007-06-13 06:55:05.000000000 -0400 @@ -1003,7 +1003,7 @@ _GLOBAL(_get_SP) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(ppc_kernel_thread) stwu r1,-16(r1) stw r30,8(r1) stw r31,12(r1) diff -uprN linux-2.6.18/arch/ppc/kernel/traps.c linux-2.6.18.ovz/arch/ppc/kernel/traps.c --- linux-2.6.18/arch/ppc/kernel/traps.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ppc/kernel/traps.c 2007-06-13 06:55:05.000000000 -0400 @@ -708,7 +708,7 @@ void single_step_exception(struct pt_reg void alignment_exception(struct pt_regs *regs) { - int fixed; + int sig, code, fixed = 0; fixed = fix_alignment(regs); if (fixed == 1) { @@ -717,14 +717,16 @@ void alignment_exception(struct pt_regs return; } if (fixed == -EFAULT) { - /* fixed == -EFAULT means the operand address was bad */ - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_ACCERR, regs->dar); - else - bad_page_fault(regs, regs->dar, SIGSEGV); - return; + sig = SIGSEGV; + code = SEGV_ACCERR; + } else { + sig = SIGBUS; + code = BUS_ADRALN; } - _exception(SIGBUS, regs, BUS_ADRALN, regs->dar); + if (user_mode(regs)) + _exception(sig, regs, code, regs->dar); + else + bad_page_fault(regs, regs->dar, sig); } void StackOverflow(struct pt_regs *regs) diff -uprN linux-2.6.18/arch/ppc/mm/fault.c linux-2.6.18.ovz/arch/ppc/mm/fault.c --- linux-2.6.18/arch/ppc/mm/fault.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ppc/mm/fault.c 2007-06-13 06:55:05.000000000 -0400 @@ -248,7 +248,6 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - survive: switch (handle_mm_fault(mm, vma, address, is_write)) { case VM_FAULT_MINOR: current->min_flt++; @@ -291,14 +290,12 @@ bad_area: */ out_of_memory: up_read(&mm->mmap_sem); - if (current->pid == 1) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); if (user_mode(regs)) - do_exit(SIGKILL); + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. Den + */ + force_sig(SIGKILL, current); return SIGKILL; do_sigbus: diff -uprN linux-2.6.18/arch/ppc/mm/init.c linux-2.6.18.ovz/arch/ppc/mm/init.c --- linux-2.6.18/arch/ppc/mm/init.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ppc/mm/init.c 2007-06-13 06:55:05.000000000 -0400 @@ -131,6 +131,7 @@ void show_mem(void) printk("%d pages shared\n",shared); printk("%d pages swap cached\n",cached); } +EXPORT_SYMBOL(show_mem); /* Free up now-unused memory */ static void free_sec(unsigned long start, unsigned long end, const char *name) diff -uprN linux-2.6.18/arch/ppc/mm/pgtable.c linux-2.6.18.ovz/arch/ppc/mm/pgtable.c --- linux-2.6.18/arch/ppc/mm/pgtable.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/ppc/mm/pgtable.c 2007-06-13 06:55:05.000000000 -0400 @@ -83,7 +83,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret; - ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); + ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | + __GFP_ZERO, PGDIR_ORDER); return ret; } @@ -117,6 +118,7 @@ struct page *pte_alloc_one(struct mm_str #else gfp_t flags = GFP_KERNEL | __GFP_REPEAT; #endif + flags |= (__GFP_UBC | __GFP_SOFT_UBC); ptepage = alloc_pages(flags, 0); if (ptepage) diff -uprN linux-2.6.18/arch/s390/Kconfig linux-2.6.18.ovz/arch/s390/Kconfig --- linux-2.6.18/arch/s390/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/s390/Kconfig 2007-06-13 06:55:05.000000000 -0400 @@ -51,6 +51,10 @@ config 64BIT Select this option if you have a 64 bit IBM zSeries machine and want to use the 64 bit addressing mode. +config 32BIT + bool + default y if !64BIT + config SMP bool "Symmetric multi-processing support" ---help--- @@ -84,7 +88,7 @@ config NR_CPUS config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" - depends on SMP + depends on SMP && !SCHED_VCPU select HOTPLUG default n help @@ -491,8 +495,12 @@ source "arch/s390/oprofile/Kconfig" source "arch/s390/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "kernel/ub/Kconfig" diff -uprN linux-2.6.18/arch/s390/kernel/init_task.c linux-2.6.18.ovz/arch/s390/kernel/init_task.c --- linux-2.6.18/arch/s390/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/s390/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/s390/kernel/process.c linux-2.6.18.ovz/arch/s390/kernel/process.c --- linux-2.6.18/arch/s390/kernel/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/s390/kernel/process.c 2007-06-13 06:55:05.000000000 -0400 @@ -165,9 +165,10 @@ void show_regs(struct pt_regs *regs) struct task_struct *tsk = current; printk("CPU: %d %s\n", task_thread_info(tsk)->cpu, print_tainted()); - printk("Process %s (pid: %d, task: %p, ksp: %p)\n", - current->comm, current->pid, (void *) tsk, - (void *) tsk->thread.ksp); + printk("Process %s (pid: %d, veid: %d, task: %p, ksp: %p)\n", + current->comm, current->pid, + VEID(VE_TASK_INFO(current)->owner_env), + (void *) tsk, (void *) tsk->thread.ksp); show_registers(regs); /* Show stack backtrace if pt_regs is from kernel mode */ @@ -188,6 +189,13 @@ int kernel_thread(int (*fn)(void *), voi { struct pt_regs regs; + if (!ve_is_super(get_exec_env())) { + /* Don't allow kernel_thread() inside VE */ + printk("kernel_thread call inside VE\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | PSW_MASK_EXT; regs.psw.addr = (unsigned long) kernel_thread_starter | PSW_ADDR_AMODE; diff -uprN linux-2.6.18/arch/s390/kernel/smp.c linux-2.6.18.ovz/arch/s390/kernel/smp.c --- linux-2.6.18/arch/s390/kernel/smp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/s390/kernel/smp.c 2007-06-13 06:55:05.000000000 -0400 @@ -526,6 +526,17 @@ int __devinit start_secondary(void *cpuv { /* Setup the cpu */ cpu_init(); + +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; + /* + * Cosmetic: sleep_time won't be changed afterwards for the idle + * thread; keep it 0 rather than -cycles. + */ + VE_TASK_INFO(idle)->sleep_time = 0; +#endif + preempt_disable(); /* init per CPU timer */ init_cpu_timer(); @@ -834,6 +845,11 @@ void __init smp_prepare_cpus(unsigned in for_each_possible_cpu(cpu) if (cpu != smp_processor_id()) smp_create_idle(cpu); + +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; +#endif } void __devinit smp_prepare_boot_cpu(void) diff -uprN linux-2.6.18/arch/s390/lib/Makefile linux-2.6.18.ovz/arch/s390/lib/Makefile --- linux-2.6.18/arch/s390/lib/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/s390/lib/Makefile 2007-06-13 06:55:05.000000000 -0400 @@ -7,3 +7,4 @@ EXTRA_AFLAGS := -traditional lib-y += delay.o string.o lib-y += $(if $(CONFIG_64BIT),uaccess64.o,uaccess.o) lib-$(CONFIG_SMP) += spinlock.o +lib-$(CONFIG_32BIT) += div64.o diff -uprN linux-2.6.18/arch/s390/lib/div64.c linux-2.6.18.ovz/arch/s390/lib/div64.c --- linux-2.6.18/arch/s390/lib/div64.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/arch/s390/lib/div64.c 2007-06-13 06:55:05.000000000 -0400 @@ -0,0 +1,151 @@ +/* + * arch/s390/lib/div64.c + * + * __div64_32 implementation for 31 bit. + * + * Copyright (C) IBM Corp. 2006 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + */ + +#include +#include + +#ifdef CONFIG_MARCH_G5 + +/* + * Function to divide an unsigned 64 bit integer by an unsigned + * 31 bit integer using signed 64/32 bit division. + */ +static uint32_t __div64_31(uint64_t *n, uint32_t base) +{ + register uint32_t reg2 asm("2"); + register uint32_t reg3 asm("3"); + uint32_t *words = (uint32_t *) n; + uint32_t tmp; + + /* Special case base==1, remainder = 0, quotient = n */ + if (base == 1) + return 0; + /* + * Special case base==0 will cause a fixed point divide exception + * on the dr instruction and may not happen anyway. For the + * following calculation we can assume base > 1. The first + * signed 64 / 32 bit division with an upper half of 0 will + * give the correct upper half of the 64 bit quotient. + */ + reg2 = 0UL; + reg3 = words[0]; + asm volatile( + " dr %0,%2\n" + : "+d" (reg2), "+d" (reg3) : "d" (base) : "cc" ); + words[0] = reg3; + reg3 = words[1]; + /* + * To get the lower half of the 64 bit quotient and the 32 bit + * remainder we have to use a little trick. Since we only have + * a signed division the quotient can get too big. To avoid this + * the 64 bit dividend is halved, then the signed division will + * work. Afterwards the quotient and the remainder are doubled. + * If the last bit of the dividend has been one the remainder + * is increased by one then checked against the base. If the + * remainder has overflown subtract base and increase the + * quotient. Simple, no ? + */ + asm volatile( + " nr %2,%1\n" + " srdl %0,1\n" + " dr %0,%3\n" + " alr %0,%0\n" + " alr %1,%1\n" + " alr %0,%2\n" + " clr %0,%3\n" + " jl 0f\n" + " slr %0,%3\n" + " alr %1,%2\n" + "0:\n" + : "+d" (reg2), "+d" (reg3), "=d" (tmp) + : "d" (base), "2" (1UL) : "cc" ); + words[1] = reg3; + return reg2; +} + +/* + * Function to divide an unsigned 64 bit integer by an unsigned + * 32 bit integer using the unsigned 64/31 bit division. + */ +uint32_t __div64_32(uint64_t *n, uint32_t base) +{ + uint32_t r; + + /* + * If the most significant bit of base is set, divide n by + * (base/2). That allows to use 64/31 bit division and gives a + * good approximation of the result: n = (base/2)*q + r. The + * result needs to be corrected with two simple transformations. + * If base is already < 2^31-1 __div64_31 can be used directly. + */ + r = __div64_31(n, ((signed) base < 0) ? (base/2) : base); + if ((signed) base < 0) { + uint64_t q = *n; + /* + * First transformation: + * n = (base/2)*q + r + * = ((base/2)*2)*(q/2) + ((q&1) ? (base/2) : 0) + r + * Since r < (base/2), r + (base/2) < base. + * With q1 = (q/2) and r1 = r + ((q&1) ? (base/2) : 0) + * n = ((base/2)*2)*q1 + r1 with r1 < base. + */ + if (q & 1) + r += base/2; + q >>= 1; + /* + * Second transformation. ((base/2)*2) could have lost the + * last bit. + * n = ((base/2)*2)*q1 + r1 + * = base*q1 - ((base&1) ? q1 : 0) + r1 + */ + if (base & 1) { + int64_t rx = r - q; + /* + * base is >= 2^31. The worst case for the while + * loop is n=2^64-1 base=2^31+1. That gives a + * maximum for q=(2^64-1)/2^31 = 0x1ffffffff. Since + * base >= 2^31 the loop is finished after a maximum + * of three iterations. + */ + while (rx < 0) { + rx += base; + q--; + } + r = rx; + } + *n = q; + } + return r; +} + +#else /* MARCH_G5 */ + +uint32_t __div64_32(uint64_t *n, uint32_t base) +{ + register uint32_t reg2 asm("2"); + register uint32_t reg3 asm("3"); + uint32_t *words = (uint32_t *) n; + + reg2 = 0UL; + reg3 = words[0]; + asm volatile( + " dlr %0,%2\n" + : "+d" (reg2), "+d" (reg3) : "d" (base) : "cc" ); + words[0] = reg3; + reg3 = words[1]; + asm volatile( + " dlr %0,%2\n" + : "+d" (reg2), "+d" (reg3) : "d" (base) : "cc" ); + words[1] = reg3; + return reg2; +} + +#endif /* MARCH_G5 */ + +EXPORT_SYMBOL(__div64_32); diff -uprN linux-2.6.18/arch/s390/lib/uaccess.S linux-2.6.18.ovz/arch/s390/lib/uaccess.S --- linux-2.6.18/arch/s390/lib/uaccess.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/s390/lib/uaccess.S 2007-06-13 06:55:05.000000000 -0400 @@ -40,7 +40,17 @@ __copy_from_user_asm: # move with the reduced length which is < 256 5: mvcp 0(%r5,%r2),0(%r4),%r0 slr %r3,%r5 -6: lr %r2,%r3 + alr %r2,%r5 +6: lr %r5,%r3 # copy remaining size + ahi %r5,-1 # subtract 1 for xc loop + bras %r4,8f + xc 0(1,%r2),0(%r2) +7: xc 0(256,%r2),0(%r2) + la %r2,256(%r2) +8: ahi %r5,-256 + jnm 7b + ex %r5,0(%r4) +9: lr %r2,%r3 br %r14 .section __ex_table,"a" .long 0b,4b diff -uprN linux-2.6.18/arch/s390/lib/uaccess64.S linux-2.6.18.ovz/arch/s390/lib/uaccess64.S --- linux-2.6.18/arch/s390/lib/uaccess64.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/s390/lib/uaccess64.S 2007-06-13 06:55:05.000000000 -0400 @@ -40,7 +40,17 @@ __copy_from_user_asm: # move with the reduced length which is < 256 5: mvcp 0(%r5,%r2),0(%r4),%r0 slgr %r3,%r5 -6: lgr %r2,%r3 + algr %r2,%r5 +6: lgr %r5,%r3 # copy remaining size + aghi %r5,-1 # subtract 1 for xc loop + bras %r4,8f + xc 0(1,%r2),0(%r2) +7: xc 0(256,%r2),0(%r2) + la %r2,256(%r2) +8: aghi %r5,-256 + jnm 7b + ex %r5,0(%r4) +9: lgr %r2,%r3 br %r14 .section __ex_table,"a" .quad 0b,4b diff -uprN linux-2.6.18/arch/s390/mm/fault.c linux-2.6.18.ovz/arch/s390/mm/fault.c --- linux-2.6.18/arch/s390/mm/fault.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/s390/mm/fault.c 2007-06-13 06:55:05.000000000 -0400 @@ -60,17 +60,9 @@ void bust_spinlocks(int yes) if (yes) { oops_in_progress = 1; } else { - int loglevel_save = console_loglevel; console_unblank(); oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk will give klogd - * a poke. Hold onto your hats... - */ - console_loglevel = 15; - printk(" "); - console_loglevel = loglevel_save; + wake_up_klogd(); } } diff -uprN linux-2.6.18/arch/s390/mm/init.c linux-2.6.18.ovz/arch/s390/mm/init.c --- linux-2.6.18/arch/s390/mm/init.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/s390/mm/init.c 2007-06-13 06:55:05.000000000 -0400 @@ -90,6 +90,7 @@ void show_mem(void) printk("%d pages shared\n",shared); printk("%d pages swap cached\n",cached); } +EXPORT_SYMBOL(show_mem); extern unsigned long __initdata zholes_size[]; /* diff -uprN linux-2.6.18/arch/sh/kernel/init_task.c linux-2.6.18.ovz/arch/sh/kernel/init_task.c --- linux-2.6.18/arch/sh/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sh/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -12,6 +13,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/sh/kernel/kgdb_stub.c linux-2.6.18.ovz/arch/sh/kernel/kgdb_stub.c --- linux-2.6.18/arch/sh/kernel/kgdb_stub.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sh/kernel/kgdb_stub.c 2007-06-13 06:55:05.000000000 -0400 @@ -412,7 +412,7 @@ static struct task_struct *get_thread(in if (pid == PID_MAX) pid = 0; /* First check via PID */ - thread = find_task_by_pid(pid); + thread = find_task_by_pid_all(pid); if (thread) return thread; diff -uprN linux-2.6.18/arch/sh/kernel/process.c linux-2.6.18.ovz/arch/sh/kernel/process.c --- linux-2.6.18/arch/sh/kernel/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sh/kernel/process.c 2007-06-13 06:55:05.000000000 -0400 @@ -26,6 +26,7 @@ #include #include #include +#include static int hlt_counter=0; diff -uprN linux-2.6.18/arch/sh/kernel/setup.c linux-2.6.18.ovz/arch/sh/kernel/setup.c --- linux-2.6.18/arch/sh/kernel/setup.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sh/kernel/setup.c 2007-06-13 06:55:05.000000000 -0400 @@ -481,7 +481,7 @@ static int show_cpuinfo(struct seq_file seq_printf(m, "machine\t\t: %s\n", get_system_type()); seq_printf(m, "processor\t: %d\n", cpu); - seq_printf(m, "cpu family\t: %s\n", system_utsname.machine); + seq_printf(m, "cpu family\t: %s\n", init_utsname()->machine); seq_printf(m, "cpu type\t: %s\n", get_cpu_subtype()); show_cpuflags(m); diff -uprN linux-2.6.18/arch/sh/kernel/sys_sh.c linux-2.6.18.ovz/arch/sh/kernel/sys_sh.c --- linux-2.6.18/arch/sh/kernel/sys_sh.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sh/kernel/sys_sh.c 2007-06-13 06:55:05.000000000 -0400 @@ -267,7 +267,7 @@ asmlinkage int sys_uname(struct old_utsn if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } diff -uprN linux-2.6.18/arch/sh64/kernel/init_task.c linux-2.6.18.ovz/arch/sh64/kernel/init_task.c --- linux-2.6.18/arch/sh64/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sh64/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); struct pt_regs fake_swapper_regs; diff -uprN linux-2.6.18/arch/sh64/kernel/process.c linux-2.6.18.ovz/arch/sh64/kernel/process.c --- linux-2.6.18/arch/sh64/kernel/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sh64/kernel/process.c 2007-06-13 06:55:05.000000000 -0400 @@ -908,7 +908,7 @@ asids_proc_info(char *buf, char **start, int len=0; struct task_struct *p; read_lock(&tasklist_lock); - for_each_process(p) { + for_each_process_ve(p) { int pid = p->pid; struct mm_struct *mm; if (!pid) continue; diff -uprN linux-2.6.18/arch/sh64/kernel/sys_sh64.c linux-2.6.18.ovz/arch/sh64/kernel/sys_sh64.c --- linux-2.6.18/arch/sh64/kernel/sys_sh64.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sh64/kernel/sys_sh64.c 2007-06-13 06:55:05.000000000 -0400 @@ -279,7 +279,7 @@ asmlinkage int sys_uname(struct old_utsn if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } diff -uprN linux-2.6.18/arch/sparc/kernel/entry.S linux-2.6.18.ovz/arch/sparc/kernel/entry.S --- linux-2.6.18/arch/sparc/kernel/entry.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc/kernel/entry.S 2007-06-13 06:55:05.000000000 -0400 @@ -32,13 +32,12 @@ #include #include #include +#include #include #define curptr g6 -#define NR_SYSCALLS 300 /* Each OS is different... */ - /* These are just handy. */ #define _SV save %sp, -STACKFRAME_SZ, %sp #define _RS restore diff -uprN linux-2.6.18/arch/sparc/kernel/init_task.c linux-2.6.18.ovz/arch/sparc/kernel/init_task.c --- linux-2.6.18/arch/sparc/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -12,6 +13,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/sparc/kernel/sys_sparc.c linux-2.6.18.ovz/arch/sparc/kernel/sys_sparc.c --- linux-2.6.18/arch/sparc/kernel/sys_sparc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc/kernel/sys_sparc.c 2007-06-13 06:55:05.000000000 -0400 @@ -475,13 +475,13 @@ asmlinkage int sys_getdomainname(char __ down_read(&uts_sem); - nlen = strlen(system_utsname.domainname) + 1; + nlen = strlen(init_utsname()->domainname) + 1; err = -EINVAL; if (nlen > len) goto out; err = -EFAULT; - if (!copy_to_user(name, system_utsname.domainname, nlen)) + if (!copy_to_user(name, init_utsname()->domainname, nlen)) err = 0; out: diff -uprN linux-2.6.18/arch/sparc/kernel/sys_sunos.c linux-2.6.18.ovz/arch/sparc/kernel/sys_sunos.c --- linux-2.6.18/arch/sparc/kernel/sys_sunos.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc/kernel/sys_sunos.c 2007-06-13 06:55:05.000000000 -0400 @@ -483,13 +483,18 @@ asmlinkage int sunos_uname(struct sunos_ { int ret; down_read(&uts_sem); - ret = copy_to_user(&name->sname[0], &system_utsname.sysname[0], sizeof(name->sname) - 1); + ret = copy_to_user(&name->sname[0], &utsname()->sysname[0], + sizeof(name->sname) - 1); if (!ret) { - ret |= __copy_to_user(&name->nname[0], &system_utsname.nodename[0], sizeof(name->nname) - 1); + ret |= __copy_to_user(&name->nname[0], &utsname()->nodename[0], + sizeof(name->nname) - 1); ret |= __put_user('\0', &name->nname[8]); - ret |= __copy_to_user(&name->rel[0], &system_utsname.release[0], sizeof(name->rel) - 1); - ret |= __copy_to_user(&name->ver[0], &system_utsname.version[0], sizeof(name->ver) - 1); - ret |= __copy_to_user(&name->mach[0], &system_utsname.machine[0], sizeof(name->mach) - 1); + ret |= __copy_to_user(&name->rel[0], &utsname()->release[0], + sizeof(name->rel) - 1); + ret |= __copy_to_user(&name->ver[0], &utsname()->version[0], + sizeof(name->ver) - 1); + ret |= __copy_to_user(&name->mach[0], &utsname()->machine[0], + sizeof(name->mach) - 1); } up_read(&uts_sem); return ret ? -EFAULT : 0; diff -uprN linux-2.6.18/arch/sparc64/Kconfig linux-2.6.18.ovz/arch/sparc64/Kconfig --- linux-2.6.18/arch/sparc64/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/Kconfig 2007-06-13 06:55:05.000000000 -0400 @@ -138,6 +138,8 @@ config NR_CPUS depends on SMP default "32" +source "kernel/Kconfig.fairsched" + source "drivers/cpufreq/Kconfig" config US3_FREQ @@ -431,8 +433,12 @@ endmenu source "arch/sparc64/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "kernel/ub/Kconfig" diff -uprN linux-2.6.18/arch/sparc64/kernel/central.c linux-2.6.18.ovz/arch/sparc64/kernel/central.c --- linux-2.6.18/arch/sparc64/kernel/central.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/central.c 2007-06-13 06:55:05.000000000 -0400 @@ -126,6 +126,10 @@ static void probe_other_fhcs(void) int board; u32 tmp; + if (dp->parent && + dp->parent->parent != NULL) + continue; + fhc = (struct linux_fhc *) central_alloc_bootmem(sizeof(struct linux_fhc)); if (fhc == NULL) diff -uprN linux-2.6.18/arch/sparc64/kernel/entry.S linux-2.6.18.ovz/arch/sparc64/kernel/entry.S --- linux-2.6.18/arch/sparc64/kernel/entry.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/entry.S 2007-06-13 06:55:05.000000000 -0400 @@ -22,11 +22,10 @@ #include #include #include +#include #define curptr g6 -#define NR_SYSCALLS 300 /* Each OS is different... */ - .text .align 32 diff -uprN linux-2.6.18/arch/sparc64/kernel/init_task.c linux-2.6.18.ovz/arch/sparc64/kernel/init_task.c --- linux-2.6.18/arch/sparc64/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -13,6 +14,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/sparc64/kernel/of_device.c linux-2.6.18.ovz/arch/sparc64/kernel/of_device.c --- linux-2.6.18/arch/sparc64/kernel/of_device.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/of_device.c 2007-06-13 06:55:05.000000000 -0400 @@ -398,16 +398,22 @@ static void of_bus_sbus_count_cells(stru *sizec = 1; } -static int of_bus_sbus_map(u32 *addr, const u32 *range, int na, int ns, int pna) -{ - return of_bus_default_map(addr, range, na, ns, pna); -} - -static unsigned int of_bus_sbus_get_flags(u32 *addr) +/* + * FHC/Central bus specific translator. + * + * This is just needed to hard-code the address and size cell + * counts. 'fhc' and 'central' nodes lack the #address-cells and + * #size-cells properties, and if you walk to the root on such + * Enterprise boxes all you'll get is a #size-cells of 2 which is + * not what we want to use. + */ +static int of_bus_fhc_match(struct device_node *np) { - return IORESOURCE_MEM; + return !strcmp(np->name, "fhc") || + !strcmp(np->name, "central"); } +#define of_bus_fhc_count_cells of_bus_sbus_count_cells /* * Array of bus specific translators @@ -429,8 +435,17 @@ static struct of_bus of_busses[] = { .addr_prop_name = "reg", .match = of_bus_sbus_match, .count_cells = of_bus_sbus_count_cells, - .map = of_bus_sbus_map, - .get_flags = of_bus_sbus_get_flags, + .map = of_bus_default_map, + .get_flags = of_bus_default_get_flags, + }, + /* FHC */ + { + .name = "fhc", + .addr_prop_name = "reg", + .match = of_bus_fhc_match, + .count_cells = of_bus_fhc_count_cells, + .map = of_bus_default_map, + .get_flags = of_bus_default_get_flags, }, /* Default */ { diff -uprN linux-2.6.18/arch/sparc64/kernel/pci_common.c linux-2.6.18.ovz/arch/sparc64/kernel/pci_common.c --- linux-2.6.18/arch/sparc64/kernel/pci_common.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/pci_common.c 2007-06-13 06:55:05.000000000 -0400 @@ -330,19 +330,6 @@ __init get_device_resource(struct linux_ return res; } -static int __init pdev_resource_collisions_expected(struct pci_dev *pdev) -{ - if (pdev->vendor != PCI_VENDOR_ID_SUN) - return 0; - - if (pdev->device == PCI_DEVICE_ID_SUN_RIO_EBUS || - pdev->device == PCI_DEVICE_ID_SUN_RIO_1394 || - pdev->device == PCI_DEVICE_ID_SUN_RIO_USB) - return 1; - - return 0; -} - static void __init pdev_record_assignments(struct pci_pbm_info *pbm, struct pci_dev *pdev) { @@ -400,19 +387,23 @@ static void __init pdev_record_assignmen pbm->parent->resource_adjust(pdev, res, root); if (request_resource(root, res) < 0) { + int rnum; + /* OK, there is some conflict. But this is fine * since we'll reassign it in the fixup pass. * - * We notify the user that OBP made an error if it - * is a case we don't expect. + * Do not print the warning for ROM resources + * as such a conflict is quite common and + * harmless as the ROM bar is disabled. */ - if (!pdev_resource_collisions_expected(pdev)) { - printk(KERN_ERR "PCI: Address space collision on region %ld " + rnum = (res - &pdev->resource[0]); + if (rnum != PCI_ROM_RESOURCE) + printk(KERN_ERR "PCI: Resource collision, " + "region %d " "[%016lx:%016lx] of device %s\n", - (res - &pdev->resource[0]), + rnum, res->start, res->end, pci_name(pdev)); - } } } } diff -uprN linux-2.6.18/arch/sparc64/kernel/pci_iommu.c linux-2.6.18.ovz/arch/sparc64/kernel/pci_iommu.c --- linux-2.6.18/arch/sparc64/kernel/pci_iommu.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/pci_iommu.c 2007-06-13 06:55:05.000000000 -0400 @@ -281,7 +281,7 @@ static void pci_4u_free_consistent(struc spin_lock_irqsave(&iommu->lock, flags); - free_npages(iommu, dvma, npages); + free_npages(iommu, dvma - iommu->page_table_map_base, npages); spin_unlock_irqrestore(&iommu->lock, flags); diff -uprN linux-2.6.18/arch/sparc64/kernel/pci_sabre.c linux-2.6.18.ovz/arch/sparc64/kernel/pci_sabre.c --- linux-2.6.18/arch/sparc64/kernel/pci_sabre.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/pci_sabre.c 2007-06-13 06:55:05.000000000 -0400 @@ -1196,7 +1196,7 @@ static void pbm_register_toplevel_resour &pbm->mem_space); } -static void sabre_pbm_init(struct pci_controller_info *p, struct device_node *dp, u32 dma_begin) +static void sabre_pbm_init(struct pci_controller_info *p, struct device_node *dp, u32 dma_start, u32 dma_end) { struct pci_pbm_info *pbm; struct device_node *node; @@ -1261,6 +1261,8 @@ static void sabre_pbm_init(struct pci_co node = node->sibling; } if (simbas_found == 0) { + struct resource *rp; + /* No APBs underneath, probably this is a hummingbird * system. */ @@ -1302,8 +1304,10 @@ static void sabre_pbm_init(struct pci_co pbm->io_space.end = pbm->io_space.start + (1UL << 24) - 1UL; pbm->io_space.flags = IORESOURCE_IO; - pbm->mem_space.start = p->pbm_A.controller_regs + SABRE_MEMSPACE; - pbm->mem_space.end = pbm->mem_space.start + (unsigned long)dma_begin - 1UL; + pbm->mem_space.start = + (p->pbm_A.controller_regs + SABRE_MEMSPACE); + pbm->mem_space.end = + (pbm->mem_space.start + ((1UL << 32UL) - 1UL)); pbm->mem_space.flags = IORESOURCE_MEM; if (request_resource(&ioport_resource, &pbm->io_space) < 0) { @@ -1315,6 +1319,17 @@ static void sabre_pbm_init(struct pci_co prom_halt(); } + rp = kmalloc(sizeof(*rp), GFP_KERNEL); + if (!rp) { + prom_printf("Cannot allocate IOMMU resource.\n"); + prom_halt(); + } + rp->name = "IOMMU"; + rp->start = pbm->mem_space.start + (unsigned long) dma_start; + rp->end = pbm->mem_space.start + (unsigned long) dma_end - 1UL; + rp->flags = IORESOURCE_BUSY; + request_resource(&pbm->mem_space, rp); + pci_register_legacy_regions(&pbm->io_space, &pbm->mem_space); } @@ -1450,5 +1465,5 @@ void sabre_init(struct device_node *dp, /* * Look for APB underneath. */ - sabre_pbm_init(p, dp, vdma[0]); + sabre_pbm_init(p, dp, vdma[0], vdma[0] + vdma[1]); } diff -uprN linux-2.6.18/arch/sparc64/kernel/process.c linux-2.6.18.ovz/arch/sparc64/kernel/process.c --- linux-2.6.18/arch/sparc64/kernel/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/process.c 2007-06-13 06:55:05.000000000 -0400 @@ -676,6 +676,13 @@ pid_t kernel_thread(int (*fn)(void *), v { long retval; + /* Don't allow kernel_thread() inside VE */ + if (!ve_is_super(get_exec_env())) { + printk("kernel_thread call inside VE\n"); + dump_stack(); + return -EPERM; + } + /* If the parent runs before fn(arg) is called by the child, * the input registers of this function can be clobbered. * So we stash 'fn' and 'arg' into global registers which diff -uprN linux-2.6.18/arch/sparc64/kernel/prom.c linux-2.6.18.ovz/arch/sparc64/kernel/prom.c --- linux-2.6.18/arch/sparc64/kernel/prom.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/prom.c 2007-06-13 06:55:05.000000000 -0400 @@ -794,7 +794,7 @@ static unsigned int schizo_irq_build(str return virt_irq; } -static void schizo_irq_trans_init(struct device_node *dp) +static void __schizo_irq_trans_init(struct device_node *dp, int is_tomatillo) { struct linux_prom64_registers *regs; struct schizo_irq_data *irq_data; @@ -808,11 +808,24 @@ static void schizo_irq_trans_init(struct dp->irq_trans->data = irq_data; irq_data->pbm_regs = regs[0].phys_addr; - irq_data->sync_reg = regs[3].phys_addr + 0x1a18UL; + if (is_tomatillo) + irq_data->sync_reg = regs[3].phys_addr + 0x1a18UL; + else + irq_data->sync_reg = 0UL; irq_data->portid = of_getintprop_default(dp, "portid", 0); irq_data->chip_version = of_getintprop_default(dp, "version#", 0); } +static void schizo_irq_trans_init(struct device_node *dp) +{ + __schizo_irq_trans_init(dp, 0); +} + +static void tomatillo_irq_trans_init(struct device_node *dp) +{ + __schizo_irq_trans_init(dp, 1); +} + static unsigned int pci_sun4v_irq_build(struct device_node *dp, unsigned int devino, void *_data) @@ -1051,8 +1064,8 @@ static struct irq_trans pci_irq_trans_ta { "pci108e,8001", schizo_irq_trans_init }, { "SUNW,schizo+", schizo_irq_trans_init }, { "pci108e,8002", schizo_irq_trans_init }, - { "SUNW,tomatillo", schizo_irq_trans_init }, - { "pci108e,a801", schizo_irq_trans_init }, + { "SUNW,tomatillo", tomatillo_irq_trans_init }, + { "pci108e,a801", tomatillo_irq_trans_init }, { "SUNW,sun4v-pci", pci_sun4v_irq_trans_init }, }; #endif @@ -1080,23 +1093,22 @@ static void sun4v_vdev_irq_trans_init(st static void irq_trans_init(struct device_node *dp) { - const char *model; #ifdef CONFIG_PCI + const char *model; int i; #endif +#ifdef CONFIG_PCI model = of_get_property(dp, "model", NULL); if (!model) model = of_get_property(dp, "compatible", NULL); - if (!model) - return; - -#ifdef CONFIG_PCI - for (i = 0; i < ARRAY_SIZE(pci_irq_trans_table); i++) { - struct irq_trans *t = &pci_irq_trans_table[i]; + if (model) { + for (i = 0; i < ARRAY_SIZE(pci_irq_trans_table); i++) { + struct irq_trans *t = &pci_irq_trans_table[i]; - if (!strcmp(model, t->name)) - return t->init(dp); + if (!strcmp(model, t->name)) + return t->init(dp); + } } #endif #ifdef CONFIG_SBUS @@ -1104,8 +1116,9 @@ static void irq_trans_init(struct device !strcmp(dp->name, "sbi")) return sbus_irq_trans_init(dp); #endif - if (!strcmp(dp->name, "central")) - return central_irq_trans_init(dp->child); + if (!strcmp(dp->name, "fhc") && + !strcmp(dp->parent->name, "central")) + return central_irq_trans_init(dp); if (!strcmp(dp->name, "virtual-devices")) return sun4v_vdev_irq_trans_init(dp); } @@ -1517,7 +1530,7 @@ static char * __init get_one_property(ph return buf; } -static struct device_node * __init create_node(phandle node) +static struct device_node * __init create_node(phandle node, struct device_node *parent) { struct device_node *dp; @@ -1526,6 +1539,7 @@ static struct device_node * __init creat dp = prom_early_alloc(sizeof(*dp)); dp->unique_id = unique_id++; + dp->parent = parent; kref_init(&dp->kref); @@ -1544,12 +1558,11 @@ static struct device_node * __init build { struct device_node *dp; - dp = create_node(node); + dp = create_node(node, parent); if (dp) { *(*nextp) = dp; *nextp = &dp->allnext; - dp->parent = parent; dp->path_component_name = build_path_component(dp); dp->full_name = build_full_name(dp); @@ -1565,7 +1578,7 @@ void __init prom_build_devicetree(void) { struct device_node **nextp; - allnodes = create_node(prom_root_node); + allnodes = create_node(prom_root_node, NULL); allnodes->path_component_name = ""; allnodes->full_name = "/"; diff -uprN linux-2.6.18/arch/sparc64/kernel/sparc64_ksyms.c linux-2.6.18.ovz/arch/sparc64/kernel/sparc64_ksyms.c --- linux-2.6.18/arch/sparc64/kernel/sparc64_ksyms.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/sparc64_ksyms.c 2007-06-13 06:55:05.000000000 -0400 @@ -315,6 +315,7 @@ EXPORT_SYMBOL(copy_from_user_fixup); EXPORT_SYMBOL(copy_in_user_fixup); EXPORT_SYMBOL(__strncpy_from_user); EXPORT_SYMBOL(__clear_user); +EXPORT_SYMBOL(mem_map_zero); /* Various address conversion macros use this. */ EXPORT_SYMBOL(sparc64_valid_addr_bitmap); diff -uprN linux-2.6.18/arch/sparc64/kernel/sys_sparc.c linux-2.6.18.ovz/arch/sparc64/kernel/sys_sparc.c --- linux-2.6.18/arch/sparc64/kernel/sys_sparc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/sys_sparc.c 2007-06-13 06:55:05.000000000 -0400 @@ -712,13 +712,13 @@ asmlinkage long sys_getdomainname(char _ down_read(&uts_sem); - nlen = strlen(system_utsname.domainname) + 1; + nlen = strlen(utsname()->domainname) + 1; err = -EINVAL; if (nlen > len) goto out; err = -EFAULT; - if (!copy_to_user(name, system_utsname.domainname, nlen)) + if (!copy_to_user(name, utsname()->domainname, nlen)) err = 0; out: diff -uprN linux-2.6.18/arch/sparc64/kernel/sys_sparc32.c linux-2.6.18.ovz/arch/sparc64/kernel/sys_sparc32.c --- linux-2.6.18/arch/sparc64/kernel/sys_sparc32.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/sys_sparc32.c 2007-06-13 06:55:05.000000000 -0400 @@ -841,7 +841,7 @@ asmlinkage long sys32_utimes(char __user return -EFAULT; } - return do_utimes(AT_FDCWD, filename, (tvs ? &ktvs[0] : NULL)); + return do_utimes(AT_FDCWD, filename, (tvs ? &ktvs[0] : NULL), 0); } /* These are here just in case some old sparc32 binary calls it. */ diff -uprN linux-2.6.18/arch/sparc64/kernel/sys_sunos32.c linux-2.6.18.ovz/arch/sparc64/kernel/sys_sunos32.c --- linux-2.6.18/arch/sparc64/kernel/sys_sunos32.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/sys_sunos32.c 2007-06-13 06:55:05.000000000 -0400 @@ -439,16 +439,16 @@ asmlinkage int sunos_uname(struct sunos_ int ret; down_read(&uts_sem); - ret = copy_to_user(&name->sname[0], &system_utsname.sysname[0], + ret = copy_to_user(&name->sname[0], &utsname()->sysname[0], sizeof(name->sname) - 1); - ret |= copy_to_user(&name->nname[0], &system_utsname.nodename[0], + ret |= copy_to_user(&name->nname[0], &utsname()->nodename[0], sizeof(name->nname) - 1); ret |= put_user('\0', &name->nname[8]); - ret |= copy_to_user(&name->rel[0], &system_utsname.release[0], + ret |= copy_to_user(&name->rel[0], &utsname()->release[0], sizeof(name->rel) - 1); - ret |= copy_to_user(&name->ver[0], &system_utsname.version[0], + ret |= copy_to_user(&name->ver[0], &utsname()->version[0], sizeof(name->ver) - 1); - ret |= copy_to_user(&name->mach[0], &system_utsname.machine[0], + ret |= copy_to_user(&name->mach[0], &utsname()->machine[0], sizeof(name->mach) - 1); up_read(&uts_sem); return (ret ? -EFAULT : 0); diff -uprN linux-2.6.18/arch/sparc64/kernel/systbls.S linux-2.6.18.ovz/arch/sparc64/kernel/systbls.S --- linux-2.6.18/arch/sparc64/kernel/systbls.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/systbls.S 2007-06-13 06:55:05.000000000 -0400 @@ -81,6 +81,24 @@ sys_call_table32: .word sys_fchmodat, sys_faccessat, compat_sys_pselect6, compat_sys_ppoll, sys_unshare /*300*/ .word compat_sys_set_robust_list, compat_sys_get_robust_list + .rept 500-302 + .word sys_nis_syscall + .endr + .word sys_fairsched_mknod /* 500 */ + .word sys_fairsched_rmnod + .word sys_fairsched_chwt + .word sys_fairsched_mvpr + .word sys_fairsched_rate + .word sys_nis_syscall /* 505 */ + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_getluid /* 510 */ + .word sys_setluid + .word compat_sys_setublimit + .word compat_sys_ubstat + #endif /* CONFIG_COMPAT */ /* Now the 64-bit native Linux syscall table. */ @@ -151,6 +169,25 @@ sys_call_table: .word sys_fchmodat, sys_faccessat, sys_pselect6, sys_ppoll, sys_unshare /*300*/ .word sys_set_robust_list, sys_get_robust_list + .rept 500-302 + .word sys_nis_syscall + .endr + .word sys_fairsched_mknod /* 500 */ + .word sys_fairsched_rmnod + .word sys_fairsched_chwt + .word sys_fairsched_mvpr + .word sys_fairsched_rate + .word sys_nis_syscall /* 505 */ + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_getluid /* 510 */ + .word sys_setluid + .word sys_setublimit + .word sys_ubstat + + #if defined(CONFIG_SUNOS_EMUL) || defined(CONFIG_SOLARIS_EMUL) || \ defined(CONFIG_SOLARIS_EMUL_MODULE) /* Now the 32-bit SunOS syscall table. */ @@ -263,4 +300,7 @@ sunos_sys_table: .word sunos_nosys, sunos_nosys, sunos_nosys .word sunos_nosys, sunos_nosys, sunos_nosys .word sunos_nosys, sunos_nosys, sunos_nosys + .rept 520-302 + .word sunos_nosys + .endr #endif diff -uprN linux-2.6.18/arch/sparc64/kernel/time.c linux-2.6.18.ovz/arch/sparc64/kernel/time.c --- linux-2.6.18/arch/sparc64/kernel/time.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/time.c 2007-06-13 06:55:05.000000000 -0400 @@ -983,7 +983,7 @@ static struct time_interpolator sparc64_ }; /* The quotient formula is taken from the IA64 port. */ -#define SPARC64_NSEC_PER_CYC_SHIFT 30UL +#define SPARC64_NSEC_PER_CYC_SHIFT 10UL void __init time_init(void) { unsigned long clock = sparc64_init_timers(); diff -uprN linux-2.6.18/arch/sparc64/kernel/traps.c linux-2.6.18.ovz/arch/sparc64/kernel/traps.c --- linux-2.6.18/arch/sparc64/kernel/traps.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/kernel/traps.c 2007-06-13 06:55:05.000000000 -0400 @@ -2216,6 +2216,10 @@ void die_if_kernel(char *str, struct pt_ " \\__U_/\n"); printk("%s(%d): %s [#%d]\n", current->comm, current->pid, str, ++die_counter); + printk("VE:EXCVE %d:%d, CPU %d, VCPU %d:%d\n", + VEID(VE_TASK_INFO(current)->owner_env), VEID(get_exec_env()), + smp_processor_id(), + task_vsched_id(current), task_cpu(current)); notify_die(DIE_OOPS, str, regs, 0, 255, SIGSEGV); __asm__ __volatile__("flushw"); __show_regs(regs); diff -uprN linux-2.6.18/arch/sparc64/mm/init.c linux-2.6.18.ovz/arch/sparc64/mm/init.c --- linux-2.6.18/arch/sparc64/mm/init.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/mm/init.c 2007-06-13 06:55:05.000000000 -0400 @@ -418,6 +418,7 @@ void show_mem(void) printk("%ld pages of RAM\n", num_physpages); printk("%d free pages\n", nr_free_pages()); } +EXPORT_SYMBOL(show_mem); void mmu_info(struct seq_file *m) { @@ -920,8 +921,7 @@ static unsigned long __init bootmem_init if (sparc_ramdisk_image || sparc_ramdisk_image64) { unsigned long ramdisk_image = sparc_ramdisk_image ? sparc_ramdisk_image : sparc_ramdisk_image64; - if (ramdisk_image >= (unsigned long)_end - 2 * PAGE_SIZE) - ramdisk_image -= KERNBASE; + ramdisk_image -= KERNBASE; initrd_start = ramdisk_image + phys_base; initrd_end = initrd_start + sparc_ramdisk_size; if (initrd_end > end_of_phys_memory) { diff -uprN linux-2.6.18/arch/sparc64/solaris/misc.c linux-2.6.18.ovz/arch/sparc64/solaris/misc.c --- linux-2.6.18/arch/sparc64/solaris/misc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/sparc64/solaris/misc.c 2007-06-13 06:55:05.000000000 -0400 @@ -248,7 +248,7 @@ asmlinkage int solaris_utssys(u32 buf, u /* Let's cheat */ err = set_utsfield(v->sysname, "SunOS", 1, 0); down_read(&uts_sem); - err |= set_utsfield(v->nodename, system_utsname.nodename, + err |= set_utsfield(v->nodename, utsname()->nodename, 1, 1); up_read(&uts_sem); err |= set_utsfield(v->release, "2.6", 0, 0); @@ -272,7 +272,7 @@ asmlinkage int solaris_utsname(u32 buf) /* Why should we not lie a bit? */ down_read(&uts_sem); err = set_utsfield(v->sysname, "SunOS", 0, 0); - err |= set_utsfield(v->nodename, system_utsname.nodename, 1, 1); + err |= set_utsfield(v->nodename, utsname()->nodename, 1, 1); err |= set_utsfield(v->release, "5.6", 0, 0); err |= set_utsfield(v->version, "Generic", 0, 0); err |= set_utsfield(v->machine, machine(), 0, 0); @@ -304,7 +304,7 @@ asmlinkage int solaris_sysinfo(int cmd, case SI_HOSTNAME: r = buffer + 256; down_read(&uts_sem); - for (p = system_utsname.nodename, q = buffer; + for (p = utsname()->nodename, q = buffer; q < r && *p && *p != '.'; *q++ = *p++); up_read(&uts_sem); *q = 0; diff -uprN linux-2.6.18/arch/um/Kconfig linux-2.6.18.ovz/arch/um/Kconfig --- linux-2.6.18/arch/um/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/Kconfig 2007-06-13 06:55:05.000000000 -0400 @@ -1,3 +1,8 @@ +config DEFCONFIG_LIST + string + option defconfig_list + default "arch/$ARCH/defconfig" + # UML uses the generic IRQ sugsystem config GENERIC_HARDIRQS bool diff -uprN linux-2.6.18/arch/um/Makefile-x86_64 linux-2.6.18.ovz/arch/um/Makefile-x86_64 --- linux-2.6.18/arch/um/Makefile-x86_64 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/Makefile-x86_64 2007-06-13 06:55:05.000000000 -0400 @@ -1,7 +1,7 @@ # Copyright 2003 - 2004 Pathscale, Inc # Released under the GPL -core-y += arch/um/sys-x86_64/ +core-y += arch/um/sys-x86_64/ arch/x86_64/crypto/ START := 0x60000000 #We #undef __x86_64__ for kernelspace, not for userspace where diff -uprN linux-2.6.18/arch/um/drivers/mconsole_kern.c linux-2.6.18.ovz/arch/um/drivers/mconsole_kern.c --- linux-2.6.18/arch/um/drivers/mconsole_kern.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/drivers/mconsole_kern.c 2007-06-13 06:55:05.000000000 -0400 @@ -106,9 +106,9 @@ void mconsole_version(struct mc_request { char version[256]; - sprintf(version, "%s %s %s %s %s", system_utsname.sysname, - system_utsname.nodename, system_utsname.release, - system_utsname.version, system_utsname.machine); + sprintf(version, "%s %s %s %s %s", utsname()->sysname, + utsname()->nodename, utsname()->release, + utsname()->version, utsname()->machine); mconsole_reply(req, version, 0, 0); } @@ -734,7 +734,7 @@ static void do_stack_trace(struct mc_req from = current; - to = find_task_by_pid(pid_requested); + to = find_task_by_pid_all(pid_requested); if((to == NULL) || (pid_requested == 0)) { mconsole_reply(req, "Couldn't find that pid", 1, 0); return; diff -uprN linux-2.6.18/arch/um/include/common-offsets.h linux-2.6.18.ovz/arch/um/include/common-offsets.h --- linux-2.6.18/arch/um/include/common-offsets.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/include/common-offsets.h 2007-06-13 06:55:05.000000000 -0400 @@ -15,3 +15,4 @@ DEFINE_STR(UM_KERN_DEBUG, KERN_DEBUG); DEFINE(UM_ELF_CLASS, ELF_CLASS); DEFINE(UM_ELFCLASS32, ELFCLASS32); DEFINE(UM_ELFCLASS64, ELFCLASS64); +DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); diff -uprN linux-2.6.18/arch/um/include/kern_util.h linux-2.6.18.ovz/arch/um/include/kern_util.h --- linux-2.6.18/arch/um/include/kern_util.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/include/kern_util.h 2007-06-13 06:55:05.000000000 -0400 @@ -6,7 +6,6 @@ #ifndef __KERN_UTIL_H__ #define __KERN_UTIL_H__ -#include "linux/threads.h" #include "sysdep/ptrace.h" #include "sysdep/faultinfo.h" diff -uprN linux-2.6.18/arch/um/include/sysdep-i386/kernel-offsets.h linux-2.6.18.ovz/arch/um/include/sysdep-i386/kernel-offsets.h --- linux-2.6.18/arch/um/include/sysdep-i386/kernel-offsets.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/include/sysdep-i386/kernel-offsets.h 2007-06-13 06:55:05.000000000 -0400 @@ -1,6 +1,7 @@ #include #include #include +#include #include #define DEFINE(sym, val) \ diff -uprN linux-2.6.18/arch/um/include/sysdep-x86_64/kernel-offsets.h linux-2.6.18.ovz/arch/um/include/sysdep-x86_64/kernel-offsets.h --- linux-2.6.18/arch/um/include/sysdep-x86_64/kernel-offsets.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/include/sysdep-x86_64/kernel-offsets.h 2007-06-13 06:55:05.000000000 -0400 @@ -2,6 +2,7 @@ #include #include #include +#include #include #include diff -uprN linux-2.6.18/arch/um/kernel/init_task.c linux-2.6.18.ovz/arch/um/kernel/init_task.c --- linux-2.6.18/arch/um/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -9,6 +9,7 @@ #include "linux/sched.h" #include "linux/init_task.h" #include "linux/mqueue.h" +#include "linux/nsproxy.h" #include "asm/uaccess.h" #include "asm/pgtable.h" #include "user_util.h" @@ -17,6 +18,7 @@ static struct fs_struct init_fs = INIT_FS; struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); diff -uprN linux-2.6.18/arch/um/kernel/skas/process_kern.c linux-2.6.18.ovz/arch/um/kernel/skas/process_kern.c --- linux-2.6.18/arch/um/kernel/skas/process_kern.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/kernel/skas/process_kern.c 2007-06-13 06:55:05.000000000 -0400 @@ -208,7 +208,7 @@ void kill_off_processes_skas(void) int pid, me; me = os_getpid(); - for_each_process(p){ + for_each_process_all(p){ if(p->mm == NULL) continue; diff -uprN linux-2.6.18/arch/um/kernel/syscall.c linux-2.6.18.ovz/arch/um/kernel/syscall.c --- linux-2.6.18/arch/um/kernel/syscall.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/kernel/syscall.c 2007-06-13 06:55:05.000000000 -0400 @@ -110,7 +110,7 @@ long sys_uname(struct old_utsname __user if (!name) return -EFAULT; down_read(&uts_sem); - err = copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } @@ -126,21 +126,21 @@ long sys_olduname(struct oldold_utsname down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname, + error = __copy_to_user(&name->sysname, &utsname()->sysname, __OLD_UTS_LEN); - error |= __put_user(0,name->sysname+__OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename,&system_utsname.nodename, + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, __OLD_UTS_LEN); - error |= __put_user(0,name->nodename+__OLD_UTS_LEN); - error |= __copy_to_user(&name->release,&system_utsname.release, + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); + error |= __copy_to_user(&name->release, &utsname()->release, __OLD_UTS_LEN); - error |= __put_user(0,name->release+__OLD_UTS_LEN); - error |= __copy_to_user(&name->version,&system_utsname.version, + error |= __put_user(0, name->release + __OLD_UTS_LEN); + error |= __copy_to_user(&name->version, &utsname()->version, __OLD_UTS_LEN); - error |= __put_user(0,name->version+__OLD_UTS_LEN); - error |= __copy_to_user(&name->machine,&system_utsname.machine, + error |= __put_user(0, name->version + __OLD_UTS_LEN); + error |= __copy_to_user(&name->machine, &utsname()->machine, __OLD_UTS_LEN); - error |= __put_user(0,name->machine+__OLD_UTS_LEN); + error |= __put_user(0, name->machine + __OLD_UTS_LEN); up_read(&uts_sem); diff -uprN linux-2.6.18/arch/um/kernel/tt/process_kern.c linux-2.6.18.ovz/arch/um/kernel/tt/process_kern.c --- linux-2.6.18/arch/um/kernel/tt/process_kern.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/kernel/tt/process_kern.c 2007-06-13 06:55:05.000000000 -0400 @@ -307,7 +307,7 @@ void kill_off_processes_tt(void) int me; me = os_getpid(); - for_each_process(p){ + for_each_process_all(p){ if(p->thread.mode.tt.extern_pid != me) os_kill_process(p->thread.mode.tt.extern_pid, 0); } @@ -450,7 +450,7 @@ int is_valid_pid(int pid) struct task_struct *task; read_lock(&tasklist_lock); - for_each_process(task){ + for_each_process_all(task){ if(task->thread.mode.tt.extern_pid == pid){ read_unlock(&tasklist_lock); return(1); diff -uprN linux-2.6.18/arch/um/kernel/um_arch.c linux-2.6.18.ovz/arch/um/kernel/um_arch.c --- linux-2.6.18/arch/um/kernel/um_arch.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/kernel/um_arch.c 2007-06-13 06:55:05.000000000 -0400 @@ -167,7 +167,7 @@ static char *usage_string = static int __init uml_version_setup(char *line, int *add) { - printf("%s\n", system_utsname.release); + printf("%s\n", init_utsname()->release); exit(0); return 0; @@ -278,7 +278,7 @@ static int __init Usage(char *line, int { const char **p; - printf(usage_string, system_utsname.release); + printf(usage_string, init_utsname()->release); p = &__uml_help_start; while (p < &__uml_help_end) { printf("%s", *p); @@ -403,7 +403,7 @@ int linux_main(int argc, char **argv) /* Reserve up to 4M after the current brk */ uml_reserved = ROUND_4M(brk_start) + (1 << 22); - setup_machinename(system_utsname.machine); + setup_machinename(init_utsname()->machine); #ifdef CONFIG_CMDLINE_ON_HOST argv1_begin = argv[1]; diff -uprN linux-2.6.18/arch/um/os-Linux/process.c linux-2.6.18.ovz/arch/um/os-Linux/process.c --- linux-2.6.18/arch/um/os-Linux/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/os-Linux/process.c 2007-06-13 06:55:05.000000000 -0400 @@ -141,11 +141,9 @@ void os_usr1_process(int pid) * syscalls, and also breaks with clone(), which does not unshare the TLS. */ -inline _syscall0(pid_t, getpid) - int os_getpid(void) { - return(getpid()); + return syscall(__NR_getpid); } int os_getpgrp(void) diff -uprN linux-2.6.18/arch/um/os-Linux/sys-i386/tls.c linux-2.6.18.ovz/arch/um/os-Linux/sys-i386/tls.c --- linux-2.6.18/arch/um/os-Linux/sys-i386/tls.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/os-Linux/sys-i386/tls.c 2007-06-13 06:55:05.000000000 -0400 @@ -1,10 +1,10 @@ #include +#include +#include #include #include "sysdep/tls.h" #include "user_util.h" -static _syscall1(int, get_thread_area, user_desc_t *, u_info); - /* Checks whether host supports TLS, and sets *tls_min according to the value * valid on the host. * i386 host have it == 6; x86_64 host have it == 12, for i386 emulation. */ @@ -17,7 +17,7 @@ void check_host_supports_tls(int *suppor user_desc_t info; info.entry_number = val[i]; - if (get_thread_area(&info) == 0) { + if(syscall(__NR_get_thread_area, &info) == 0){ *tls_min = val[i]; *supports_tls = 1; return; diff -uprN linux-2.6.18/arch/um/os-Linux/tls.c linux-2.6.18.ovz/arch/um/os-Linux/tls.c --- linux-2.6.18/arch/um/os-Linux/tls.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/os-Linux/tls.c 2007-06-13 06:55:05.000000000 -0400 @@ -1,6 +1,8 @@ #include #include +#include #include +#include #include "sysdep/tls.h" #include "uml-config.h" @@ -48,14 +50,11 @@ int os_get_thread_area(user_desc_t *info #ifdef UML_CONFIG_MODE_TT #include "linux/unistd.h" -static _syscall1(int, get_thread_area, user_desc_t *, u_info); -static _syscall1(int, set_thread_area, user_desc_t *, u_info); - int do_set_thread_area_tt(user_desc_t *info) { int ret; - ret = set_thread_area(info); + ret = syscall(__NR_set_thread_area, info); if (ret < 0) { ret = -errno; } @@ -66,7 +65,7 @@ int do_get_thread_area_tt(user_desc_t *i { int ret; - ret = get_thread_area(info); + ret = syscall(__NR_get_thread_area, info); if (ret < 0) { ret = -errno; } diff -uprN linux-2.6.18/arch/um/sys-x86_64/stub_segv.c linux-2.6.18.ovz/arch/um/sys-x86_64/stub_segv.c --- linux-2.6.18/arch/um/sys-x86_64/stub_segv.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/sys-x86_64/stub_segv.c 2007-06-13 06:55:05.000000000 -0400 @@ -5,7 +5,6 @@ #include #include -#include #include #include "uml-config.h" #include "sysdep/sigcontext.h" diff -uprN linux-2.6.18/arch/um/sys-x86_64/syscalls.c linux-2.6.18.ovz/arch/um/sys-x86_64/syscalls.c --- linux-2.6.18/arch/um/sys-x86_64/syscalls.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/sys-x86_64/syscalls.c 2007-06-13 06:55:05.000000000 -0400 @@ -21,7 +21,7 @@ asmlinkage long sys_uname64(struct new_u { int err; down_read(&uts_sem); - err = copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); if (personality(current->personality) == PER_LINUX32) err |= copy_to_user(&name->machine, "i686", 5); diff -uprN linux-2.6.18/arch/um/sys-x86_64/sysrq.c linux-2.6.18.ovz/arch/um/sys-x86_64/sysrq.c --- linux-2.6.18/arch/um/sys-x86_64/sysrq.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/um/sys-x86_64/sysrq.c 2007-06-13 06:55:05.000000000 -0400 @@ -16,7 +16,7 @@ void __show_regs(struct pt_regs * regs) printk("\n"); print_modules(); printk("Pid: %d, comm: %.20s %s %s\n", - current->pid, current->comm, print_tainted(), system_utsname.release); + current->pid, current->comm, print_tainted(), init_utsname()->release); printk("RIP: %04lx:[<%016lx>] ", PT_REGS_CS(regs) & 0xffff, PT_REGS_RIP(regs)); printk("\nRSP: %016lx EFLAGS: %08lx\n", PT_REGS_RSP(regs), diff -uprN linux-2.6.18/arch/v850/kernel/init_task.c linux-2.6.18.ovz/arch/v850/kernel/init_task.c --- linux-2.6.18/arch/v850/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/v850/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS (init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM (init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/x86_64/Kconfig linux-2.6.18.ovz/arch/x86_64/Kconfig --- linux-2.6.18/arch/x86_64/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/Kconfig 2007-06-13 06:55:05.000000000 -0400 @@ -276,6 +276,8 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +source "kernel/Kconfig.fairsched" + source "kernel/Kconfig.preempt" config NUMA @@ -376,7 +378,7 @@ config NR_CPUS config HOTPLUG_CPU bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" - depends on SMP && HOTPLUG && EXPERIMENTAL + depends on SMP && HOTPLUG && EXPERIMENTAL && !SCHED_VCPU help Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu/cpu#. @@ -654,8 +656,12 @@ endmenu source "arch/x86_64/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "kernel/ub/Kconfig" diff -uprN linux-2.6.18/arch/x86_64/boot/compressed/head.S linux-2.6.18.ovz/arch/x86_64/boot/compressed/head.S --- linux-2.6.18/arch/x86_64/boot/compressed/head.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/boot/compressed/head.S 2007-06-13 06:55:05.000000000 -0400 @@ -34,7 +34,7 @@ startup_32: cld cli - movl $(__KERNEL_DS),%eax + movl $(__BOOT_DS),%eax movl %eax,%ds movl %eax,%es movl %eax,%fs @@ -76,7 +76,7 @@ startup_32: jnz 3f addl $8,%esp xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START + ljmp $(__BOOT_CS), $__PHYSICAL_START /* * We come here, if we were loaded high. @@ -104,7 +104,7 @@ startup_32: popl %eax # hcount movl $__PHYSICAL_START,%edi cli # make sure we don't get interrupted - ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine + ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine /* * Routine (template) for moving the decompressed kernel in place, @@ -127,7 +127,7 @@ move_routine_start: movsl movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START + ljmp $(__BOOT_CS), $__PHYSICAL_START move_routine_end: @@ -137,5 +137,5 @@ user_stack: .fill 4096,4,0 stack_start: .long user_stack+4096 - .word __KERNEL_DS + .word __BOOT_DS diff -uprN linux-2.6.18/arch/x86_64/boot/setup.S linux-2.6.18.ovz/arch/x86_64/boot/setup.S --- linux-2.6.18/arch/x86_64/boot/setup.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/boot/setup.S 2007-06-13 06:55:05.000000000 -0400 @@ -728,7 +728,7 @@ flush_instr: subw $DELTA_INITSEG, %si shll $4, %esi # Convert to 32-bit pointer # NOTE: For high loaded big kernels we need a -# jmpi 0x100000,__KERNEL_CS +# jmpi 0x100000,__BOOT_CS # # but we yet haven't reloaded the CS register, so the default size # of the target offset still is 16 bit. @@ -739,7 +739,7 @@ flush_instr: .byte 0x66, 0xea # prefix + jmpi-opcode code32: .long 0x1000 # will be set to 0x100000 # for big kernels - .word __KERNEL_CS + .word __BOOT_CS # Here's a bunch of information about your current kernel.. kernel_version: .ascii UTS_RELEASE diff -uprN linux-2.6.18/arch/x86_64/ia32/ia32_aout.c linux-2.6.18.ovz/arch/x86_64/ia32/ia32_aout.c --- linux-2.6.18/arch/x86_64/ia32/ia32_aout.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/ia32/ia32_aout.c 2007-06-13 06:55:05.000000000 -0400 @@ -347,14 +347,14 @@ static int load_aout_binary(struct linux if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) { - printk(KERN_NOTICE "executable not page aligned\n"); + ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n"); error_time2 = jiffies; } if ((fd_offset & ~PAGE_MASK) != 0 && (jiffies-error_time) > 5*HZ) { - printk(KERN_WARNING + ve_printk(VE_LOG, KERN_WARNING "fd_offset is not page aligned. Please convert program: %s\n", bprm->file->f_dentry->d_name.name); error_time = jiffies; @@ -467,7 +467,7 @@ static int load_aout_library(struct file static unsigned long error_time; if ((jiffies-error_time) > 5*HZ) { - printk(KERN_WARNING + ve_printk(VE_LOG, KERN_WARNING "N_TXTOFF is not page aligned. Please convert library: %s\n", file->f_dentry->d_name.name); error_time = jiffies; diff -uprN linux-2.6.18/arch/x86_64/ia32/ia32_binfmt.c linux-2.6.18.ovz/arch/x86_64/ia32/ia32_binfmt.c --- linux-2.6.18/arch/x86_64/ia32/ia32_binfmt.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/ia32/ia32_binfmt.c 2007-06-13 06:55:05.000000000 -0400 @@ -27,12 +27,14 @@ #include #include +#include + #define ELF_NAME "elf/i386" #define AT_SYSINFO 32 #define AT_SYSINFO_EHDR 33 -int sysctl_vsyscall32 = 1; +int sysctl_vsyscall32 = 0; #define ARCH_DLINFO do { \ if (sysctl_vsyscall32) { \ @@ -352,9 +354,15 @@ int ia32_setup_arg_pages(struct linux_bi bprm->loader += stack_base; bprm->exec += stack_base; + ret = -ENOMEM; + if (ub_memory_charge(mm, stack_top - + (PAGE_MASK & (unsigned long)bprm->p), + VM_STACK_FLAGS, NULL, UB_SOFT)) + goto err_charge; + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!mpnt) - return -ENOMEM; + goto err_alloc; memset(mpnt, 0, sizeof(*mpnt)); @@ -371,11 +379,8 @@ int ia32_setup_arg_pages(struct linux_bi mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? PAGE_COPY_EXEC : PAGE_COPY; - if ((ret = insert_vm_struct(mm, mpnt))) { - up_write(&mm->mmap_sem); - kmem_cache_free(vm_area_cachep, mpnt); - return ret; - } + if ((ret = insert_vm_struct(mm, mpnt))) + goto err_insert; mm->stack_vm = mm->total_vm = vma_pages(mpnt); } @@ -390,6 +395,15 @@ int ia32_setup_arg_pages(struct linux_bi up_write(&mm->mmap_sem); return 0; + +err_insert: + up_write(&mm->mmap_sem); + kmem_cache_free(vm_area_cachep, mpnt); +err_alloc: + ub_memory_uncharge(mm, stack_top - (PAGE_MASK & (unsigned long)bprm->p), + VM_STACK_FLAGS, NULL); +err_charge: + return ret; } EXPORT_SYMBOL(ia32_setup_arg_pages); diff -uprN linux-2.6.18/arch/x86_64/ia32/ia32_signal.c linux-2.6.18.ovz/arch/x86_64/ia32/ia32_signal.c --- linux-2.6.18/arch/x86_64/ia32/ia32_signal.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/ia32/ia32_signal.c 2007-06-13 06:55:05.000000000 -0400 @@ -37,7 +37,6 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) @@ -116,22 +115,17 @@ asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask, struct pt_regs *regs) { - sigset_t saveset; - mask &= _BLOCKABLE; spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; + current->saved_sigmask = current->blocked; siginitset(¤t->blocked, mask); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_thread_flag(TIF_RESTORE_SIGMASK); + return -ERESTARTNOHAND; } asmlinkage long @@ -508,11 +502,11 @@ int ia32_setup_frame(int sig, struct k_s current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -604,9 +598,9 @@ int ia32_setup_rt_frame(int sig, struct current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } diff -uprN linux-2.6.18/arch/x86_64/ia32/ia32entry.S linux-2.6.18.ovz/arch/x86_64/ia32/ia32entry.S --- linux-2.6.18/arch/x86_64/ia32/ia32entry.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/ia32/ia32entry.S 2007-06-13 06:55:05.000000000 -0400 @@ -508,7 +508,7 @@ ia32_sys_call_table: .quad stub32_iopl /* 110 */ .quad sys_vhangup .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ + .quad quiet_ni_syscall /* vm86old */ .quad compat_sys_wait4 .quad sys_swapoff /* 115 */ .quad sys32_sysinfo @@ -526,7 +526,7 @@ ia32_sys_call_table: .quad sys_init_module .quad sys_delete_module .quad quiet_ni_syscall /* 130 get_kernel_syms */ - .quad sys_quotactl + .quad sys32_quotactl .quad sys_getpgid .quad sys_fchdir .quad quiet_ni_syscall /* bdflush */ @@ -561,7 +561,7 @@ ia32_sys_call_table: .quad sys_mremap .quad sys_setresuid16 .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ + .quad quiet_ni_syscall /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll .quad compat_sys_nfsservctl diff -uprN linux-2.6.18/arch/x86_64/ia32/ptrace32.c linux-2.6.18.ovz/arch/x86_64/ia32/ptrace32.c --- linux-2.6.18/arch/x86_64/ia32/ptrace32.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/ia32/ptrace32.c 2007-06-13 06:55:05.000000000 -0400 @@ -239,6 +239,7 @@ asmlinkage long sys32_ptrace(long reques case PTRACE_SINGLESTEP: case PTRACE_DETACH: case PTRACE_SYSCALL: + case PTRACE_OLDSETOPTIONS: case PTRACE_SETOPTIONS: return sys_ptrace(request, pid, addr, data); diff -uprN linux-2.6.18/arch/x86_64/ia32/sys_ia32.c linux-2.6.18.ovz/arch/x86_64/ia32/sys_ia32.c --- linux-2.6.18/arch/x86_64/ia32/sys_ia32.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/ia32/sys_ia32.c 2007-06-13 06:55:05.000000000 -0400 @@ -782,25 +782,26 @@ asmlinkage long sys32_olduname(struct ol if (!name) return -EFAULT; - if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) + if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) return -EFAULT; down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); - __put_user(0,name->sysname+__OLD_UTS_LEN); - __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); - __put_user(0,name->nodename+__OLD_UTS_LEN); - __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); - __put_user(0,name->release+__OLD_UTS_LEN); - __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); - __put_user(0,name->version+__OLD_UTS_LEN); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + __put_user(0, name->sysname + __OLD_UTS_LEN); + __copy_to_user(&name->nodename, &utsname()->nodename, __OLD_UTS_LEN); + __put_user(0, name->nodename + __OLD_UTS_LEN); + __copy_to_user(&name->release, &utsname()->release, __OLD_UTS_LEN); + __put_user(0, name->release + __OLD_UTS_LEN); + __copy_to_user(&name->version, &utsname()->version, __OLD_UTS_LEN); + __put_user(0, name->version + __OLD_UTS_LEN); { char *arch = "x86_64"; if (personality(current->personality) == PER_LINUX32) arch = "i686"; - __copy_to_user(&name->machine,arch,strlen(arch)+1); + __copy_to_user(&name->machine, arch, strlen(arch) + 1); } up_read(&uts_sem); @@ -816,7 +817,7 @@ long sys32_uname(struct old_utsname __us if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); if (personality(current->personality) == PER_LINUX32) err |= copy_to_user(&name->machine, "i686", 5); @@ -897,18 +898,6 @@ long sys32_fadvise64_64(int fd, __u32 of advice); } -long sys32_vm86_warning(void) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, char __user * buf, size_t len) { diff -uprN linux-2.6.18/arch/x86_64/ia32/syscall32.c linux-2.6.18.ovz/arch/x86_64/ia32/syscall32.c --- linux-2.6.18/arch/x86_64/ia32/syscall32.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/ia32/syscall32.c 2007-06-13 06:55:05.000000000 -0400 @@ -14,6 +14,8 @@ #include #include +#include + extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; extern int sysctl_vsyscall32; @@ -47,32 +49,45 @@ int syscall32_setup_pages(struct linux_b int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT; struct vm_area_struct *vma; struct mm_struct *mm = current->mm; + unsigned long flags; int ret; + flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE | + mm->def_flags; + + ret = -ENOMEM; + if (ub_memory_charge(mm, VSYSCALL32_END - VSYSCALL32_BASE, + flags, NULL, UB_SOFT)) + goto err_charge; + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) - return -ENOMEM; + goto err_alloc; memset(vma, 0, sizeof(struct vm_area_struct)); /* Could randomize here */ vma->vm_start = VSYSCALL32_BASE; vma->vm_end = VSYSCALL32_END; /* MAYWRITE to allow gdb to COW and set breakpoints */ - vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE; - vma->vm_flags |= mm->def_flags; + vma->vm_flags = flags; vma->vm_page_prot = protection_map[vma->vm_flags & 7]; vma->vm_ops = &syscall32_vm_ops; vma->vm_mm = mm; down_write(&mm->mmap_sem); - if ((ret = insert_vm_struct(mm, vma))) { - up_write(&mm->mmap_sem); - kmem_cache_free(vm_area_cachep, vma); - return ret; - } + if ((ret = insert_vm_struct(mm, vma))) + goto err_ins; mm->total_vm += npages; up_write(&mm->mmap_sem); return 0; + +err_ins: + up_write(&mm->mmap_sem); + kmem_cache_free(vm_area_cachep, vma); +err_alloc: + ub_memory_uncharge(mm, VSYSCALL32_END - VSYSCALL32_BASE, flags, NULL); +err_charge: + return ret; } static int __init init_syscall32(void) diff -uprN linux-2.6.18/arch/x86_64/kernel/acpi/wakeup.S linux-2.6.18.ovz/arch/x86_64/kernel/acpi/wakeup.S --- linux-2.6.18/arch/x86_64/kernel/acpi/wakeup.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/acpi/wakeup.S 2007-06-13 06:55:05.000000000 -0400 @@ -77,7 +77,7 @@ wakeup_code: .byte 0x66, 0xea # prefix + jmpi-opcode .long wakeup_32 - __START_KERNEL_map - .word __KERNEL_CS + .word __BOOT_CS .code32 wakeup_32: @@ -96,13 +96,13 @@ wakeup_32: jnc bogus_cpu movl %edx,%edi - movw $__KERNEL_DS, %ax + movw $__BOOT_DS, %ax movw %ax, %ds movw %ax, %es movw %ax, %fs movw %ax, %gs - movw $__KERNEL_DS, %ax + movw $__BOOT_DS, %ax movw %ax, %ss mov $(wakeup_stack - __START_KERNEL_map), %esp @@ -187,7 +187,7 @@ reach_compatibility_mode: wakeup_jumpvector: .long wakeup_long64 - __START_KERNEL_map - .word __KERNEL_CS + .word __BOOT_CS .code64 diff -uprN linux-2.6.18/arch/x86_64/kernel/entry.S linux-2.6.18.ovz/arch/x86_64/kernel/entry.S --- linux-2.6.18/arch/x86_64/kernel/entry.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/entry.S 2007-06-13 06:55:05.000000000 -0400 @@ -146,8 +146,17 @@ /* rdi: prev */ ENTRY(ret_from_fork) CFI_DEFAULT_STACK + push kernel_eflags(%rip) + CFI_ADJUST_CFA_OFFSET 4 + popf # reset kernel eflags + CFI_ADJUST_CFA_OFFSET -4 call schedule_tail +ret_from_fork_tail: GET_THREAD_INFO(%rcx) + btr $TIF_RESUME,threadinfo_flags(%rcx) + jc x86_64_ret_from_resume + +ret_from_fork_check: testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) jnz rff_trace rff_action: @@ -163,6 +172,19 @@ rff_trace: call syscall_trace_leave GET_THREAD_INFO(%rcx) jmp rff_action + +x86_64_ret_from_resume: + movq (%rsp),%rax + testq %rax,%rax + jz 1f + movq %rsp,%rdi + call *%rax +1: + addq $256,%rsp + cmpq $0,ORIG_RAX(%rsp) + jge ret_from_fork_tail + RESTORE_REST + jmp int_ret_from_sys_call CFI_ENDPROC END(ret_from_fork) @@ -270,7 +292,7 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON sti - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz 1f /* Really a signal */ @@ -382,7 +404,7 @@ int_very_careful: jmp int_restore_rest int_signal: - testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx + testl $(_TIF_NOTIFY_RESUME|_TIF_RESTORE_SIGMASK|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx jz 1f movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 @@ -600,7 +622,7 @@ retint_careful: jmp retint_check retint_signal: - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz retint_swapgs TRACE_IRQS_ON sti @@ -955,7 +977,7 @@ ENTRY(kernel_thread) xorl %r9d,%r9d # clone now - call do_fork + call do_fork_kthread movq %rax,RAX(%rsp) xorl %edi,%edi diff -uprN linux-2.6.18/arch/x86_64/kernel/head.S linux-2.6.18.ovz/arch/x86_64/kernel/head.S --- linux-2.6.18/arch/x86_64/kernel/head.S 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/head.S 2007-06-13 06:55:05.000000000 -0400 @@ -41,7 +41,7 @@ startup_32: */ /* Initialize the %ds segment register */ - movl $__KERNEL_DS,%eax + movl $__BOOT_DS,%eax movl %eax,%ds /* Load new GDT with the 64bit segments using 32bit descriptor */ @@ -184,7 +184,14 @@ startup_64: /* esi is pointer to real mode structure with interesting info. pass it to C */ movl %esi, %edi - + + /* Switch to __KERNEL_CS. The segment is the same, but selector + * is different. */ + pushq $__KERNEL_CS + pushq $switch_cs + lretq +switch_cs: + /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump * to the full 64bit address , this is only possible as indirect @@ -246,7 +253,7 @@ pGDT32: .org 0xf10 ljumpvector: .long startup_64-__START_KERNEL_map - .word __KERNEL_CS + .word __BOOT_CS ENTRY(stext) ENTRY(_stext) @@ -357,21 +364,30 @@ gdt: .align PAGE_SIZE /* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ + Hopefully nobody expects them at a fixed place (Wine?) + Descriptors rearranged to plase 32bit and TLS selectors in the same + places, because it is really necessary. sysret/exit mandates order + of kernel/user cs/ds, so we have to extend gdt. +*/ ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x0 /* unused */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x00cffa000000ffff /* __USER32_CS */ - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ - .quad 0x00affa000000ffff /* __USER_CS */ - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ - .quad 0,0 /* TSS */ - .quad 0,0 /* LDT */ - .quad 0,0,0 /* three TLS descriptors */ - .quad 0 /* unused */ + .quad 0x0000000000000000 /* 0 NULL descriptor */ + .quad 0x0 /* 1 unused */ + .quad 0x00af9a000000ffff /* 2 __BOOT_CS */ + .quad 0x00cf92000000ffff /* 3 __BOOT_DS */ + .quad 0,0 /* 4,5 TSS */ + .quad 0,0,0 /* 6-8 three TLS descriptors */ + .quad 0,0 /* 9,10 LDT */ + .quad 0x00cf9a000000ffff /* 11 __KERNEL32_CS */ + .quad 0x00af9a000000ffff /* 12 __KERNEL_CS */ + .quad 0x00cf92000000ffff /* 13 __KERNEL_DS */ + .quad 0x00cffa000000ffff /* 14 __USER32_CS */ + .quad 0x00cff2000000ffff /* 15 __USER_DS, __USER32_DS */ + .quad 0x00affa000000ffff /* 16 __USER_CS */ + .quad 0x0 /* 17 unused */ + .quad 0,0,0,0,0,0 + .quad 0,0,0,0,0,0,0,0 + gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ /* This should be a multiple of the cache line size */ diff -uprN linux-2.6.18/arch/x86_64/kernel/init_task.c linux-2.6.18.ovz/arch/x86_64/kernel/init_task.c --- linux-2.6.18/arch/x86_64/kernel/init_task.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/init_task.c 2007-06-13 06:55:05.000000000 -0400 @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,7 @@ static struct files_struct init_files = static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); EXPORT_SYMBOL(init_mm); diff -uprN linux-2.6.18/arch/x86_64/kernel/ldt.c linux-2.6.18.ovz/arch/x86_64/kernel/ldt.c --- linux-2.6.18/arch/x86_64/kernel/ldt.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/ldt.c 2007-06-13 06:55:05.000000000 -0400 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,8 @@ #include #include +#include + #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) { @@ -42,9 +45,9 @@ static int alloc_ldt(mm_context_t *pc, u oldsize = pc->size; mincount = (mincount+511)&(~511); if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); if (!newldt) return -ENOMEM; @@ -109,6 +112,7 @@ int init_new_context(struct task_struct } return retval; } +EXPORT_SYMBOL_GPL(init_new_context); /* * diff -uprN linux-2.6.18/arch/x86_64/kernel/nmi.c linux-2.6.18.ovz/arch/x86_64/kernel/nmi.c --- linux-2.6.18/arch/x86_64/kernel/nmi.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/nmi.c 2007-06-13 06:55:05.000000000 -0400 @@ -589,6 +589,7 @@ static __kprobes int dummy_nmi_callback( } static nmi_callback_t nmi_callback = dummy_nmi_callback; +static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { @@ -598,9 +599,21 @@ asmlinkage __kprobes void do_nmi(struct add_pda(__nmi_count,1); if (!rcu_dereference(nmi_callback)(regs, cpu)) default_do_nmi(regs); + + nmi_ipi_callback(regs, cpu); nmi_exit(); } +void set_nmi_ipi_callback(nmi_callback_t callback) +{ + nmi_ipi_callback = callback; +} + +void unset_nmi_ipi_callback(void) +{ + nmi_ipi_callback = dummy_nmi_callback; +} + void set_nmi_callback(nmi_callback_t callback) { vmalloc_sync_all(); diff -uprN linux-2.6.18/arch/x86_64/kernel/pci-calgary.c linux-2.6.18.ovz/arch/x86_64/kernel/pci-calgary.c --- linux-2.6.18/arch/x86_64/kernel/pci-calgary.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/pci-calgary.c 2007-06-13 06:55:05.000000000 -0400 @@ -759,7 +759,16 @@ static inline unsigned int __init locate int rionodeid; u32 address; - rionodeid = (dev->bus->number % 15 > 4) ? 3 : 2; + /* + * Each Calgary has four busses. The first four busses (first Calgary) + * have RIO node ID 2, then the next four (second Calgary) have RIO + * node ID 3, the next four (third Calgary) have node ID 2 again, etc. + * We use a gross hack - relying on the dev->bus->number ordering, + * modulo 14 - to decide which Calgary a given bus is on. Busses 0, 1, + * 2 and 4 are on the first Calgary (id 2), 6, 8, a and c are on the + * second (id 3), and then it repeats modulo 14. + */ + rionodeid = (dev->bus->number % 14 > 4) ? 3 : 2; /* * register space address calculation as follows: * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase) @@ -767,7 +776,7 @@ static inline unsigned int __init locate * RioNodeId is 2 for first Calgary, 3 for second Calgary */ address = START_ADDRESS - - (0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 15)) + + (0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 14)) + (0x100000) * (rionodeid - CHASSIS_BASE); return address; } diff -uprN linux-2.6.18/arch/x86_64/kernel/process.c linux-2.6.18.ovz/arch/x86_64/kernel/process.c --- linux-2.6.18/arch/x86_64/kernel/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/process.c 2007-06-13 06:55:05.000000000 -0400 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -52,7 +53,8 @@ #include #include -asmlinkage extern void ret_from_fork(void); +asmlinkage extern void execve(void); +EXPORT_SYMBOL_GPL(execve); unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; @@ -291,11 +293,12 @@ void __show_regs(struct pt_regs * regs) print_modules(); printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), - system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); + if (decode_call_traces) + printk_address(regs->rip); printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", @@ -332,11 +335,26 @@ void __show_regs(struct pt_regs * regs) void show_regs(struct pt_regs *regs) { - printk("CPU %d:", smp_processor_id()); + printk("CPU %d, VCPU %d:%d", smp_processor_id(), task_vsched_id(current), task_cpu(current)); __show_regs(regs); show_trace(NULL, regs, (void *)(regs + 1)); } +void smp_show_regs(struct pt_regs *regs, void *data) +{ + static DEFINE_SPINLOCK(show_regs_lock); + + if (regs == NULL) + return; + + bust_spinlocks(1); + spin_lock(&show_regs_lock); + printk("----------- IPI show regs -----------\n"); + show_regs(regs); + spin_unlock(&show_regs_lock); + bust_spinlocks(0); +} + /* * Free current thread data structures etc.. */ @@ -571,6 +589,9 @@ __switch_to(struct task_struct *prev_p, prev->gsindex = gsindex; } + /* Must be after DS reload */ + unlazy_fpu(prev_p); + /* * Switch the PDA and FPU contexts. */ @@ -578,10 +599,6 @@ __switch_to(struct task_struct *prev_p, write_pda(oldrsp, next->userrsp); write_pda(pcurrent, next_p); - /* This must be here to ensure both math_state_restore() and - kernel_fpu_begin() work consistently. - And the AMD workaround requires it to be after DS reload. */ - unlazy_fpu(prev_p); write_pda(kernelstack, task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); @@ -838,3 +855,20 @@ unsigned long arch_align_stack(unsigned sp -= get_random_int() % 8192; return sp & ~0xf; } + +long do_fork_kthread(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + if (ve_allow_kthreads || ve_is_super(get_exec_env())) + return do_fork(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr); + + /* Don't allow kernel_thread() inside VE */ + printk("kernel_thread call inside VE\n"); + dump_stack(); + return -EPERM; +} diff -uprN linux-2.6.18/arch/x86_64/kernel/ptrace.c linux-2.6.18.ovz/arch/x86_64/kernel/ptrace.c --- linux-2.6.18/arch/x86_64/kernel/ptrace.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/ptrace.c 2007-06-13 06:55:05.000000000 -0400 @@ -295,6 +295,15 @@ static unsigned long getreg(struct task_ return child->thread.fs; case offsetof(struct user_regs_struct, gs_base): return child->thread.gs; + case offsetof(struct user_regs_struct, cs): + if (test_tsk_thread_flag(child, TIF_SYSCALL_TRACE)) { + val = get_stack_long(child, regno - sizeof(struct pt_regs)); + if (val == __USER_CS) + return 0x33; + if (val == __USER32_CS) + return 0x23; + } + /* fall through */ default: regno = regno - sizeof(struct pt_regs); val = get_stack_long(child, regno); @@ -576,8 +585,10 @@ static void syscall_trace(struct pt_regs current_thread_info()->flags, current->ptrace); #endif + set_pn_state(current, (regs->rax != -ENOSYS) ? PN_STOP_LEAVE : PN_STOP_ENTRY); ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); + clear_pn_state(current); /* * this isn't the same as continuing with a signal, but it will do * for normal use. strace only continues with a signal if the diff -uprN linux-2.6.18/arch/x86_64/kernel/setup.c linux-2.6.18.ovz/arch/x86_64/kernel/setup.c --- linux-2.6.18/arch/x86_64/kernel/setup.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/setup.c 2007-06-13 06:55:05.000000000 -0400 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -1010,7 +1011,10 @@ static void __cpuinit init_intel(struct if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + if (c->x86 == 15) + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + else + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); @@ -1257,7 +1261,7 @@ static int show_cpuinfo(struct seq_file #ifdef CONFIG_SMP - if (!cpu_online(c-cpu_data)) + if (!vcpu_online(c - cpu_data)) return 0; #endif @@ -1278,9 +1282,13 @@ static int show_cpuinfo(struct seq_file seq_printf(m, "stepping\t: unknown\n"); if (cpu_has(c,X86_FEATURE_TSC)) { +#ifndef CONFIG_FAIRSCHED unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data)); if (!freq) freq = cpu_khz; +#else + unsigned int freq = (unsigned int)ve_scale_khz(cpu_khz); +#endif seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } diff -uprN linux-2.6.18/arch/x86_64/kernel/setup64.c linux-2.6.18.ovz/arch/x86_64/kernel/setup64.c --- linux-2.6.18/arch/x86_64/kernel/setup64.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/setup64.c 2007-06-13 06:55:05.000000000 -0400 @@ -178,6 +178,8 @@ void __cpuinit check_efer(void) } } +unsigned long kernel_eflags; + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -290,4 +292,8 @@ void __cpuinit cpu_init (void) set_debugreg(0UL, 7); fpu_init(); + + raw_local_save_flags(kernel_eflags); } + +EXPORT_SYMBOL_GPL(cpu_gdt_descr); diff -uprN linux-2.6.18/arch/x86_64/kernel/signal.c linux-2.6.18.ovz/arch/x86_64/kernel/signal.c --- linux-2.6.18/arch/x86_64/kernel/signal.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/signal.c 2007-06-13 06:55:05.000000000 -0400 @@ -38,37 +38,6 @@ int ia32_setup_frame(int sig, struct k_s sigset_t *set, struct pt_regs * regs); asmlinkage long -sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); -#ifdef DEBUG_SIG - printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", - saveset, newset, regs, regs->rip); -#endif - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - -asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { @@ -341,11 +310,11 @@ static int setup_rt_frame(int sig, struc current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } /* @@ -408,7 +377,7 @@ handle_signal(unsigned long sig, siginfo #endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret) { + if (ret == 0) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) @@ -425,9 +394,10 @@ handle_signal(unsigned long sig, siginfo * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -int do_signal(struct pt_regs *regs, sigset_t *oldset) +static void do_signal(struct pt_regs *regs) { struct k_sigaction ka; + sigset_t *oldset; siginfo_t info; int signr; @@ -438,9 +408,14 @@ int do_signal(struct pt_regs *regs, sigs * if so. */ if (!user_mode(regs)) - return 1; + return; - if (!oldset) + if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + oldset = ¤t->saved_sigmask; + else oldset = ¤t->blocked; signr = get_signal_to_deliver(&info, &ka, regs, NULL); @@ -454,9 +429,18 @@ int do_signal(struct pt_regs *regs, sigs set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ - return handle_signal(signr, &info, &ka, oldset, regs); + if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + /* a signal was successfully delivered; the saved + * sigmask will have been stored in the signal frame, + * and will be restored by sigreturn, so we can simply + * clear the TIF_RESTORE_SIGMASK flag */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + clear_thread_flag(TIF_RESTORE_SIGMASK); + } + return; } +no_signal: /* Did we come from a system call? */ if ((long)regs->orig_rax >= 0) { /* Restart the system call - no handlers present */ @@ -474,10 +458,16 @@ int do_signal(struct pt_regs *regs, sigs regs->rip -= 2; } } - return 0; + + /* if there's no signal to deliver, we just put the saved sigmask + * back */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) { + clear_thread_flag(TIF_RESTORE_SIGMASK); + sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); + } } -void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) +void do_notify_resume(struct pt_regs *regs, sigset_t *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", @@ -491,8 +481,8 @@ void do_notify_resume(struct pt_regs *re } /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs,oldset); + if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) + do_signal(regs); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff -uprN linux-2.6.18/arch/x86_64/kernel/smp.c linux-2.6.18.ovz/arch/x86_64/kernel/smp.c --- linux-2.6.18/arch/x86_64/kernel/smp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/smp.c 2007-06-13 06:55:05.000000000 -0400 @@ -28,6 +28,7 @@ #include #include #include +#include /* * Smarter SMP flushing macros. @@ -448,6 +449,84 @@ int smp_call_function (void (*func) (voi } EXPORT_SYMBOL(smp_call_function); +static spinlock_t nmi_call_lock = SPIN_LOCK_UNLOCKED; +static struct nmi_call_data_struct { + smp_nmi_function func; + void *info; + atomic_t started; + atomic_t finished; + cpumask_t cpus_called; + int wait; +} *nmi_call_data; + +static int smp_nmi_callback(struct pt_regs * regs, int cpu) +{ + smp_nmi_function func; + void *info; + int wait; + + func = nmi_call_data->func; + info = nmi_call_data->info; + wait = nmi_call_data->wait; + ack_APIC_irq(); + /* prevent from calling func() multiple times */ + if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) + return 0; + /* + * notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&nmi_call_data->started); + /* at this point the nmi_call_data structure is out of scope */ + irq_enter(); + func(regs, info); + irq_exit(); + if (wait) + atomic_inc(&nmi_call_data->finished); + + return 0; +} + +int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) +{ + struct nmi_call_data_struct data; + int cpus; + + cpus = num_online_cpus() - 1; + if (!cpus) + return 0; + + data.func = func; + data.info = info; + data.wait = wait; + atomic_set(&data.started, 0); + atomic_set(&data.finished, 0); + cpus_clear(data.cpus_called); + /* prevent this cpu from calling func if NMI happens */ + cpu_set(smp_processor_id(), data.cpus_called); + + if (!spin_trylock(&nmi_call_lock)) + return -1; + + nmi_call_data = &data; + set_nmi_ipi_callback(smp_nmi_callback); + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(APIC_DM_NMI); + while (atomic_read(&data.started) != cpus) + barrier(); + + unset_nmi_ipi_callback(); + if (wait) + while (atomic_read(&data.finished) != cpus) + barrier(); + spin_unlock(&nmi_call_lock); + + return 0; +} + void smp_stop_cpu(void) { unsigned long flags; diff -uprN linux-2.6.18/arch/x86_64/kernel/sys_x86_64.c linux-2.6.18.ovz/arch/x86_64/kernel/sys_x86_64.c --- linux-2.6.18/arch/x86_64/kernel/sys_x86_64.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/sys_x86_64.c 2007-06-13 06:55:05.000000000 -0400 @@ -148,7 +148,7 @@ asmlinkage long sys_uname(struct new_uts { int err; down_read(&uts_sem); - err = copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); if (personality(current->personality) == PER_LINUX32) err |= copy_to_user(&name->machine, "i686", 5); diff -uprN linux-2.6.18/arch/x86_64/kernel/time.c linux-2.6.18.ovz/arch/x86_64/kernel/time.c --- linux-2.6.18/arch/x86_64/kernel/time.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/time.c 2007-06-13 06:55:05.000000000 -0400 @@ -960,7 +960,7 @@ __cpuinit int unsynchronized_tsc(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { #ifdef CONFIG_ACPI /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) + if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 1000) return 1; #endif return 0; diff -uprN linux-2.6.18/arch/x86_64/kernel/traps.c linux-2.6.18.ovz/arch/x86_64/kernel/traps.c --- linux-2.6.18/arch/x86_64/kernel/traps.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/kernel/traps.c 2007-06-13 06:55:05.000000000 -0400 @@ -123,6 +123,11 @@ void printk_address(unsigned long addres char *delim = ":"; char namebuf[128]; + if (!decode_call_traces) { + printk("[<%016lx>]", address); + return; + } + symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); if (!symname) { @@ -399,7 +404,7 @@ static void _show_stack(struct task_stru if (((long) stack & (THREAD_SIZE-1)) == 0) break; } - if (i && ((i % 4) == 0)) + if (i && ((i % 4) == 0) && decode_call_traces) printk("\n"); printk(" %016lx", *stack++); touch_nmi_watchdog(); @@ -433,10 +438,12 @@ void show_registers(struct pt_regs *regs rsp = regs->rsp; - printk("CPU %d ", cpu); + printk("CPU: %d ", cpu); __show_regs(regs); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, + VEID(VE_TASK_INFO(current)->owner_env), + task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the @@ -584,6 +591,7 @@ void __kprobes die_nmi(char *str, struct crash_kexec(regs); if (panic_on_timeout || panic_on_oops) panic("nmi watchdog"); + smp_nmi_call_function(smp_show_regs, NULL, 1); printk("console shuts up ...\n"); oops_end(flags); nmi_exit(); diff -uprN linux-2.6.18/arch/x86_64/lib/Makefile linux-2.6.18.ovz/arch/x86_64/lib/Makefile --- linux-2.6.18/arch/x86_64/lib/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/lib/Makefile 2007-06-13 06:55:05.000000000 -0400 @@ -5,6 +5,7 @@ CFLAGS_csum-partial.o := -funroll-loops obj-y := io.o iomap_copy.o +obj-$(CONFIG_SMP) += msr-on-cpu.o cpuid-on-cpu.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ diff -uprN linux-2.6.18/arch/x86_64/lib/cpuid-on-cpu.c linux-2.6.18.ovz/arch/x86_64/lib/cpuid-on-cpu.c --- linux-2.6.18/arch/x86_64/lib/cpuid-on-cpu.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/arch/x86_64/lib/cpuid-on-cpu.c 2007-06-13 06:55:05.000000000 -0400 @@ -0,0 +1 @@ +#include "../../i386/lib/cpuid-on-cpu.c" diff -uprN linux-2.6.18/arch/x86_64/lib/msr-on-cpu.c linux-2.6.18.ovz/arch/x86_64/lib/msr-on-cpu.c --- linux-2.6.18/arch/x86_64/lib/msr-on-cpu.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/arch/x86_64/lib/msr-on-cpu.c 2007-06-13 06:55:05.000000000 -0400 @@ -0,0 +1 @@ +#include "../../i386/lib/msr-on-cpu.c" diff -uprN linux-2.6.18/arch/x86_64/mm/fault.c linux-2.6.18.ovz/arch/x86_64/mm/fault.c --- linux-2.6.18/arch/x86_64/mm/fault.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/mm/fault.c 2007-06-13 06:55:05.000000000 -0400 @@ -75,27 +75,6 @@ static inline int notify_page_fault(enum } #endif -void bust_spinlocks(int yes) -{ - int loglevel_save = console_loglevel; - if (yes) { - oops_in_progress = 1; - } else { -#ifdef CONFIG_VT - unblank_screen(); -#endif - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk will give klogd - * a poke. Hold onto your hats... - */ - console_loglevel = 15; /* NMI oopser may have shut the console up */ - printk(" "); - console_loglevel = loglevel_save; - } -} - /* Sometimes the CPU reports invalid exceptions on prefetch. Check that here and ignore. Opcode checker based on code by Richard Brunner */ @@ -329,7 +308,7 @@ static int vmalloc_fault(unsigned long a } int page_fault_trace = 0; -int exception_trace = 1; +int exception_trace = 0; /* * This routine handles page faults. It determines the address, @@ -400,7 +379,7 @@ asmlinkage void __kprobes do_page_fault( local_irq_enable(); if (unlikely(page_fault_trace)) - printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", + ve_printk(VE_LOG, "pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); if (unlikely(error_code & PF_RSVD)) @@ -413,7 +392,6 @@ asmlinkage void __kprobes do_page_fault( if (unlikely(in_atomic() || !mm)) goto bad_area_nosemaphore; - again: /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunatly, in the case of an @@ -519,7 +497,7 @@ bad_area_nosemaphore: return; if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { - printk( + ve_printk(VE_LOG, "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", tsk->pid > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, tsk->pid, address, regs->rip, @@ -569,7 +547,8 @@ no_context: else printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at %016lx RIP: \n" KERN_ALERT,address); - printk_address(regs->rip); + if (decode_call_traces) + printk_address(regs->rip); dump_pagetable(address); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; @@ -586,13 +565,14 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (current->pid == 1) { - yield(); - goto again; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_exit(SIGKILL); + if (error_code & 4) { + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. + */ + force_sig(SIGKILL, tsk); + return; + } goto no_context; do_sigbus: diff -uprN linux-2.6.18/arch/x86_64/mm/init.c linux-2.6.18.ovz/arch/x86_64/mm/init.c --- linux-2.6.18/arch/x86_64/mm/init.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/x86_64/mm/init.c 2007-06-13 06:55:05.000000000 -0400 @@ -87,6 +87,7 @@ void show_mem(void) printk(KERN_INFO "%lu pages shared\n",shared); printk(KERN_INFO "%lu pages swap cached\n",cached); } +EXPORT_SYMBOL(show_mem); int after_bootmem; diff -uprN linux-2.6.18/arch/xtensa/kernel/syscalls.c linux-2.6.18.ovz/arch/xtensa/kernel/syscalls.c --- linux-2.6.18/arch/xtensa/kernel/syscalls.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/arch/xtensa/kernel/syscalls.c 2007-06-13 06:55:05.000000000 -0400 @@ -128,7 +128,7 @@ out: int sys_uname(struct old_utsname * name) { - if (name && !copy_to_user(name, &system_utsname, sizeof (*name))) + if (name && !copy_to_user(name, utsname(), sizeof (*name))) return 0; return -EFAULT; } diff -uprN linux-2.6.18/block/cfq-iosched.c linux-2.6.18.ovz/block/cfq-iosched.c --- linux-2.6.18/block/cfq-iosched.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/block/cfq-iosched.c 2007-06-13 06:55:05.000000000 -0400 @@ -12,6 +12,11 @@ #include #include #include +#include +#include +#include +#include +#include /* * tunables @@ -26,6 +31,7 @@ static const int cfq_slice_sync = HZ / 1 static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static int cfq_ub_slice = HZ / 2; #define CFQ_IDLE_GRACE (HZ / 10) #define CFQ_SLICE_SCALE (5) @@ -63,13 +69,11 @@ static DEFINE_SPINLOCK(cfq_exit_lock); #define rq_rb_key(rq) (rq)->sector static kmem_cache_t *crq_pool; -static kmem_cache_t *cfq_pool; static kmem_cache_t *cfq_ioc_pool; static atomic_t ioc_count = ATOMIC_INIT(0); static struct completion *ioc_gone; -#define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) @@ -87,118 +91,6 @@ static struct completion *ioc_gone; #define sample_valid(samples) ((samples) > 80) -/* - * Per block device queue structure - */ -struct cfq_data { - request_queue_t *queue; - - /* - * rr list of queues with requests and the count of them - */ - struct list_head rr_list[CFQ_PRIO_LISTS]; - struct list_head busy_rr; - struct list_head cur_rr; - struct list_head idle_rr; - unsigned int busy_queues; - - /* - * non-ordered list of empty cfqq's - */ - struct list_head empty_list; - - /* - * cfqq lookup hash - */ - struct hlist_head *cfq_hash; - - /* - * global crq hash for all queues - */ - struct hlist_head *crq_hash; - - mempool_t *crq_pool; - - int rq_in_driver; - int hw_tag; - - /* - * schedule slice state info - */ - /* - * idle window management - */ - struct timer_list idle_slice_timer; - struct work_struct unplug_work; - - struct cfq_queue *active_queue; - struct cfq_io_context *active_cic; - int cur_prio, cur_end_prio; - unsigned int dispatch_slice; - - struct timer_list idle_class_timer; - - sector_t last_sector; - unsigned long last_end_request; - - unsigned int rq_starved; - - /* - * tunables, see top of file - */ - unsigned int cfq_quantum; - unsigned int cfq_queued; - unsigned int cfq_fifo_expire[2]; - unsigned int cfq_back_penalty; - unsigned int cfq_back_max; - unsigned int cfq_slice[2]; - unsigned int cfq_slice_async_rq; - unsigned int cfq_slice_idle; - - struct list_head cic_list; -}; - -/* - * Per process-grouping structure - */ -struct cfq_queue { - /* reference count */ - atomic_t ref; - /* parent cfq_data */ - struct cfq_data *cfqd; - /* cfqq lookup hash */ - struct hlist_node cfq_hash; - /* hash key */ - unsigned int key; - /* on either rr or empty list of cfqd */ - struct list_head cfq_list; - /* sorted list of pending requests */ - struct rb_root sort_list; - /* if fifo isn't expired, next request to serve */ - struct cfq_rq *next_crq; - /* requests queued in sort_list */ - int queued[2]; - /* currently allocated requests */ - int allocated[2]; - /* fifo list of requests in sort_list */ - struct list_head fifo; - - unsigned long slice_start; - unsigned long slice_end; - unsigned long slice_left; - unsigned long service_last; - - /* number of requests that are on the dispatch list */ - int on_dispatch[2]; - - /* io prio of this group */ - unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; - - /* various state flags, see below */ - unsigned int flags; -}; - struct cfq_rq { struct rb_node rb_node; sector_t rb_key; @@ -269,7 +161,70 @@ CFQ_CRQ_FNS(is_sync); static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask); +static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, + struct task_struct *tsk, struct ub_iopriv *iopriv, gfp_t gfp_mask); +static void cfq_put_queue(struct cfq_queue *cfqq); + +#ifdef CONFIG_UBC_IO_PRIO +static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, pid_t key) +{ + if (key != CFQ_KEY_ASYNC) + return cfqd->virt_mode ? &get_io_ub()->iopriv : &get_ub0()->iopriv; + else + return cfqd->write_virt_mode ? &get_io_ub()->iopriv : &get_ub0()->iopriv; +} + +static inline void cfq_put_async_queue(struct cfq_data *cfqd) +{ + struct user_beancounter *ub; + struct cfq_bc_data *cfq_bc; + int i; + + rcu_read_lock(); + for_each_beancounter(ub) { + write_lock(&ub->iopriv.cfq_bc_list_lock); + cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd); + if (!cfq_bc) { + write_unlock(&ub->iopriv.cfq_bc_list_lock); + continue; + } + for (i = 0; i < CFQ_PRIO_LISTS; i++) + if (cfq_bc->async_cfqq[i]) { + cfq_put_queue(cfq_bc->async_cfqq[i]); + cfq_bc->async_cfqq[i] = NULL; + } + write_unlock(&ub->iopriv.cfq_bc_list_lock); + } + rcu_read_unlock(); +} +#else +static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, pid_t key) +{ + return NULL; +} + +static inline void cfq_put_async_queue(struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + int i; + + cfq_bc = &cfqd->cfq_bc; + for (i = 0; i < CFQ_PRIO_LISTS; i++) + if (cfq_bc->async_cfqq[i]) { + cfq_put_queue(cfq_bc->async_cfqq[i]); + cfq_bc->async_cfqq[i] = NULL; + } +} +#endif + +static inline struct user_beancounter *ub_by_iopriv(struct ub_iopriv *iopriv) +{ +#ifdef CONFIG_UBC_IO_PRIO + return container_of(iopriv, struct user_beancounter, iopriv); +#else + return NULL; +#endif +} /* * lots of deadline iosched dupes, can be abstracted later... @@ -324,9 +279,12 @@ static int cfq_queue_empty(request_queue return !cfqd->busy_queues; } -static inline pid_t cfq_queue_pid(struct task_struct *task, int rw) +static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync) { - if (rw == READ || rw == WRITE_SYNC) + /* + * Use the per-process queue, for read requests and syncronous writes + */ + if (!(rw & REQ_RW) || is_sync) return task->pid; return CFQ_KEY_ASYNC; @@ -459,17 +417,19 @@ static void cfq_update_next_crq(struct c static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) { - struct cfq_data *cfqd = cfqq->cfqd; struct list_head *list, *entry; + struct cfq_bc_data *cfq_bc; BUG_ON(!cfq_cfqq_on_rr(cfqq)); list_del(&cfqq->cfq_list); + cfq_bc = cfqq->cfq_bc; + if (cfq_class_rt(cfqq)) - list = &cfqd->cur_rr; + list = &cfq_bc->cur_rr; else if (cfq_class_idle(cfqq)) - list = &cfqd->idle_rr; + list = &cfq_bc->idle_rr; else { /* * if cfqq has requests in flight, don't allow it to be @@ -479,16 +439,16 @@ static void cfq_resort_rr_list(struct cf * sporadically or synchronously */ if (cfq_cfqq_dispatched(cfqq)) - list = &cfqd->busy_rr; + list = &cfq_bc->busy_rr; else - list = &cfqd->rr_list[cfqq->ioprio]; + list = &cfq_bc->rr_list[cfqq->ioprio]; } /* * if queue was preempted, just add to front to be fair. busy_rr * isn't sorted, but insert at the back for fairness. */ - if (preempted || list == &cfqd->busy_rr) { + if (preempted || list == &cfq_bc->busy_rr) { if (preempted) list = list->prev; @@ -522,6 +482,7 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, s BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; + bc_inc_rqnum(cfqq); cfq_resort_rr_list(cfqq, 0); } @@ -535,6 +496,7 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, s BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; + bc_dec_rqnum(cfqq); } /* @@ -621,7 +583,7 @@ static struct request * cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) { struct task_struct *tsk = current; - pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio)); + pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio), bio_sync(bio)); struct cfq_queue *cfqq; struct rb_node *n; sector_t sector; @@ -737,6 +699,34 @@ cfq_merged_requests(request_queue_t *q, cfq_remove_request(next); } +static int cfq_allow_merge(request_queue_t *q, struct request *rq, + struct bio *bio) +{ + struct cfq_data *cfqd = q->elevator->elevator_data; + const int rw = bio_data_dir(bio); + struct cfq_rq *crq = RQ_DATA(rq); + struct cfq_queue *cfqq; + pid_t key; + + /* + * Disallow merge of a sync bio into an async request. + */ + if ((bio_data_dir(bio) == READ || bio_sync(bio)) + && !cfq_crq_is_sync(crq)) + return 0; + + /* + * Lookup the cfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + */ + key = cfq_queue_pid(current, rw, bio_sync(bio)); + cfqq = cfq_find_cfq_hash(cfqd, key, current->ioprio); + if (cfqq == crq->cfq_queue) + return 1; + + return 0; +} + static inline void __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { @@ -820,14 +810,19 @@ static inline void cfq_slice_expired(str static int cfq_get_next_prio_level(struct cfq_data *cfqd) { int prio, wrap; + struct cfq_bc_data *cfq_bc; + + cfq_bc = cfqd->active_cfq_bc; + if (!cfq_bc) + return -1; prio = -1; wrap = 0; do { int p; - for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) { - if (!list_empty(&cfqd->rr_list[p])) { + for (p = cfq_bc->cur_prio; p <= cfq_bc->cur_end_prio; p++) { + if (!list_empty(&cfq_bc->rr_list[p])) { prio = p; break; } @@ -835,9 +830,9 @@ static int cfq_get_next_prio_level(struc if (prio != -1) break; - cfqd->cur_prio = 0; - if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) { - cfqd->cur_end_prio = 0; + cfq_bc->cur_prio = 0; + if (++cfq_bc->cur_end_prio == CFQ_PRIO_LISTS) { + cfq_bc->cur_end_prio = 0; if (wrap) break; wrap = 1; @@ -849,16 +844,16 @@ static int cfq_get_next_prio_level(struc BUG_ON(prio >= CFQ_PRIO_LISTS); - list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr); + list_splice_init(&cfq_bc->rr_list[prio], &cfq_bc->cur_rr); - cfqd->cur_prio = prio + 1; - if (cfqd->cur_prio > cfqd->cur_end_prio) { - cfqd->cur_end_prio = cfqd->cur_prio; - cfqd->cur_prio = 0; - } - if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) { - cfqd->cur_prio = 0; - cfqd->cur_end_prio = 0; + cfq_bc->cur_prio = prio + 1; + if (cfq_bc->cur_prio > cfq_bc->cur_end_prio) { + cfq_bc->cur_end_prio = cfq_bc->cur_prio; + cfq_bc->cur_prio = 0; + } + if (cfq_bc->cur_end_prio == CFQ_PRIO_LISTS) { + cfq_bc->cur_prio = 0; + cfq_bc->cur_end_prio = 0; } return prio; @@ -867,35 +862,44 @@ static int cfq_get_next_prio_level(struc static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) { struct cfq_queue *cfqq = NULL; + struct cfq_bc_data *cfq_bc; + + bc_schedule_active(cfqd); + + cfq_bc = cfqd->active_cfq_bc; + if (!cfq_bc) + goto out; /* * if current list is non-empty, grab first entry. if it is empty, * get next prio level and grab first entry then if any are spliced */ - if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) - cfqq = list_entry_cfqq(cfqd->cur_rr.next); + if (!list_empty(&cfq_bc->cur_rr) + || cfq_get_next_prio_level(cfqd) != -1) + cfqq = list_entry_cfqq(cfq_bc->cur_rr.next); /* * If no new queues are available, check if the busy list has some * before falling back to idle io. */ - if (!cfqq && !list_empty(&cfqd->busy_rr)) - cfqq = list_entry_cfqq(cfqd->busy_rr.next); + if (!cfqq && !list_empty(&cfq_bc->busy_rr)) + cfqq = list_entry_cfqq(cfq_bc->busy_rr.next); /* * if we have idle queues and no rt or be queues had pending * requests, either allow immediate service if the grace period * has passed or arm the idle grace timer */ - if (!cfqq && !list_empty(&cfqd->idle_rr)) { + if (!cfqq && !list_empty(&cfq_bc->idle_rr)) { unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE; if (time_after_eq(jiffies, end)) - cfqq = list_entry_cfqq(cfqd->idle_rr.next); + cfqq = list_entry_cfqq(cfq_bc->idle_rr.next); else mod_timer(&cfqd->idle_class_timer, end); } +out: __cfq_set_active_queue(cfqd, cfqq); return cfqq; } @@ -948,9 +952,9 @@ static void cfq_dispatch_insert(request_ struct cfq_queue *cfqq = crq->cfq_queue; struct request *rq; - cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq); cfq_remove_request(crq->request); cfqq->on_dispatch[cfq_crq_is_sync(crq)]++; + cfqq->cfq_bc->on_dispatch++; elv_dispatch_sort(q, crq->request); rq = list_entry(q->queue_head.prev, struct request, queuelist); @@ -1029,7 +1033,8 @@ static struct cfq_queue *cfq_select_queu /* * slice has expired */ - if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end)) + if (!cfq_cfqq_must_dispatch(cfqq) && + (time_after(now, cfqq->slice_end) || bc_expired(cfqd))) goto expire; /* @@ -1128,7 +1133,7 @@ cfq_forced_dispatch_cfqqs(struct list_he } static int -cfq_forced_dispatch(struct cfq_data *cfqd) +__cfq_forced_dispatch(struct cfq_bc_data *cfqd) { int i, dispatched = 0; @@ -1139,6 +1144,26 @@ cfq_forced_dispatch(struct cfq_data *cfq dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr); dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr); + return dispatched; +} + +static int +cfq_forced_dispatch(struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + struct cfq_bc_data *cfq_bc_tmp; + int dispatched; + + dispatched = 0; + /* + * We use here _safe iterating, because + * __cfq_forced_dispatch() produces list_del() implicitly + */ + list_for_each_entry_safe(cfq_bc, cfq_bc_tmp, + &cfqd->act_cfq_bc_head, act_cfq_bc_list) { + dispatched += __cfq_forced_dispatch(cfq_bc); + } + cfq_slice_expired(cfqd, 0); BUG_ON(cfqd->busy_queues); @@ -1225,7 +1250,7 @@ static void cfq_put_queue(struct cfq_que static inline struct cfq_queue * __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio, - const int hashval) + const int hashval, struct ub_iopriv *iopriv) { struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; struct hlist_node *entry; @@ -1234,8 +1259,13 @@ __cfq_find_cfq_hash(struct cfq_data *cfq hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) { const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio); - if (__cfqq->key == key && (__p == prio || !prio)) - return __cfqq; + if (__cfqq->key == key && (__p == prio || !prio)) { + if (key != CFQ_KEY_ASYNC || !iopriv) + return __cfqq; + /* async queue => compare owner beancounter */ + if (__cfqq->cfq_bc->ub_iopriv == iopriv) + return __cfqq; + } } return NULL; @@ -1244,7 +1274,11 @@ __cfq_find_cfq_hash(struct cfq_data *cfq static struct cfq_queue * cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio) { - return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT)); + struct ub_iopriv *iopriv; + + iopriv = cfqq_ub_iopriv(cfqd, key); + return __cfq_find_cfq_hash(cfqd, key, prio, + hash_long(key, CFQ_QHASH_SHIFT), iopriv); } static void cfq_free_io_context(struct io_context *ioc) @@ -1287,6 +1321,10 @@ static void cfq_exit_single_io_context(s spin_lock(q->queue_lock); + /* + * cic->cfqq[ASYNC] is always NULL and the put of async queues + * happens on appropriate bc death or device unplug + */ if (cic->cfqq[ASYNC]) { if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue)) __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0); @@ -1395,18 +1433,25 @@ static void cfq_init_prio_data(struct cf static inline void changed_ioprio(struct cfq_io_context *cic) { struct cfq_data *cfqd = cic->key; + struct ub_iopriv *iopriv; struct cfq_queue *cfqq; + unsigned long flags; if (unlikely(!cfqd)) return; - spin_lock(cfqd->queue->queue_lock); + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + /* + * cic->cfqq[ASYNC] is always NULL, ioprio change + * for async queues happens automatically + */ cfqq = cic->cfqq[ASYNC]; if (cfqq) { struct cfq_queue *new_cfqq; + iopriv = cfqq_ub_iopriv(cfqd, CFQ_KEY_ASYNC); new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task, - GFP_ATOMIC); + iopriv, GFP_ATOMIC); if (new_cfqq) { cic->cfqq[ASYNC] = new_cfqq; cfq_put_queue(cfqq); @@ -1417,7 +1462,7 @@ static inline void changed_ioprio(struct if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); - spin_unlock(cfqd->queue->queue_lock); + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); } /* @@ -1445,15 +1490,16 @@ static int cfq_ioc_set_ioprio(struct io_ static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, - gfp_t gfp_mask) + struct ub_iopriv *iopriv, gfp_t gfp_mask) { const int hashval = hash_long(key, CFQ_QHASH_SHIFT); struct cfq_queue *cfqq, *new_cfqq = NULL; unsigned short ioprio; + struct cfq_bc_data *cfq_bc = NULL; retry: ioprio = tsk->ioprio; - cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); + cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval, iopriv); if (!cfqq) { if (new_cfqq) { @@ -1462,16 +1508,31 @@ retry: } else if (gfp_mask & __GFP_WAIT) { spin_unlock_irq(cfqd->queue->queue_lock); new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); + if (new_cfqq) { + cfq_bc = bc_findcreate_cfq_bc(iopriv, + cfqd, gfp_mask); + if (!cfq_bc) { + kmem_cache_free(cfq_pool, new_cfqq); + new_cfqq = NULL; + } + } spin_lock_irq(cfqd->queue->queue_lock); goto retry; } else { cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); if (!cfqq) goto out; + cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); + if (!cfq_bc) { + kmem_cache_free(cfq_pool, cfqq); + cfqq = NULL; + goto out; + } } memset(cfqq, 0, sizeof(*cfqq)); + cfqq->cfq_bc = cfq_bc; INIT_HLIST_NODE(&cfqq->cfq_hash); INIT_LIST_HEAD(&cfqq->cfq_list); INIT_LIST_HEAD(&cfqq->fifo); @@ -1546,6 +1607,7 @@ cfq_cic_link(struct cfq_data *cfqd, stru struct rb_node **p; struct rb_node *parent; struct cfq_io_context *__cic; + unsigned long flags; void *k; cic->ioc = ioc; @@ -1573,11 +1635,11 @@ restart: BUG(); } - spin_lock(&cfq_exit_lock); + spin_lock_irqsave(&cfq_exit_lock, flags); rb_link_node(&cic->rb_node, parent, p); rb_insert_color(&cic->rb_node, &ioc->cic_root); list_add(&cic->queue_list, &cfqd->cic_list); - spin_unlock(&cfq_exit_lock); + spin_unlock_irqrestore(&cfq_exit_lock, flags); } /* @@ -1731,8 +1793,13 @@ cfq_should_preempt(struct cfq_data *cfqd static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { struct cfq_queue *__cfqq, *next; + struct cfq_bc_data *cfq_bc; + + cfq_bc = cfqd->active_cfq_bc; + if (!cfq_bc) + return; - list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list) + list_for_each_entry_safe(__cfqq, next, &cfq_bc->cur_rr, cfq_list) cfq_resort_rr_list(__cfqq, 1); if (!cfqq->slice_left) @@ -1845,6 +1912,7 @@ static void cfq_completed_request(reques WARN_ON(!cfqq->on_dispatch[sync]); cfqd->rq_in_driver--; cfqq->on_dispatch[sync]--; + cfqq->cfq_bc->on_dispatch--; if (!cfq_class_idle(cfqq)) cfqd->last_end_request = now; @@ -1951,6 +2019,9 @@ static int cfq_may_queue(request_queue_t struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; struct cfq_queue *cfqq; + unsigned int key; + + key = cfq_queue_pid(tsk, rw, rw & REQ_RW_SYNC); /* * don't force setup of a queue from here, as a call to may_queue @@ -1958,7 +2029,7 @@ static int cfq_may_queue(request_queue_t * so just lookup a possibly existing queue, or return 'may queue' * if that fails */ - cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio); + cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); if (cfqq) { cfq_init_prio_data(cfqq); cfq_prio_boost(cfqq); @@ -2005,10 +2076,33 @@ static void cfq_put_request(request_queu rq->elevator_private = NULL; cfq_check_waiters(q, cfqq); + put_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv)); cfq_put_queue(cfqq); } } +static int cfq_get_async_cfqq_index(struct task_struct *tsk) +{ + int index; + int ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio); + + switch (ioprio_class) { + default: + printk(KERN_ERR "cfq: bad class %x\n", ioprio_class); + case IOPRIO_CLASS_NONE: + index = task_nice_ioprio(tsk); + break; + case IOPRIO_CLASS_RT: + case IOPRIO_CLASS_BE: + index = task_ioprio(tsk); + break; + case IOPRIO_CLASS_IDLE: + index = 7; + break; + } + return index; +} + /* * Allocate cfq data structures associated with this request. */ @@ -2020,29 +2114,51 @@ cfq_set_request(request_queue_t *q, stru struct task_struct *tsk = current; struct cfq_io_context *cic; const int rw = rq_data_dir(rq); - pid_t key = cfq_queue_pid(tsk, rw); + pid_t key = cfq_queue_pid(tsk, rw, rq->flags & REQ_RW_SYNC); struct cfq_queue *cfqq; struct cfq_rq *crq; unsigned long flags; int is_sync = key != CFQ_KEY_ASYNC; + struct ub_iopriv *iopriv; + struct cfq_bc_data *cfq_bc = NULL; + int cfqq_index; might_sleep_if(gfp_mask & __GFP_WAIT); cic = cfq_get_io_context(cfqd, gfp_mask); + iopriv = cfqq_ub_iopriv(cfqd, key); + if (!is_sync) + cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); spin_lock_irqsave(q->queue_lock, flags); - if (!cic) + if (!cic || (!is_sync && cfq_bc == NULL)) goto queue_fail; - if (!cic->cfqq[is_sync]) { - cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask); - if (!cfqq) - goto queue_fail; - - cic->cfqq[is_sync] = cfqq; - } else - cfqq = cic->cfqq[is_sync]; + /* + * We store task's sync cfqq at IO context as usual, + * and async cfqqs are stored at cfq_bc_data + */ + if (is_sync) { + if (!cic->cfqq[is_sync]) { + cfqq = cfq_get_queue(cfqd, key, tsk, iopriv, gfp_mask); + if (!cfqq) + goto queue_fail; + + cic->cfqq[is_sync] = cfqq; + } else + cfqq = cic->cfqq[is_sync]; + } else { + cfqq_index = cfq_get_async_cfqq_index(tsk); + if (!cfq_bc->async_cfqq[cfqq_index]) { + cfqq = cfq_get_queue(cfqd, key, tsk, iopriv, gfp_mask); + if (!cfqq) + goto queue_fail; + + cfq_bc->async_cfqq[cfqq_index] = cfqq; + } else + cfqq = cfq_bc->async_cfqq[cfqq_index]; + } cfqq->allocated[rw]++; cfq_clear_cfqq_must_alloc(cfqq); @@ -2065,6 +2181,11 @@ cfq_set_request(request_queue_t *q, stru cfq_clear_crq_is_sync(crq); rq->elevator_private = crq; + /* + * We can't get iopriv here. Otherwise if prioritization + * was suddenly disabled we get a wrong beancounter. + */ + get_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv)); return 0; } @@ -2203,6 +2324,11 @@ static void cfq_exit_queue(elevator_t *e struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, struct cfq_io_context, queue_list); + /* + * putting async queues on all beancounters, + * ->cfqq[ASYNC] below always equals NULL + */ + cfq_put_async_queue(cfqd); if (cic->cfqq[ASYNC]) { cfq_put_queue(cic->cfqq[ASYNC]); cic->cfqq[ASYNC] = NULL; @@ -2220,6 +2346,8 @@ static void cfq_exit_queue(elevator_t *e cfq_shutdown_timer_wq(cfqd); + bc_cfq_exit_queue(cfqd); + mempool_destroy(cfqd->crq_pool); kfree(cfqd->crq_hash); kfree(cfqd->cfq_hash); @@ -2237,12 +2365,15 @@ static void *cfq_init_queue(request_queu memset(cfqd, 0, sizeof(*cfqd)); - for (i = 0; i < CFQ_PRIO_LISTS; i++) - INIT_LIST_HEAD(&cfqd->rr_list[i]); - - INIT_LIST_HEAD(&cfqd->busy_rr); - INIT_LIST_HEAD(&cfqd->cur_rr); - INIT_LIST_HEAD(&cfqd->idle_rr); + INIT_LIST_HEAD(&cfqd->act_cfq_bc_head); +#ifndef CONFIG_UBC_IO_PRIO + cfq_init_cfq_bc(&cfqd->cfq_bc); + /* + * Adding ub0 to active list in order to serve force dispatching + * case uniformally. Note, that nobody removes ub0 from this list. + */ + list_add_tail(&cfqd->cfq_bc.act_cfq_bc_list, &cfqd->act_cfq_bc_head); +#endif INIT_LIST_HEAD(&cfqd->empty_list); INIT_LIST_HEAD(&cfqd->cic_list); @@ -2285,6 +2416,9 @@ static void *cfq_init_queue(request_queu cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->cfq_ub_slice = cfq_ub_slice; + cfqd->virt_mode = 1; + cfqd->write_virt_mode = 1; return cfqd; out_crqpool: @@ -2367,6 +2501,9 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd- SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); +SHOW_FUNCTION(cfq_ub_slice_show, cfqd->cfq_ub_slice, 1); +SHOW_FUNCTION(cfq_virt_mode_show, cfqd->virt_mode, 0); +SHOW_FUNCTION(cfq_write_virt_mode_show, cfqd->write_virt_mode, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -2395,6 +2532,9 @@ STORE_FUNCTION(cfq_slice_idle_store, &cf STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); +STORE_FUNCTION(cfq_ub_slice_store, &cfqd->cfq_ub_slice, 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_virt_mode_store, &cfqd->virt_mode, 0, 1, 0); +STORE_FUNCTION(cfq_write_virt_mode_store, &cfqd->write_virt_mode, 0, 1, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -2411,6 +2551,9 @@ static struct elv_fs_entry cfq_attrs[] = CFQ_ATTR(slice_async), CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), + CFQ_ATTR(ub_slice), + CFQ_ATTR(virt_mode), + CFQ_ATTR(write_virt_mode), __ATTR_NULL }; @@ -2419,6 +2562,7 @@ static struct elevator_type iosched_cfq .elevator_merge_fn = cfq_merge, .elevator_merged_fn = cfq_merged_request, .elevator_merge_req_fn = cfq_merged_requests, + .elevator_allow_merge_fn = cfq_allow_merge, .elevator_dispatch_fn = cfq_dispatch_requests, .elevator_add_req_fn = cfq_insert_request, .elevator_activate_req_fn = cfq_activate_request, @@ -2433,6 +2577,7 @@ static struct elevator_type iosched_cfq .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, .trim = cfq_trim, + .put_queue = cfq_put_queue, }, .elevator_attrs = cfq_attrs, .elevator_name = "cfq", diff -uprN linux-2.6.18/block/elevator.c linux-2.6.18.ovz/block/elevator.c --- linux-2.6.18/block/elevator.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/block/elevator.c 2007-06-13 06:55:05.000000000 -0400 @@ -39,6 +39,23 @@ static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); +kmem_cache_t *cfq_pool; + +/* + * Query io scheduler to see if the current process issuing bio may be + * merged with rq. + */ +static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) +{ + request_queue_t *q = rq->q; + elevator_t *e = q->elevator; + + if (e->ops->elevator_allow_merge_fn) + return e->ops->elevator_allow_merge_fn(q, rq, bio); + + return 1; +} + /* * can we safely merge with this request? */ @@ -54,13 +71,16 @@ inline int elv_rq_merge_ok(struct reques return 0; /* - * same device and no special stuff set, merge is ok + * must be same device and not a special request */ - if (rq->rq_disk == bio->bi_bdev->bd_disk && - !rq->waiting && !rq->special) - return 1; + if (rq->rq_disk != bio->bi_bdev->bd_disk || + rq->waiting || rq->special) + return 0; - return 0; + if (!elv_iosched_allow_merge(rq, bio)) + return 0; + + return 1; } EXPORT_SYMBOL(elv_rq_merge_ok); @@ -763,12 +783,12 @@ void elv_unregister(struct elevator_type */ if (e->ops.trim) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); if (p->io_context) e->ops.trim(p->io_context); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } @@ -892,7 +912,7 @@ ssize_t elv_iosched_show(request_queue_t struct list_head *entry; int len = 0; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&elv_list_lock); list_for_each(entry, &elv_list) { struct elevator_type *__e; @@ -902,7 +922,7 @@ ssize_t elv_iosched_show(request_queue_t else len += sprintf(name+len, "%s ", __e->elevator_name); } - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&elv_list_lock); len += sprintf(len+name, "\n"); return len; diff -uprN linux-2.6.18/block/genhd.c linux-2.6.18.ovz/block/genhd.c --- linux-2.6.18/block/genhd.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/block/genhd.c 2007-06-13 06:55:05.000000000 -0400 @@ -17,6 +17,7 @@ #include struct subsystem block_subsys; +EXPORT_SYMBOL(block_subsys); static DEFINE_MUTEX(block_subsys_lock); /* diff -uprN linux-2.6.18/block/ll_rw_blk.c linux-2.6.18.ovz/block/ll_rw_blk.c --- linux-2.6.18/block/ll_rw_blk.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/block/ll_rw_blk.c 2007-06-13 06:55:05.000000000 -0400 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -2067,15 +2068,16 @@ static void freed_request(request_queue_ * Returns NULL on failure, with queue_lock held. * Returns !NULL on success, with queue_lock *not held*. */ -static struct request *get_request(request_queue_t *q, int rw, struct bio *bio, +static struct request *get_request(request_queue_t *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) { struct request *rq = NULL; struct request_list *rl = &q->rq; struct io_context *ioc = NULL; + const int rw = rw_flags & 0x01; int may_queue, priv; - may_queue = elv_may_queue(q, rw, bio); + may_queue = elv_may_queue(q, rw_flags, bio); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; @@ -2123,7 +2125,7 @@ static struct request *get_request(reque spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw, bio, priv, gfp_mask); + rq = blk_alloc_request(q, rw_flags, bio, priv, gfp_mask); if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything @@ -2172,12 +2174,13 @@ out: * * Called with q->queue_lock held, and returns with it unlocked. */ -static struct request *get_request_wait(request_queue_t *q, int rw, +static struct request *get_request_wait(request_queue_t *q, int rw_flags, struct bio *bio) { + const int rw = rw_flags & 0x01; struct request *rq; - rq = get_request(q, rw, bio, GFP_NOIO); + rq = get_request(q, rw_flags, bio, GFP_NOIO); while (!rq) { DEFINE_WAIT(wait); struct request_list *rl = &q->rq; @@ -2185,7 +2188,7 @@ static struct request *get_request_wait( prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - rq = get_request(q, rw, bio, GFP_NOIO); + rq = get_request(q, rw_flags, bio, GFP_NOIO); if (!rq) { struct io_context *ioc; @@ -2849,6 +2852,7 @@ static int __make_request(request_queue_ int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; unsigned short prio; sector_t sector; + int rw_flags; sector = bio->bi_sector; nr_sectors = bio_sectors(bio); @@ -2931,10 +2935,19 @@ static int __make_request(request_queue_ get_rq: /* + * This sync check and mask will be re-done in init_request_from_bio(), + * but we need to set it earlier to expose the sync flag to the + * rq allocator and io schedulers. + */ + rw_flags = bio_data_dir(bio); + if (sync) + rw_flags |= REQ_RW_SYNC; + + /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request_wait(q, rw, bio); + req = get_request_wait(q, rw_flags, bio); /* * After dropping the lock and possibly sleeping here, our request @@ -3021,6 +3034,7 @@ void generic_make_request(struct bio *bi { request_queue_t *q; sector_t maxsector; + sector_t old_sector; int ret, nr_sectors = bio_sectors(bio); dev_t old_dev; @@ -3049,7 +3063,7 @@ void generic_make_request(struct bio *bi * NOTE: we don't repeat the blk_size check for each new device. * Stacking drivers are expected to know what they are doing. */ - maxsector = -1; + old_sector = -1; old_dev = 0; do { char b[BDEVNAME_SIZE]; @@ -3083,15 +3097,30 @@ end_io: */ blk_partition_remap(bio); - if (maxsector != -1) + if (old_sector != -1) blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, - maxsector); + old_sector); blk_add_trace_bio(q, bio, BLK_TA_QUEUE); - maxsector = bio->bi_sector; + old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; + maxsector = bio->bi_bdev->bd_inode->i_size >> 9; + if (maxsector) { + sector_t sector = bio->bi_sector; + + if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { + /* + * This may well happen - partitions are not checked + * to make sure they are within the size of the + * whole device. + */ + handle_bad_sector(bio); + goto end_io; + } + } + ret = q->make_request_fn(q, bio); } while (ret); } @@ -3115,10 +3144,12 @@ void submit_bio(int rw, struct bio *bio) BIO_BUG_ON(!bio->bi_size); BIO_BUG_ON(!bio->bi_io_vec); bio->bi_rw |= rw; - if (rw & WRITE) + if (rw & WRITE) { count_vm_events(PGPGOUT, count); - else + } else { + task_io_account_read(bio->bi_size); count_vm_events(PGPGIN, count); + } if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; diff -uprN linux-2.6.18/block/scsi_ioctl.c linux-2.6.18.ovz/block/scsi_ioctl.c --- linux-2.6.18/block/scsi_ioctl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/block/scsi_ioctl.c 2007-06-13 06:55:05.000000000 -0400 @@ -246,10 +246,10 @@ static int sg_io(struct file *file, requ switch (hdr->dxfer_direction) { default: return -EINVAL; - case SG_DXFER_TO_FROM_DEV: case SG_DXFER_TO_DEV: writing = 1; break; + case SG_DXFER_TO_FROM_DEV: case SG_DXFER_FROM_DEV: break; } @@ -286,9 +286,8 @@ static int sg_io(struct file *file, requ * fill in request structure */ rq->cmd_len = hdr->cmd_len; + memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */ memcpy(rq->cmd, cmd, hdr->cmd_len); - if (sizeof(rq->cmd) != hdr->cmd_len) - memset(rq->cmd + hdr->cmd_len, 0, sizeof(rq->cmd) - hdr->cmd_len); memset(sense, 0, sizeof(sense)); rq->sense = sense; diff -uprN linux-2.6.18/drivers/base/class.c linux-2.6.18.ovz/drivers/base/class.c --- linux-2.6.18/drivers/base/class.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/base/class.c 2007-06-13 06:55:05.000000000 -0400 @@ -71,8 +71,13 @@ static struct kobj_type ktype_class = { }; /* Hotplug events for classes go to the class_obj subsys */ -static decl_subsys(class, &ktype_class, NULL); +decl_subsys(class, &ktype_class, NULL); +#ifndef CONFIG_VE +#define visible_class_subsys class_subsys +#else +#define visible_class_subsys (*get_exec_env()->class_subsys) +#endif int class_create_file(struct class * cls, const struct class_attribute * attr) { @@ -148,7 +153,7 @@ int class_register(struct class * cls) if (error) return error; - subsys_set_kset(cls, class_subsys); + subsys_set_kset(cls, visible_class_subsys); error = subsystem_register(&cls->subsys); if (!error) { @@ -420,8 +425,13 @@ static struct kset_uevent_ops class_ueve .uevent = class_uevent, }; -static decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops); +decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops); +#ifndef CONFIG_VE +#define visible_class_obj_subsys class_obj_subsys +#else +#define visible_class_obj_subsys (*get_exec_env()->class_obj_subsys) +#endif static int class_device_add_attrs(struct class_device * cd) { @@ -499,7 +509,7 @@ static ssize_t store_uevent(struct class void class_device_initialize(struct class_device *class_dev) { - kobj_set_kset_s(class_dev, class_obj_subsys); + kobj_set_kset_s(class_dev, visible_class_obj_subsys); kobject_init(&class_dev->kobj); INIT_LIST_HEAD(&class_dev->node); } @@ -877,12 +887,19 @@ void class_interface_unregister(struct c class_put(parent); } - +void prepare_sysfs_classes(void) +{ +#ifdef CONFIG_VE + get_ve0()->class_subsys = &class_subsys; + get_ve0()->class_obj_subsys = &class_obj_subsys; +#endif +} int __init classes_init(void) { int retval; + prepare_sysfs_classes(); retval = subsystem_register(&class_subsys); if (retval) return retval; @@ -918,3 +935,6 @@ EXPORT_SYMBOL_GPL(class_device_remove_bi EXPORT_SYMBOL_GPL(class_interface_register); EXPORT_SYMBOL_GPL(class_interface_unregister); + +EXPORT_SYMBOL(class_subsys); +EXPORT_SYMBOL(class_obj_subsys); diff -uprN linux-2.6.18/drivers/block/DAC960.c linux-2.6.18.ovz/drivers/block/DAC960.c --- linux-2.6.18/drivers/block/DAC960.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/block/DAC960.c 2007-06-13 06:55:05.000000000 -0400 @@ -7115,7 +7115,7 @@ static struct pci_device_id DAC960_id_ta { .vendor = PCI_VENDOR_ID_MYLEX, .device = PCI_DEVICE_ID_MYLEX_DAC960_GEM, - .subvendor = PCI_ANY_ID, + .subvendor = PCI_VENDOR_ID_MYLEX, .subdevice = PCI_ANY_ID, .driver_data = (unsigned long) &DAC960_GEM_privdata, }, diff -uprN linux-2.6.18/drivers/block/Kconfig linux-2.6.18.ovz/drivers/block/Kconfig --- linux-2.6.18/drivers/block/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/block/Kconfig 2007-06-13 06:55:08.000000000 -0400 @@ -460,6 +460,8 @@ config CDROM_PKTCDVD_WCACHE source "drivers/s390/block/Kconfig" +source "drivers/block/drbd/Kconfig" + config ATA_OVER_ETH tristate "ATA over Ethernet support" depends on NET diff -uprN linux-2.6.18/drivers/block/Makefile linux-2.6.18.ovz/drivers/block/Makefile --- linux-2.6.18/drivers/block/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/block/Makefile 2007-06-13 06:55:08.000000000 -0400 @@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o +obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_UMEM) += umem.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o diff -uprN linux-2.6.18/drivers/block/cciss.c linux-2.6.18.ovz/drivers/block/cciss.c --- linux-2.6.18/drivers/block/cciss.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/block/cciss.c 2007-06-13 06:55:05.000000000 -0400 @@ -1302,6 +1302,12 @@ static void cciss_softirq_done(struct re complete_buffers(rq->bio, rq->errors); + if (blk_fs_request(rq)) { + const int rw = rq_data_dir(rq); + + disk_stat_add(rq->rq_disk, sectors[rw], rq->nr_sectors); + } + #ifdef CCISS_DEBUG printk("Done with %p\n", rq); #endif /* CCISS_DEBUG */ diff -uprN linux-2.6.18/drivers/block/cpqarray.c linux-2.6.18.ovz/drivers/block/cpqarray.c --- linux-2.6.18/drivers/block/cpqarray.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/block/cpqarray.c 2007-06-13 06:55:05.000000000 -0400 @@ -1000,6 +1000,7 @@ static inline void complete_buffers(stru */ static inline void complete_command(cmdlist_t *cmd, int timeout) { + struct request *rq = cmd->rq; int ok=1; int i, ddir; @@ -1031,12 +1032,18 @@ static inline void complete_command(cmdl pci_unmap_page(hba[cmd->ctlr]->pci_dev, cmd->req.sg[i].addr, cmd->req.sg[i].size, ddir); - complete_buffers(cmd->rq->bio, ok); + complete_buffers(rq->bio, ok); - add_disk_randomness(cmd->rq->rq_disk); + if (blk_fs_request(rq)) { + const int rw = rq_data_dir(rq); - DBGPX(printk("Done with %p\n", cmd->rq);); - end_that_request_last(cmd->rq, ok ? 1 : -EIO); + disk_stat_add(rq->rq_disk, sectors[rw], rq->nr_sectors); + } + + add_disk_randomness(rq->rq_disk); + + DBGPX(printk("Done with %p\n", rq);); + end_that_request_last(rq, ok ? 1 : -EIO); } /* diff -uprN linux-2.6.18/drivers/block/drbd/Kconfig linux-2.6.18.ovz/drivers/block/drbd/Kconfig --- linux-2.6.18/drivers/block/drbd/Kconfig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/Kconfig 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,32 @@ +# +# DRBD device driver configuration +# +config BLK_DEV_DRBD + tristate "DRBD Distributed Replicated Block Device support" + select INET + select PROC_FS + select CONNECTOR + select CRYPTO + select CRYPTO_HMAC + ---help--- + DRBD is a block device which is designed to build high availability + clusters. This is done by mirroring a whole block device via (a + dedicated) network. You could see it as a network RAID 1. + + Each minor device has a state, which can be 'primary' or 'secondary'. + On the node with the primary device the application is supposed to + run and to access the device (/dev/drbdX). Every write is sent to the + local 'lower level block device' and via network to the node with the + device in 'secondary' state. + The secondary device simply writes the data to its lower level block + device. Currently no read-balancing via the network is done. + + DRBD can also be used with "shared-disk semantics" (primary-primary), + even though it is a "shared-nothing cluster". You'd need to use a + cluster file system on top of that for cache coherency. + + DRBD management is done through user-space tools. + For automatic failover you need a cluster manager (e.g. heartbeat). + See also: http://www.drbd.org/, http://www.linux-ha.org + + If unsure, say N. diff -uprN linux-2.6.18/drivers/block/drbd/Makefile linux-2.6.18.ovz/drivers/block/drbd/Makefile --- linux-2.6.18/drivers/block/drbd/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/Makefile 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,11 @@ +#CFLAGS_drbd_sizeof_sanity_check.o = -Wpadded # -Werror + +drbd-objs := drbd_buildtag.o drbd_bitmap.o drbd_proc.o \ + drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o \ + lru_cache.o drbd_main.o drbd_strings.o drbd_nl.o + +ifndef CONFIG_CONNECTOR + drbd-objs += connector.o cn_queue.o +endif + +obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff -uprN linux-2.6.18/drivers/block/drbd/drbd_actlog.c linux-2.6.18.ovz/drivers/block/drbd/drbd_actlog.c --- linux-2.6.18/drivers/block/drbd/drbd_actlog.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_actlog.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1471 @@ +/* +-*- linux-c -*- + drbd_actlog.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2007, Philipp Reisner . + Copyright (C) 2003-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include "drbd_int.h" + +/* This is what I like so much about the linux kernel: + * if you have a close look, you can almost always reuse code by someone else + * ;) + * this is mostly from drivers/md/md.c + */ +STATIC int _drbd_md_sync_page_io(drbd_dev *mdev, + struct drbd_backing_dev *bdev, + struct page *page, sector_t sector, + int rw, int size) +{ + struct bio *bio = bio_alloc(GFP_NOIO, 1); + struct completion event; + int ok; + + bio->bi_bdev = bdev->md_bdev; + bio->bi_sector = sector; + ok = (bio_add_page(bio, page, size, 0) == size); + if(!ok) goto out; + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = drbd_md_io_complete; + + if (FAULT_ACTIVE(mdev, (rw & WRITE)? DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD)) { + bio->bi_rw |= rw; + bio_endio(bio,bio->bi_size,-EIO); + } + else { +#ifdef BIO_RW_SYNC + submit_bio(rw | (1 << BIO_RW_SYNC), bio); +#else + submit_bio(rw, bio); + drbd_blk_run_queue(bdev_get_queue(bdev->md_bdev)); +#endif + } + wait_for_completion(&event); + ok = test_bit(BIO_UPTODATE, &bio->bi_flags); + out: + bio_put(bio); + return ok; +} + +int drbd_md_sync_page_io(drbd_dev *mdev, struct drbd_backing_dev *bdev, + sector_t sector, int rw) +{ + int hardsect,mask,ok,offset=0; + struct page *iop = mdev->md_io_page; + + D_ASSERT(semaphore_is_locked(&mdev->md_io_mutex)); + + if (!bdev->md_bdev) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("bdev->md_bdev==NULL\n"); + dump_stack(); + } + return 0; + } + + hardsect = drbd_get_hardsect(bdev->md_bdev); + if(hardsect == 0) hardsect = MD_HARDSECT; + + // in case hardsect != 512 [ s390 only? ] + if( hardsect != MD_HARDSECT ) { + if(!mdev->md_io_tmpp) { + struct page *page = alloc_page(GFP_NOIO); + if(!page) return 0; + + WARN("Meta data's bdev hardsect = %d != %d\n", + hardsect, MD_HARDSECT); + WARN("Workaround engaged (has performace impact).\n"); + + mdev->md_io_tmpp = page; + } + + mask = ( hardsect / MD_HARDSECT ) - 1; + D_ASSERT( mask == 1 || mask == 3 || mask == 7 ); + D_ASSERT( hardsect == (mask+1) * MD_HARDSECT ); + offset = sector & mask; + sector = sector & ~mask; + iop = mdev->md_io_tmpp; + + if (rw == WRITE) { + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + ok = _drbd_md_sync_page_io(mdev, bdev,iop, + sector,READ,hardsect); + + if (unlikely(!ok)) { + ERR("drbd_md_sync_page_io(,%llus,READ [hardsect!=512]) failed!\n", + (unsigned long long)sector); + return 0; + } + + memcpy(hp + offset*MD_HARDSECT , p, MD_HARDSECT); + } + } + +#if DUMP_MD >= 3 + INFO("%s [%d]:%s(,%llus,%s)\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, rw ? "WRITE" : "READ"); +#endif + + if (sector < drbd_md_first_sector(bdev) || sector > drbd_md_last_sector(bdev)) { + ALERT("%s [%d]:%s(,%llus,%s) out of range md access!\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, rw ? "WRITE" : "READ"); + } + + ok = _drbd_md_sync_page_io(mdev, bdev,iop,sector,rw,hardsect); + if (unlikely(!ok)) { + ERR("drbd_md_sync_page_io(,%llus,%s) failed!\n", + (unsigned long long)sector,rw ? "WRITE" : "READ"); + return 0; + } + + if( hardsect != MD_HARDSECT && rw == READ ) { + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + memcpy(p, hp + offset*MD_HARDSECT, MD_HARDSECT); + } + + return ok; +} + + +struct __attribute__((packed)) al_transaction { + u32 magic; + u32 tr_number; + // u32 tr_generation; //TODO + struct __attribute__((packed)) { + u32 pos; + u32 extent; } updates[1 + AL_EXTENTS_PT]; + u32 xor_sum; + // I do not believe that all storage medias can guarantee atomic + // 512 byte write operations. When the journal is read, only + // transactions with correct xor_sums are considered. +}; // sizeof() = 512 byte + + +struct update_odbm_work { + struct drbd_work w; + unsigned int enr; +} ; + +struct update_al_work { + struct drbd_work w; + struct lc_element * al_ext; + struct completion event; + unsigned int enr; +}; + +STATIC int w_al_write_transaction(struct Drbd_Conf *, struct drbd_work *, int); + +static inline +struct lc_element* _al_get(struct Drbd_Conf *mdev, unsigned int enr) +{ + struct lc_element *al_ext; + struct bm_extent *bm_ext; + unsigned long al_flags=0; + + spin_lock_irq(&mdev->al_lock); + bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr/AL_EXT_PER_BM_SECT); + if (unlikely(bm_ext!=NULL)) { + if(test_bit(BME_NO_WRITES,&bm_ext->flags)) { + spin_unlock_irq(&mdev->al_lock); + //INFO("Delaying app write until sync read is done\n"); + return 0; + } + } + al_ext = lc_get(mdev->act_log,enr); + al_flags = mdev->act_log->flags; + spin_unlock_irq(&mdev->al_lock); + + /* + if (!al_ext) { + if (al_flags & LC_STARVING) + WARN("Have to wait for LRU element (AL too small?)\n"); + if (al_flags & LC_DIRTY) + WARN("Ongoing AL update (AL device too slow?)\n"); + } + */ + + return al_ext; +} + +/* FIXME + * this should be able to return failure when meta data update has failed. + */ +void drbd_al_begin_io(struct Drbd_Conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SIZE_B-9)); + struct lc_element *al_ext; + struct update_al_work al_work; + + D_ASSERT(atomic_read(&mdev->local_cnt)>0); + + MTRACE(TraceTypeALExts,TraceLvlMetrics, + INFO("al_begin_io( sec=%llus (al_enr=%u) (rs_enr=%d) )\n", + (unsigned long long) sector, enr, + (int)BM_SECT_TO_EXT(sector)); + ); + + wait_event(mdev->al_wait, (al_ext = _al_get(mdev,enr)) ); + + if (al_ext->lc_number != enr) { + // We have to do write an transaction to AL. + unsigned int evicted; + + evicted = al_ext->lc_number; + + if(mdev->state.conn < Connected && evicted != LC_FREE ) { + drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT ); + } + + /* drbd_al_write_transaction(mdev,al_ext,enr); + generic_make_request() are serialized on the + current->bio_tail list now. Therefore we have + to deligate writing something to AL to the + worker thread. */ + init_completion(&al_work.event); + al_work.al_ext = al_ext; + al_work.enr = enr; + al_work.w.cb = w_al_write_transaction; + drbd_queue_work_front(&mdev->data.work,&al_work.w); + wait_for_completion(&al_work.event); + + mdev->al_writ_cnt++; + + /* + DUMPI(al_ext->lc_number); + DUMPI(mdev->act_log->new_number); + */ + spin_lock_irq(&mdev->al_lock); + lc_changed(mdev->act_log,al_ext); + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); + } +} + +void drbd_al_complete_io(struct Drbd_Conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SIZE_B-9)); + struct lc_element *extent; + unsigned long flags; + + MTRACE(TraceTypeALExts,TraceLvlMetrics, + INFO("al_complete_io( sec=%llus (al_enr=%u) (rs_enr=%d) )\n", + (unsigned long long) sector, enr, + (int)BM_SECT_TO_EXT(sector)); + ); + + spin_lock_irqsave(&mdev->al_lock,flags); + + extent = lc_find(mdev->act_log,enr); + + if(!extent) { + spin_unlock_irqrestore(&mdev->al_lock,flags); + ERR("al_complete_io() called on inactive extent %u\n",enr); + return; + } + + if( lc_put(mdev->act_log,extent) == 0 ) { + wake_up(&mdev->al_wait); + } + + spin_unlock_irqrestore(&mdev->al_lock,flags); +} + +STATIC int +w_al_write_transaction(struct Drbd_Conf *mdev, struct drbd_work *w, int unused) +{ + int i,n,mx; + unsigned int extent_nr; + struct al_transaction* buffer; + sector_t sector; + u32 xor_sum=0; + + struct lc_element *updated = ((struct update_al_work*)w)->al_ext; + unsigned int new_enr = ((struct update_al_work*)w)->enr; + + down(&mdev->md_io_mutex); // protects md_io_buffer, al_tr_cycle, ... + buffer = (struct al_transaction*)page_address(mdev->md_io_page); + + buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); + buffer->tr_number = cpu_to_be32(mdev->al_tr_number); + + n = lc_index_of(mdev->act_log, updated); + + buffer->updates[0].pos = cpu_to_be32(n); + buffer->updates[0].extent = cpu_to_be32(new_enr); + +#if 0 /* Use this printf with the test_al.pl program */ + ERR("T%03d S%03d=E%06d\n", mdev->al_tr_number,n,new_enr); +#endif + + xor_sum ^= new_enr; + + mx = min_t(int,AL_EXTENTS_PT, + mdev->act_log->nr_elements - mdev->al_tr_cycle); + for(i=0;iact_log, + mdev->al_tr_cycle+i)->lc_number; + buffer->updates[i+1].pos = cpu_to_be32(mdev->al_tr_cycle+i); + buffer->updates[i+1].extent = cpu_to_be32(extent_nr); + xor_sum ^= extent_nr; + } + for(;iupdates[i+1].pos = __constant_cpu_to_be32(-1); + buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); + xor_sum ^= LC_FREE; + } + mdev->al_tr_cycle += AL_EXTENTS_PT; + if(mdev->al_tr_cycle >= mdev->act_log->nr_elements) mdev->al_tr_cycle=0; + + buffer->xor_sum = cpu_to_be32(xor_sum); + +// warning LGE check outcome of addition u64/sector_t/s32 +// warning LGE "FIXME code missing" + sector = mdev->bc->md.md_offset + mdev->bc->md.al_offset + mdev->al_tr_pos; + + if(!drbd_md_sync_page_io(mdev,mdev->bc,sector,WRITE)) { + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + } + + if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) { + mdev->al_tr_pos=0; + } + D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); + mdev->al_tr_number++; + + up(&mdev->md_io_mutex); + + complete(&((struct update_al_work*)w)->event); + + return 1; +} + +/** + * drbd_al_read_tr: Reads a single transaction record form the + * on disk activity log. + * Returns -1 on IO error, 0 on checksum error and 1 if it is a valid + * record. + */ +STATIC int drbd_al_read_tr(struct Drbd_Conf *mdev, + struct drbd_backing_dev *bdev, + struct al_transaction* b, + int index) +{ + sector_t sector; + int rv,i; + u32 xor_sum=0; + + sector = bdev->md.md_offset + bdev->md.al_offset + index; + + if(!drbd_md_sync_page_io(mdev,bdev,sector,READ)) { + // Dont process error normally as this is done before + // disk is atached! + return -1; + } + + rv = ( be32_to_cpu(b->magic) == DRBD_MAGIC ); + + for(i=0;iupdates[i].extent); + } + rv &= (xor_sum == be32_to_cpu(b->xor_sum)); + + return rv; +} + +/** + * drbd_al_read_log: Restores the activity log from its on disk + * representation. Returns 1 on success, returns 0 when + * reading the log failed due to IO errors. + */ +int drbd_al_read_log(struct Drbd_Conf *mdev,struct drbd_backing_dev *bdev) +{ + struct al_transaction* buffer; + int from=-1,to=-1,i,cnr, overflow=0,rv; + u32 from_tnr=-1, to_tnr=0; + int active_extents=0; + int transactions=0; + int mx; + + mx = div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT); + + /* lock out all other meta data io for now, + * and make sure the page is mapped. + */ + down(&mdev->md_io_mutex); + buffer = page_address(mdev->md_io_page); + + // Find the valid transaction in the log + for(i=0;i<=mx;i++) { + rv = drbd_al_read_tr(mdev,bdev,buffer,i); + if(rv == 0) continue; + if(rv == -1) { + up(&mdev->md_io_mutex); + return 0; + } + cnr = be32_to_cpu(buffer->tr_number); + // INFO("index %d valid tnr=%d\n",i,cnr); + + if(cnr == -1) overflow=1; + + if(cnr < from_tnr && !overflow) { + from = i; + from_tnr = cnr; + } + if(cnr > to_tnr) { + to = i; + to_tnr = cnr; + } + } + + if(from == -1 || to == -1) { + WARN("No usable activity log found.\n"); + + up(&mdev->md_io_mutex); + return 1; + } + + // Read the valid transactions. + // INFO("Reading from %d to %d.\n",from,to); + + /* this should better be handled by a for loop, no? + */ + i=from; + while(1) { + int j,pos; + unsigned int extent_nr; + unsigned int trn; + + rv = drbd_al_read_tr(mdev,bdev,buffer,i); + ERR_IF(rv == 0) goto cancel; + if(rv == -1) { + up(&mdev->md_io_mutex); + return 0; + } + + trn=be32_to_cpu(buffer->tr_number); + + spin_lock_irq(&mdev->al_lock); + + /* This loop runs backwards because in the cyclic + elements there might be an old version of the + updated element (in slot 0). So the element in slot 0 + can overwrite old versions. */ + for(j=AL_EXTENTS_PT;j>=0;j--) { + pos = be32_to_cpu(buffer->updates[j].pos); + extent_nr = be32_to_cpu(buffer->updates[j].extent); + + if(extent_nr == LC_FREE) continue; + + //if(j<3) INFO("T%03d S%03d=E%06d\n",trn,pos,extent_nr); + lc_set(mdev->act_log,extent_nr,pos); + active_extents++; + } + spin_unlock_irq(&mdev->al_lock); + + transactions++; + + cancel: + if( i == to) break; + i++; + if( i > mx ) i=0; + } + + mdev->al_tr_number = to_tnr+1; + mdev->al_tr_pos = to; + if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) { + mdev->al_tr_pos=0; + } + + /* ok, we are done with it */ + up(&mdev->md_io_mutex); + + INFO("Found %d transactions (%d active extents) in activity log.\n", + transactions,active_extents); + + return 1; +} + +void drbd_al_to_on_disk_bm_slow(struct Drbd_Conf *mdev) +{ + int i; + unsigned int enr; + + WARN("Using the slow drbd_al_to_on_disk_bm()\n"); + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + if (inc_local_if_state(mdev,Attaching)) { + for(i=0;iact_log->nr_elements;i++) { + enr = lc_entry(mdev->act_log,i)->lc_number; + if(enr == LC_FREE) continue; + /* Really slow: if we have al-extents 16..19 active, + * sector 4 will be written four times! Synchronous! */ + drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT ); + } + + dec_local(mdev); + } else D_ASSERT(0); + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); +} + +struct drbd_atodb_wait { + atomic_t count; + struct completion io_done; + struct Drbd_Conf *mdev; + int error; +}; + +STATIC int atodb_endio(struct bio *bio, unsigned int bytes_done, int error) +{ + struct drbd_atodb_wait *wc = bio->bi_private; + struct Drbd_Conf *mdev=wc->mdev; + struct page *page; + + if (bio->bi_size) return 1; + + drbd_chk_io_error(mdev,error,TRUE); + if(error && wc->error == 0) wc->error=error; + + if (atomic_dec_and_test(&wc->count)) { + complete(&wc->io_done); + } + + page = bio->bi_io_vec[0].bv_page; + if(page) put_page(page); + bio_put(bio); + mdev->bm_writ_cnt++; + dec_local(mdev); + + return 0; +} + +#define S2W(s) ((s)<<(BM_EXT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL)) +/* activity log to on disk bitmap -- prepare bio unless that sector + * is already covered by previously prepared bios */ +STATIC int atodb_prepare_unless_covered(struct Drbd_Conf *mdev, + struct bio **bios, + struct page **page, + unsigned int *page_offset, + unsigned int enr, + struct drbd_atodb_wait *wc) +{ + int i=0,allocated_page=0; + struct bio *bio; + struct page *np; + sector_t on_disk_sector = enr + mdev->bc->md.md_offset + mdev->bc->md.bm_offset; + int offset; + + // check if that enr is already covered by an already created bio. + while( (bio=bios[i]) ) { + if(bio->bi_sector == on_disk_sector) return 0; + i++; + } + + bio = bio_alloc(GFP_KERNEL, 1); + if(bio==NULL) return -ENOMEM; + + bio->bi_bdev = mdev->bc->md_bdev; + bio->bi_sector = on_disk_sector; + + bios[i] = bio; + + if(*page_offset == PAGE_SIZE) { + np = alloc_page(__GFP_HIGHMEM); + /* no memory leak, bio gets cleaned up by caller */ + if(np == NULL) return -ENOMEM; + *page = np; + *page_offset = 0; + allocated_page=1; + } + + offset = S2W(enr); + drbd_bm_get_lel( mdev, offset, + min_t(size_t,S2W(1), drbd_bm_words(mdev) - offset), + kmap(*page) + *page_offset ); + kunmap(*page); + + if(bio_add_page(bio, *page, MD_HARDSECT, *page_offset)!=MD_HARDSECT) { + /* no memory leak, page gets cleaned up by caller */ + return -EIO; + } + + if(!allocated_page) get_page(*page); + + *page_offset += MD_HARDSECT; + + bio->bi_private = wc; + bio->bi_end_io = atodb_endio; + + atomic_inc(&wc->count); + /* we already know that we may do this... + * inc_local_if_state(mdev,Attaching); + * so just get the extra reference, so that the local_cnt + * reflects the number of pending IO requests DRBD at its + * backing device. + */ + atomic_inc(&mdev->local_cnt); + return 0; +} + +/** + * drbd_al_to_on_disk_bm: + * Writes the areas of the bitmap which are covered by the AL. + * called when we detach (unconfigure) local storage, + * or when we go from Primary to Secondary state. + */ +void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev) +{ + int i, nr_elements; + unsigned int enr; + struct bio **bios; + struct page *page; + unsigned int page_offset=PAGE_SIZE; + struct drbd_atodb_wait wc; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + nr_elements = mdev->act_log->nr_elements; + + bios = kzalloc(sizeof(struct bio*) * nr_elements, GFP_KERNEL); + + if(!bios) { + lc_unlock(mdev->act_log); + + drbd_al_to_on_disk_bm_slow(mdev); + return; + } + + if (inc_local_if_state(mdev,Attaching)) { + atomic_set(&wc.count,0); + init_completion(&wc.io_done); + wc.mdev = mdev; + wc.error = 0; + + for(i=0;iact_log,i)->lc_number; + if(enr == LC_FREE) continue; + /* next statement also does atomic_inc wc.count */ + if(atodb_prepare_unless_covered(mdev,bios,&page, + &page_offset, + enr/AL_EXT_PER_BM_SECT, + &wc)) + goto abort; + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + /* all prepared, submit them */ + for(i=0;ibi_rw = WRITE; + bio_endio(bios[i],bios[i]->bi_size,-EIO); + } else { + submit_bio(WRITE, bios[i]); + } + + } + + drbd_blk_run_queue(bdev_get_queue(mdev->bc->md_bdev)); + + // In case we did not submit a single IO do not wait for + // them to complete. ( Because we would wait forever here. ) + // + // In case we had IOs and they are already complete, there + // is not point in waiting anyways. + // Therefore this if() ... + if(atomic_read(&wc.count)) wait_for_completion(&wc.io_done); + + dec_local(mdev); + + if(wc.error) drbd_io_error(mdev, TRUE); + + } else D_ASSERT(0); + + kfree(bios); + return; + + abort: + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + // free everything by calling the endio callback directly. + for(i=0;ibi_size=0; + atodb_endio(bios[i], MD_HARDSECT, 0); + } + kfree(bios); + dec_local(mdev); + + drbd_al_to_on_disk_bm_slow(mdev); //.. and take the slow path. +} + +/** + * drbd_al_apply_to_bm: Sets the bits in the bitmap that are described + * by the active extents of the AL. + */ +void drbd_al_apply_to_bm(struct Drbd_Conf *mdev) +{ + unsigned int enr; + unsigned long add=0; + char ppb[10]; + int i; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + for(i=0;iact_log->nr_elements;i++) { + enr = lc_entry(mdev->act_log,i)->lc_number; + if(enr == LC_FREE) continue; + add += drbd_bm_ALe_set_all(mdev, enr); + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + INFO("Marked additional %s as out-of-sync based on AL.\n", + ppsize(ppb,Bit2KB(add))); +} + +static inline int _try_lc_del(struct Drbd_Conf *mdev,struct lc_element *al_ext) +{ + int rv; + + spin_lock_irq(&mdev->al_lock); + rv = (al_ext->refcnt == 0); + if(likely(rv)) lc_del(mdev->act_log,al_ext); + spin_unlock_irq(&mdev->al_lock); + + if(unlikely(!rv)) INFO("Waiting for extent in drbd_al_shrink()\n"); + + return rv; +} + +/** + * drbd_al_shrink: Removes all active extents form the AL. (but does not + * write any transactions) + * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() + */ +void drbd_al_shrink(struct Drbd_Conf *mdev) +{ + struct lc_element *al_ext; + int i; + + D_ASSERT( test_bit(__LC_DIRTY,&mdev->act_log->flags) ); + + for(i=0;iact_log->nr_elements;i++) { + al_ext = lc_entry(mdev->act_log,i); + if(al_ext->lc_number == LC_FREE) continue; + wait_event(mdev->al_wait, _try_lc_del(mdev,al_ext)); + } + + wake_up(&mdev->al_wait); +} + +STATIC int w_update_odbm(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct update_odbm_work *udw = (struct update_odbm_work*)w; + + if( !inc_local_if_state(mdev,Attaching) ) { + if (DRBD_ratelimit(5*HZ,5)) + WARN("Can not update on disk bitmap, local IO disabled.\n"); + return 1; + } + + drbd_bm_write_sect(mdev, udw->enr ); + dec_local(mdev); + + kfree(udw); + + if(drbd_bm_total_weight(mdev) <= mdev->rs_failed && + ( mdev->state.conn == SyncSource || mdev->state.conn == SyncTarget || + mdev->state.conn == PausedSyncS || mdev->state.conn == PausedSyncT ) ) { + drbd_bm_lock(mdev); + drbd_resync_finished(mdev); + drbd_bm_unlock(mdev); + } + + return 1; +} + + +/* ATTENTION. The AL's extents are 4MB each, while the extents in the + * resync LRU-cache are 16MB each. + * The caller of this function has to hold an inc_local() reference. + * + * TODO will be obsoleted once we have a caching lru of the on disk bitmap + */ +STATIC void drbd_try_clear_on_disk_bm(struct Drbd_Conf *mdev,sector_t sector, + int count, int success) +{ + struct bm_extent* ext; + struct update_odbm_work * udw; + + unsigned int enr; + + MUST_HOLD(&mdev->al_lock); + D_ASSERT(atomic_read(&mdev->local_cnt)); + + // I simply assume that a sector/size pair never crosses + // a 16 MB extent border. (Currently this is true...) + enr = BM_SECT_TO_EXT(sector); + + ext = (struct bm_extent *) lc_get(mdev->resync,enr); + if (ext) { + if( ext->lce.lc_number == enr) { + if (success) + ext->rs_left -= count; + else + ext->rs_failed += count; + if (ext->rs_left < ext->rs_failed) { + ERR("BAD! sector=%llus enr=%u rs_left=%d rs_failed=%d count=%d\n", + (unsigned long long)sector, + ext->lce.lc_number, ext->rs_left, ext->rs_failed, count); + dump_stack(); + // FIXME brrrgs. should never happen! + drbd_force_state(mdev,NS(conn,Disconnecting)); + return; + } + } else { + //WARN("Counting bits in %d (resync LRU small?)\n",enr); + // This element should be in the cache + // since drbd_rs_begin_io() pulled it already in. + + // OR an application write finished, and therefore + // we set something in this area in sync. + int rs_left = drbd_bm_e_weight(mdev,enr); + if (ext->flags != 0) { + WARN("changing resync lce: %d[%u;%02lx]" + " -> %d[%u;00]\n", + ext->lce.lc_number, ext->rs_left, + ext->flags, enr, rs_left); + ext->flags = 0; + } + if( ext->rs_failed ) { + WARN("Kicking resync_lru element enr=%u " + "out with rs_failed=%d\n", + ext->lce.lc_number, ext->rs_failed); + set_bit(WRITE_BM_AFTER_RESYNC,&mdev->flags); + } + ext->rs_left = rs_left; + ext->rs_failed = success ? 0 : count; + lc_changed(mdev->resync,&ext->lce); + } + lc_put(mdev->resync,&ext->lce); + // no race, we are within the al_lock! + + if (ext->rs_left == ext->rs_failed) { + ext->rs_failed = 0; + + udw=kmalloc(sizeof(*udw),GFP_ATOMIC); + if(udw) { + udw->enr = ext->lce.lc_number; + udw->w.cb = w_update_odbm; + drbd_queue_work_front(&mdev->data.work,&udw->w); + } else { + WARN("Could not kmalloc an udw\n"); + set_bit(WRITE_BM_AFTER_RESYNC,&mdev->flags); + } + } + } else { + ERR("lc_get() failed! locked=%d/%d flags=%lu\n", + mdev->resync_locked, + mdev->resync->nr_elements, + mdev->resync->flags); + } +} + +/* clear the bit corresponding to the piece of storage in question: + * size byte of data starting from sector. Only clear a bits of the affected + * one ore more _aligned_ BM_BLOCK_SIZE blocks. + * + * called by worker on SyncTarget and receiver on SyncSource. + * + */ +void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr,ebnr,lbnr,bnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + int wake_up=0; + unsigned long flags; + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + ERR("drbd_set_in_sync: sector=%llus size=%d nonsense!\n", + (unsigned long long)sector,size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we clear it (in sync). + * round up start sector, round down end sector. we make sure we only + * clear full, alligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) { + return; + } else if (unlikely(esector == (nr_sectors-1))) { + ebnr = lbnr; + } else { + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + } + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", + (unsigned long long)sector, size, sbnr, ebnr); + ); + + if (sbnr > ebnr) return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irqsave(&mdev->al_lock,flags); + for(bnr=sbnr; bnr <= ebnr; bnr++) { + if (drbd_bm_clear_bit(mdev,bnr)) count++; + } + if (count) { + // we need the lock for drbd_try_clear_on_disk_bm + if(jiffies - mdev->rs_mark_time > HZ*10) { + /* should be roling marks, but we estimate only anyways. */ + if( mdev->rs_mark_left != drbd_bm_total_weight(mdev) && + mdev->state.conn != PausedSyncT && + mdev->state.conn != PausedSyncS ) { + mdev->rs_mark_time =jiffies; + mdev->rs_mark_left =drbd_bm_total_weight(mdev); + } + } + if( inc_local_if_state(mdev,Attaching) ) { + drbd_try_clear_on_disk_bm(mdev,sector,count,TRUE); + dec_local(mdev); + } + /* just wake_up unconditional now, + * various lc_chaged(), lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up=1; + } + spin_unlock_irqrestore(&mdev->al_lock,flags); + if(wake_up) wake_up(&mdev->al_wait); +} + +/* + * this is intended to set one request worth of data out of sync. + * affects at least 1 bit, and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. + * + * called by tl_clear and drbd_send_dblock (==drbd_make_request). + * so this can be _any_ process. + */ +void __drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line) +{ + unsigned long sbnr,ebnr,lbnr; + sector_t esector, nr_sectors; + + /* Find codepoints that call set_out_of_sync() + unsigned long flags; + unsigned int enr; + struct bm_extent* ext; + + if(inc_local(mdev)) { + enr = BM_SECT_TO_EXT(sector); + spin_lock_irqsave(&mdev->al_lock,flags); + ext = (struct bm_extent *) lc_find(mdev->resync,enr); + if (ext) { + WARN("BAD! things will happen, find this.\n"); + dump_stack(); + } + spin_unlock_irqrestore(&mdev->al_lock,flags); + dec_local(mdev); + } + */ + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + ERR("sector: %llus, size: %d\n",(unsigned long long)sector,size); + return; + } + + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we set it out of sync, + * we do not need to round anything here */ + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", + (unsigned long long)sector, size, sbnr, ebnr); + ); + + /* ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. */ + drbd_bm_set_bits_in_irq(mdev,sbnr,ebnr); +} + +static inline +struct bm_extent* _bme_get(struct Drbd_Conf *mdev, unsigned int enr) +{ + struct bm_extent *bm_ext; + int wakeup = 0; + unsigned long rs_flags; + + spin_lock_irq(&mdev->al_lock); + if (mdev->resync_locked > mdev->resync->nr_elements-3) { + //WARN("bme_get() does not lock all elements\n"); + spin_unlock_irq(&mdev->al_lock); + return NULL; + } + bm_ext = (struct bm_extent*) lc_get(mdev->resync,enr); + if (bm_ext) { + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(mdev,enr); + bm_ext->rs_failed = 0; + lc_changed(mdev->resync,(struct lc_element*)bm_ext); + wakeup = 1; + } + if (bm_ext->lce.refcnt == 1) mdev->resync_locked++; + set_bit(BME_NO_WRITES,&bm_ext->flags); + } + rs_flags=mdev->resync->flags; + spin_unlock_irq(&mdev->al_lock); + if (wakeup) wake_up(&mdev->al_wait); + + if (!bm_ext) { + if (rs_flags & LC_STARVING) { + WARN("Have to wait for element" + " (resync LRU too small?)\n"); + } + if (rs_flags & LC_DIRTY) { + BUG(); // WARN("Ongoing RS update (???)\n"); + } + } + + return bm_ext; +} + +static inline int _is_in_al(drbd_dev* mdev, unsigned int enr) +{ + struct lc_element* al_ext; + int rv=0; + + spin_lock_irq(&mdev->al_lock); + if(unlikely(enr == mdev->act_log->new_number)) rv=1; + else { + al_ext = lc_find(mdev->act_log,enr); + if(al_ext) { + if (al_ext->refcnt) rv=1; + } + } + spin_unlock_irq(&mdev->al_lock); + + /* + if(unlikely(rv)) { + INFO("Delaying sync read until app's write is done\n"); + } + */ + return rv; +} + +/** + * drbd_rs_begin_io: Gets an extent in the resync LRU cache and sets it + * to BME_LOCKED. + * + * @sector: The sector number + * + * sleeps on al_wait. + * returns 1 if successful. + * returns 0 if interrupted. + */ +int drbd_rs_begin_io(drbd_dev* mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent* bm_ext; + int i, sig; + + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("drbd_rs_begin_io: sector=%llus (rs_end=%d)\n", + (unsigned long long)sector,enr); + ); + + sig = wait_event_interruptible( mdev->al_wait, + (bm_ext = _bme_get(mdev,enr)) ); + if (sig) return 0; + + if(test_bit(BME_LOCKED,&bm_ext->flags)) return 1; + + for(i=0;ial_wait, + !_is_in_al(mdev,enr*AL_EXT_PER_BM_SECT+i) ); + if (sig) { + spin_lock_irq(&mdev->al_lock); + if( lc_put(mdev->resync,&bm_ext->lce) == 0 ) { + clear_bit(BME_NO_WRITES,&bm_ext->flags); + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } + spin_unlock_irq(&mdev->al_lock); + return 0; + } + } + + set_bit(BME_LOCKED,&bm_ext->flags); + + return 1; +} + +/** + * drbd_try_rs_begin_io: Gets an extent in the resync LRU cache, sets it + * to BME_NO_WRITES, then tries to set it to BME_LOCKED. + * + * @sector: The sector number + * + * does not sleep. + * returns zero if we could set BME_LOCKED and can proceed, + * -EAGAIN if we need to try again. + */ +int drbd_try_rs_begin_io(drbd_dev* mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; + struct bm_extent* bm_ext; + int i; + + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("drbd_try_rs_begin_io: sector=%llus\n", + (unsigned long long)sector); + ); + + spin_lock_irq(&mdev->al_lock); + if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { + /* in case you have very heavy scattered io, it may + * stall the syncer undefined if we giveup the ref count + * when we try again and requeue. + * + * if we don't give up the refcount, but the next time + * we are scheduled this extent has been "synced" by new + * application writes, we'd miss the lc_put on the + * extent we keept the refcount on. + * so we remembered which extent we had to try agin, and + * if the next requested one is something else, we do + * the lc_put here... + * we also have to wake_up + */ + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("dropping %u, aparently got 'synced' " + "by application io\n", mdev->resync_wenr); + ); + bm_ext = (struct bm_extent*)lc_find(mdev->resync,mdev->resync_wenr); + if (bm_ext) { + D_ASSERT(!test_bit(BME_LOCKED,&bm_ext->flags)); + D_ASSERT(test_bit(BME_NO_WRITES,&bm_ext->flags)); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + mdev->resync_wenr = LC_FREE; + lc_put(mdev->resync,&bm_ext->lce); + wake_up(&mdev->al_wait); + } else { + ALERT("LOGIC BUG\n"); + } + } + bm_ext = (struct bm_extent*)lc_try_get(mdev->resync,enr); + if (bm_ext) { + if (test_bit(BME_LOCKED,&bm_ext->flags)) { + goto proceed; + } + if (!test_and_set_bit(BME_NO_WRITES,&bm_ext->flags)) { + mdev->resync_locked++; + } else { + /* we did set the BME_NO_WRITES, + * but then could not set BME_LOCKED, + * so we tried again. + * drop the extra reference. */ + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("dropping extra reference on %u\n",enr); + ); + bm_ext->lce.refcnt--; + D_ASSERT(bm_ext->lce.refcnt > 0); + } + goto check_al; + } else { + if (mdev->resync_locked > mdev->resync->nr_elements-3) + goto try_again; + bm_ext = (struct bm_extent*)lc_get(mdev->resync,enr); + if (!bm_ext) { + const unsigned long rs_flags = mdev->resync->flags; + if (rs_flags & LC_STARVING) { + WARN("Have to wait for element" + " (resync LRU too small?)\n"); + } + if (rs_flags & LC_DIRTY) { + BUG(); // WARN("Ongoing RS update (???)\n"); + } + goto try_again; + } + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(mdev,enr); + bm_ext->rs_failed = 0; + lc_changed(mdev->resync,(struct lc_element*)bm_ext); + wake_up(&mdev->al_wait); + D_ASSERT(test_bit(BME_LOCKED,&bm_ext->flags) == 0); + } + set_bit(BME_NO_WRITES,&bm_ext->flags); + D_ASSERT(bm_ext->lce.refcnt == 1); + mdev->resync_locked++; + goto check_al; + } + check_al: + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("checking al for %u\n",enr); + ); + for (i=0;iact_log->new_number)) + goto try_again; + if (lc_is_used(mdev->act_log,al_enr+i)) + goto try_again; + } + set_bit(BME_LOCKED,&bm_ext->flags); + proceed: + mdev->resync_wenr = LC_FREE; + spin_unlock_irq(&mdev->al_lock); + return 0; + + try_again: + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("need to try again for %u\n",enr); + ); + if (bm_ext) mdev->resync_wenr = enr; + spin_unlock_irq(&mdev->al_lock); + return -EAGAIN; +} + +void drbd_rs_complete_io(drbd_dev* mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent* bm_ext; + unsigned long flags; + + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n", + (long long)sector, enr); + ); + + spin_lock_irqsave(&mdev->al_lock,flags); + bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr); + if(!bm_ext) { + spin_unlock_irqrestore(&mdev->al_lock,flags); + ERR("drbd_rs_complete_io() called, but extent not found\n"); + return; + } + + if(bm_ext->lce.refcnt == 0) { + spin_unlock_irqrestore(&mdev->al_lock,flags); + ERR("drbd_rs_complete_io(,%llu [=%u]) called, but refcnt is 0!?\n", + (unsigned long long)sector, enr); + return; + } + + if( lc_put(mdev->resync,(struct lc_element *)bm_ext) == 0 ) { + clear_bit(BME_LOCKED,&bm_ext->flags); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } + + spin_unlock_irqrestore(&mdev->al_lock,flags); +} + +/** + * drbd_rs_cancel_all: Removes extents from the resync LRU. Even + * if they are BME_LOCKED. + */ +void drbd_rs_cancel_all(drbd_dev* mdev) +{ + struct bm_extent* bm_ext; + int i; + + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_rs_cancel_all\n"); + ); + + spin_lock_irq(&mdev->al_lock); + + if(inc_local_if_state(mdev,Failed)) { // Makes sure ->resync is there. + for(i=0;iresync->nr_elements;i++) { + bm_ext = (struct bm_extent*) lc_entry(mdev->resync,i); + if(bm_ext->lce.lc_number == LC_FREE) continue; + bm_ext->lce.refcnt = 0; // Rude but ok. + bm_ext->rs_left = 0; + clear_bit(BME_LOCKED,&bm_ext->flags); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + lc_del(mdev->resync,&bm_ext->lce); + } + mdev->resync->used=0; + dec_local(mdev); + } + mdev->resync_locked = 0; + mdev->resync_wenr = LC_FREE; + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); +} + +/** + * drbd_rs_del_all: Gracefully remove all extents from the resync LRU. + * there may be still a reference hold by someone. In that case this function + * returns -EAGAIN. + * In case all elements got removed it returns zero. + */ +int drbd_rs_del_all(drbd_dev* mdev) +{ + struct bm_extent* bm_ext; + int i; + + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_rs_del_all\n"); + ); + + spin_lock_irq(&mdev->al_lock); + + if(inc_local_if_state(mdev,Failed)) { // Makes sure ->resync is there. + for(i=0;iresync->nr_elements;i++) { + bm_ext = (struct bm_extent*) lc_entry(mdev->resync,i); + if(bm_ext->lce.lc_number == LC_FREE) continue; + if (bm_ext->lce.lc_number == mdev->resync_wenr) { + INFO("dropping %u in drbd_rs_del_all, " + "aparently got 'synced' by application io\n", + mdev->resync_wenr); + D_ASSERT(!test_bit(BME_LOCKED,&bm_ext->flags)); + D_ASSERT(test_bit(BME_NO_WRITES,&bm_ext->flags)); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + mdev->resync_wenr = LC_FREE; + lc_put(mdev->resync,&bm_ext->lce); + } + if(bm_ext->lce.refcnt != 0) { + INFO("Retrying drbd_rs_del_all() later. " + "refcnt=%d\n",bm_ext->lce.refcnt); + dec_local(mdev); + spin_unlock_irq(&mdev->al_lock); + return -EAGAIN; + } + D_ASSERT(bm_ext->rs_left == 0); + D_ASSERT(!test_bit(BME_LOCKED,&bm_ext->flags)); + D_ASSERT(!test_bit(BME_NO_WRITES,&bm_ext->flags)); + lc_del(mdev->resync,&bm_ext->lce); + } + D_ASSERT(mdev->resync->used==0); + dec_local(mdev); + } + spin_unlock_irq(&mdev->al_lock); + + return 0; +} + +/* Record information on a failure to resync the specified blocks + * + * called on SyncTarget when resync write fails or NegRSDReply received + * + */ +void drbd_rs_failed_io(drbd_dev* mdev, sector_t sector, int size) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr,ebnr,lbnr,bnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + int wake_up=0; + + MTRACE(TraceTypeResync, TraceLvlSummary, + INFO("drbd_rs_failed_io: sector=%llus, size=%u\n", + (unsigned long long)sector,size); + ); + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + ERR("drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", + (unsigned long long)sector,size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* + * round up start sector, round down end sector. we make sure we only + * handle full, alligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) { + return; + } else if (unlikely(esector == (nr_sectors-1))) { + ebnr = lbnr; + } else { + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + } + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + + if (sbnr > ebnr) return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irq(&mdev->al_lock); + for(bnr=sbnr; bnr <= ebnr; bnr++) { + if (drbd_bm_test_bit(mdev,bnr)>0) count++; + } + if (count) { + mdev->rs_failed += count; + + if( inc_local_if_state(mdev,Attaching) ) { + drbd_try_clear_on_disk_bm(mdev,sector,count,FALSE); + dec_local(mdev); + } + + /* just wake_up unconditional now, + * various lc_chaged(), lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up=1; + } + spin_unlock_irq(&mdev->al_lock); + if(wake_up) wake_up(&mdev->al_wait); +} diff -uprN linux-2.6.18/drivers/block/drbd/drbd_bitmap.c linux-2.6.18.ovz/drivers/block/drbd/drbd_bitmap.c --- linux-2.6.18/drivers/block/drbd/drbd_bitmap.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_bitmap.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1184 @@ +/* +-*- linux-c -*- + drbd_bitmap.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2004-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2004-2007, Philipp Reisner . + Copyright (C) 2004-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include // for memset +#include /* for D_ASSERT(in_interrupt()) */ + + +#include +#include "drbd_int.h" + +/* OPAQUE outside this file! + * interface defined in drbd_int.h + * + * unfortunately this currently means that this file is not + * yet selfcontained, because it needs to know about how to receive + * the bitmap from the peer via the data socket. + * This is to be solved with some sort of + * drbd_bm_copy(mdev,offset,size,unsigned long*) ... + + * Note that since find_first_bit returns int, this implementation + * "only" supports up to 1<<(32+12) == 16 TB... non issue, since + * currently DRBD is limited to ca 3.8 TB storage anyways. + * + * we will eventually change the implementation to not allways hold the full + * bitmap in memory, but only some 'lru_cache' of the on disk bitmap, + * since vmalloc'ing mostly unused 128M is antisocial. + + * THINK + * I'm not yet sure whether this file should be bits only, + * or wether I want it to do all the sector<->bit calculation in here. + */ + +// warning LGE "verify all spin_lock_irq here, and their call path" +// warning LGE "and change to irqsave where applicable" +// warning LGE "so we don't accidentally nest spin_lock_irq()" +/* + * NOTE + * Access to the *bm is protected by bm_lock. + * It is safe to read the other members within the lock. + * + * drbd_bm_set_bit is called from bio_endio callbacks, + * We may be called with irq already disabled, + * so we need spin_lock_irqsave(). + * FIXME + * for performance reasons, when we _know_ we have irq disabled, we should + * probably introduce some _in_irq variants, so we know to only spin_lock(). + * + * FIXME + * Actually you need to serialize all resize operations. + * but then, resize is a drbd state change, and it should be serialized + * already. Unfortunately it is not (yet), so two concurrent resizes, like + * attach storage (drbdsetup) and receive the peers size (drbd receiver) + * may eventually blow things up. + * Therefore, + * you may only change the other members when holding + * the bm_change mutex _and_ the bm_lock. + * thus reading them holding either is safe. + * this is sort of overkill, but I rather do it right + * than have two resize operations interfere somewhen. + */ +struct drbd_bitmap { + unsigned long *bm; + spinlock_t bm_lock; + /* WARNING unsigned long bm_fo and friends: + * 32bit number of bit offset is just enough for 512 MB bitmap. + * it will blow up if we make the bitmap bigger... + * not that it makes much sense to have a bitmap that large, + * rather change the granularity to 16k or 64k or something. + * (that implies other problems, however...) + */ + unsigned long bm_fo; // next offset for drbd_bm_find_next + unsigned long bm_set; // nr of set bits; THINK maybe atomic_t ? + unsigned long bm_bits; + size_t bm_words; + sector_t bm_dev_capacity; + struct semaphore bm_change; // serializes resize operations + + atomic_t bm_async_io; + wait_queue_head_t bm_io_wait; + + unsigned long bm_flags; + + // { REMOVE + unsigned long bm_line; + char *bm_file; + // } +}; + +// { REMOVE once we serialize all state changes properly +#define D_BUG_ON(x) ERR_IF(x) { dump_stack(); } +#define BM_LOCKED 0 +#define BM_MD_IO_ERROR (BITS_PER_LONG-1) // 31? 63? + +#if 0 // simply disabled for now... +#define MUST_NOT_BE_LOCKED() do { \ + if (test_bit(BM_LOCKED,&b->bm_flags)) { \ + if (DRBD_ratelimit(5*HZ,5)) { \ + ERR("%s:%d: bitmap is locked by %s:%lu\n", \ + __FILE__, __LINE__, b->bm_file,b->bm_line); \ + dump_stack(); \ + } \ + } \ +} while (0) +#define MUST_BE_LOCKED() do { \ + if (!test_bit(BM_LOCKED,&b->bm_flags)) { \ + if (DRBD_ratelimit(5*HZ,5)) { \ + ERR("%s:%d: bitmap not locked!\n", \ + __FILE__, __LINE__); \ + dump_stack(); \ + } \ + } \ +} while (0) +#else +#define MUST_NOT_BE_LOCKED() do {(void)b;} while (0) +#define MUST_BE_LOCKED() do {(void)b;} while (0) +#endif +void __drbd_bm_lock(drbd_dev *mdev, char* file, int line) +{ + struct drbd_bitmap *b = mdev->bitmap; + spin_lock_irq(&b->bm_lock); + if (!__test_and_set_bit(BM_LOCKED,&b->bm_flags)) { + b->bm_file = file; + b->bm_line = line; + } else if (DRBD_ratelimit(5*HZ,5)) { + ERR("%s:%d: bitmap already locked by %s:%lu\n", + file, line, b->bm_file,b->bm_line); + /* + dump_stack(); + ERR("This is no oops, but debug stack trace only.\n"); + ERR("If you get this often, or in reproducable situations, " + "notify \n"); + */ + } + spin_unlock_irq(&b->bm_lock); +} +void drbd_bm_unlock(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + spin_lock_irq(&b->bm_lock); + if (!__test_and_clear_bit(BM_LOCKED,&mdev->bitmap->bm_flags)) { + ERR("bitmap not locked in bm_unlock\n"); + } else { + /* FIXME if we got a "is already locked" previously, + * we unlock here even though we actually MUST NOT do so... */ + b->bm_file = NULL; + b->bm_line = -1; + } + spin_unlock_irq(&b->bm_lock); +} + +#if 0 +// has been very helpful to indicate that rs_total and rs_left have been +// used in a non-smp safe way... +#define BM_PARANOIA_CHECK() do { \ + D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); \ + D_ASSERT(b->bm_dev_capacity == drbd_get_capacity(mdev->this_bdev)); \ + if ( (b->bm_set != mdev->rs_total) && \ + (b->bm_set != mdev->rs_left) ) { \ + if ( DRBD_ratelimit(5*HZ,5) ) { \ + ERR("%s:%d: ?? bm_set=%lu; rs_total=%lu, rs_left=%lu\n",\ + __FILE__ , __LINE__ , \ + b->bm_set, mdev->rs_total, mdev->rs_left ); \ + } \ + } \ +} while (0) +#else +#define BM_PARANOIA_CHECK() do { \ + D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); \ + if (b->bm_dev_capacity != drbd_get_capacity(mdev->this_bdev)) { \ + ERR("%s:%d: bm_dev_capacity:%llu drbd_get_capacity:%llu\n", \ + __FILE__, __LINE__, \ + (unsigned long long) b->bm_dev_capacity, \ + (unsigned long long) drbd_get_capacity(mdev->this_bdev));\ + } \ +} while (0) +#endif +// } + +#if DUMP_MD >= 3 +/* debugging aid */ +STATIC void bm_end_info(drbd_dev *mdev, const char* where) +{ + struct drbd_bitmap *b = mdev->bitmap; + size_t w = (b->bm_bits-1) >> LN2_BPL; + + INFO("%s: bm_set=%lu\n", where, b->bm_set); + INFO("bm[%d]=0x%lX\n", w, b->bm[w]); + w++; + + if ( w < b->bm_words ) { + D_ASSERT(w == b->bm_words -1); + INFO("bm[%d]=0x%lX\n",w,b->bm[w]); + } +} +#else +#define bm_end_info(ignored...) ((void)(0)) +#endif + +/* long word offset of _bitmap_ sector */ +#define S2W(s) ((s)<<(BM_EXT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL)) + +/* + * actually most functions herein should take a struct drbd_bitmap*, not a + * drbd_dev*, but for the debug macros I like to have the mdev around + * to be able to report device specific. + */ + +/* FIXME TODO sometimes I use "int offset" as index into the bitmap. + * since we currently are LIMITED to (128<<11)-64-8 sectors of bitmap, + * this is ok [as long as we dont run on a 24 bit arch :)]. + * But it is NOT strictly ok. + */ + +/* + * called on driver init only. TODO call when a device is created. + * allocates the drbd_bitmap, and stores it in mdev->bitmap. + */ +int drbd_bm_init(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + D_BUG_ON(b); + b = kzalloc(sizeof(struct drbd_bitmap),GFP_KERNEL); + if (!b) + return -ENOMEM; + spin_lock_init(&b->bm_lock); + init_MUTEX(&b->bm_change); + init_waitqueue_head(&b->bm_io_wait); + + mdev->bitmap = b; + + return 0; +} + +sector_t drbd_bm_capacity(drbd_dev *mdev) +{ + ERR_IF(!mdev->bitmap) return 0; + return mdev->bitmap->bm_dev_capacity; +} + +/* called on driver unload. TODO: call when a device is destroyed. + */ +void drbd_bm_cleanup(drbd_dev *mdev) +{ + ERR_IF (!mdev->bitmap) return; + /* FIXME I think we should explicitly change the device size to zero + * before this... + * + D_BUG_ON(mdev->bitmap->bm); + */ + vfree(mdev->bitmap->bm); + kfree(mdev->bitmap); + mdev->bitmap = NULL; +} + +/* + * since (b->bm_bits % BITS_PER_LONG) != 0, + * this masks out the remaining bits. + * Rerturns the number of bits cleared. + */ +STATIC int bm_clear_surplus(struct drbd_bitmap * b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1; + size_t w = b->bm_bits >> LN2_BPL; + int cleared=0; + + if ( w < b->bm_words ) { + cleared = hweight_long(b->bm[w] & ~mask); + b->bm[w++] &= mask; + } + + if ( w < b->bm_words ) { + cleared += hweight_long(b->bm[w]); + b->bm[w++]=0; + } + + return cleared; +} + +STATIC void bm_set_surplus(struct drbd_bitmap * b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1; + size_t w = b->bm_bits >> LN2_BPL; + + if ( w < b->bm_words ) { + b->bm[w++] |= ~mask; + } + + if ( w < b->bm_words ) { + b->bm[w++] = ~(0UL); + } +} + +STATIC unsigned long bm_count_bits(struct drbd_bitmap * b, int just_read) +{ + unsigned long *bm = b->bm; + unsigned long *ep = b->bm + b->bm_words; + unsigned long bits = 0; + + while ( bm < ep ) { + /* on little endian, this is *bm = *bm; + * and should be optimized away by the compiler */ + if (just_read) *bm = lel_to_cpu(*bm); + bits += hweight_long(*bm++); + } + + return bits; +} + +void _drbd_bm_recount_bits(drbd_dev *mdev, char* file, int line) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long flags, bits; + + ERR_IF(!b) return; + + spin_lock_irqsave(&b->bm_lock,flags); + bits = bm_count_bits(b,0); + if(bits != b->bm_set) { + ERR("bm_set was %lu, corrected to %lu. %s:%d\n", + b->bm_set,bits,file,line); + b->bm_set = bits; + } + spin_unlock_irqrestore(&b->bm_lock,flags); +} + +#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) + +/* + * make sure the bitmap has enough room for the attached storage, + * if neccessary, resize. + * called whenever we may have changed the device size. + * returns -ENOMEM if we could not allocate enough memory, 0 on success. + * In case this is actually a resize, we copy the old bitmap into the new one. + * Otherwise, the bitmap is initiallized to all bits set. + */ +int drbd_bm_resize(drbd_dev *mdev, sector_t capacity) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long bits, bytes, words, *nbm, *obm = 0; + int err = 0, growing; + + ERR_IF(!b) return -ENOMEM; + MUST_BE_LOCKED(); + + ERR_IF (down_trylock(&b->bm_change)) { + down(&b->bm_change); + } + + INFO("drbd_bm_resize called with capacity == %llu\n", + (unsigned long long)capacity); + + if (capacity == b->bm_dev_capacity) + goto out; + + if (capacity == 0) { + spin_lock_irq(&b->bm_lock); + obm = b->bm; + b->bm = NULL; + b->bm_fo = + b->bm_set = + b->bm_bits = + b->bm_words = + b->bm_dev_capacity = 0; + spin_unlock_irq(&b->bm_lock); + goto free_obm; + } else { + bits = BM_SECT_TO_BIT(ALIGN(capacity,BM_SECTORS_PER_BIT)); + + /* if we would use + words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; + a 32bit host could present the wrong number of words + to a 64bit host. + */ + words = ALIGN(bits,64) >> LN2_BPL; + + D_ASSERT((u64)bits <= (((u64)mdev->bc->md.md_size_sect-MD_BM_OFFSET) << 12)); + + if ( words == b->bm_words ) { + /* optimize: capacity has changed, + * but only within one long word worth of bits. + * just update the bm_dev_capacity and bm_bits members. + */ + spin_lock_irq(&b->bm_lock); + b->bm_bits = bits; + b->bm_dev_capacity = capacity; + b->bm_set -= bm_clear_surplus(b); + bm_end_info(mdev, __FUNCTION__ ); + spin_unlock_irq(&b->bm_lock); + goto out; + } else { + /* one extra long to catch off by one errors */ + bytes = (words+1)*sizeof(long); + nbm = vmalloc(bytes); + if (!nbm) { + ERR("bitmap: failed to vmalloc %lu bytes\n",bytes); + err = -ENOMEM; + goto out; + } + } + spin_lock_irq(&b->bm_lock); + obm = b->bm; + // brgs. move several MB within spinlock... + // FIXME this should go into userspace! + if (obm) { + bm_set_surplus(b); + D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); + memcpy(nbm,obm,min_t(size_t,b->bm_words,words)*sizeof(long)); + } + growing = words > b->bm_words; + if (growing) { // set all newly allocated bits + // start at -1, just to be sure. + memset( nbm + (b->bm_words?:1)-1 , 0xff, + (words - ((b->bm_words?:1)-1)) * sizeof(long) ); + b->bm_set += bits - b->bm_bits; + } + nbm[words] = DRBD_MAGIC; + b->bm = nbm; + b->bm_bits = bits; + b->bm_words = words; + b->bm_dev_capacity = capacity; + bm_clear_surplus(b); + if( !growing ) b->bm_set = bm_count_bits(b,0); + bm_end_info(mdev, __FUNCTION__ ); + spin_unlock_irq(&b->bm_lock); + INFO("resync bitmap: bits=%lu words=%lu\n",bits,words); + } + free_obm: + vfree(obm); // vfree(NULL) is noop + out: + up(&b->bm_change); + return err; +} + +/* inherently racy: + * if not protected by other means, return value may be out of date when + * leaving this function... + * we still need to lock it, since it is important that this returns + * bm_set == 0 precisely. + * + * maybe bm_set should be atomic_t ? + */ +unsigned long drbd_bm_total_weight(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long s; + unsigned long flags; + + ERR_IF(!b) return 0; + // MUST_BE_LOCKED(); well. yes. but ... + + spin_lock_irqsave(&b->bm_lock,flags); + s = b->bm_set; + spin_unlock_irqrestore(&b->bm_lock,flags); + + return s; +} + +size_t drbd_bm_words(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return 0; + + /* FIXME + * actually yes. really. otherwise it could just change its size ... + * but it triggers all the time... + * MUST_BE_LOCKED(); + */ + + return b->bm_words; +} + +/* merge number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + */ +void drbd_bm_merge_lel( drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer ) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *bm; + unsigned long word, bits; + size_t n = number; + + if (number == 0) return; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + D_BUG_ON(offset >= b->bm_words); + D_BUG_ON(offset+number > b->bm_words); + D_BUG_ON(number > PAGE_SIZE/sizeof(long)); + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + // BM_PARANOIA_CHECK(); no. + bm = b->bm + offset; + while(n--) { + bits = hweight_long(*bm); + word = *bm | lel_to_cpu(*buffer++); + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (offset+number == b->bm_words) { + b->bm_set -= bm_clear_surplus(b); + bm_end_info(mdev, __FUNCTION__ ); + } + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + */ +void drbd_bm_set_lel( drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer ) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *bm; + unsigned long word, bits; + size_t n = number; + + if (number == 0) return; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + D_BUG_ON(offset >= b->bm_words); + D_BUG_ON(offset+number > b->bm_words); + D_BUG_ON(number > PAGE_SIZE/sizeof(long)); + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + // BM_PARANOIA_CHECK(); no. + bm = b->bm + offset; + while(n--) { + bits = hweight_long(*bm); + word = lel_to_cpu(*buffer++); + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (offset+number == b->bm_words) { + b->bm_set -= bm_clear_surplus(b); + bm_end_info(mdev, __FUNCTION__ ); + } + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from the bitmap starting at offset into the buffer. + * buffer[i] will be little endian unsigned long. + */ +void drbd_bm_get_lel( drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer ) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *bm; + + if (number == 0) return; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + if ( (offset >= b->bm_words) || + (offset+number > b->bm_words) || + (number > PAGE_SIZE/sizeof(long)) || + (number <= 0) ) { + // yes, there is "%z", but that gives compiler warnings... + ERR("offset=%lu number=%lu bm_words=%lu\n", + (unsigned long) offset, + (unsigned long) number, + (unsigned long) b->bm_words); + return; + } + + // MUST_BE_LOCKED(); yes. but not neccessarily globally... + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + bm = b->bm + offset; + while(number--) *buffer++ = cpu_to_lel(*bm++); + spin_unlock_irq(&b->bm_lock); +} + +/* set all bits in the bitmap */ +void drbd_bm_set_all(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + memset(b->bm,0xff,b->bm_words*sizeof(long)); + bm_clear_surplus(b); + b->bm_set = b->bm_bits; + spin_unlock_irq(&b->bm_lock); +} + +int drbd_bm_async_io_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + struct drbd_bitmap *b = bio->bi_private; + + if (bio->bi_size) + return 1; + + if (error) { + /* doh. what now? + * for now, set all bits, and flag MD_IO_ERROR + */ + /* FIXME kmap_atomic memset etc. pp. */ + __set_bit(BM_MD_IO_ERROR,&b->bm_flags); + } + if (atomic_dec_and_test(&b->bm_async_io)) + wake_up(&b->bm_io_wait); + + bio_put(bio); + + return 0; +} + +STATIC void drbd_bm_page_io_async(drbd_dev *mdev, struct drbd_bitmap *b, int page_nr, int rw) +{ + /* we are process context. we always get a bio */ + /* THINK: do we need GFP_NOIO here? */ + struct bio *bio = bio_alloc(GFP_KERNEL, 1); + struct page *page = vmalloc_to_page((char*)(b->bm) + (PAGE_SIZE*page_nr)); + unsigned int len; + sector_t on_disk_sector = mdev->bc->md.md_offset + mdev->bc->md.bm_offset; + on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); + + /* this might happen with very small flexible external meta data device */ + len = min_t(unsigned int, PAGE_SIZE, + (drbd_md_last_sector(mdev->bc) - on_disk_sector + 1)<<9); + + D_DUMPLU(on_disk_sector); + D_DUMPI(len); + + bio->bi_bdev = mdev->bc->md_bdev; + bio->bi_sector = on_disk_sector; + bio_add_page(bio, page, len, 0); + bio->bi_private = b; + bio->bi_end_io = drbd_bm_async_io_complete; + + if (FAULT_ACTIVE(mdev, (rw&WRITE)?DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD)) { + bio->bi_rw |= rw; + bio_endio(bio,bio->bi_size,-EIO); + } + else + submit_bio(rw, bio); +} +/* read one sector of the on disk bitmap into memory. + * on disk bitmap is little endian. + * @enr is _sector_ offset from start of on disk bitmap (aka bm-extent nr). + * returns 0 on success, -EIO on failure + */ +int drbd_bm_read_sect(drbd_dev *mdev,unsigned long enr) +{ + sector_t on_disk_sector = mdev->bc->md.md_offset + mdev->bc->md.bm_offset + enr; + int bm_words, num_words, offset, err = 0; + + // MUST_BE_LOCKED(); not neccessarily global ... + + down(&mdev->md_io_mutex); + if(drbd_md_sync_page_io(mdev,mdev->bc,on_disk_sector,READ)) { + bm_words = drbd_bm_words(mdev); + offset = S2W(enr); // word offset into bitmap + num_words = min(S2W(1), bm_words - offset); +#if DUMP_MD >= 3 + INFO("read_sect: sector=%lus offset=%u num_words=%u\n", + enr, offset, num_words); +#endif + drbd_bm_set_lel( mdev, offset, num_words, + page_address(mdev->md_io_page) ); + } else { + int i; + err = -EIO; + ERR( "IO ERROR reading bitmap sector %lu " + "(meta-disk sector %llu)\n", + enr, (unsigned long long)on_disk_sector ); + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) + drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i); + } + up(&mdev->md_io_mutex); + return err; +} + +/** + * drbd_bm_read: Read the whole bitmap from its on disk location. + * + * currently only called from "drbd_ioctl_set_disk" + * FIXME need to be able to return an error!! + * + */ +# if defined(__LITTLE_ENDIAN) + /* nothing to do, on disk == in memory */ +# define bm_cpu_to_lel(x) ((void)0) +# else +void bm_cpu_to_lel(struct drbd_bitmap *b) +{ + /* need to cpu_to_lel all the pages ... + * this may be optimized by using + * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0; + * the following is still not optimal, but better than nothing */ + const unsigned long *end = b->bm+b->bm_words; + unsigned long *bm; + if (b->bm_set == 0) { + /* no page at all; avoid swap if all is 0 */ + return; + } else if (b->bm_set == b->bm_bits) { + /* only the last words */ + bm = end-2; + } else { + /* all pages */ + bm = b->bm; + } + for (; bm < end; bm++) { + *bm = cpu_to_lel(*bm); + } +} +# endif +/* lel_to_cpu == cpu_to_lel */ +# define bm_lel_to_cpu(x) bm_cpu_to_lel(x) + +STATIC int drbd_bm_rw(struct Drbd_Conf *mdev, int rw) +{ + struct drbd_bitmap *b = mdev->bitmap; + /* sector_t sector; */ + int bm_words, num_pages, i; + unsigned long now; + char ppb[10]; + int err = 0; + + MUST_BE_LOCKED(); + + bm_words = drbd_bm_words(mdev); + num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; + + /* OK, I manipulate the bitmap low level, + * and I expect to be the exclusive user. + * If not, I am really in a bad mood... + * to catch such bugs early, make all people who want to access the + * bitmap while I read/write it dereference a NULL pointer :-> + */ + mdev->bitmap = NULL; + + if(rw == WRITE) bm_cpu_to_lel(b); + + now = jiffies; + atomic_set(&b->bm_async_io, num_pages); + __clear_bit(BM_MD_IO_ERROR,&b->bm_flags); + + for (i = 0; i < num_pages; i++) { + /* let the layers below us try to merge these bios... */ + drbd_bm_page_io_async(mdev,b,i,rw); + } + + drbd_blk_run_queue(bdev_get_queue(mdev->bc->md_bdev)); + wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); + INFO("%s of bitmap took %lu jiffies\n", + rw == READ ? "reading" : "writing", jiffies - now); + + if (test_bit(BM_MD_IO_ERROR,&b->bm_flags)) { + ALERT("we had at least one MD IO ERROR during bitmap IO\n"); + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + err = -EIO; + } + + now = jiffies; + if(rw == WRITE) { + bm_lel_to_cpu(b); + } else /* rw == READ */ { + /* just read, if neccessary adjust endianness */ + b->bm_set = bm_count_bits(b, 1); + INFO("recounting of set bits took additional %lu jiffies\n", + jiffies - now); + } + + /* ok, done, + * now it is visible again + */ + + mdev->bitmap = b; + + INFO("%s marked out-of-sync by on disk bit-map.\n", + ppsize(ppb,drbd_bm_total_weight(mdev) << (BM_BLOCK_SIZE_B-10)) ); + + return err; +} + +int drbd_bm_read(struct Drbd_Conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + int err=0; + + if (b->bm) { + // bitmap size > 0 + err = drbd_bm_rw(mdev, READ); + + if (err == 0) + b->bm[b->bm_words] = DRBD_MAGIC; + } + + return err; +} + +/** + * drbd_bm_write_sect: Writes a 512 byte piece of the bitmap to its + * on disk location. On disk bitmap is little endian. + * + * @enr: The _sector_ offset from the start of the bitmap. + * + */ +int drbd_bm_write_sect(struct Drbd_Conf *mdev,unsigned long enr) +{ + sector_t on_disk_sector = enr + mdev->bc->md.md_offset + mdev->bc->md.bm_offset; + int bm_words, num_words, offset, err = 0; + + // MUST_BE_LOCKED(); not neccessarily global... + + down(&mdev->md_io_mutex); + bm_words = drbd_bm_words(mdev); + offset = S2W(enr); // word offset into bitmap + num_words = min(S2W(1), bm_words - offset); +#if DUMP_MD >= 3 + INFO("write_sect: sector=%lu offset=%u num_words=%u\n", + enr, offset, num_words); +#endif + if (num_words < S2W(1)) { + memset(page_address(mdev->md_io_page),0,MD_HARDSECT); + } + drbd_bm_get_lel( mdev, offset, num_words, + page_address(mdev->md_io_page) ); + if (!drbd_md_sync_page_io(mdev,mdev->bc,on_disk_sector,WRITE)) { + int i; + err = -EIO; + ERR( "IO ERROR writing bitmap sector %lu " + "(meta-disk sector %llus)\n", + enr, (unsigned long long)on_disk_sector ); + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) + drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i); + } + mdev->bm_writ_cnt++; + up(&mdev->md_io_mutex); + return err; +} + +/** + * drbd_bm_write: Write the whole bitmap to its on disk location. + */ +int drbd_bm_write(struct Drbd_Conf *mdev) +{ + int err = drbd_bm_rw(mdev, WRITE); + + INFO("%lu KB now marked out-of-sync by on disk bit-map.\n", + drbd_bm_total_weight(mdev) << (BM_BLOCK_SIZE_B-10) ); + + return err; +} + +/* clear all bits in the bitmap */ +void drbd_bm_clear_all(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + + MUST_BE_LOCKED(); \ + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + memset(b->bm,0,b->bm_words*sizeof(long)); + b->bm_set = 0; + spin_unlock_irq(&b->bm_lock); +} + +void drbd_bm_reset_find(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + + ERR_IF(!b) return; + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + b->bm_fo = 0; + spin_unlock_irq(&b->bm_lock); + +} + +/* NOTE + * find_first_bit returns int, we return unsigned long. + * should not make much difference anyways, but ... + * this returns a bit number, NOT a sector! + */ +unsigned long drbd_bm_find_next(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long i = -1UL; + + ERR_IF(!b) return i; + ERR_IF(!b->bm) return i; + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + if (b->bm_fo < b->bm_bits) { + i = find_next_bit(b->bm,b->bm_bits,b->bm_fo); + } else if (b->bm_fo > b->bm_bits) { + ERR("bm_fo=%lu bm_bits=%lu\n",b->bm_fo, b->bm_bits); + } + if (i >= b->bm_bits) { + i = -1UL; + b->bm_fo = 0; + } else { + b->bm_fo = i+1; + } + spin_unlock_irq(&b->bm_lock); + return i; +} + +void drbd_bm_set_find(drbd_dev *mdev, unsigned long i) +{ + struct drbd_bitmap *b = mdev->bitmap; + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + + b->bm_fo = min_t(unsigned long, i, b->bm_bits); + + spin_unlock_irq(&b->bm_lock); +} + + +int drbd_bm_rs_done(drbd_dev *mdev) +{ + return mdev->bitmap->bm_fo == 0; +} + +// THINK maybe the D_BUG_ON(i<0)s in set/clear/test should be not that strict? + +/* returns previous bit state + * wants bitnr, NOT sector. + */ +int drbd_bm_set_bit(drbd_dev *mdev, const unsigned long bitnr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int i; + ERR_IF(!b) return 1; + ERR_IF(!b->bm) return 1; + +/* + * only called from drbd_set_out_of_sync. + * strange_state blubber is already in place there... + strange_state = ( mdev->cstate > Connected ) || + ( mdev->cstate == Connected && + !(test_bit(DISKLESS,&mdev->flags) || + test_bit(PARTNER_DISKLESS,&mdev->flags)) ); + if (strange_state) + ERR("%s in drbd_bm_set_bit\n", conns_to_name(mdev->cstate)); +*/ + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + MUST_NOT_BE_LOCKED(); + ERR_IF (bitnr >= b->bm_bits) { + ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits); + i = 0; + } else { + i = (0 != __test_and_set_bit(bitnr, b->bm)); + b->bm_set += !i; + } + spin_unlock_irq(&b->bm_lock); + return i; +} + +/* returns number of bits actually changed (0->1) + * wants bitnr, not sector */ +int drbd_bm_set_bits_in_irq(drbd_dev *mdev, const unsigned long s, const unsigned long e) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long bitnr; + int c = 0; + ERR_IF(!b) return 1; + ERR_IF(!b->bm) return 1; + +#if 0 + /* hm. I assumed that, when inside of lock_irq/unlock_irq, + * in_interrupt() would be true ? + * how else can I assert that this called with irq disabled without using + * spin_lock_irqsave? */ + D_BUG_ON(!in_interrupt()); /* called within spin_lock_irq(&mdev->req_lock) */ +#endif + + spin_lock(&b->bm_lock); + BM_PARANOIA_CHECK(); + MUST_NOT_BE_LOCKED(); + for (bitnr = s; bitnr <=e; bitnr++) { + ERR_IF (bitnr >= b->bm_bits) { + ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits); + } else { + c += (0 == __test_and_set_bit(bitnr, b->bm)); + } + } + b->bm_set += c; + spin_unlock(&b->bm_lock); + return c; +} + +/* returns previous bit state + * wants bitnr, NOT sector. + */ +int drbd_bm_clear_bit(drbd_dev *mdev, const unsigned long bitnr) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long flags; + int i; + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + + spin_lock_irqsave(&b->bm_lock,flags); + BM_PARANOIA_CHECK(); + MUST_NOT_BE_LOCKED(); + ERR_IF (bitnr >= b->bm_bits) { + ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits); + i = 0; + } else { + i = (0 != __test_and_clear_bit(bitnr, b->bm)); + b->bm_set -= i; + } + spin_unlock_irqrestore(&b->bm_lock,flags); + + /* clearing bits should only take place when sync is in progress! + * this is only called from drbd_set_in_sync. + * strange_state blubber is already in place there ... + if (i && mdev->cstate <= Connected) + ERR("drbd_bm_clear_bit: cleared a bitnr=%lu while %s\n", + bitnr, conns_to_name(mdev->cstate)); + */ + + return i; +} + +/* returns bit state + * wants bitnr, NOT sector. + * inherently racy... area needs to be locked by means of {al,rs}_lru + * 1 ... bit set + * 0 ... bit not set + * -1 ... first out of bounds access, stop testing for bits! + */ +int drbd_bm_test_bit(drbd_dev *mdev, const unsigned long bitnr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int i; + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + if (bitnr < b->bm_bits) { + i = test_bit(bitnr, b->bm) ? 1 : 0; + } else if (bitnr == b->bm_bits) { + i = -1; + } else /* (bitnr > b->bm_bits) */ { + ERR("bitnr=%lu > bm_bits=%lu\n",bitnr, b->bm_bits); + i = 0; + } + + spin_unlock_irq(&b->bm_lock); + return i; +} + +/* inherently racy... + * return value may be already out-of-date when this function returns. + * but the general usage is that this is only use during a cstate when bits are + * only cleared, not set, and typically only care for the case when the return + * value is zero, or we already "locked" this "bitmap extent" by other means. + * + * enr is bm-extent number, since we chose to name one sector (512 bytes) + * worth of the bitmap a "bitmap extent". + * + * TODO + * I think since we use it like a reference count, we should use the real + * reference count of some bitmap extent element from some lru instead... + * + */ +int drbd_bm_e_weight(drbd_dev *mdev, unsigned long enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int count, s, e; + unsigned long flags; + + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + spin_lock_irqsave(&b->bm_lock,flags); + BM_PARANOIA_CHECK(); + + s = S2W(enr); + e = min((size_t)S2W(enr+1),b->bm_words); + count = 0; + if (s < b->bm_words) { + const unsigned long* w = b->bm+s; + int n = e-s; + while (n--) count += hweight_long(*w++); + } else { + ERR("start offset (%d) too large in drbd_bm_e_weight\n", s); + } + spin_unlock_irqrestore(&b->bm_lock,flags); +#if DUMP_MD >= 3 + INFO("enr=%lu weight=%d e=%d s=%d\n", enr, count, e, s); +#endif + return count; +} + +/* set all bits covered by the AL-extent al_enr */ +unsigned long drbd_bm_ALe_set_all(drbd_dev *mdev, unsigned long al_enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long weight; + int count, s, e; + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + weight = b->bm_set; + + s = al_enr * BM_WORDS_PER_AL_EXT; + e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); + count = 0; + if (s < b->bm_words) { + const unsigned long* w = b->bm+s; + int n = e-s; + while (n--) count += hweight_long(*w++); + n = e-s; + memset(b->bm+s,-1,n*sizeof(long)); + b->bm_set += n*BITS_PER_LONG - count; + if (e == b->bm_words) { + b->bm_set -= bm_clear_surplus(b); + } + } else { + ERR("start offset (%d) too large in drbd_bm_ALe_set_all\n", s); + } + weight = b->bm_set - weight; + spin_unlock_irq(&b->bm_lock); + return weight; +} diff -uprN linux-2.6.18/drivers/block/drbd/drbd_buildtag.c linux-2.6.18.ovz/drivers/block/drbd/drbd_buildtag.c --- linux-2.6.18/drivers/block/drbd/drbd_buildtag.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_buildtag.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,6 @@ +/* automatically generated. DO NOT EDIT. */ +const char * drbd_buildtag(void) +{ + return "SVN Revision: 2881" + " build by phil@mescal, 2007-05-07 17:22:02"; +} diff -uprN linux-2.6.18/drivers/block/drbd/drbd_compat_wrappers.h linux-2.6.18.ovz/drivers/block/drbd/drbd_compat_wrappers.h --- linux-2.6.18/drivers/block/drbd/drbd_compat_wrappers.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_compat_wrappers.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,340 @@ +/* + * FIXME this file is bound to die, renamed or included in drbd_int.h + */ + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +# error "use a 2.6 kernel, please" +#endif + + +/* struct page has a union in 2.6.15 ... + * an anonymous union and struct since 2.6.16 + * or in fc5 "2.6.15" */ +#include +#ifndef page_private +# define page_private(page) ((page)->private) +# define set_page_private(page, v) ((page)->private = (v)) +#endif + +#include // for fsync_bdev + +/* see get_sb_bdev and bd_claim */ +extern char* drbd_sec_holder; + +// bi_end_io handlers +// int (bio_end_io_t) (struct bio *, unsigned int, int); +extern int drbd_md_io_complete (struct bio *bio, unsigned int bytes_done, int error); + +extern int drbd_endio_read_sec (struct bio *bio, unsigned int bytes_done, int error); +extern int drbd_endio_write_sec(struct bio *bio, unsigned int bytes_done, int error); +extern int drbd_endio_pri (struct bio *bio, unsigned int bytes_done, int error); + +static inline sector_t drbd_get_hardsect(struct block_device *bdev) +{ + return bdev->bd_disk->queue->hardsect_size; +} + +/* Returns the number of 512 byte sectors of the device */ +static inline sector_t drbd_get_capacity(struct block_device *bdev) +{ + /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ + return bdev ? bdev->bd_inode->i_size >> 9 : 0; +} + +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(drbd_dev *mdev, + sector_t size) +{ + /* set_capacity(mdev->this_bdev->bd_disk, size); */ + set_capacity(mdev->vdisk,size); + mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +static inline int drbd_sync_me(drbd_dev *mdev) +{ + return fsync_bdev(mdev->this_bdev); +} + +#define drbd_bio_uptodate(bio) bio_flagged(bio,BIO_UPTODATE) + +#ifdef CONFIG_HIGHMEM +/* + * I don't know why there is no bvec_kmap, only bvec_kmap_irq ... + * + * we do a sock_recvmsg into the target buffer, + * so we obviously cannot use the bvec_kmap_irq variant. -lge + * + * Most likely it is only due to performance anyways: + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +static inline char *drbd_bio_kmap(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + unsigned long addr; + + addr = (unsigned long) kmap(bvec->bv_page); + + if (addr & ~PAGE_MASK) + BUG(); + + return (char *) addr + bvec->bv_offset; +} + +static inline void drbd_bio_kunmap(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + + kunmap(bvec->bv_page); +} + +#else +static inline char *drbd_bio_kmap(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + return page_address(bvec->bv_page) + bvec->bv_offset; +} +static inline void drbd_bio_kunmap(struct bio *bio) +{ + // do nothing. +} +#endif + +static inline int drbd_bio_has_active_page(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + __bio_for_each_segment(bvec, bio, i, 0) { + if (page_count(bvec->bv_page) > 1) return 1; + } + + return 0; +} + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(drbd_dev *mdev, int rw, int fault_type, struct bio *bio) +{ + bio->bi_rw = rw; // on the receiver side, e->..rw was not yet defined. + + if (!bio->bi_bdev) { + printk(KERN_ERR DEVICE_NAME "%d: drbd_generic_make_request: bio->bi_bdev == NULL\n", + mdev_to_minor(mdev)); + dump_stack(); + bio_endio(bio, bio->bi_size, -ENODEV); + return; + } + + if (FAULT_ACTIVE(mdev, fault_type)) + bio_endio(bio,bio->bi_size,-EIO); + else + generic_make_request(bio); +} + +static inline void drbd_plug_device(drbd_dev *mdev) +{ + request_queue_t *q; + q = bdev_get_queue(mdev->this_bdev); + + spin_lock_irq(q->queue_lock); + +/* XXX the check on !blk_queue_plugged is redundant, + * implicitly checked in blk_plug_device */ + + if(!blk_queue_plugged(q)) { + blk_plug_device(q); + del_timer(&q->unplug_timer); + // unplugging should not happen automatically... + } + spin_unlock_irq(q->queue_lock); +} + +static inline int _drbd_send_bio(drbd_dev *mdev, struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + struct page *page = bvec->bv_page; + size_t size = bvec->bv_len; + int offset = bvec->bv_offset; + int ret; + + ret = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + kunmap(page); + return ret; +} + +#ifdef DEFINE_SOCK_CREATE_KERN +#define sock_create_kern sock_create +#endif + +#ifdef USE_KMEM_CACHE_S +typedef struct kmem_cache_s drbd_kmem_cache_t; +#else +typedef struct kmem_cache drbd_kmem_cache_t; +#endif + +#ifdef NEED_BACKPORT_OF_ATOMIC_ADD + +#if defined(__x86_64__) + +static __inline__ int atomic_add_return(int i, atomic_t *v) +{ + int __i = i; + __asm__ __volatile__( + LOCK_PREFIX "xaddl %0, %1;" + :"=r"(i) + :"m"(v->counter), "0"(i)); + return i + __i; +} + +static __inline__ int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i,v); +} + +#define atomic_inc_return(v) (atomic_add_return(1,v)) +#define atomic_dec_return(v) (atomic_sub_return(1,v)) + +#elif defined(__i386__) || defined(__arch_um__) + +static __inline__ int atomic_add_return(int i, atomic_t *v) +{ + int __i; +#ifdef CONFIG_M386 + unsigned long flags; + if(unlikely(boot_cpu_data.x86==3)) + goto no_xadd; +#endif + /* Modern 486+ processor */ + __i = i; + __asm__ __volatile__( + LOCK_PREFIX "xaddl %0, %1;" + :"=r"(i) + :"m"(v->counter), "0"(i)); + return i + __i; + +#ifdef CONFIG_M386 +no_xadd: /* Legacy 386 processor */ + local_irq_save(flags); + __i = atomic_read(v); + atomic_set(v, i + __i); + local_irq_restore(flags); + return i + __i; +#endif +} + +static __inline__ int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i,v); +} + +#define atomic_inc_return(v) (atomic_add_return(1,v)) +#define atomic_dec_return(v) (atomic_sub_return(1,v)) + +#else +# error "You need to copy/past atomic_inc_return()/atomic_dec_return() here" +# error "for your architecture. (Hint: Kernels after 2.6.10 have those" +# error "by default! Using a later kernel might be less effort!)" +#endif + +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) +/* With Linux-2.6.19 the crypto API changed! */ +/* This is not a generic backport of the new api, it just implements + the corner case of "hmac(xxx)". */ + +#define CRYPTO_ALG_ASYNC 4711 +#define CRYPTO_ALG_TYPE_HASH CRYPTO_ALG_TYPE_DIGEST + +struct crypto_hash { + struct crypto_tfm *base; + const u8 *key; + int keylen; +}; + +struct hash_desc { + struct crypto_hash *tfm; + u32 flags; +}; + +static inline struct crypto_hash * +crypto_alloc_hash(char *alg_name, u32 type, u32 mask) +{ + struct crypto_hash *ch; + char *closing_bracket; + + // "hmac(xxx)" is in alg_name we need that xxx. + closing_bracket = strchr(alg_name,')'); + if(!closing_bracket) return NULL; + if(closing_bracket-alg_name < 6) return NULL; + + ch = kmalloc(sizeof(struct crypto_hash),GFP_KERNEL); + if(!ch) return NULL; + + *closing_bracket = 0; + ch->base = crypto_alloc_tfm(alg_name + 5, 0); + *closing_bracket = ')'; + + if (ch->base == NULL) { + kfree(ch); + return NULL; + } + + return ch; +} + +static inline int +crypto_hash_setkey(struct crypto_hash *hash,const u8 *key,unsigned int keylen) +{ + hash->key = key; + hash->keylen = keylen; + + return 0; +} + +static inline int +crypto_hash_digest(struct hash_desc *desc, struct scatterlist *sg, + unsigned int nbytes, u8 *out) +{ + + crypto_hmac(desc->tfm->base, (u8*)desc->tfm->key, + &desc->tfm->keylen, sg, 1 /* ! */ , out); + /* ! this is not generic. Would need to convert nbytes -> nsg */ + + return 0; +} + +static inline void crypto_free_hash(struct crypto_hash *tfm) +{ + crypto_free_tfm(tfm->base); + kfree(tfm); +} + +static inline unsigned int crypto_hash_digestsize(struct crypto_hash *tfm) +{ + return crypto_tfm_alg_digestsize(tfm->base); +} + +static inline struct crypto_tfm *crypto_hash_tfm(struct crypto_hash *tfm) +{ + return tfm->base; +} + +#endif + +#ifdef NEED_BACKPORT_OF_KZALLOC +static inline void *kzalloc(size_t size, int flags) +{ + void *rv = kmalloc(size,flags); + if(rv) memset(rv,0,size); + + return rv; +} +#endif diff -uprN linux-2.6.18/drivers/block/drbd/drbd_int.h linux-2.6.18.ovz/drivers/block/drbd/drbd_int.h --- linux-2.6.18/drivers/block/drbd/drbd_int.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_int.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1925 @@ +/* + drbd_int.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#ifndef _DRBD_INT_H +#define _DRBD_INT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lru_cache.h" + +// module parameter, defined in drbd_main.c +extern int minor_count; +extern int allow_oos; +extern int major_nr; +extern int use_nbd_major; + +#ifdef DRBD_ENABLE_FAULTS +extern int enable_faults; +extern int fault_rate; +extern int fault_devs; +#endif + +#include +#ifdef DRBD_MAJOR +# warning "FIXME. DRBD_MAJOR is now officially defined in major.h" +#endif + +#include +#include +#define MAJOR_NR major_nr + +#undef DEVICE_NAME +#define DEVICE_NAME "drbd" + +// XXX do we need this? +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +/* I don't remember why XCPU ... + * This is used to wake the asender, + * and to interrupt sending the sending task + * on disconnect. + */ +#define DRBD_SIG SIGXCPU + +/* This is used to stop/restart our threads. + * Cannot use SIGTERM nor SIGKILL, since these + * are sent out by init on runlevel changes + * I choose SIGHUP for now. + * + * FIXME btw, we should register some reboot notifier. + */ +#define DRBD_SIGKILL SIGHUP + +#define ID_SYNCER (-1ULL) +#define ID_VACANT 0 // All EEs on the free list should have this value + // freshly allocated EEs get !ID_VACANT (== 1) + // so if it says "cannot dereference null + // pointer at adress 0x00000001, it is most + // probably one of these :( +#define is_syncer_block_id(id) ((id)==ID_SYNCER) + +struct Drbd_Conf; +typedef struct Drbd_Conf drbd_dev; + +#ifdef DBG_ALL_SYMBOLS +# define STATIC +#else +# define STATIC static +#endif + +#ifdef PARANOIA +# define PARANOIA_BUG_ON(x) BUG_ON(x) +#else +# define PARANOIA_BUG_ON(x) +#endif + +/* + * Some Message Macros + *************************/ + +// handy macro: DUMPP(somepointer) +#define DUMPP(A) ERR( #A " = %p in %s:%d\n", (A),__FILE__,__LINE__); +#define DUMPLU(A) ERR( #A " = %lu in %s:%d\n", (unsigned long)(A),__FILE__,__LINE__); +#define DUMPLLU(A) ERR( #A " = %llu in %s:%d\n",(unsigned long long)(A),__FILE__,__LINE__); +#define DUMPLX(A) ERR( #A " = %lx in %s:%d\n", (A),__FILE__,__LINE__); +#define DUMPI(A) ERR( #A " = %d in %s:%d\n", (int)(A),__FILE__,__LINE__); + +#define DUMPST(A) DUMPLLU((unsigned long long)(A)) + +#if 0 +#define D_DUMPP(A) DUMPP(A) +#define D_DUMPLU(A) DUMPLU(A) +#define D_DUMPLLU(A) DUMPLLU(A) +#define D_DUMPLX(A) DUMPLX(A) +#define D_DUMPI(A) DUMPI(A) +#else +#define D_DUMPP(A) +#define D_DUMPLU(A) +#define D_DUMPLLU(A) +#define D_DUMPLX(A) +#define D_DUMPI(A) +#endif + +// Info: do not remove the spaces around the "," before ## +// Otherwise this is not portable from gcc-2.95 to gcc-3.3 +#define PRINTK(level,fmt,args...) \ + printk(level DEVICE_NAME "%d: " fmt, \ + mdev->minor , ##args) + +#define ALERT(fmt,args...) PRINTK(KERN_ALERT, fmt , ##args) +#define ERR(fmt,args...) PRINTK(KERN_ERR, fmt , ##args) +#define WARN(fmt,args...) PRINTK(KERN_WARNING, fmt , ##args) +#define INFO(fmt,args...) PRINTK(KERN_INFO, fmt , ##args) +#define DBG(fmt,args...) PRINTK(KERN_DEBUG, fmt , ##args) + +/* see kernel/printk.c:printk_ratelimit + * macro, so it is easy do have independend rate limits at different locations + * "initializer element not constant ..." with kernel 2.4 :( + * so I initialize toks to something large + */ +#define DRBD_ratelimit(ratelimit_jiffies,ratelimit_burst) \ +({ \ + int __ret; \ + static unsigned long toks = 0x80000000UL; \ + static unsigned long last_msg; \ + static int missed; \ + unsigned long now = jiffies; \ + toks += now - last_msg; \ + last_msg = now; \ + if (toks > (ratelimit_burst * ratelimit_jiffies)) \ + toks = ratelimit_burst * ratelimit_jiffies; \ + if (toks >= ratelimit_jiffies) { \ + int lost = missed; \ + missed = 0; \ + toks -= ratelimit_jiffies; \ + if (lost) \ + WARN("%d messages suppressed in %s:%d.\n",\ + lost , __FILE__ , __LINE__ ); \ + __ret=1; \ + } else { \ + missed++; \ + __ret=0; \ + } \ + __ret; \ +}) + + +#ifdef DBG_ASSERTS +extern void drbd_assert_breakpoint(drbd_dev*, char *, char *, int ); +# define D_ASSERT(exp) if (!(exp)) \ + drbd_assert_breakpoint(mdev,#exp,__FILE__,__LINE__) +#else +# define D_ASSERT(exp) if (!(exp)) \ + ERR("ASSERT( " #exp " ) in %s:%d\n", __FILE__,__LINE__) +#endif +#define ERR_IF(exp) if (({ \ + int _b = (exp)!=0; \ + if (_b) ERR("%s: (" #exp ") in %s:%d\n", __func__, __FILE__,__LINE__); \ + _b; \ + })) + +// Defines to control fault insertion +enum { + DRBD_FAULT_MD_WR = 0, + DRBD_FAULT_MD_RD, + DRBD_FAULT_RS_WR, + DRBD_FAULT_RS_RD, + DRBD_FAULT_DT_WR, + DRBD_FAULT_DT_RD, + DRBD_FAULT_DT_RA, // READA = Read ahead + + DRBD_FAULT_MAX, +}; + +#ifdef DRBD_ENABLE_FAULTS +extern unsigned int _drbd_insert_fault(drbd_dev *mdev, unsigned int type); +static inline int +drbd_insert_fault(drbd_dev *mdev, unsigned int type) { + return (fault_rate && + (enable_faults & (1< +// integer division, round _UP_ to the next integer +#define div_ceil(A,B) ( (A)/(B) + ((A)%(B) ? 1 : 0) ) +// usual integer division +#define div_floor(A,B) ( (A)/(B) ) + +/* + * Compatibility Section + *************************/ + +#define LOCK_SIGMASK(task,flags) spin_lock_irqsave(&task->sighand->siglock, flags) +#define UNLOCK_SIGMASK(task,flags) spin_unlock_irqrestore(&task->sighand->siglock, flags) +#define RECALC_SIGPENDING() recalc_sigpending(); + +#if defined(DBG_SPINLOCKS) && defined(__SMP__) +# define MUST_HOLD(lock) if(!spin_is_locked(lock)) { ERR("Not holding lock! in %s\n", __FUNCTION__ ); } +#else +# define MUST_HOLD(lock) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8) +# define HAVE_KERNEL_SENDMSG 1 +#else +# define HAVE_KERNEL_SENDMSG 0 +#endif + + +/* + * our structs + *************************/ + +#define SET_MDEV_MAGIC(x) \ + ({ typecheck(struct Drbd_Conf*,x); \ + (x)->magic = (long)(x) ^ DRBD_MAGIC; }) +#define IS_VALID_MDEV(x) \ + ( typecheck(struct Drbd_Conf*,x) && \ + ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)):0)) + +/* drbd_meta-data.c (still in drbd_main.c) */ +#define DRBD_MD_MAGIC (DRBD_MAGIC+4) // 4th incarnation of the disk layout. + +extern struct Drbd_Conf **minor_table; + +/*** + * on the wire + *********************************************************************/ + +typedef enum { + Data, + DataReply, // Response to DataRequest + RSDataReply, // Response to RSDataRequest + Barrier, + ReportBitMap, + BecomeSyncTarget, + BecomeSyncSource, + UnplugRemote, // Used at various times to hint the peer to hurry up + DataRequest, // Used to ask for a data block + RSDataRequest, // Used to ask for a data block + SyncParam, + ReportProtocol, + ReportUUIDs, + ReportSizes, + ReportState, + ReportSyncUUID, + AuthChallenge, + AuthResponse, + StateChgRequest, + + Ping, // These are sent on the meta socket... + PingAck, + RecvAck, // Used in protocol B + WriteAck, // Used in protocol C + RSWriteAck, // Is a WriteAck, additionally call set_in_sync(). + DiscardAck, // Used in protocol C, two-primaries conflict detection + NegAck, // Sent if local disk is unusable + NegDReply, // Local disk is broken... + NegRSDReply, // Local disk is broken... + BarrierAck, + StateChgReply, + + MAX_CMD, + MayIgnore = 0x100, // Flag only to test if (cmd > MayIgnore) ... + MAX_OPT_CMD, + + /* FIXME + * to get a more useful error message with drbd-8 <-> drbd 0.7.x, + * these could be reimplemented as special case of HandShake. */ + HandShakeM = 0xfff1, // First Packet on the MetaSock + HandShakeS = 0xfff2, // First Packet on the Socket + + HandShake = 0xfffe // FIXED for the next century! +} Drbd_Packet_Cmd; + +static inline const char* cmdname(Drbd_Packet_Cmd cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [Data] = "Data", + [DataReply] = "DataReply", + [RSDataReply] = "RSDataReply", + [Barrier] = "Barrier", + [ReportBitMap] = "ReportBitMap", + [BecomeSyncTarget] = "BecomeSyncTarget", + [BecomeSyncSource] = "BecomeSyncSource", + [UnplugRemote] = "UnplugRemote", + [DataRequest] = "DataRequest", + [RSDataRequest] = "RSDataRequest", + [SyncParam] = "SyncParam", + [ReportProtocol] = "ReportProtocol", + [ReportUUIDs] = "ReportUUIDs", + [ReportSizes] = "ReportSizes", + [ReportState] = "ReportState", + [ReportSyncUUID] = "ReportSyncUUID", + [AuthChallenge] = "AuthChallenge", + [AuthResponse] = "AuthResponse", + [Ping] = "Ping", + [PingAck] = "PingAck", + [RecvAck] = "RecvAck", + [WriteAck] = "WriteAck", + [RSWriteAck] = "RSWriteAck", + [DiscardAck] = "DiscardAck", + [NegAck] = "NegAck", + [NegDReply] = "NegDReply", + [NegRSDReply] = "NegRSDReply", + [BarrierAck] = "BarrierAck", + [StateChgRequest] = "StateChgRequest", + [StateChgReply] = "StateChgReply" + }; + + if (Data > cmd || cmd >= MAX_CMD) { + switch (cmd) { + case HandShakeM: + return "HandShakeM"; + break; + case HandShakeS: + return "HandShakeS"; + break; + case HandShake: + return "HandShake"; + break; + default: + return "Unknown"; + break; + } + } + return cmdnames[cmd]; +} + + +/* This is the layout for a packet on the wire. + * The byteorder is the network byte order. + * (except block_id and barrier fields. + * these are pointers to local structs + * and have no relevance for the partner, + * which just echoes them as received.) + * + * NOTE that the payload starts at a long aligned offset, + * regardless of 32 or 64 bit arch! + */ +typedef struct { + u32 magic; + u16 command; + u16 length; // bytes of data after this header + char payload[0]; +} __attribute((packed)) Drbd_Header; +// 8 bytes. packet FIXED for the next century! + +/* + * short commands, packets without payload, plain Drbd_Header: + * Ping + * PingAck + * BecomeSyncTarget + * BecomeSyncSource + * UnplugRemote + */ + +/* + * commands with out-of-struct payload: + * ReportBitMap (no additional fields) + * Data, DataReply (see Drbd_Data_Packet) + */ + +#define DP_HARDBARRIER 1 +#define DP_RW_SYNC 2 +#define DP_MAY_SET_IN_SYNC 4 + +typedef struct { + Drbd_Header head; + u64 sector; // 64 bits sector number + u64 block_id; // Used in protocol B&C for the address of the req. + u32 seq_num; + u32 dp_flags; +} __attribute((packed)) Drbd_Data_Packet; + +/* + * commands which share a struct: + * Drbd_BlockAck_Packet: + * RecvAck (proto B), WriteAck (proto C), + * DiscardAck (proto C, two-primaries conflict detection) + * Drbd_BlockRequest_Packet: + * DataRequest, RSDataRequest + */ +typedef struct { + Drbd_Header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 seq_num; +} __attribute((packed)) Drbd_BlockAck_Packet; + + +typedef struct { + Drbd_Header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 pad; //make sure packet is a multiple of 8 Byte +} __attribute((packed)) Drbd_BlockRequest_Packet; + +/* + * commands with their own struct for additional fields: + * HandShake + * Barrier + * BarrierAck + * SyncParam + * ReportParams + */ + +typedef struct { + Drbd_Header head; // 8 bytes + u32 protocol_version; + u32 feature_flags; + + /* should be more than enough for future enhancements + * for now, feature_flags and the reserverd array shall be zero. + */ + + u64 reserverd[8]; +} __attribute((packed)) Drbd_HandShake_Packet; +// 80 bytes, FIXED for the next century + +typedef struct { + Drbd_Header head; + u32 barrier; // barrier number _handle_ only + u32 pad; // make sure packet is a multiple of 8 Byte +} __attribute((packed)) Drbd_Barrier_Packet; + +typedef struct { + Drbd_Header head; + u32 barrier; + u32 set_size; +} __attribute((packed)) Drbd_BarrierAck_Packet; + +typedef struct { + Drbd_Header head; + u32 rate; +} __attribute((packed)) Drbd_SyncParam_Packet; + +typedef struct { + Drbd_Header head; + u32 protocol; + u32 after_sb_0p; + u32 after_sb_1p; + u32 after_sb_2p; + u32 want_lose; + u32 two_primaries; +} __attribute((packed)) Drbd_Protocol_Packet; + +typedef struct { + Drbd_Header head; + u64 uuid[EXT_UUID_SIZE]; +} __attribute((packed)) Drbd_GenCnt_Packet; + +typedef struct { + Drbd_Header head; + u64 uuid; +} __attribute((packed)) Drbd_SyncUUID_Packet; + +typedef struct { + Drbd_Header head; + u64 d_size; // size of disk + u64 u_size; // user requested size + u64 c_size; // current exported size + u32 max_segment_size; // Maximal size of a BIO + u32 queue_order_type; +} __attribute((packed)) Drbd_Sizes_Packet; + +typedef struct { + Drbd_Header head; + u32 state; +} __attribute((packed)) Drbd_State_Packet; + +typedef struct { + Drbd_Header head; + u32 mask; + u32 val; +} __attribute((packed)) Drbd_Req_State_Packet; + +typedef struct { + Drbd_Header head; + u32 retcode; +} __attribute((packed)) Drbd_RqS_Reply_Packet; + +typedef struct { + u64 size; + u32 state; + u32 blksize; + u32 protocol; + u32 version; + u32 gen_cnt[5]; + u32 bit_map_gen[5]; +} __attribute((packed)) Drbd06_Parameter_P; + +typedef struct { + Drbd_Header head; + u64 block_id; + u32 seq_num; + u32 pad; +} __attribute((packed)) Drbd_Discard_Packet; + +typedef union { + Drbd_Header head; + Drbd_HandShake_Packet HandShake; + Drbd_Data_Packet Data; + Drbd_BlockAck_Packet BlockAck; + Drbd_Barrier_Packet Barrier; + Drbd_BarrierAck_Packet BarrierAck; + Drbd_SyncParam_Packet SyncParam; + Drbd_Protocol_Packet Protocol; + Drbd_Sizes_Packet Sizes; + Drbd_GenCnt_Packet GenCnt; + Drbd_State_Packet State; + Drbd_Req_State_Packet ReqState; + Drbd_RqS_Reply_Packet RqSReply; + Drbd_BlockRequest_Packet BlockRequest; +} __attribute((packed)) Drbd_Polymorph_Packet; + +/**********************************************************************/ + +typedef enum { + None, + Running, + Exiting, + Restarting +} Drbd_thread_state; + +struct Drbd_thread { + spinlock_t t_lock; + struct task_struct *task; + struct completion startstop; + Drbd_thread_state t_state; + int (*function) (struct Drbd_thread *); + drbd_dev *mdev; +}; + +static inline Drbd_thread_state get_t_state(struct Drbd_thread *thi) +{ + /* THINK testing the t_state seems to be uncritical in all cases + * (but thread_{start,stop}), so we can read it *without* the lock. + * --lge */ + + smp_rmb(); + return (volatile int)thi->t_state; +} + + +/* + * Having this as the first member of a struct provides sort of "inheritance". + * "derived" structs can be "drbd_queue_work()"ed. + * The callback should know and cast back to the descendant struct. + * drbd_request and Tl_epoch_entry are descendants of drbd_work. + */ +struct drbd_work; +typedef int (*drbd_work_cb)(drbd_dev*, struct drbd_work*, int cancel); +struct drbd_work { + struct list_head list; + drbd_work_cb cb; +}; + +struct drbd_barrier; +struct drbd_request { + struct drbd_work w; + drbd_dev *mdev; + struct bio *private_bio; + struct hlist_node colision; + sector_t sector; + unsigned int size; + unsigned int epoch; /* barrier_nr */ + + /* barrier_nr: used to check on "completion" whether this req was in + * the current epoch, and we therefore have to close it, + * starting a new epoch... + */ + + /* up to here, the struct layout is identical to Tl_epoch_entry; + * we might be able to use that to our advantage... */ + + struct list_head tl_requests; /* ring list in the transfer log */ + struct bio *master_bio; /* master bio pointer */ + unsigned long rq_state; /* see comments above _req_mod() */ + int seq_num; +}; + +struct drbd_barrier { + struct drbd_work w; + struct list_head requests; // requests before + struct drbd_barrier *next; // pointer to the next barrier + unsigned int br_number; // the barriers identifier. + int n_req; // number of requests attached before this barrier +}; + +typedef struct drbd_request drbd_request_t; + +/* These Tl_epoch_entries may be in one of 6 lists: + active_ee .. data packet being written + sync_ee .. syncer block being written + done_ee .. block written, need to send WriteAck + read_ee .. [RS]DataRequest being read +*/ + +struct Tl_epoch_entry { + struct drbd_work w; + drbd_dev *mdev; + struct bio *private_bio; + struct hlist_node colision; + sector_t sector; + unsigned int size; + unsigned int barrier_nr; + + /* up to here, the struct layout is identical to drbd_request; + * we might be able to use that to our advantage... */ + + unsigned int barrier_nr2; + /* If we issue the bio with BIO_RW_BARRIER we have to + send a barrier ACK before we send the ACK to this + write. We store the barrier number in here. + In case the barrier after this write has been coalesced + as well, we set it's barrier_nr into barrier_nr2 */ + + unsigned int flags; + u64 block_id; +}; + +/* ee flag bits */ +enum { + __EE_CALL_AL_COMPLETE_IO, + __EE_CONFLICT_PENDING, + __EE_MAY_SET_IN_SYNC, +}; +#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) +#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) +#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) + +/* global flag bits */ +enum { + ISSUE_BARRIER, // next Data is preceeded by a Barrier + SIGNAL_ASENDER, // whether asender wants to be interrupted + SEND_PING, // whether asender should send a ping asap + WRITE_ACK_PENDING, // so BarrierAck won't overtake WriteAck + WORK_PENDING, // completion flag for drbd_disconnect + STOP_SYNC_TIMER, // tell timer to cancel itself + UNPLUG_QUEUED, // only relevant with kernel 2.4 + UNPLUG_REMOTE, // whether sending a "UnplugRemote" makes sense + MD_DIRTY, // current gen counts and flags not yet on disk + DISCARD_CONCURRENT, // Set on one node, cleared on the peer! + USE_DEGR_WFC_T, // Use degr-wfc-timeout instead of wfc-timeout. + CLUSTER_ST_CHANGE, // Cluster wide state change going on... + CL_ST_CHG_SUCCESS, + CL_ST_CHG_FAIL, + CRASHED_PRIMARY, // This node was a crashed primary. Gets + // cleared when the state.conn goes into + // Connected state. + WRITE_BM_AFTER_RESYNC // A kmalloc() during resync failed +}; + +struct drbd_bitmap; // opaque for Drbd_Conf + +// TODO sort members for performance +// MAYBE group them further + +/* THINK maybe we actually want to use the default "event/%s" worker threads + * or similar in linux 2.6, which uses per cpu data and threads. + * + * To be general, this might need a spin_lock member. + * For now, please use the mdev->req_lock to protect list_head, + * see drbd_queue_work below. + */ +struct drbd_work_queue { + struct list_head q; + struct semaphore s; // producers up it, worker down()s it + spinlock_t q_lock; // to protect the list. +}; + +/* If Philipp agrees, we remove the "mutex", and make_request will only + * (throttle on "queue full" condition and) queue it to the worker thread... + * which then is free to do whatever is needed, and has exclusive send access + * to the data socket ... + */ +struct drbd_socket { + struct drbd_work_queue work; + struct semaphore mutex; + struct socket *socket; + Drbd_Polymorph_Packet sbuf; // this way we get our + Drbd_Polymorph_Packet rbuf; // send/receive buffers off the stack +}; + +struct drbd_md { + u64 md_offset; /* sector offset to 'super' block */ + + u64 la_size_sect; /* last agreed size, unit sectors */ + u64 uuid[UUID_SIZE]; + u64 device_uuid; + u32 flags; + u32 md_size_sect; + + s32 al_offset; /* signed relative sector offset to al area */ + s32 bm_offset; /* signed relative sector offset to bitmap */ + + /* u32 al_nr_extents; important for restoring the AL + * is stored into sync_conf.al_extents, which in turn + * gets applied to act_log->nr_elements + */ +}; + +// for sync_conf and other types... +#define PACKET(name, number, fields) struct name { fields }; +#define INTEGER(pn,pr,member) int member; +#define INT64(pn,pr,member) __u64 member; +#define BIT(pn,pr,member) unsigned member : 1; +#define STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; +#include "linux/drbd_nl.h" + +struct drbd_backing_dev { + struct block_device *backing_bdev; + struct block_device *md_bdev; + struct file *lo_file; + struct file *md_file; + struct drbd_md md; + struct disk_conf dc; /* The user provided config... */ +}; + +struct Drbd_Conf { +#ifdef PARANOIA + long magic; +#endif + /* things that are stored as / read from meta data on disk */ + unsigned long flags; + + /* configured by drbdsetup */ + struct net_conf *net_conf; // protected by inc_net() and dec_net() + struct syncer_conf sync_conf; + struct drbd_backing_dev *bc; // protected by inc_local() dec_local() + + sector_t p_size; /* partner's disk size */ + request_queue_t *rq_queue; + struct block_device *this_bdev; + struct gendisk *vdisk; + + struct drbd_socket data; // for data/barrier/cstate/parameter packets + struct drbd_socket meta; // for ping/ack (metadata) packets + volatile unsigned long last_received; // in jiffies, either socket + volatile unsigned int ko_count; + struct drbd_work resync_work, + unplug_work, + md_sync_work; + struct timer_list resync_timer; + struct timer_list md_sync_timer; + + drbd_state_t new_state_tmp; // Used after attach while negotiating new disk state. + drbd_state_t state; + wait_queue_head_t misc_wait; + wait_queue_head_t state_wait; // upon each state change. + unsigned int send_cnt; + unsigned int recv_cnt; + unsigned int read_cnt; + unsigned int writ_cnt; + unsigned int al_writ_cnt; + unsigned int bm_writ_cnt; + atomic_t ap_bio_cnt; // Requests we need to complete + atomic_t ap_pending_cnt; // AP data packets on the wire, ack expected + atomic_t rs_pending_cnt; // RS request/data packets on the wire + atomic_t unacked_cnt; // Need to send replys for + atomic_t local_cnt; // Waiting for local disk to signal completion + atomic_t net_cnt; // Users of net_conf + spinlock_t req_lock; + struct drbd_barrier* unused_spare_barrier; /* for pre-allocation */ + struct drbd_barrier* newest_barrier; + struct drbd_barrier* oldest_barrier; + struct hlist_head * tl_hash; + unsigned int tl_hash_s; + // sector_t rs_left; // blocks not up-to-date [unit BM_BLOCK_SIZE] + // moved into bitmap->bm_set + unsigned long rs_total; // blocks to sync in this run [unit BM_BLOCK_SIZE] + unsigned long rs_failed; // number of sync IOs that failed in this run + unsigned long rs_start; // Syncer's start time [unit jiffies] + unsigned long rs_paused; // cumulated time in PausedSyncX state [unit jiffies] + unsigned long rs_mark_left;// block not up-to-date at mark [unit BM_BLOCK_SIZE] + unsigned long rs_mark_time;// marks's time [unit jiffies] + struct Drbd_thread receiver; + struct Drbd_thread worker; + struct Drbd_thread asender; + struct drbd_bitmap* bitmap; + struct lru_cache* resync; // Used to track operations of resync... + unsigned int resync_locked; // Number of locked elements in resync LRU + unsigned int resync_wenr; // resync extent number waiting for application requests + int open_cnt; + u64 *p_uuid; + /* FIXME clean comments, restructure so it is more obvious which + * members are protected by what */ + unsigned int epoch_size; + struct list_head active_ee; // IO in progress + struct list_head sync_ee; // IO in progress + struct list_head done_ee; // send ack + struct list_head read_ee; // IO in progress + struct list_head net_ee; // zero-copy network send in progress + struct hlist_head * ee_hash; // is proteced by req_lock! + unsigned int ee_hash_s; + struct Tl_epoch_entry * last_write_w_barrier; // ee_lock, single thread + int next_barrier_nr; // ee_lock, single thread + struct hlist_head * app_reads_hash; // is proteced by req_lock + struct list_head resync_reads; + atomic_t pp_in_use; + wait_queue_head_t ee_wait; + struct page *md_io_page; // one page buffer for md_io + struct page *md_io_tmpp; // in case hardsect != 512 [ s390 only? ] + struct semaphore md_io_mutex; // protects the md_io_buffer + spinlock_t al_lock; + wait_queue_head_t al_wait; + struct lru_cache* act_log; // activity log + unsigned int al_tr_number; + int al_tr_cycle; + int al_tr_pos; // position of the next transaction in the journal + struct crypto_hash* cram_hmac_tfm; + wait_queue_head_t seq_wait; + atomic_t packet_seq; + unsigned int peer_seq; + spinlock_t peer_seq_lock; + int minor; + unsigned long comm_bm_set; // communicated number of set bits. +}; + +static inline drbd_dev *minor_to_mdev(int minor) +{ + drbd_dev *mdev; + + mdev = minor < minor_count ? minor_table[minor] : NULL; + + return mdev; +} + +static inline int mdev_to_minor(drbd_dev *mdev) +{ + return mdev->minor; +} + +/* returns 1 if it was successfull, + * returns 0 if there was no data socket. + * so wherever you are going to use the data.socket, e.g. do + * if (!drbd_get_data_sock(mdev)) + * return 0; + * CODE(); + * drbd_put_data_sock(mdev); + */ +static inline int drbd_get_data_sock(drbd_dev *mdev) +{ + down(&mdev->data.mutex); + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (unlikely(mdev->data.socket == NULL)) { + up(&mdev->data.mutex); + return 0; + } + return 1; +} + +static inline void drbd_put_data_sock(drbd_dev *mdev) +{ + up(&mdev->data.mutex); +} + + +/* + * function declarations + *************************/ + +// drbd_main.c + +enum chg_state_flags { + ChgStateHard = 1, + ChgStateVerbose = 2, + ScheduleAfter = 4, +}; + +extern int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f, + drbd_state_t mask, drbd_state_t val); +extern void drbd_force_state(drbd_dev*, drbd_state_t, drbd_state_t); +extern int _drbd_request_state(drbd_dev*, drbd_state_t, drbd_state_t, + enum chg_state_flags); +extern int _drbd_set_state(drbd_dev*, drbd_state_t, enum chg_state_flags ); +extern void print_st_err(drbd_dev*, drbd_state_t, drbd_state_t, int ); +extern void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, + enum chg_state_flags); +extern int drbd_thread_start(struct Drbd_thread *thi); +extern void _drbd_thread_stop(struct Drbd_thread *thi, int restart, int wait); +extern void drbd_thread_signal(struct Drbd_thread *thi); +extern void drbd_free_resources(drbd_dev *mdev); +extern void tl_release(drbd_dev *mdev,unsigned int barrier_nr, + unsigned int set_size); +extern void tl_clear(drbd_dev *mdev); +extern struct drbd_barrier *_tl_add_barrier(drbd_dev *,struct drbd_barrier *); +extern void drbd_free_sock(drbd_dev *mdev); +extern int drbd_send(drbd_dev *mdev, struct socket *sock, + void* buf, size_t size, unsigned msg_flags); +extern int drbd_send_protocol(drbd_dev *mdev); +extern int drbd_send_uuids(drbd_dev *mdev); +extern int drbd_send_sync_uuid(drbd_dev *mdev, u64 val); +extern int drbd_send_sizes(drbd_dev *mdev); +extern int drbd_send_state(drbd_dev *mdev); +extern int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock, + Drbd_Packet_Cmd cmd, Drbd_Header *h, + size_t size, unsigned msg_flags); +#define USE_DATA_SOCKET 1 +#define USE_META_SOCKET 0 +extern int drbd_send_cmd(drbd_dev *mdev, int use_data_socket, + Drbd_Packet_Cmd cmd, Drbd_Header *h, size_t size); +extern int drbd_send_cmd2(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + char* data, size_t size); +extern int drbd_send_sync_param(drbd_dev *mdev, struct syncer_conf *sc); +extern int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + struct Tl_epoch_entry *e); +extern int drbd_send_ack_rp(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + Drbd_BlockRequest_Packet *rp); +extern int drbd_send_ack_dp(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + Drbd_Data_Packet *dp); +extern int _drbd_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size); +extern int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + struct Tl_epoch_entry *e); +extern int drbd_send_dblock(drbd_dev *mdev, drbd_request_t *req); +extern int _drbd_send_barrier(drbd_dev *mdev, struct drbd_barrier *barrier); +extern int drbd_send_drequest(drbd_dev *mdev, int cmd, + sector_t sector,int size, u64 block_id); +extern int drbd_send_bitmap(drbd_dev *mdev); +extern int _drbd_send_bitmap(drbd_dev *mdev); +extern int drbd_send_sr_reply(drbd_dev *mdev, int retcode); +extern void drbd_free_bc(struct drbd_backing_dev* bc); +extern int drbd_io_error(drbd_dev* mdev, int forcedetach); +extern void drbd_mdev_cleanup(drbd_dev *mdev); + +// drbd_meta-data.c (still in drbd_main.c) +extern void drbd_md_sync(drbd_dev *mdev); +extern int drbd_md_read(drbd_dev *mdev, struct drbd_backing_dev * bdev); +// maybe define them below as inline? +extern void drbd_uuid_set(drbd_dev *mdev,int idx, u64 val); +extern void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val); +extern void drbd_uuid_new_current(drbd_dev *mdev); +extern void drbd_uuid_set_bm(drbd_dev *mdev, u64 val); +extern void drbd_md_set_flag(drbd_dev *mdev, int flags); +extern void drbd_md_clear_flag(drbd_dev *mdev, int flags); +extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +extern void drbd_md_mark_dirty(drbd_dev *mdev); + +/* Meta data layout + We reserve a 128MB Block (4k aligned) + * either at the end of the backing device + * or on a seperate meta data device. */ + +#define MD_RESERVED_SECT ( 128LU << 11 ) // 128 MB, unit sectors +// The following numbers are sectors +#define MD_AL_OFFSET 8 // 8 Sectors after start of meta area +#define MD_AL_MAX_SIZE 64 // = 32 kb LOG ~ 3776 extents ~ 14 GB Storage +#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) //Allows up to about 3.8TB + +#define MD_HARDSECT_B 9 // Since the smalles IO unit is usually 512 byte +#define MD_HARDSECT (1< we need 32 KB bitmap. + * Bit 0 ==> local node thinks this block is binary identical on both nodes + * Bit 1 ==> local node thinks this block needs to be synced. + */ + +#define BM_BLOCK_SIZE_B 12 // 4k per bit +#define BM_BLOCK_SIZE (1<>(BM_BLOCK_SIZE_B-9)) +#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SIZE_B-9)) +#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) + +/* bit to represented kilo byte conversion */ +#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SIZE_B-10)) + +/* in which _bitmap_ extent (resp. sector) the bit for a certain + * _storage_ sector is located in */ +#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SIZE_B-9)) + +/* who much _storage_ sectors we have per bitmap sector */ +#define BM_SECT_PER_EXT (1ULL << (BM_EXT_SIZE_B-9)) + +/* in one sector of the bitmap, we have this many activity_log extents. */ +#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SIZE_B - AL_EXTENT_SIZE_B) ) +#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL)) + + +#define BM_BLOCKS_PER_BM_EXT_B ( BM_EXT_SIZE_B - BM_BLOCK_SIZE_B ) +#define BM_BLOCKS_PER_BM_EXT_MASK ( (1<= level) && (type & trace_type)); +} +static inline int +is_mdev_trace(drbd_dev *mdev, unsigned int type, unsigned int level) { + return (is_trace(type, level) && + ( ( 1 << mdev_to_minor(mdev)) & trace_devs)); +} + +#define MTRACE(type,lvl,code...) \ +do { \ + if (unlikely(is_mdev_trace(mdev,type,lvl))) { \ + code \ + } \ +} while (0) + +#define TRACE(type,lvl,code...) \ +do { \ + if (unlikely(is_trace(type,lvl))) { \ + code \ + } \ +} while (0) + +// Buffer printing support +// DbgPrintFlags: used for Flags arg to DbgPrintBuffer +// - DBGPRINT_BUFFADDR; if set, each line starts with the +// virtual address of the line being output. If clear, +// each line starts with the offset from the beginning +// of the buffer. +typedef enum { + DBGPRINT_BUFFADDR = 0x0001, +} DbgPrintFlags; + +extern void drbd_print_uuid(drbd_dev *mdev, unsigned int idx); + +extern void drbd_print_buffer(const char *prefix,unsigned int flags,int size, + const void *buffer,const void *buffer_va, + unsigned int length); + +// Bio printing support +extern void _dump_bio(drbd_dev *mdev, struct bio *bio, int complete); + +static inline void dump_bio(drbd_dev *mdev, struct bio *bio, int complete) { + MTRACE(TraceTypeRq,TraceLvlSummary, + _dump_bio(mdev, bio, complete); + ); +} + +// Packet dumping support +extern void _dump_packet(drbd_dev *mdev, struct socket *sock, + int recv, Drbd_Polymorph_Packet *p, char* file, int line); + +static inline void +dump_packet(drbd_dev *mdev, struct socket *sock, + int recv, Drbd_Polymorph_Packet *p, char* file, int line) +{ + MTRACE(TraceTypePacket, TraceLvlSummary, + _dump_packet(mdev,sock,recv,p,file,line); + ); +} + +#else + +#define MTRACE(ignored...) ((void)0) +#define TRACE(ignored...) ((void)0) + +#define dump_bio(ignored...) ((void)0) +#define dump_packet(ignored...) ((void)0) +#endif + +// drbd_req +extern int drbd_make_request_26(request_queue_t *q, struct bio *bio); +extern int drbd_read_remote(drbd_dev *mdev, drbd_request_t *req); +extern int drbd_merge_bvec(request_queue_t *, struct bio *, struct bio_vec *); +extern int is_valid_ar_handle(drbd_request_t *, sector_t); + + +// drbd_nl.c +extern char* ppsize(char* buf, unsigned long long size); +extern sector_t drbd_new_dev_size(struct Drbd_Conf*, struct drbd_backing_dev*); +extern int drbd_determin_dev_size(drbd_dev*); +extern void drbd_setup_queue_param(drbd_dev *mdev, unsigned int); +extern int drbd_set_role(drbd_dev *mdev, drbd_role_t new_role, int force); +extern int drbd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +drbd_disks_t drbd_try_outdate_peer(drbd_dev *mdev); +extern long drbd_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg); +extern int drbd_khelper(drbd_dev *mdev, char* cmd); + +// drbd_worker.c +extern int drbd_worker(struct Drbd_thread *thi); +extern void drbd_alter_sa(drbd_dev *mdev, int na); +extern void drbd_start_resync(drbd_dev *mdev, drbd_conns_t side); +extern void resume_next_sg(drbd_dev* mdev); +extern void suspend_other_sg(drbd_dev* mdev); +extern int drbd_resync_finished(drbd_dev *mdev); +// maybe rather drbd_main.c ? +extern int drbd_md_sync_page_io(drbd_dev *mdev, struct drbd_backing_dev *bdev, + sector_t sector, int rw); +// worker callbacks +extern int w_req_cancel_conflict (drbd_dev *, struct drbd_work *, int); +extern int w_read_retry_remote (drbd_dev *, struct drbd_work *, int); +extern int w_e_end_data_req (drbd_dev *, struct drbd_work *, int); +extern int w_e_end_rsdata_req (drbd_dev *, struct drbd_work *, int); +extern int w_resync_inactive (drbd_dev *, struct drbd_work *, int); +extern int w_resume_next_sg (drbd_dev *, struct drbd_work *, int); +extern int w_io_error (drbd_dev *, struct drbd_work *, int); +extern int w_send_write_hint (drbd_dev *, struct drbd_work *, int); +extern int w_make_resync_request (drbd_dev *, struct drbd_work *, int); +extern int w_send_dblock (drbd_dev *, struct drbd_work *, int); +extern int w_send_barrier (drbd_dev *, struct drbd_work *, int); +extern int w_send_read_req (drbd_dev *, struct drbd_work *, int); +extern int w_prev_work_done (drbd_dev *, struct drbd_work *, int); + +extern void resync_timer_fn(unsigned long data); + +#if 0 +#define BD_CLAIM(bdev,holder) ({ \ + int r = bd_claim(bdev,holder); \ + printk(KERN_INFO "drbd: %u = bd_claim(%p,%p); [%p;%u]\n", \ + r, bdev, holder, bdev->bd_holder, bdev->bd_holders); \ + r; }) + +#define BD_RELEASE(bdev) do { \ + printk(KERN_INFO "drbd: pre: bd_release(%p); [%p;%u]\n", \ + bdev, bdev->bd_holder, bdev->bd_holders); \ + bd_release(bdev); \ + printk(KERN_INFO "drbd: post: bd_release(%p); [%p;%u]\n", \ + bdev, bdev->bd_holder, bdev->bd_holders); \ + } while (0) +#else +#define BD_CLAIM(bdev,holder) bd_claim(bdev,holder) +#define BD_RELEASE(bdev) bd_release(bdev) +#endif + +// drbd_receiver.c +extern int drbd_release_ee(drbd_dev* mdev,struct list_head* list); +extern struct Tl_epoch_entry* drbd_alloc_ee(drbd_dev *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + unsigned int gfp_mask); +extern void drbd_free_ee(drbd_dev *mdev, struct Tl_epoch_entry* e); +extern void drbd_wait_ee_list_empty(drbd_dev *mdev, struct list_head *head); +extern void _drbd_wait_ee_list_empty(drbd_dev *mdev, struct list_head *head); +extern void drbd_set_recv_tcq(drbd_dev *mdev, int tcq_enabled); +extern void _drbd_clear_done_ee(drbd_dev *mdev); + +static inline void drbd_tcp_cork(struct socket *sock) +{ +#if 1 + mm_segment_t oldfs = get_fs(); + int val = 1; + + set_fs(KERNEL_DS); + tcp_setsockopt(sock->sk, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val) ); + set_fs(oldfs); +#else + tcp_sk(sock->sk)->nonagle |= TCP_NAGLE_CORK; +#endif +} + +static inline void drbd_tcp_flush(struct socket *sock) +{ +#if 1 + mm_segment_t oldfs = get_fs(); + int val = 0; + + set_fs(KERNEL_DS); + tcp_setsockopt(sock->sk, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val) ); + set_fs(oldfs); +#else + tcp_sk(sock->sk)->nonagle &= ~TCP_NAGLE_CORK; + tcp_push_pending_frames(sock->sk, tcp_sk(sock->sk)); +#endif +} + +// drbd_proc.c +extern struct proc_dir_entry *drbd_proc; +extern struct file_operations drbd_proc_fops; +extern const char* conns_to_name(drbd_conns_t s); +extern const char* roles_to_name(drbd_role_t s); + +// drbd_actlog.c +extern void drbd_al_begin_io(struct Drbd_Conf *mdev, sector_t sector); +extern void drbd_al_complete_io(struct Drbd_Conf *mdev, sector_t sector); +extern void drbd_rs_complete_io(struct Drbd_Conf *mdev, sector_t sector); +extern int drbd_rs_begin_io(struct Drbd_Conf *mdev, sector_t sector); +extern int drbd_try_rs_begin_io(struct Drbd_Conf *mdev, sector_t sector); +extern void drbd_rs_cancel_all(drbd_dev* mdev); +extern int drbd_rs_del_all(drbd_dev* mdev); +extern void drbd_rs_failed_io(drbd_dev* mdev, sector_t sector, int size); +extern int drbd_al_read_log(struct Drbd_Conf *mdev,struct drbd_backing_dev *); +extern void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line); +#define drbd_set_in_sync(mdev,sector,size) \ + __drbd_set_in_sync(mdev,sector,size, __FILE__, __LINE__ ) +extern void __drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line); +#define drbd_set_out_of_sync(mdev,sector,size) \ + __drbd_set_out_of_sync(mdev,sector,size, __FILE__, __LINE__ ) +extern void drbd_al_apply_to_bm(struct Drbd_Conf *mdev); +extern void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev); +extern void drbd_al_shrink(struct Drbd_Conf *mdev); + + +// drbd_nl.c + +void drbd_nl_cleanup(void); +int __init drbd_nl_init(void); +void drbd_bcast_state(drbd_dev *mdev); + +/* + * inline helper functions + *************************/ + +#define peer_mask role_mask +#define pdsk_mask disk_mask +#define susp_mask 1 +#define user_isp_mask 1 +#define aftr_isp_mask 1 + +#define NS(T,S) ({drbd_state_t mask; mask.i=0; mask.T = T##_mask; mask;}), \ + ({drbd_state_t val; val.i=0; val.T = (S); val;}) +#define NS2(T1,S1,T2,S2) \ + ({drbd_state_t mask; mask.i=0; mask.T1 = T1##_mask; \ + mask.T2 = T2##_mask; mask;}), \ + ({drbd_state_t val; val.i=0; val.T1 = (S1); \ + val.T2 = (S2); val;}) +#define NS3(T1,S1,T2,S2,T3,S3) \ + ({drbd_state_t mask; mask.i=0; mask.T1 = T1##_mask; \ + mask.T2 = T2##_mask; mask.T3 = T3##_mask; mask;}), \ + ({drbd_state_t val; val.i=0; val.T1 = (S1); \ + val.T2 = (S2); val.T3 = (S3); val;}) + +#define _NS(D,T,S) D,({drbd_state_t ns; ns.i = D->state.i; ns.T = (S); ns;}) +#define _NS2(D,T1,S1,T2,S2) \ + D,({drbd_state_t ns; ns.i = D->state.i; ns.T1 = (S1); \ + ns.T2 = (S2); ns;}) +#define _NS3(D,T1,S1,T2,S2,T3,S3) \ + D,({drbd_state_t ns; ns.i = D->state.i; ns.T1 = (S1); \ + ns.T2 = (S2); ns.T3 = (S3); ns;}) + +static inline void drbd_state_lock(drbd_dev *mdev) +{ + wait_event(mdev->misc_wait, + !test_and_set_bit(CLUSTER_ST_CHANGE,&mdev->flags)); +} + +static inline void drbd_state_unlock(drbd_dev *mdev) +{ + clear_bit(CLUSTER_ST_CHANGE,&mdev->flags); + wake_up(&mdev->misc_wait); +} + +static inline int drbd_request_state(drbd_dev* mdev, drbd_state_t mask, + drbd_state_t val) +{ + return _drbd_request_state(mdev, mask, val, ChgStateVerbose); +} + +/** + * drbd_chk_io_error: Handles the on_io_error setting, should be called from + * all io completion handlers. See also drbd_io_error(). + */ +static inline void __drbd_chk_io_error(drbd_dev* mdev, int forcedetach) +{ + switch(mdev->bc->dc.on_io_error) { + case PassOn: /* FIXME would this be better named "Ignore"? */ + if (!forcedetach) { + if (printk_ratelimit()) + ERR("Local IO failed. Passing error on...\n"); + break; + } + /* NOTE fall through to detach case if forcedetach set */ + case Detach: + case CallIOEHelper: + if (mdev->state.disk > Failed) { + _drbd_set_state(_NS(mdev,disk,Failed), + ChgStateHard|ScheduleAfter); + ERR("Local IO failed. Detaching...\n"); + } + break; + } +} + +static inline void drbd_chk_io_error(drbd_dev* mdev, int error, int forcedetach) +{ + if (error) { + unsigned long flags; + spin_lock_irqsave(&mdev->req_lock,flags); + __drbd_chk_io_error(mdev,forcedetach); + spin_unlock_irqrestore(&mdev->req_lock,flags); + } +} + +static inline int semaphore_is_locked(struct semaphore* s) +{ + if(!down_trylock(s)) { + up(s); + return 0; + } + return 1; +} + +/* Returns the first sector number of our meta data, + * which, for internal meta data, happens to be the maximum capacity + * we could agree upon with our peer + */ +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + bdev->md.bm_offset; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset; + } +} + +/* returns the last sector number of our meta data, + * to be able to catch out of band md access */ +static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + MD_AL_OFFSET -1; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset + bdev->md.md_size_sect; + } +} + +/* returns the capacity we announce to out peer */ +static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return drbd_get_capacity(bdev->backing_bdev) + ? drbd_md_first_sector(bdev) + : 0; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return drbd_get_capacity(bdev->backing_bdev); + } +} + +/* returns the sector number of our meta data 'super' block */ +static inline sector_t drbd_md_ss__(drbd_dev *mdev, + struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + default: /* external, some index */ + return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; + case DRBD_MD_INDEX_INTERNAL: + /* with drbd08, internal meta data is always "flexible" */ + case DRBD_MD_INDEX_FLEX_INT: + /* sizeof(struct md_on_disk_07) == 4k + * position: last 4k aligned block of 4k size */ + if (!bdev->backing_bdev) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("bdev->backing_bdev==NULL\n"); + dump_stack(); + } + return 0; + } + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) + - MD_AL_OFFSET; + case DRBD_MD_INDEX_FLEX_EXT: + return 0; + } +} + +static inline void +_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + list_add_tail(&w->list,&q->q); + up(&q->s); +} + +static inline void +drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock,flags); + list_add(&w->list,&q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ + spin_unlock_irqrestore(&q->q_lock,flags); +} + +static inline void +drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock,flags); + list_add_tail(&w->list,&q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ + spin_unlock_irqrestore(&q->q_lock,flags); +} + +static inline void wake_asender(drbd_dev *mdev) { + if(test_bit(SIGNAL_ASENDER, &mdev->flags)) { + force_sig(DRBD_SIG, mdev->asender.task); + } +} + +static inline void request_ping(drbd_dev *mdev) { + set_bit(SEND_PING,&mdev->flags); + wake_asender(mdev); +} + +static inline int drbd_send_short_cmd(drbd_dev *mdev, Drbd_Packet_Cmd cmd) +{ + Drbd_Header h; + return drbd_send_cmd(mdev,USE_DATA_SOCKET,cmd,&h,sizeof(h)); +} + +static inline int drbd_send_ping(drbd_dev *mdev) +{ + Drbd_Header h; + return drbd_send_cmd(mdev,USE_META_SOCKET,Ping,&h,sizeof(h)); +} + +static inline int drbd_send_ping_ack(drbd_dev *mdev) +{ + Drbd_Header h; + return drbd_send_cmd(mdev,USE_META_SOCKET,PingAck,&h,sizeof(h)); +} + +static inline void drbd_thread_stop(struct Drbd_thread *thi) +{ + _drbd_thread_stop(thi,FALSE,TRUE); +} + +static inline void drbd_thread_stop_nowait(struct Drbd_thread *thi) +{ + _drbd_thread_stop(thi,FALSE,FALSE); +} + +static inline void drbd_thread_restart_nowait(struct Drbd_thread *thi) +{ + _drbd_thread_stop(thi,TRUE,FALSE); +} + +/* counts how many answer packets packets we expect from our peer, + * for either explicit application requests, + * or implicit barrier packets as necessary. + * increased: + * w_send_barrier + * _req_mod(req, queue_for_net_write or queue_for_net_read); + * it is much easier and equally valid to count what we queue for the + * worker, even before it actually was queued or send. + * (drbd_make_request_common; recovery path on read io-error) + * decreased: + * got_BarrierAck (respective tl_clear, tl_clear_barrier) + * _req_mod(req, data_received) + * [from receive_DataReply] + * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) + * [from got_BlockAck (WriteAck, RecvAck)] + * FIXME + * for some reason it is NOT decreased in got_NegAck, + * but in the resulting cleanup code from report_params. + * we should try to remember the reason for that... + * _req_mod(req, send_failed or send_canceled) + * _req_mod(req, connection_lost_while_pending) + * [from tl_clear_barrier] + */ +static inline void inc_ap_pending(drbd_dev* mdev) +{ + atomic_inc(&mdev->ap_pending_cnt); +} + +#define ERR_IF_CNT_IS_NEGATIVE(which) \ + if(atomic_read(&mdev->which)<0) \ + ERR("in %s:%d: " #which " = %d < 0 !\n", \ + __func__ , __LINE__ , \ + atomic_read(&mdev->which)) + +#define dec_ap_pending(mdev) do { \ + typecheck(drbd_dev*,mdev); \ + if(atomic_dec_and_test(&mdev->ap_pending_cnt)) \ + wake_up(&mdev->misc_wait); \ + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) + +/* counts how many resync-related answers we still expect from the peer + * increase decrease + * SyncTarget sends RSDataRequest (and expects RSDataReply) + * SyncSource sends RSDataReply (and expects WriteAck whith ID_SYNCER) + * (or NegAck with ID_SYNCER) + */ +static inline void inc_rs_pending(drbd_dev* mdev) +{ + atomic_inc(&mdev->rs_pending_cnt); +} + +#define dec_rs_pending(mdev) do { \ + typecheck(drbd_dev*,mdev); \ + atomic_dec(&mdev->rs_pending_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) + +/* counts how many answers we still need to send to the peer. + * increased on + * receive_Data unless protocol A; + * we need to send a RecvAck (proto B) + * or WriteAck (proto C) + * receive_RSDataReply (recv_resync_read) we need to send a WriteAck + * receive_DataRequest (receive_RSDataRequest) we need to send back Data + * receive_Barrier_* we need to send a BarrierAck + */ +static inline void inc_unacked(drbd_dev* mdev) +{ + atomic_inc(&mdev->unacked_cnt); +} + +#define dec_unacked(mdev) do { \ + typecheck(drbd_dev*,mdev); \ + atomic_dec(&mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + +#define sub_unacked(mdev, n) do { \ + typecheck(drbd_dev*,mdev); \ + atomic_sub(n, &mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + + +static inline void dec_net(drbd_dev* mdev) +{ + if(atomic_dec_and_test(&mdev->net_cnt)) { + wake_up(&mdev->misc_wait); + } +} + +/** + * inc_net: Returns TRUE when it is ok to access mdev->net_conf. You + * should call dec_net() when finished looking at mdev->net_conf. + */ +static inline int inc_net(drbd_dev* mdev) +{ + int have_net_conf; + + atomic_inc(&mdev->net_cnt); + have_net_conf = mdev->state.conn >= Unconnected; + if(!have_net_conf) dec_net(mdev); + return have_net_conf; +} + +/* strictly speaking, + * these would have to hold the req_lock while looking at + * the disk state. But since we cannot submit within a spinlock, + * this is mood... + */ + +static inline void dec_local(drbd_dev* mdev) +{ + if(atomic_dec_and_test(&mdev->local_cnt)) { + wake_up(&mdev->misc_wait); + } + D_ASSERT(atomic_read(&mdev->local_cnt)>=0); +} +/** + * inc_local: Returns TRUE when local IO is possible. If it returns + * TRUE you should call dec_local() after IO is completed. + */ +static inline int inc_local_if_state(drbd_dev* mdev, drbd_disks_t mins) +{ + int io_allowed; + + atomic_inc(&mdev->local_cnt); + io_allowed = (mdev->state.disk >= mins ); + if( !io_allowed ) { + dec_local(mdev); + } + return io_allowed; +} +static inline int inc_local(drbd_dev* mdev) +{ + return inc_local_if_state(mdev, Inconsistent); +} + +/* this throttles on-the-fly application requests + * according to max_buffers settings; + * maybe re-implement using semaphores? */ +static inline int drbd_get_max_buffers(drbd_dev* mdev) +{ + int mxb = 1000000; /* arbitrary limit on open requests */ + if(inc_net(mdev)) { + mxb = mdev->net_conf->max_buffers; + dec_net(mdev); + } + return mxb; +} + +static inline int __inc_ap_bio_cond(drbd_dev* mdev) { + int mxb = drbd_get_max_buffers(mdev); + if (mdev->state.susp) return 0; + if (mdev->state.conn == WFBitMapS) return 0; + if (mdev->state.conn == WFBitMapT) return 0; + /* since some older kernels don't have atomic_add_unless, + * and we are within the spinlock anyways, we have this workaround. */ + if (atomic_read(&mdev->ap_bio_cnt) > mxb) return 0; + atomic_inc(&mdev->ap_bio_cnt); + return 1; +} + +/* I'd like to use wait_event_lock_irq, + * but I'm not sure when it got introduced, + * and not sure when it has 3 or 4 arguments */ +static inline void inc_ap_bio(drbd_dev* mdev) +{ + /* compare with after_state_ch, + * os.conn != WFBitMapS && ns.conn == WFBitMapS */ + DEFINE_WAIT(wait); + + /* we wait here + * as long as the device is suspended + * until the bitmap is no longer on the fly during connection handshake + * as long as we would exeed the max_buffer limit. + * + * to avoid races with the reconnect code, + * we need to atomic_inc within the spinlock. */ + + spin_lock_irq(&mdev->req_lock); + while (!__inc_ap_bio_cond(mdev)) { + prepare_to_wait(&mdev->misc_wait,&wait,TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mdev->req_lock); + schedule(); + finish_wait(&mdev->misc_wait, &wait); + spin_lock_irq(&mdev->req_lock); + } + spin_unlock_irq(&mdev->req_lock); +} + +static inline void dec_ap_bio(drbd_dev* mdev) +{ + int mxb = drbd_get_max_buffers(mdev); + int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); + + D_ASSERT(ap_bio>=0); + if (ap_bio < mxb) wake_up(&mdev->misc_wait); +} + +static inline int seq_cmp(u32 a, u32 b) +{ + /* we assume wrap around at 32bit. + * for wrap around at 24bit (old atomic_t), + * we'd have to + * a <<= 8; b <<= 8; + */ + return ((s32)(a) - (s32)(b)); +} +#define seq_lt(a,b) (seq_cmp((a),(b)) < 0) +#define seq_gt(a,b) (seq_cmp((a),(b)) > 0) +#define seq_ge(a,b) (seq_cmp((a),(b)) >= 0) +#define seq_le(a,b) (seq_cmp((a),(b)) <= 0) +/* CAUTION: please no side effects in arguments! */ +#define seq_max(a,b) ((u32)(seq_gt((a),(b)) ? (a) : (b))) + +static inline void update_peer_seq(drbd_dev* mdev, unsigned int new_seq) +{ + unsigned int m; + spin_lock(&mdev->peer_seq_lock); + m = seq_max(mdev->peer_seq, new_seq); + mdev->peer_seq = m; + spin_unlock(&mdev->peer_seq_lock); + if (m == new_seq) wake_up(&mdev->seq_wait); +} + +static inline int drbd_queue_order_type(drbd_dev* mdev) +{ + int rv; +#if !defined(QUEUE_FLAG_ORDERED) + ERR_IF(mdev->bc == NULL) return QUEUE_ORDERED_NONE; + rv = bdev_get_queue(mdev->bc->backing_bdev)->ordered; +#else +# define QUEUE_ORDERED_NONE 0 +# define QUEUE_ORDERED_TAG 1 +# define QUEUE_ORDERED_FLUSH 2 +# warning "TCQ code disabled at compile time." + rv = QUEUE_ORDERED_NONE; // Kernels before 2.6.12 had not had TCQ support. +#endif + return rv; +} + +/* + * FIXME investigate what makes most sense: + * a) blk_run_queue(q); + * + * b) struct backing_dev_info *bdi; + * b1) bdi = &q->backing_dev_info; + * b2) bdi = mdev->bc->backing_bdev->bd_inode->i_mapping->backing_dev_info; + * blk_run_backing_dev(bdi,NULL); + * + * c) generic_unplug(q) ? __generic_unplug(q) ? + * + * d) q->unplug_fn(q), which is what all the drivers/md/ stuff uses... + * + */ +static inline void drbd_blk_run_queue(request_queue_t *q) +{ + if (q && q->unplug_fn) + q->unplug_fn(q); +} + +static inline void drbd_kick_lo(drbd_dev *mdev) +{ + if (!mdev->bc->backing_bdev) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("backing_bdev==NULL in drbd_kick_lo! The following call trace is for debuggin purposes only. Don't worry.\n"); + dump_stack(); + } + } else { + drbd_blk_run_queue(bdev_get_queue(mdev->bc->backing_bdev)); + } +} +#endif diff -uprN linux-2.6.18/drivers/block/drbd/drbd_main.c linux-2.6.18.ovz/drivers/block/drbd/drbd_main.c --- linux-2.6.18/drivers/block/drbd/drbd_main.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_main.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,3268 @@ +/* +-*- Linux-c -*- + drbd.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __KERNEL_SYSCALLS__ +#include +#include + +#include +#include +#include "drbd_int.h" +#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ + +/* YES. We got an official device major from lanana + */ +#define LANANA_DRBD_MAJOR 147 + +struct after_state_chg_work { + struct drbd_work w; + drbd_state_t os; + drbd_state_t ns; + enum chg_state_flags flags; +}; + +int drbdd_init(struct Drbd_thread*); +int drbd_worker(struct Drbd_thread*); +int drbd_asender(struct Drbd_thread*); + +int drbd_init(void); +STATIC int drbd_open(struct inode *inode, struct file *file); +STATIC int drbd_close(struct inode *inode, struct file *file); +STATIC int w_after_state_ch(drbd_dev *mdev, struct drbd_work *w, int unused); +STATIC int w_md_sync(drbd_dev *mdev, struct drbd_work *w, int unused); +STATIC void md_sync_timer_fn(unsigned long data); + +MODULE_AUTHOR("Philipp Reisner , Lars Ellenberg "); +MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); +MODULE_LICENSE("GPL"); +MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); +MODULE_ALIAS_BLOCKDEV_MAJOR(LANANA_DRBD_MAJOR); + +#include +/* allow_open_on_secondary */ +MODULE_PARM_DESC(allow_oos, "DONT USE!"); +/* thanks to these macros, if compiled into the kernel (not-module), + * this becomes the boot parameter drbd.minor_count */ +module_param(minor_count, int,0); +module_param(allow_oos, bool,0); + +#ifdef DRBD_ENABLE_FAULTS +int enable_faults = 0; +int fault_rate; +int fault_count; +int fault_devs; +module_param(enable_faults,int,0664); // bitmap of enabled faults +module_param(fault_rate,int,0664); // fault rate % value - applies to all enabled faults +module_param(fault_count,int,0664); // count of faults inserted +module_param(fault_devs,int,0644); // bitmap of devices to insert faults on +#endif + +// module parameter, defined +int major_nr = LANANA_DRBD_MAJOR; +int minor_count = 32; + +int allow_oos = 0; + +#ifdef ENABLE_DYNAMIC_TRACE +int trace_type = 0; // Bitmap of trace types to enable +int trace_level= 0; // Current trace level +int trace_devs = 0; // Bitmap of devices to trace + +module_param(trace_level,int,0644); +module_param(trace_type,int,0644); +module_param(trace_devs,int,0644); +#endif + +// global panic flag +volatile int drbd_did_panic = 0; + +/* in 2.6.x, our device mapping and config info contains our virtual gendisks + * as member "struct gendisk *vdisk;" + */ +struct Drbd_Conf **minor_table = NULL; + +drbd_kmem_cache_t *drbd_request_cache; +drbd_kmem_cache_t *drbd_ee_cache; +mempool_t *drbd_request_mempool; +mempool_t *drbd_ee_mempool; + +/* I do not use a standard mempool, because: + 1) I want to hand out the preallocated objects first. + 2) I want to be able to interrupt sleeping allocation with a signal. + Note: This is a single linked list, the next pointer is the private + member of struct page. + */ +struct page* drbd_pp_pool; +spinlock_t drbd_pp_lock; +int drbd_pp_vacant; +wait_queue_head_t drbd_pp_wait; + +STATIC struct block_device_operations drbd_ops = { + .owner = THIS_MODULE, + .open = drbd_open, + .release = drbd_close, +}; + +#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) + +/************************* The transfer log start */ +STATIC int tl_init(drbd_dev *mdev) +{ + struct drbd_barrier *b; + + b=kmalloc(sizeof(struct drbd_barrier),GFP_KERNEL); + if(!b) return 0; + INIT_LIST_HEAD(&b->requests); + INIT_LIST_HEAD(&b->w.list); + b->next=0; + b->br_number=4711; + b->n_req=0; + + mdev->oldest_barrier = b; + mdev->newest_barrier = b; + + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + + return 1; +} + +STATIC void tl_cleanup(drbd_dev *mdev) +{ + D_ASSERT(mdev->oldest_barrier == mdev->newest_barrier); + kfree(mdev->oldest_barrier); + if(mdev->tl_hash) { + kfree(mdev->tl_hash); + mdev->tl_hash_s = 0; + } +} + +/** + * _tl_add_barrier: Adds a barrier to the TL. + * It returns the previously newest barrier + * (not the just created barrier) to the caller. + */ +struct drbd_barrier *_tl_add_barrier(drbd_dev *mdev,struct drbd_barrier *new) +{ + struct drbd_barrier *newest_before; + + INIT_LIST_HEAD(&new->requests); + INIT_LIST_HEAD(&new->w.list); + new->next=0; + new->n_req=0; + + newest_before = mdev->newest_barrier; + /* never send a barrier number == 0, because that is special-cased + * when using TCQ for our write ordering code */ + new->br_number = (newest_before->br_number+1) ?: 1; + mdev->newest_barrier->next = new; + mdev->newest_barrier = new; + + return newest_before; +} + +/* when we receive a barrier ack */ +void tl_release(drbd_dev *mdev,unsigned int barrier_nr, + unsigned int set_size) +{ + struct drbd_barrier *b; + struct list_head *le, *tle; + struct drbd_request *r; + + spin_lock_irq(&mdev->req_lock); + + b = mdev->oldest_barrier; + mdev->oldest_barrier = b->next; + + /* in protocol C this list should be empty, + * unless there is local io pending. + * in protocol A and B, this should not be empty, even though the + * master_bio's could already been completed. */ + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request,tl_requests); + _req_mod(r, barrier_acked, 0); + } + list_del(&b->requests); + /* There could be requests on the list waiting for completion + of the write to the local disk, to avoid corruptions of + slab's data structures we have to remove the lists head */ + + spin_unlock_irq(&mdev->req_lock); + + D_ASSERT(b->br_number == barrier_nr); + D_ASSERT(b->n_req == set_size); + +#if 1 + if(b->br_number != barrier_nr) { + DUMPI(b->br_number); + DUMPI(barrier_nr); + } + if(b->n_req != set_size) { + DUMPI(b->n_req); + DUMPI(set_size); + } +#endif + + kfree(b); +} + + +/* called by drbd_disconnect (exiting receiver thread) + * or from some after_state_ch */ +void tl_clear(drbd_dev *mdev) +{ + struct drbd_barrier *b, *tmp; + + WARN("tl_clear()\n"); + + spin_lock_irq(&mdev->req_lock); + b = mdev->oldest_barrier; + while ( b ) { + struct list_head *le, *tle; + struct drbd_request *r; + + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request,tl_requests); + _req_mod(r, connection_lost_while_pending, 0); + } + tmp = b->next; + + /* there could still be requests on that ring list, + * in case local io is still pending */ + list_del(&b->requests); + + if (b == mdev->newest_barrier) { + D_ASSERT(tmp == NULL); + b->br_number=4711; + b->n_req=0; + INIT_LIST_HEAD(&b->requests); + mdev->oldest_barrier = b; + break; + } + kfree(b); + b = tmp; + /* dec_ap_pending corresponding to _drbd_send_barrier; + * note: the barrier for the current epoch (newest_barrier) + * has not been sent yet, so we don't dec_ap_pending for it + * here, either */ + dec_ap_pending(mdev); + } + D_ASSERT(mdev->newest_barrier == mdev->oldest_barrier); + D_ASSERT(mdev->newest_barrier->br_number == 4711); + spin_unlock_irq(&mdev->req_lock); +} + +/** + * drbd_io_error: Handles the on_io_error setting, should be called in the + * unlikely(!drbd_bio_uptodate(e->bio)) case from kernel thread context. + * See also drbd_chk_io_error + * + * NOTE: we set ourselves FAILED here if on_io_error is Detach or Panic OR + * if the forcedetach flag is set. This flag is set when failures + * occur writing the meta data portion of the disk as they are + * not recoverable. We also try to write the "need full sync bit" here + * anyways. This is to make sure that you get a resynchronisation of + * the full device the next time you connect. + */ +int drbd_io_error(drbd_dev* mdev, int forcedetach) +{ + enum io_error_handler eh; + unsigned long flags; + int send,ok=1; + + eh = PassOn; + if(inc_local_if_state(mdev,Failed)) { + eh = mdev->bc->dc.on_io_error; + dec_local(mdev); + } + + if(!forcedetach && eh == PassOn) + return 1; + + spin_lock_irqsave(&mdev->req_lock,flags); + if( (send = (mdev->state.disk == Failed)) ) { + _drbd_set_state(_NS(mdev,disk,Diskless), + ChgStateHard|ScheduleAfter); + } + spin_unlock_irqrestore(&mdev->req_lock,flags); + + if(!send) return ok; + + ok = drbd_send_state(mdev); + if (ok) WARN("Notified peer that my disk is broken.\n"); + else ERR("Sending state in drbd_io_error() failed\n"); + + // Make sure we try to flush meta-data to disk - we come + // in here because of a local disk error so it might fail + // but we still need to try -- both because the error might + // be in the data portion of the disk and because we need + // to ensure the md-sync-timer is stopped if running. + drbd_md_sync(mdev); + + /* Releasing the backing device is done in after_state_ch() */ + + if(eh == CallIOEHelper) { + drbd_khelper(mdev,"local-io-error"); + } + + return ok; +} + +/** + * cl_wide_st_chg: + * Returns TRUE if this state change should be preformed as a cluster wide + * transaction. Of course it returns 0 as soon as the connection is lost. + */ +STATIC int cl_wide_st_chg(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns) +{ + return ( os.conn >= Connected && ns.conn >= Connected && + ( ( os.role != Primary && ns.role == Primary ) || + ( os.conn != StartingSyncT && ns.conn == StartingSyncT ) || + ( os.conn != StartingSyncS && ns.conn == StartingSyncS ) || + ( os.disk != Diskless && ns.disk == Diskless ) ) ) || + (os.conn >= Connected && ns.conn == Disconnecting); +} + +int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f, + drbd_state_t mask, drbd_state_t val) +{ + unsigned long flags; + drbd_state_t os,ns; + int rv; + + spin_lock_irqsave(&mdev->req_lock,flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + rv = _drbd_set_state(mdev, ns, f); + ns = mdev->state; + spin_unlock_irqrestore(&mdev->req_lock,flags); + if (rv==SS_Success && !(f&ScheduleAfter)) after_state_ch(mdev,os,ns,f); + + return rv; +} + +void drbd_force_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val) +{ + drbd_change_state(mdev,ChgStateHard,mask,val); +} + +STATIC int is_valid_state(drbd_dev* mdev, drbd_state_t ns); +STATIC int is_valid_state_transition(drbd_dev*, drbd_state_t, drbd_state_t); +STATIC int drbd_send_state_req(drbd_dev *, drbd_state_t, drbd_state_t); + +set_st_err_t _req_st_cond(drbd_dev* mdev,drbd_state_t mask, drbd_state_t val) +{ + drbd_state_t os,ns; + unsigned long flags; + int rv; + + if(test_and_clear_bit(CL_ST_CHG_SUCCESS,&mdev->flags)) + return SS_CW_Success; + + if(test_and_clear_bit(CL_ST_CHG_FAIL,&mdev->flags)) + return SS_CW_FailedByPeer; + + rv=0; + spin_lock_irqsave(&mdev->req_lock,flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + if( !cl_wide_st_chg(mdev,os,ns) ) rv = SS_CW_NoNeed; + if( !rv ) { + rv = is_valid_state(mdev,ns); + if(rv==SS_Success) { + rv = is_valid_state_transition(mdev,ns,os); + if(rv==SS_Success) rv = 0; // cont waiting, otherwise fail. + } + } + spin_unlock_irqrestore(&mdev->req_lock,flags); + + return rv; +} + +/** + * _drbd_request_state: + * This function is the most gracefull way to change state. For some state + * transition this function even does a cluster wide transaction. + * It has a cousin named drbd_request_state(), which is always verbose. + */ +int _drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val, + enum chg_state_flags f) +{ + unsigned long flags; + drbd_state_t os,ns; + int rv; + + spin_lock_irqsave(&mdev->req_lock,flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + + if(cl_wide_st_chg(mdev,os,ns)) { + rv = is_valid_state(mdev,ns); + if(rv == SS_Success ) rv = is_valid_state_transition(mdev,ns,os); + spin_unlock_irqrestore(&mdev->req_lock,flags); + + if( rv < SS_Success ) { + if( f & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); + return rv; + } + + drbd_state_lock(mdev); + if( !drbd_send_state_req(mdev,mask,val) ) { + drbd_state_unlock(mdev); + rv = SS_CW_FailedByPeer; + if( f & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); + return rv; + } + + wait_event(mdev->state_wait,(rv=_req_st_cond(mdev,mask,val))); + + if( rv < SS_Success ) { + // nearly dead code. + drbd_state_unlock(mdev); + if( f & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); + return rv; + } + spin_lock_irqsave(&mdev->req_lock,flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + drbd_state_unlock(mdev); + } + + rv = _drbd_set_state(mdev, ns, f); + ns = mdev->state; + spin_unlock_irqrestore(&mdev->req_lock,flags); + + if (rv==SS_Success && !(f&ScheduleAfter)) after_state_ch(mdev,os,ns,f); + + return rv; +} + + +STATIC void print_st(drbd_dev* mdev, char *name, drbd_state_t ns) +{ + ERR(" %s = { cs:%s st:%s/%s ds:%s/%s %c%c%c%c }\n", + name, + conns_to_name(ns.conn), + roles_to_name(ns.role), + roles_to_name(ns.peer), + disks_to_name(ns.disk), + disks_to_name(ns.pdsk), + ns.susp ? 's' : 'r', + ns.aftr_isp ? 'a' : '-', + ns.peer_isp ? 'p' : '-', + ns.user_isp ? 'u' : '-' + ); +} + +void print_st_err(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, int err) +{ + ERR("State change failed: %s\n",set_st_err_name(err)); + print_st(mdev," state",os); + print_st(mdev,"wanted",ns); +} + + +#define peers_to_name roles_to_name +#define pdsks_to_name disks_to_name + +#define susps_to_name(A) ( (A) ? "1" : "0" ) +#define aftr_isps_to_name(A) ( (A) ? "1" : "0" ) +#define peer_isps_to_name(A) ( (A) ? "1" : "0" ) +#define user_isps_to_name(A) ( (A) ? "1" : "0" ) + +#define PSC(A) \ + ({ if( ns.A != os.A ) { \ + pbp += sprintf(pbp, #A "( %s -> %s ) ", \ + A##s_to_name(os.A), \ + A##s_to_name(ns.A)); \ + } }) + +STATIC int is_valid_state(drbd_dev* mdev, drbd_state_t ns) +{ + /* See drbd_state_sw_errors in drbd_strings.c */ + + enum fencing_policy fp; + int rv=SS_Success; + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + dec_local(mdev); + } + + if(inc_net(mdev)) { + if( !mdev->net_conf->two_primaries && + ns.role == Primary && ns.peer == Primary ) + rv=SS_TwoPrimaries; + dec_net(mdev); + } + + if( rv <= 0 ) /* already found a reason to abort */; + else if( ns.role == Secondary && mdev->open_cnt ) + rv=SS_DeviceInUse; + + else if( ns.role == Primary && ns.conn < Connected && + ns.disk < UpToDate ) rv=SS_NoUpToDateDisk; + + else if( fp >= Resource && + ns.role == Primary && ns.conn < Connected && + ns.pdsk >= DUnknown ) rv=SS_PrimaryNOP; + + else if( ns.role == Primary && ns.disk <= Inconsistent && + ns.pdsk <= Inconsistent ) rv=SS_NoUpToDateDisk; + + else if( ns.conn > Connected && + ns.disk < UpToDate && ns.pdsk < UpToDate ) + rv=SS_BothInconsistent; + + else if( ns.conn > Connected && + (ns.disk == Diskless || ns.pdsk == Diskless ) ) + rv=SS_SyncingDiskless; + + else if( (ns.conn == Connected || + ns.conn == WFBitMapS || + ns.conn == SyncSource || + ns.conn == PausedSyncS) && + ns.disk == Outdated ) rv=SS_ConnectedOutdates; + + return rv; +} + +STATIC int is_valid_state_transition(drbd_dev* mdev,drbd_state_t ns,drbd_state_t os) +{ + int rv=SS_Success; + + if( (ns.conn == StartingSyncT || ns.conn == StartingSyncS ) && + os.conn > Connected) rv=SS_ResyncRunning; + + if( ns.conn == Disconnecting && os.conn == StandAlone) + rv=SS_AlreadyStandAlone; + + if( ns.disk == Outdated && os.disk == Diskless) + rv=SS_CanNotOutdateDL; + + return rv; +} + +int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) +{ + drbd_state_t os; + int rv=SS_Success, warn_sync_abort=0; + enum fencing_policy fp; + + MUST_HOLD(&mdev->req_lock); + + os = mdev->state; + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + dec_local(mdev); + } + + /* Early state sanitising. Dissalow the invalidate ioctl to connect */ + if( (ns.conn == StartingSyncS || ns.conn == StartingSyncT) && + os.conn < Connected ) { + ns.conn = os.conn; + ns.pdsk = os.pdsk; + } + + /* Dissalow Network errors to configure a device's network part */ + if( (ns.conn >= Timeout && ns.conn <= TearDown ) && + os.conn <= Disconnecting ) { + ns.conn = os.conn; + } + + /* Dissalow network errors (+TearDown) to overwrite each other. + Dissalow network errors to overwrite the Disconnecting state. */ + if( ( (os.conn >= Timeout && os.conn <= TearDown) + || os.conn == Disconnecting ) && + ns.conn >= Timeout && ns.conn <= TearDown ) { + ns.conn = os.conn; + } + + if( ns.conn < Connected ) { + ns.peer_isp = 0; + ns.peer = Unknown; + if ( ns.pdsk > DUnknown || + ns.pdsk < Inconsistent ) ns.pdsk = DUnknown; + } + + if( ns.conn <= Disconnecting && ns.disk == Diskless ) { + ns.pdsk = DUnknown; + } + + if( ns.conn > Connected && (ns.disk <= Failed || ns.pdsk <= Failed )) { + warn_sync_abort=1; + ns.conn = Connected; + } + + if( ns.conn >= Connected && + ( ns.disk == Consistent || ns.disk == Outdated ) ) { + switch(ns.conn) { + case WFBitMapT: + case PausedSyncT: + ns.disk = Outdated; + break; + case Connected: + case WFBitMapS: + case SyncSource: + case PausedSyncS: + ns.disk = UpToDate; + break; + case SyncTarget: + ns.disk = Inconsistent; + WARN("Implicit set disk state Inconsistent!\n"); + break; + } + if( os.disk == Outdated && ns.disk == UpToDate ) { + WARN("Implicit set disk from Outdate to UpToDate\n"); + } + } + + if( ns.conn >= Connected && + ( ns.pdsk == Consistent || ns.pdsk == Outdated ) ) { + switch(ns.conn) { + case Connected: + case WFBitMapT: + case PausedSyncT: + case SyncTarget: + ns.pdsk = UpToDate; + break; + case WFBitMapS: + case PausedSyncS: + ns.pdsk = Outdated; + break; + case SyncSource: + ns.pdsk = Inconsistent; + WARN("Implicit set pdsk Inconsistent!\n"); + break; + } + if( os.pdsk == Outdated && ns.pdsk == UpToDate ) { + WARN("Implicit set pdsk from Outdate to UpToDate\n"); + } + } + + /* Connection breaks down before we finished "Negotiating" */ + if (ns.conn < Connected && ns.disk == Negotiating ) { + ns.disk = mdev->new_state_tmp.disk; + ns.pdsk = mdev->new_state_tmp.pdsk; + } + + if( fp == Stonith ) { + if(ns.role == Primary && + ns.conn < Connected && + ns.pdsk > Outdated ) { + ns.susp = 1; + } + } + + if( ns.aftr_isp || ns.peer_isp || ns.user_isp ) { + if(ns.conn == SyncSource) ns.conn=PausedSyncS; + if(ns.conn == SyncTarget) ns.conn=PausedSyncT; + } else { + if(ns.conn == PausedSyncS) ns.conn=SyncSource; + if(ns.conn == PausedSyncT) ns.conn=SyncTarget; + } + + if( ns.i == os.i ) return SS_NothingToDo; + + if( !(flags & ChgStateHard) ) { + /* pre-state-change checks ; only look at ns */ + /* See drbd_state_sw_errors in drbd_strings.c */ + + rv = is_valid_state(mdev,ns); + if(rv < SS_Success) { + /* If the old state was illegal as well, then let + this happen...*/ + + if( is_valid_state(mdev,os) == rv ) { + ERR("Forcing state change from bad state. " + "Error would be: '%s'\n", + set_st_err_name(rv)); + print_st(mdev,"old",os); + print_st(mdev,"new",ns); + rv = SS_Success; + } + } else rv = is_valid_state_transition(mdev,ns,os); + } + + if(rv < SS_Success) { + if( flags & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); + return rv; + } + + if(warn_sync_abort) { + WARN("Resync aborted.\n"); + } + +#if DUMP_MD >= 2 + { + char *pbp,pb[300]; + pbp = pb; + *pbp=0; + PSC(role); + PSC(peer); + PSC(conn); + PSC(disk); + PSC(pdsk); + PSC(susp); + PSC(aftr_isp); + PSC(peer_isp); + PSC(user_isp); + INFO("%s\n", pb); + } +#endif + + mdev->state.i = ns.i; + wake_up(&mdev->misc_wait); + wake_up(&mdev->state_wait); + + /** post-state-change actions **/ + if ( os.conn >= SyncSource && ns.conn <= Connected ) { + set_bit(STOP_SYNC_TIMER,&mdev->flags); + mod_timer(&mdev->resync_timer,jiffies); + } + + if( (os.conn == PausedSyncT || os.conn == PausedSyncS) && + (ns.conn == SyncTarget || ns.conn == SyncSource) ) { + INFO("Syncer continues.\n"); + mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; + if( ns.conn == SyncTarget ) { + D_ASSERT(!test_bit(STOP_SYNC_TIMER,&mdev->flags)); + clear_bit(STOP_SYNC_TIMER,&mdev->flags); + mod_timer(&mdev->resync_timer,jiffies); + } + } + + if( (os.conn == SyncTarget || os.conn == SyncSource) && + (ns.conn == PausedSyncT || ns.conn == PausedSyncS) ) { + INFO("Resync suspended\n"); + mdev->rs_mark_time = jiffies; + if( ns.conn == PausedSyncT ) { + set_bit(STOP_SYNC_TIMER,&mdev->flags); + } + } + + if ( os.disk == Diskless && os.conn == StandAlone && + (ns.disk > Diskless || ns.conn >= Unconnected) ) { + int i; + i = try_module_get(THIS_MODULE); + D_ASSERT(i); + } + + if( flags & ScheduleAfter ) { + struct after_state_chg_work* ascw; + + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); + if(ascw) { + ascw->os = os; + ascw->ns = ns; + ascw->flags = flags; + ascw->w.cb = w_after_state_ch; + drbd_queue_work(&mdev->data.work,&ascw->w); + } else { + WARN("Could not kmalloc an ascw\n"); + } + } + + return rv; +} + +STATIC int w_after_state_ch(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct after_state_chg_work* ascw; + + ascw = (struct after_state_chg_work*) w; + after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); + kfree(ascw); + + return 1; +} + +void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, + enum chg_state_flags flags) +{ + enum fencing_policy fp; + u32 mdf; + + if ( (os.conn != Connected && ns.conn == Connected) ) { + clear_bit(CRASHED_PRIMARY, &mdev->flags); + if( mdev->p_uuid ) { + mdev->p_uuid[UUID_FLAGS] &= ~((u64)2); + } + } + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + + mdf = mdev->bc->md.flags & ~(MDF_Consistent|MDF_PrimaryInd| + MDF_ConnectedInd|MDF_WasUpToDate| + MDF_PeerOutDated ); + + if (test_bit(CRASHED_PRIMARY,&mdev->flags) || + mdev->state.role == Primary || + ( mdev->state.pdsk < Inconsistent && + mdev->state.peer == Primary ) ) mdf |= MDF_PrimaryInd; + if (mdev->state.conn > WFReportParams) mdf |= MDF_ConnectedInd; + if (mdev->state.disk > Inconsistent) mdf |= MDF_Consistent; + if (mdev->state.disk > Outdated) mdf |= MDF_WasUpToDate; + if (mdev->state.pdsk <= Outdated && + mdev->state.pdsk >= Inconsistent) mdf |= MDF_PeerOutDated; + if( mdf != mdev->bc->md.flags) { + mdev->bc->md.flags = mdf; + drbd_md_mark_dirty(mdev); + } + dec_local(mdev); + } + + /* Inform userspace about the change... */ + drbd_bcast_state(mdev); + + /* Here we have the actions that are performed after a + state change. This function might sleep */ + + if( fp == Stonith && ns.susp ) { + // case1: The outdate peer handler is successfull: + // case2: The connection was established again: + if ( (os.pdsk > Outdated && ns.pdsk <= Outdated) || // case1 + (os.conn < Connected && ns.conn >= Connected) ) { + tl_clear(mdev); + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS(mdev,susp,0), + ChgStateVerbose | ScheduleAfter ); + spin_unlock_irq(&mdev->req_lock); + } + } + // Do not change the order of the if above and below... + if (os.conn != WFBitMapS && ns.conn == WFBitMapS) { + /* compare with drbd_make_request_common, + * wait_event and inc_ap_bio. + * Note: we may lose connection whilst waiting here. + * no worries though, should work out ok... */ + wait_event(mdev->misc_wait, + mdev->state.conn != WFBitMapS || + !atomic_read(&mdev->ap_bio_cnt)); + drbd_bm_lock(mdev); // { + drbd_send_bitmap(mdev); + drbd_bm_unlock(mdev); // } + } + + /* Lost contact to peer's copy of the data */ + if ( (os.pdsk>=Inconsistent && os.pdsk!=DUnknown && os.pdsk!=Outdated) && + (ns.pdskp_uuid ) { + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + } + if (inc_local(mdev)) { + if (ns.role == Primary && mdev->bc->md.uuid[Bitmap] == 0 ) { + /* Only do it if we have not yet done it... */ + drbd_uuid_new_current(mdev); + } + if (ns.peer == Primary ) { + /* Note: The condition ns.peer == Primary implies + that we are connected. Otherwise it would + be ns.peer == Unknown. */ + /* Our peer lost its disk. + Not rotation into BitMap-UUID! A FullSync is + required after a primary detached from it disk! */ + u64 uuid; + INFO("Creating new current UUID [no BitMap]\n"); + get_random_bytes(&uuid, sizeof(u64)); + drbd_uuid_set(mdev, Current, uuid); + } + dec_local(mdev); + } + } + + if( ns.pdsk < Inconsistent ) { + /* Diskless Peer becomes primary */ + if (os.peer == Secondary && ns.peer == Primary ) { + drbd_uuid_new_current(mdev); + } + /* Diskless Peer becomes secondary */ + if (os.peer == Primary && ns.peer == Secondary ) { + drbd_al_to_on_disk_bm(mdev); + } + } + + /* Last part of the attaching process ... */ + if ( os.disk == Attaching && ns.disk == Negotiating ) { + drbd_send_sizes(mdev); // to start sync... + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + + /* We want to pause/continue resync, tell peer. */ + if ( ( os.aftr_isp != ns.aftr_isp ) || + ( os.user_isp != ns.user_isp ) ) { + drbd_send_state(mdev); + } + + /* In case one of the isp bits got set, suspend other devices. */ + if ( ( !os.aftr_isp && !os.peer_isp && !os.user_isp) && + ( ns.aftr_isp || ns.peer_isp || ns.user_isp) ) { + suspend_other_sg(mdev); + } + + /* We are in the progress to start a full sync... */ + if ( ( os.conn != StartingSyncT && ns.conn == StartingSyncT ) || + ( os.conn != StartingSyncS && ns.conn == StartingSyncS ) ) { + + drbd_bm_lock(mdev); // racy... + + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_unlock(mdev); + + if (ns.conn == StartingSyncT) { + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS(mdev,conn,WFSyncUUID), + ChgStateVerbose | ScheduleAfter ); + spin_unlock_irq(&mdev->req_lock); + } else /* StartingSyncS */ { + drbd_start_resync(mdev,SyncSource); + } + } + + /* We are invalidating our self... */ + if ( os.conn < Connected && ns.conn < Connected && + os.disk > Inconsistent && ns.disk == Inconsistent ) { + drbd_bm_lock(mdev); // racy... + + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_unlock(mdev); + } + + if ( os.disk > Diskless && ns.disk == Diskless ) { + /* since inc_local() only works as long as disk>=Inconsistent, + and it is Diskless here, local_cnt can only go down, it can + not increase... It will reach zero */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + + drbd_free_bc(mdev->bc); mdev->bc = NULL; + lc_free(mdev->resync); mdev->resync = NULL; + lc_free(mdev->act_log); mdev->act_log = NULL; + } + + // A resync finished or aborted, wake paused devices... + if ( (os.conn > Connected && ns.conn <= Connected) || + (os.peer_isp && !ns.peer_isp) || + (os.user_isp && !ns.user_isp) ) { + resume_next_sg(mdev); + } + + // Receiver should clean up itself + if ( os.conn != Disconnecting && ns.conn == Disconnecting ) { + drbd_thread_signal(&mdev->receiver); + } + + // Now the receiver finished cleaning up itself, it should die now + if ( os.conn != StandAlone && ns.conn == StandAlone ) { + drbd_thread_stop_nowait(&mdev->receiver); + } + + // Upon network failure, we need to restart the receiver. + if ( os.conn > TearDown && + ns.conn <= TearDown && ns.conn >= Timeout) { + drbd_thread_restart_nowait(&mdev->receiver); + } + + if ( os.conn == StandAlone && ns.conn == Unconnected) { + drbd_thread_start(&mdev->receiver); + } + + if ( os.disk == Diskless && os.conn <= Disconnecting && + (ns.disk > Diskless || ns.conn >= Unconnected) ) { + if(!drbd_thread_start(&mdev->worker)) { + module_put(THIS_MODULE); + } + } + + /* FIXME what about Primary, Diskless, and then losing + * the connection? since we survive that "somehow", + * maybe we may not stop the worker yet, + * since that would call drbd_mdev_cleanup. + * after which we probably won't survive the next + * request from the upper layers ... BOOM again :( */ + if ( (os.disk > Diskless || os.conn > StandAlone) && + ns.disk == Diskless && ns.conn == StandAlone ) { + drbd_thread_stop_nowait(&mdev->worker); + } +} + + +STATIC int drbd_thread_setup(void* arg) +{ + struct Drbd_thread *thi = (struct Drbd_thread *) arg; + drbd_dev *mdev = thi->mdev; + int retval; + + daemonize("drbd_thread"); + D_ASSERT(get_t_state(thi) == Running); + D_ASSERT(thi->task == NULL); + spin_lock(&thi->t_lock); + thi->task = current; + smp_mb(); + spin_unlock(&thi->t_lock); + complete(&thi->startstop); // notify: thi->task is set. + + while(1) { + retval = thi->function(thi); + if(get_t_state(thi) != Restarting) break; + thi->t_state = Running; + } + + spin_lock(&thi->t_lock); + thi->task = NULL; + thi->t_state = None; + smp_mb(); + spin_unlock(&thi->t_lock); + + // THINK maybe two different completions? + complete(&thi->startstop); // notify: thi->task unset. + + return retval; +} + +STATIC void drbd_thread_init(drbd_dev *mdev, struct Drbd_thread *thi, + int (*func) (struct Drbd_thread *)) +{ + spin_lock_init(&thi->t_lock); + thi->task = NULL; + thi->t_state = None; + thi->function = func; + thi->mdev = mdev; +} + +int drbd_thread_start(struct Drbd_thread *thi) +{ + int pid; + drbd_dev *mdev = thi->mdev; + + spin_lock(&thi->t_lock); + + /* INFO("drbd_thread_start: %s [%d]: %s %d -> Running\n", + current->comm, current->pid, + thi == &mdev->receiver ? "receiver" : + thi == &mdev->asender ? "asender" : + thi == &mdev->worker ? "worker" : "NONSENSE", + thi->t_state); */ + + if (thi->t_state == None) { + init_completion(&thi->startstop); + D_ASSERT(thi->task == NULL); + thi->t_state = Running; + spin_unlock(&thi->t_lock); + flush_signals(current); // otherw. may get -ERESTARTNOINTR + pid = kernel_thread(drbd_thread_setup, (void *) thi, CLONE_FS); + if (pid < 0) { + ERR("Couldn't start thread (%d)\n", pid); + return FALSE; + } + wait_for_completion(&thi->startstop); // waits until thi->task is set + D_ASSERT(thi->task); + D_ASSERT(get_t_state(thi) == Running); + } else { + spin_unlock(&thi->t_lock); + } + + return TRUE; +} + + +void _drbd_thread_stop(struct Drbd_thread *thi, int restart,int wait) +{ + drbd_dev *mdev = thi->mdev; + Drbd_thread_state ns = restart ? Restarting : Exiting; + + spin_lock(&thi->t_lock); + + /* INFO("drbd_thread_stop: %s [%d]: %s %d -> %d; %d\n", + current->comm, current->pid, + thi->task ? thi->task->comm : "NULL", thi->t_state, ns, wait); */ + + if (thi->t_state == None) { + spin_unlock(&thi->t_lock); + if(restart) drbd_thread_start(thi); + return; + } + + if (thi->t_state != ns) { + if (thi->task == NULL) { + spin_unlock(&thi->t_lock); + return; + } + + thi->t_state = ns; + smp_mb(); + if (thi->task != current) { + if(wait) init_completion(&thi->startstop); + force_sig(DRBD_SIGKILL,thi->task); + } else D_ASSERT(!wait); + } + spin_unlock(&thi->t_lock); + + if (wait) { + D_ASSERT(thi->task != current); + wait_for_completion(&thi->startstop); + spin_lock(&thi->t_lock); + D_ASSERT(thi->task == NULL); + D_ASSERT(thi->t_state == None); + spin_unlock(&thi->t_lock); + } +} + +void drbd_thread_signal(struct Drbd_thread *thi) +{ + spin_lock(&thi->t_lock); + + if (thi->t_state == None) { + spin_unlock(&thi->t_lock); + return; + } + + if (thi->task != current) { + force_sig(DRBD_SIGKILL,thi->task); + } + + spin_unlock(&thi->t_lock); +} + +/* the appropriate socket mutex must be held already */ +int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock, + Drbd_Packet_Cmd cmd, Drbd_Header *h, + size_t size, unsigned msg_flags) +{ + int sent,ok; + + ERR_IF(!h) return FALSE; + ERR_IF(!size) return FALSE; + + h->magic = BE_DRBD_MAGIC; + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be16(size-sizeof(Drbd_Header)); + + dump_packet(mdev,sock,0,(void*)h, __FILE__, __LINE__); + sent = drbd_send(mdev,sock,h,size,msg_flags); + + ok = ( sent == size ); + if(!ok) { + ERR("short sent %s size=%d sent=%d\n", + cmdname(cmd), (int)size, sent); + } + return ok; +} + +/* don't pass the socket. we may only look at it + * when we hold the appropriate socket mutex. + */ +int drbd_send_cmd(drbd_dev *mdev, int use_data_socket, + Drbd_Packet_Cmd cmd, Drbd_Header* h, size_t size) +{ + int ok = 0; + struct socket *sock; + + if (use_data_socket) { + down(&mdev->data.mutex); + sock = mdev->data.socket; + } else { + down(&mdev->meta.mutex); + sock = mdev->meta.socket; + } + + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); + } + + if (use_data_socket) { + up(&mdev->data.mutex); + } else + up(&mdev->meta.mutex); + return ok; +} + +int drbd_send_cmd2(drbd_dev *mdev, Drbd_Packet_Cmd cmd, char* data, + size_t size) +{ + Drbd_Header h; + int ok; + + h.magic = BE_DRBD_MAGIC; + h.command = cpu_to_be16(cmd); + h.length = cpu_to_be16(size); + + if (!drbd_get_data_sock(mdev)) + return 0; + + dump_packet(mdev,mdev->data.socket,0,(void*)&h, __FILE__, __LINE__); + + ok = ( sizeof(h) == drbd_send(mdev,mdev->data.socket,&h,sizeof(h),0) ); + ok = ok && ( size == drbd_send(mdev,mdev->data.socket,data,size,0) ); + + drbd_put_data_sock(mdev); + + return ok; +} + +int drbd_send_sync_param(drbd_dev *mdev, struct syncer_conf *sc) +{ + Drbd_SyncParam_Packet p; + + p.rate = cpu_to_be32(sc->rate); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,SyncParam,(Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_protocol(drbd_dev *mdev) +{ + Drbd_Protocol_Packet p; + + p.protocol = cpu_to_be32(mdev->net_conf->wire_protocol); + p.after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); + p.after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); + p.after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); + p.want_lose = cpu_to_be32(mdev->net_conf->want_lose); + p.two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportProtocol, + (Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_uuids(drbd_dev *mdev) +{ + Drbd_GenCnt_Packet p; + int i; + u64 uuid_flags = 0; + + if(!inc_local_if_state(mdev,Negotiating)) return 1; // ok. + + for (i = Current; i < UUID_SIZE; i++) { + /* FIXME howto handle diskless ? */ + p.uuid[i] = mdev->bc + ? cpu_to_be64(mdev->bc->md.uuid[i]) + : 0; + } + + mdev->comm_bm_set = drbd_bm_total_weight(mdev); + p.uuid[UUID_SIZE] = cpu_to_be64(mdev->comm_bm_set); + uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; + uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; + p.uuid[UUID_FLAGS] = cpu_to_be64(uuid_flags); + + dec_local(mdev); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportUUIDs, + (Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_sync_uuid(drbd_dev *mdev, u64 val) +{ + Drbd_SyncUUID_Packet p; + + p.uuid = cpu_to_be64(val); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportSyncUUID, + (Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_sizes(drbd_dev *mdev) +{ + Drbd_Sizes_Packet p; + sector_t d_size, u_size; + int q_order_type; + int ok; + + if(inc_local_if_state(mdev,Negotiating)) { + D_ASSERT(mdev->bc->backing_bdev); + d_size = drbd_get_max_capacity(mdev->bc); + u_size = mdev->bc->dc.disk_size; + q_order_type = drbd_queue_order_type(mdev); + p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); + dec_local(mdev); + } else { + d_size = 0; + u_size = 0; + q_order_type = QUEUE_ORDERED_NONE; + } + + p.d_size = cpu_to_be64(d_size); + p.u_size = cpu_to_be64(u_size); + p.c_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); + p.max_segment_size = cpu_to_be32(mdev->rq_queue->max_segment_size); + p.queue_order_type = cpu_to_be32(q_order_type); + + ok = drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportSizes, + (Drbd_Header*)&p,sizeof(p)); + return ok; +} + +int drbd_send_state(drbd_dev *mdev) +{ + Drbd_State_Packet p; + + p.state = cpu_to_be32(mdev->state.i); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportState, + (Drbd_Header*)&p,sizeof(p)); +} + +STATIC int drbd_send_state_req(drbd_dev *mdev, drbd_state_t mask, drbd_state_t val) +{ + Drbd_Req_State_Packet p; + + p.mask = cpu_to_be32(mask.i); + p.val = cpu_to_be32(val.i); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,StateChgRequest, + (Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_sr_reply(drbd_dev *mdev, int retcode) +{ + Drbd_RqS_Reply_Packet p; + + p.retcode = cpu_to_be32(retcode); + + return drbd_send_cmd(mdev,USE_META_SOCKET,StateChgReply, + (Drbd_Header*)&p,sizeof(p)); +} + + +/* See the comment at receive_bitmap() */ +int _drbd_send_bitmap(drbd_dev *mdev) +{ + int want; + int ok=TRUE, bm_i=0; + size_t bm_words, num_words; + unsigned long *buffer; + Drbd_Header *p; + + ERR_IF(!mdev->bitmap) return FALSE; + + bm_words = drbd_bm_words(mdev); + p = vmalloc(PAGE_SIZE); // sleeps. cannot fail. + buffer = (unsigned long*)p->payload; + + if (drbd_md_test_flag(mdev->bc,MDF_FullSync)) { + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + if (unlikely(mdev->state.disk <= Failed )) { + /* write_bm did fail! Leave full sync flag set in Meta Data + * but otherwise process as per normal - need to tell other + * side that a full resync is required! */ + ERR("Failed to write bitmap to disk!\n"); + } + else { + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + } + } + + /* + * maybe TODO use some simple compression scheme, nowadays there are + * some such algorithms in the kernel anyways. + */ + do { + num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i ); + want = num_words * sizeof(long); + if (want) { + drbd_bm_get_lel(mdev, bm_i, num_words, buffer); + } + ok = _drbd_send_cmd(mdev,mdev->data.socket,ReportBitMap, + p, sizeof(*p) + want, 0); + bm_i += num_words; + } while (ok && want); + + vfree(p); + return ok; +} + +int drbd_send_bitmap(drbd_dev *mdev) +{ + int ok; + + if (!drbd_get_data_sock(mdev)) + return 0; + ok=_drbd_send_bitmap(mdev); + drbd_put_data_sock(mdev); + return ok; +} + +int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr,u32 set_size) +{ + int ok; + Drbd_BarrierAck_Packet p; + + p.barrier = barrier_nr; + p.set_size = cpu_to_be32(set_size); + + ok = drbd_send_cmd(mdev,USE_META_SOCKET,BarrierAck,(Drbd_Header*)&p,sizeof(p)); + return ok; +} + +/** + * _drbd_send_ack: + * This helper function expects the sector and block_id parameter already + * in big endian! + */ +STATIC int _drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + u64 sector, + u32 blksize, + u64 block_id) +{ + int ok; + Drbd_BlockAck_Packet p; + + p.sector = sector; + p.block_id = block_id; + p.blksize = blksize; + p.seq_num = cpu_to_be32(atomic_add_return(1,&mdev->packet_seq)); + + if (!mdev->meta.socket || mdev->state.conn < Connected) return FALSE; + ok=drbd_send_cmd(mdev,USE_META_SOCKET,cmd,(Drbd_Header*)&p,sizeof(p)); + return ok; +} + +int drbd_send_ack_dp(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + Drbd_Data_Packet *dp) +{ + const int header_size = sizeof(Drbd_Data_Packet) - sizeof(Drbd_Header); + int data_size = ((Drbd_Header*)dp)->length - header_size; + + return _drbd_send_ack(mdev,cmd,dp->sector,cpu_to_be32(data_size), + dp->block_id); +} + +int drbd_send_ack_rp(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + Drbd_BlockRequest_Packet *rp) +{ + return _drbd_send_ack(mdev,cmd,rp->sector,rp->blksize,rp->block_id); +} + +int drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd, struct Tl_epoch_entry *e) +{ + return _drbd_send_ack(mdev,cmd, + cpu_to_be64(e->sector), + cpu_to_be32(e->size), + e->block_id); +} + +int drbd_send_drequest(drbd_dev *mdev, int cmd, + sector_t sector,int size, u64 block_id) +{ + int ok; + Drbd_BlockRequest_Packet p; + + p.sector = cpu_to_be64(sector); + p.block_id = block_id; + p.blksize = cpu_to_be32(size); + + /* FIXME BIO_RW_SYNC ? */ + + ok = drbd_send_cmd(mdev,USE_DATA_SOCKET,cmd,(Drbd_Header*)&p,sizeof(p)); + return ok; +} + +/* called on sndtimeo + * returns FALSE if we should retry, + * TRUE if we think connection is dead + */ +STATIC int we_should_drop_the_connection(drbd_dev *mdev, struct socket *sock) +{ + int drop_it; + // long elapsed = (long)(jiffies - mdev->last_received); + // DUMPLU(elapsed); // elapsed ignored for now. + + drop_it = mdev->meta.socket == sock + || !mdev->asender.task + || get_t_state(&mdev->asender) != Running + || (volatile int)mdev->state.conn < Connected; + + if (drop_it) + return TRUE; + + drop_it = !--mdev->ko_count; + if ( !drop_it ) { + ERR("[%s/%d] sock_sendmsg time expired, ko = %u\n", + current->comm, current->pid, mdev->ko_count); + request_ping(mdev); + } + + return drop_it; /* && (mdev->state == Primary) */; +} + +/* The idea of sendpage seems to be to put some kind of reference + to the page into the skb, and to hand it over to the NIC. In + this process get_page() gets called. + + As soon as the page was really sent over the network put_page() + gets called by some part of the network layer. [ NIC driver? ] + + [ get_page() / put_page() increment/decrement the count. If count + reaches 0 the page will be freed. ] + + This works nicely with pages from FSs. + But this means that in protocol A we might signal IO completion too early ! + + In order not to corrupt data during a resync we must make sure + that we do not reuse our own buffer pages (EEs) to early, therefore + we have the net_ee list. + + XFS seems to have problems, still, it submits pages with page_count == 0! + As a workaround, we disable sendpage on pages with page_count == 0 or PageSlab. +*/ +int _drbd_no_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size) +{ + int ret; + ret = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + kunmap(page); + return ret; +} + +int _drbd_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size) +{ + mm_segment_t oldfs = get_fs(); + int sent,ok; + int len = size; + +#ifdef SHOW_SENDPAGE_USAGE + unsigned long now = jiffies; + static unsigned long total = 0; + static unsigned long fallback = 0; + static unsigned long last_rep = 0; + + /* report statistics every hour, + * if we had at least one fallback. + */ + ++total; + if (fallback && time_before(last_rep+3600*HZ, now)) { + last_rep = now; + printk(KERN_INFO DEVICE_NAME + ": sendpage() omitted: %lu/%lu\n", fallback, total); + } +#endif + + /* PARANOIA. if this ever triggers, + * something in the layers above us is really kaputt. + *one roundtrip later: + * doh. it triggered. so XFS _IS_ really kaputt ... + * oh well... + */ + if ( (page_count(page) < 1) || PageSlab(page) ) { + /* e.g. XFS meta- & log-data is in slab pages, which have a + * page_count of 0 and/or have PageSlab() set... + */ +#ifdef SHOW_SENDPAGE_USAGE + ++fallback; +#endif + sent = _drbd_no_send_page(mdev, page, offset, size); + if (likely(sent > 0)) len -= sent; + goto out; + } + + set_fs(KERNEL_DS); + do { + sent = mdev->data.socket->ops->sendpage(mdev->data.socket,page, + offset,len, + MSG_NOSIGNAL); + if (sent == -EAGAIN) { + if (we_should_drop_the_connection(mdev, + mdev->data.socket)) + break; + else + continue; + } + if (sent <= 0) { + WARN("%s: size=%d len=%d sent=%d\n", + __func__,(int)size,len,sent); + break; + } + len -= sent; + offset += sent; + // FIXME test "last_received" ... + } while(len > 0 /* THINK && mdev->cstate >= Connected*/); + set_fs(oldfs); + + out: + ok = (len == 0); + if (likely(ok)) + mdev->send_cnt += size>>9; + return ok; +} + +STATIC int _drbd_send_zc_bio(drbd_dev *mdev, struct bio *bio) +{ + struct bio_vec *bvec; + int i; + __bio_for_each_segment(bvec, bio, i, 0) { + if (! _drbd_send_page(mdev, bvec->bv_page, bvec->bv_offset, + bvec->bv_len) ) { + return 0; + } + } + + return 1; +} + +/* Used to send write requests + * Primary -> Peer (Data) + */ +int drbd_send_dblock(drbd_dev *mdev, drbd_request_t *req) +{ + int ok=1; + Drbd_Data_Packet p; + unsigned int dp_flags=0; + + if (!drbd_get_data_sock(mdev)) + return 0; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(Data); + p.head.length = cpu_to_be16(sizeof(p)-sizeof(Drbd_Header)+req->size); + + p.sector = cpu_to_be64(req->sector); + p.block_id = (unsigned long)req; + p.seq_num = cpu_to_be32( req->seq_num = + atomic_add_return(1,&mdev->packet_seq) ); + dp_flags = 0; + if(req->master_bio->bi_rw & BIO_RW_BARRIER) { + dp_flags |= DP_HARDBARRIER; + } + if(req->master_bio->bi_rw & BIO_RW_SYNC) { + dp_flags |= DP_RW_SYNC; + } + if(mdev->state.conn >= SyncSource && + mdev->state.conn <= PausedSyncT) { + dp_flags |= DP_MAY_SET_IN_SYNC; + } + + p.dp_flags = cpu_to_be32(dp_flags); + dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__); + set_bit(UNPLUG_REMOTE,&mdev->flags); + ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE); + if(ok) { + if(mdev->net_conf->wire_protocol == DRBD_PROT_A) { + ok = _drbd_send_bio(mdev,req->master_bio); + } else { + ok = _drbd_send_zc_bio(mdev,req->master_bio); + } + } + + drbd_put_data_sock(mdev); + return ok; +} + +/* answer packet, used to send data back for read requests: + * Peer -> (diskless) Primary (DataReply) + * SyncSource -> SyncTarget (RSDataReply) + */ +int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + struct Tl_epoch_entry *e) +{ + int ok; + Drbd_Data_Packet p; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(cmd); + p.head.length = cpu_to_be16( sizeof(p)-sizeof(Drbd_Header) + e->size); + + p.sector = cpu_to_be64(e->sector); + p.block_id = e->block_id; + /* p.seq_num = 0; No sequence numbers here.. */ + + /* Only called by our kernel thread. + * This one may be interupted by DRBD_SIG and/or DRBD_SIGKILL + * in response to ioctl or module unload. + */ + if (!drbd_get_data_sock(mdev)) + return 0; + + dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__); + ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE); + if (ok) ok = _drbd_send_zc_bio(mdev,e->private_bio); + + drbd_put_data_sock(mdev); + return ok; +} + +/* + drbd_send distinguishes two cases: + + Packets sent via the data socket "sock" + and packets sent via the meta data socket "msock" + + sock msock + -----------------+-------------------------+------------------------------ + timeout conf.timeout / 2 conf.timeout / 2 + timeout action send a ping via msock Abort communication + and close all sockets +*/ + +/* + * you must have down()ed the appropriate [m]sock_mutex elsewhere! + */ +int drbd_send(drbd_dev *mdev, struct socket *sock, + void* buf, size_t size, unsigned msg_flags) +{ +#if !HAVE_KERNEL_SENDMSG + mm_segment_t oldfs; + struct iovec iov; +#else + struct kvec iov; +#endif + struct msghdr msg; + int rv,sent=0; + + if (!sock) return -1000; + + // THINK if (signal_pending) return ... ? + + iov.iov_base = buf; + iov.iov_len = size; + + msg.msg_name = 0; + msg.msg_namelen = 0; +#if !HAVE_KERNEL_SENDMSG + msg.msg_iov = &iov; + msg.msg_iovlen = 1; +#endif + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; + +#if !HAVE_KERNEL_SENDMSG + oldfs = get_fs(); + set_fs(KERNEL_DS); +#endif + + if (sock == mdev->data.socket) + mdev->ko_count = mdev->net_conf->ko_count; + do { + /* STRANGE + * tcp_sendmsg does _not_ use its size parameter at all ? + * + * -EAGAIN on timeout, -EINTR on signal. + */ +/* THINK + * do we need to block DRBD_SIG if sock == &meta.socket ?? + * otherwise wake_asender() might interrupt some send_*Ack ! + */ +#if !HAVE_KERNEL_SENDMSG + rv = sock_sendmsg(sock, &msg, iov.iov_len ); +#else + rv = kernel_sendmsg(sock, &msg, &iov, 1, size); +#endif + if (rv == -EAGAIN) { + if (we_should_drop_the_connection(mdev,sock)) + break; + else + continue; + } + D_ASSERT(rv != 0); + if (rv == -EINTR ) { +#if 0 + /* FIXME this happens all the time. + * we don't care for now! + * eventually this should be sorted out be the proper + * use of the SIGNAL_ASENDER bit... */ + if (DRBD_ratelimit(5*HZ,5)) { + DBG("Got a signal in drbd_send(,%c,)!\n", + sock == mdev->meta.socket ? 'm' : 's'); + // dump_stack(); + } +#endif + flush_signals(current); + rv = 0; + } + if (rv < 0) break; + sent += rv; + iov.iov_base += rv; + iov.iov_len -= rv; + } while(sent < size); + +#if !HAVE_KERNEL_SENDMSG + set_fs(oldfs); +#endif + + if (rv <= 0) { + if (rv != -EAGAIN) { + ERR("%s_sendmsg returned %d\n", + sock == mdev->meta.socket ? "msock" : "sock", + rv); + drbd_force_state(mdev, NS(conn,BrokenPipe)); + } else + drbd_force_state(mdev, NS(conn,Timeout)); + } + + return sent; +} + +STATIC int drbd_open(struct inode *inode, struct file *file) +{ + drbd_dev *mdev; + unsigned long flags; + int rv=0; + + mdev = minor_to_mdev(MINOR(inode->i_rdev)); + if(!mdev) return -ENODEV; + + spin_lock_irqsave(&mdev->req_lock,flags); + /* to have a stable mdev->state.role and no race with updating open_cnt */ + + if (mdev->state.role != Primary) { + if (file->f_mode & FMODE_WRITE) { + rv = -EROFS; + } else if (!allow_oos) { + rv = -EMEDIUMTYPE; + } + } + + if(!rv) mdev->open_cnt++; + spin_unlock_irqrestore(&mdev->req_lock,flags); + + return rv; +} + +STATIC int drbd_close(struct inode *inode, struct file *file) +{ + /* do not use *file (May be NULL, in case of a unmount :-) */ + drbd_dev *mdev; + + mdev = minor_to_mdev(MINOR(inode->i_rdev)); + if(!mdev) return -ENODEV; + + /* + printk(KERN_ERR DEVICE_NAME ": close(inode=%p,file=%p)" + "current=%p,minor=%d,wc=%d\n", inode, file, current, minor, + inode->i_writecount); + */ + + mdev->open_cnt--; + + return 0; +} + +STATIC void drbd_unplug_fn(request_queue_t *q) +{ + drbd_dev *mdev = q->queuedata; + + MTRACE(TraceTypeUnplug,TraceLvlSummary, + INFO("got unplugged ap_bio_count=%d\n", + atomic_read(&mdev->ap_bio_cnt)); + ); + + /* unplug FIRST */ + spin_lock_irq(q->queue_lock); + blk_remove_plug(q); + spin_unlock_irq(q->queue_lock); + + /* only if connected */ + spin_lock_irq(&mdev->req_lock); + if (mdev->state.pdsk >= Inconsistent && mdev->state.conn >= Connected) { + D_ASSERT(mdev->state.role == Primary); + if (test_and_clear_bit(UNPLUG_REMOTE,&mdev->flags)) { + /* add to the data.work queue, + * unless already queued. + * XXX this might be a good addition to drbd_queue_work + * anyways, to detect "double queuing" ... */ + if (list_empty(&mdev->unplug_work.list)) + drbd_queue_work(&mdev->data.work,&mdev->unplug_work); + } + } + spin_unlock_irq(&mdev->req_lock); + + if(mdev->state.disk >= Inconsistent) drbd_kick_lo(mdev); +} + +void drbd_set_defaults(drbd_dev *mdev) +{ + mdev->sync_conf.after = DRBD_AFTER_DEF; + mdev->sync_conf.rate = DRBD_RATE_DEF; + mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF; // 512 MB active set + mdev->state = (drbd_state_t){ { Secondary, + Unknown, + StandAlone, + Diskless, + DUnknown, + 0 } }; +} + +void drbd_init_set_defaults(drbd_dev *mdev) +{ + // the memset(,0,) did most of this + // note: only assignments, no allocation in here + +#ifdef PARANOIA + SET_MDEV_MAGIC(mdev); +#endif + + drbd_set_defaults(mdev); + + atomic_set(&mdev->ap_bio_cnt,0); + atomic_set(&mdev->ap_pending_cnt,0); + atomic_set(&mdev->rs_pending_cnt,0); + atomic_set(&mdev->unacked_cnt,0); + atomic_set(&mdev->local_cnt,0); + atomic_set(&mdev->net_cnt,0); + atomic_set(&mdev->packet_seq,0); + atomic_set(&mdev->pp_in_use, 0); + + init_MUTEX(&mdev->md_io_mutex); + init_MUTEX(&mdev->data.mutex); + init_MUTEX(&mdev->meta.mutex); + sema_init(&mdev->data.work.s,0); + sema_init(&mdev->meta.work.s,0); + + spin_lock_init(&mdev->data.work.q_lock); + spin_lock_init(&mdev->meta.work.q_lock); + + spin_lock_init(&mdev->al_lock); + spin_lock_init(&mdev->req_lock); + spin_lock_init(&mdev->peer_seq_lock); + + INIT_LIST_HEAD(&mdev->active_ee); + INIT_LIST_HEAD(&mdev->sync_ee); + INIT_LIST_HEAD(&mdev->done_ee); + INIT_LIST_HEAD(&mdev->read_ee); + INIT_LIST_HEAD(&mdev->net_ee); + INIT_LIST_HEAD(&mdev->resync_reads); + INIT_LIST_HEAD(&mdev->data.work.q); + INIT_LIST_HEAD(&mdev->meta.work.q); + INIT_LIST_HEAD(&mdev->resync_work.list); + INIT_LIST_HEAD(&mdev->unplug_work.list); + INIT_LIST_HEAD(&mdev->md_sync_work.list); + mdev->resync_work.cb = w_resync_inactive; + mdev->unplug_work.cb = w_send_write_hint; + mdev->md_sync_work.cb = w_md_sync; + init_timer(&mdev->resync_timer); + init_timer(&mdev->md_sync_timer); + mdev->resync_timer.function = resync_timer_fn; + mdev->resync_timer.data = (unsigned long) mdev; + mdev->md_sync_timer.function = md_sync_timer_fn; + mdev->md_sync_timer.data = (unsigned long) mdev; + + init_waitqueue_head(&mdev->misc_wait); + init_waitqueue_head(&mdev->state_wait); + init_waitqueue_head(&mdev->ee_wait); + init_waitqueue_head(&mdev->al_wait); + init_waitqueue_head(&mdev->seq_wait); + + drbd_thread_init(mdev, &mdev->receiver, drbdd_init); + drbd_thread_init(mdev, &mdev->worker, drbd_worker); + drbd_thread_init(mdev, &mdev->asender, drbd_asender); + +#ifdef __arch_um__ + INFO("mdev = 0x%p\n",mdev); +#endif +} + +void drbd_mdev_cleanup(drbd_dev *mdev) +{ + /* I'd like to cleanup completely, and memset(,0,) it. + * but I'd have to reinit it. + * FIXME: do the right thing... + */ + + /* list of things that may still + * hold data of the previous config + + * act_log ** re-initialized in set_disk + * on_io_error + + * al_tr_cycle ** re-initialized in ... FIXME?? + * al_tr_number + * al_tr_pos + + * backing_bdev ** re-initialized in drbd_free_ll_dev + * lo_file + * md_bdev + * md_file + * md_index + + * ko_count ** re-initialized in set_net + + * last_received ** currently ignored + + * mbds_id ** re-initialized in ... FIXME?? + + * resync ** re-initialized in ... FIXME?? + + *** no re-init necessary (?) *** + * md_io_page + * this_bdev + + * vdisk ? + + * rq_queue ** FIXME ASSERT ?? + * newest_barrier + * oldest_barrier + */ + + drbd_thread_stop(&mdev->receiver); + + /* no need to lock it, I'm the only thread alive */ + if ( mdev->epoch_size != 0) + ERR("epoch_size:%d\n",mdev->epoch_size); + mdev->al_writ_cnt = + mdev->bm_writ_cnt = + mdev->read_cnt = + mdev->recv_cnt = + mdev->send_cnt = + mdev->writ_cnt = + mdev->p_size = + mdev->rs_start = + mdev->rs_total = + mdev->rs_failed = + mdev->rs_mark_left = + mdev->rs_mark_time = 0; + D_ASSERT(mdev->net_conf == NULL); + drbd_set_my_capacity(mdev,0); + drbd_bm_resize(mdev,0); + + // just in case + drbd_free_resources(mdev); + + /* + * currently we drbd_init_ee only on module load, so + * we may do drbd_release_ee only on module unload! + */ + D_ASSERT(list_empty(&mdev->active_ee)); + D_ASSERT(list_empty(&mdev->sync_ee)); + D_ASSERT(list_empty(&mdev->done_ee)); + D_ASSERT(list_empty(&mdev->read_ee)); + D_ASSERT(list_empty(&mdev->net_ee)); + D_ASSERT(list_empty(&mdev->resync_reads)); + D_ASSERT(list_empty(&mdev->data.work.q)); + D_ASSERT(list_empty(&mdev->meta.work.q)); + D_ASSERT(list_empty(&mdev->resync_work.list)); + D_ASSERT(list_empty(&mdev->unplug_work.list)); + +} + + +void drbd_destroy_mempools(void) +{ + struct page *page; + + while(drbd_pp_pool) { + page = drbd_pp_pool; + drbd_pp_pool = (struct page*)page_private(page); + __free_page(page); + drbd_pp_vacant--; + } + + /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + + if (drbd_ee_mempool) mempool_destroy(drbd_ee_mempool); + if (drbd_request_mempool) mempool_destroy(drbd_request_mempool); + if (drbd_ee_cache) kmem_cache_destroy(drbd_ee_cache); + if (drbd_request_cache) kmem_cache_destroy(drbd_request_cache); + + drbd_ee_mempool = NULL; + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + + return; +} + +int drbd_create_mempools(void) +{ + struct page *page; + const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; + int i; + + // prepare our caches and mempools + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_pp_pool = NULL; + + // caches + drbd_request_cache = kmem_cache_create( + "drbd_req_cache", sizeof(drbd_request_t), + 0, 0, NULL, NULL); + if (drbd_request_cache == NULL) + goto Enomem; + + drbd_ee_cache = kmem_cache_create( + "drbd_ee_cache", sizeof(struct Tl_epoch_entry), + 0, 0, NULL, NULL); + if (drbd_ee_cache == NULL) + goto Enomem; + + // mempools + drbd_request_mempool = mempool_create( number, + mempool_alloc_slab, mempool_free_slab, drbd_request_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + drbd_ee_mempool = mempool_create( number, + mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + // drbd's page pool + spin_lock_init(&drbd_pp_lock); + + for (i=0;i< number;i++) { + page = alloc_page(GFP_HIGHUSER); + if(!page) goto Enomem; + set_page_private(page,(unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + } + drbd_pp_vacant = number; + + return 0; + + Enomem: + drbd_destroy_mempools(); // in case we allocated some + return -ENOMEM; +} + +STATIC int drbd_notify_sys(struct notifier_block *this, unsigned long code, + void *unused) +{ + /* just so we have it. you never know what interessting things we + * might want to do here some day... + */ + + return NOTIFY_DONE; +} + +STATIC struct notifier_block drbd_notifier = { + .notifier_call = drbd_notify_sys, +}; + + +STATIC void __exit drbd_cleanup(void) +{ + int i, rr; + + unregister_reboot_notifier(&drbd_notifier); + + drbd_nl_cleanup(); + + if (minor_table) { + if (drbd_proc) + remove_proc_entry("drbd",&proc_root); + i=minor_count; + while (i--) { + drbd_dev *mdev = minor_to_mdev(i); + struct gendisk **disk = &mdev->vdisk; + request_queue_t **q = &mdev->rq_queue; + + if(!mdev) continue; + drbd_free_resources(mdev); + + if (*disk) { + del_gendisk(*disk); + put_disk(*disk); + *disk = NULL; + } + if (*q) blk_put_queue(*q); + *q = NULL; + + D_ASSERT(mdev->open_cnt == 0); + if (mdev->this_bdev) bdput(mdev->this_bdev); + + tl_cleanup(mdev); + if (mdev->bitmap) drbd_bm_cleanup(mdev); + if (mdev->resync) lc_free(mdev->resync); + + rr = drbd_release_ee(mdev,&mdev->active_ee); + if(rr) ERR("%d EEs in active list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->sync_ee); + if(rr) ERR("%d EEs in sync list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->read_ee); + if(rr) ERR("%d EEs in read list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->done_ee); + if(rr) ERR("%d EEs in done list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->net_ee); + if(rr) ERR("%d EEs in net list found!\n",rr); + + ERR_IF (!list_empty(&mdev->data.work.q)) { + struct list_head *lp; + list_for_each(lp,&mdev->data.work.q) { + DUMPP(lp); + } + }; + + if (mdev->md_io_page) + __free_page(mdev->md_io_page); + + if (mdev->md_io_tmpp) + __free_page(mdev->md_io_tmpp); + + if (mdev->act_log) lc_free(mdev->act_log); + + if(mdev->ee_hash) { + kfree(mdev->ee_hash); + mdev->ee_hash_s = 0; + mdev->ee_hash = NULL; + } + if(mdev->tl_hash) { + kfree(mdev->tl_hash); + mdev->tl_hash_s = 0; + mdev->tl_hash = NULL; + } + if(mdev->app_reads_hash) { + kfree(mdev->app_reads_hash); + mdev->app_reads_hash = NULL; + } + if ( mdev->p_uuid ) { + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + } + } + drbd_destroy_mempools(); + } + + kfree(minor_table); + + if (unregister_blkdev(MAJOR_NR, DEVICE_NAME) != 0) + printk(KERN_ERR DEVICE_NAME": unregister of device failed\n"); + + printk(KERN_INFO DEVICE_NAME": module cleanup done.\n"); +} + +drbd_dev *drbd_new_device(int minor) +{ + drbd_dev *mdev = NULL; + struct gendisk *disk; + request_queue_t *q; + + mdev = kzalloc(sizeof(drbd_dev),GFP_KERNEL); + if(!mdev) goto Enomem; + + mdev->minor = minor; + + drbd_init_set_defaults(mdev); + + q = blk_alloc_queue(GFP_KERNEL); + if (!q) goto Enomem; + mdev->rq_queue = q; + q->queuedata = mdev; + q->max_segment_size = DRBD_MAX_SEGMENT_SIZE; + + disk = alloc_disk(1); + if (!disk) goto Enomem; + mdev->vdisk = disk; + + set_disk_ro( disk, TRUE ); + + disk->queue = q; + disk->major = MAJOR_NR; + disk->first_minor = minor; + disk->fops = &drbd_ops; + sprintf(disk->disk_name, DEVICE_NAME "%d", minor); + disk->private_data = mdev; + add_disk(disk); + + mdev->this_bdev = bdget(MKDEV(MAJOR_NR,minor)); + // we have no partitions. we contain only ourselves. + mdev->this_bdev->bd_contains = mdev->this_bdev; + + blk_queue_make_request(q, drbd_make_request_26); + blk_queue_merge_bvec(q, drbd_merge_bvec); + q->queue_lock = &mdev->req_lock; // needed since we use + // plugging on a queue, that actually has no requests! + q->unplug_fn = drbd_unplug_fn; + + mdev->md_io_page = alloc_page(GFP_KERNEL); + if(!mdev->md_io_page) goto Enomem; + + if (drbd_bm_init(mdev)) goto Enomem; + // no need to lock access, we are still initializing the module. + if (!tl_init(mdev)) goto Enomem; + + mdev->app_reads_hash=kzalloc(APP_R_HSIZE*sizeof(void*),GFP_KERNEL); + if (!mdev->app_reads_hash) goto Enomem; + + return mdev; + + Enomem: + if(mdev) { + if(mdev->app_reads_hash) kfree(mdev->app_reads_hash); + if(mdev->md_io_page) __free_page(mdev->md_io_page); + kfree(mdev); + } + return NULL; +} + +int __init drbd_init(void) +{ + int err; + +#if 0 +// warning LGE "DEBUGGING" +/* I am too lazy to calculate this by hand -lge + */ +#define SZO(x) printk(KERN_ERR "sizeof(" #x ") = %d\n", sizeof(x)) + SZO(struct Drbd_Conf); + SZO(struct buffer_head); + SZO(Drbd_Polymorph_Packet); + SZO(struct drbd_socket); + SZO(struct bm_extent); + SZO(struct lc_element); + SZO(struct semaphore); + SZO(struct drbd_request); + SZO(struct bio); + SZO(wait_queue_head_t); + SZO(spinlock_t); + SZO(Drbd_Header); + SZO(Drbd_HandShake_Packet); + SZO(Drbd_Barrier_Packet); + SZO(Drbd_BarrierAck_Packet); + SZO(Drbd_SyncParam_Packet); + SZO(Drbd06_Parameter_P); + SZO(Drbd_Data_Packet); + SZO(Drbd_BlockAck_Packet); + printk(KERN_ERR "AL_EXTENTS_PT = %d\n",AL_EXTENTS_PT); + printk(KERN_ERR "DRBD_MAX_SECTORS = %llu\n",DRBD_MAX_SECTORS); + printk(KERN_ERR "DRBD_MAX_SECTORS_FLEX = %llu\n",DRBD_MAX_SECTORS_FLEX); +#define OOF(t,m) printk(KERN_ERR "offsetof("#t","#m") = %d\n", offsetof(t,m)) + OOF(struct Drbd_Conf,bitmap); + //OOF(struct drbd_bitmap,bm_set); + return -EBUSY; +#endif +#ifdef __arch_um__ + printk(KERN_INFO "drbd_module = 0x%p core = 0x%p\n", + THIS_MODULE,THIS_MODULE->module_core); +#endif + + if (sizeof(Drbd_HandShake_Packet) != 80) { + printk(KERN_ERR DEVICE_NAME + ": never change the size or layout of the HandShake packet.\n"); + return -EINVAL; + } + + if (1 > minor_count||minor_count > 255) { + printk(KERN_ERR DEVICE_NAME + ": invalid minor_count (%d)\n",minor_count); +#ifdef MODULE + return -EINVAL; +#else + minor_count = 8; +#endif + } + + if( (err = drbd_nl_init()) ) { + return err; + } + + err = register_blkdev(MAJOR_NR, DEVICE_NAME); + if (err) { + printk(KERN_ERR DEVICE_NAME + ": unable to register block device major %d\n", + MAJOR_NR); + return err; + } + + register_reboot_notifier(&drbd_notifier); + + /* + * allocate all necessary structs + */ + err = -ENOMEM; + + init_waitqueue_head(&drbd_pp_wait); + + drbd_proc = NULL; // play safe for drbd_cleanup + minor_table = kzalloc(sizeof(drbd_dev *)*minor_count,GFP_KERNEL); + if(!minor_table) goto Enomem; + + if ((err = drbd_create_mempools())) + goto Enomem; + +#if CONFIG_PROC_FS + /* + * register with procfs + */ + drbd_proc = create_proc_entry("drbd", S_IFREG | S_IRUGO , &proc_root); + + if (!drbd_proc) { + printk(KERN_ERR DEVICE_NAME": unable to register proc file\n"); + goto Enomem; + } + + drbd_proc->proc_fops = &drbd_proc_fops; + drbd_proc->owner = THIS_MODULE; +#else +# error "Currently drbd depends on the proc file system (CONFIG_PROC_FS)" +#endif + + printk(KERN_INFO DEVICE_NAME ": initialised. " + "Version: " REL_VERSION " (api:%d/proto:%d)\n", + API_VERSION,PRO_VERSION); + printk(KERN_INFO DEVICE_NAME ": %s\n", drbd_buildtag()); + printk(KERN_INFO DEVICE_NAME": registered as block device major %d\n", MAJOR_NR); + printk(KERN_INFO DEVICE_NAME": minor_table @ 0x%p\n", minor_table); + + return 0; // Success! + + Enomem: + drbd_cleanup(); + if (err == -ENOMEM) // currently always the case + printk(KERN_ERR DEVICE_NAME ": ran out of memory\n"); + else + printk(KERN_ERR DEVICE_NAME ": initialization failure\n"); + return err; +} + +void drbd_free_bc(struct drbd_backing_dev* bc) +{ + if(bc == NULL) return; + + BD_RELEASE(bc->backing_bdev); + BD_RELEASE(bc->md_bdev); + + fput(bc->lo_file); + fput(bc->md_file); + + kfree(bc); +} + +void drbd_free_sock(drbd_dev *mdev) +{ + if (mdev->data.socket) { + sock_release(mdev->data.socket); + mdev->data.socket = 0; + } + if (mdev->meta.socket) { + sock_release(mdev->meta.socket); + mdev->meta.socket = 0; + } +} + + +void drbd_free_resources(drbd_dev *mdev) +{ + if ( mdev->cram_hmac_tfm ) { + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + } + drbd_free_sock(mdev); + drbd_free_bc(mdev->bc); + mdev->bc=0; +} + +/*********************************/ +/* meta data management */ + +struct meta_data_on_disk { + u64 la_size; // last agreed size. + u64 uuid[UUID_SIZE]; // UUIDs. + u64 device_uuid; + u64 reserved_u64_1; + u32 flags; // MDF + u32 magic; + u32 md_size_sect; + u32 al_offset; // offset to this block + u32 al_nr_extents; // important for restoring the AL + // `-- act_log->nr_elements <-- sync_conf.al_extents + u32 bm_offset; // offset to the bitmap, from here + u32 bm_bytes_per_bit; // BM_BLOCK_SIZE + u32 reserved_u32[4]; + +} __attribute((packed)); + +/** + * drbd_md_sync: + * Writes the meta data super block if the MD_DIRTY flag bit is set. + */ +void drbd_md_sync(drbd_dev *mdev) +{ + struct meta_data_on_disk * buffer; + sector_t sector; + int i; + + if (!test_and_clear_bit(MD_DIRTY,&mdev->flags)) return; + del_timer(&mdev->md_sync_timer); + + // We use here Failed and not Attaching because we try to write + // metadata even if we detach due to a disk failure! + if(!inc_local_if_state(mdev,Failed)) return; + + INFO("Writing meta data super block now.\n"); + + down(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + memset(buffer,0,512); + + buffer->la_size=cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); + for (i = Current; i < UUID_SIZE; i++) + buffer->uuid[i]=cpu_to_be64(mdev->bc->md.uuid[i]); + buffer->flags = cpu_to_be32(mdev->bc->md.flags); + buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); + + buffer->md_size_sect = cpu_to_be32(mdev->bc->md.md_size_sect); + buffer->al_offset = cpu_to_be32(mdev->bc->md.al_offset); + buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements); + buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); + buffer->device_uuid = cpu_to_be64(mdev->bc->md.device_uuid); + + buffer->bm_offset = cpu_to_be32(mdev->bc->md.bm_offset); + + D_ASSERT(drbd_md_ss__(mdev,mdev->bc) == mdev->bc->md.md_offset); + sector = mdev->bc->md.md_offset; + +#if 0 + /* FIXME sooner or later I'd like to use the MD_DIRTY flag everywhere, + * so we can avoid unneccessary md writes. + */ + ERR_IF (!test_bit(MD_DIRTY,&mdev->flags)) { + dump_stack(); + } +#endif + + if (drbd_md_sync_page_io(mdev,mdev->bc,sector,WRITE)) { + clear_bit(MD_DIRTY,&mdev->flags); + } else { + /* this was a try anyways ... */ + ERR("meta data update failed!\n"); + + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + } + + // Update mdev->bc->md.la_size_sect, since we updated it on metadata. + mdev->bc->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); + + up(&mdev->md_io_mutex); + dec_local(mdev); +} + +/** + * drbd_md_read: + * @bdev: describes the backing storage and the meta-data storage + * Reads the meta data from bdev. Return 0 (NoError) on success, and an + * enum ret_codes in case something goes wrong. + * Currently only: MDIOError, MDInvalid. + */ +int drbd_md_read(drbd_dev *mdev, struct drbd_backing_dev *bdev) +{ + struct meta_data_on_disk * buffer; + int i,rv = NoError; + + if(!inc_local_if_state(mdev,Attaching)) return MDIOError; + + down(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + + if ( ! drbd_md_sync_page_io(mdev,bdev,bdev->md.md_offset,READ) ) { + /* NOTE: cant do normal error processing here as this is + called BEFORE disk is attached */ + ERR("Error while reading metadata.\n"); + rv = MDIOError; + goto err; + } + + if(be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { + ERR("Error while reading metadata, magic not found.\n"); + rv = MDInvalid; + goto err; + } + if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { + ERR("unexpected al_offset: %d (expected %d)\n", + be32_to_cpu(buffer->al_offset), bdev->md.al_offset); + rv = MDInvalid; + goto err; + } + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { + ERR("unexpected bm_offset: %d (expected %d)\n", + be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); + rv = MDInvalid; + goto err; + } + if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { + ERR("unexpected md_size: %u (expected %u)\n", + be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); + rv = MDInvalid; + goto err; + } + + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { + ERR("unexpected bm_bytes_per_bit: %u (expected %u)\n", + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); + rv = MDInvalid; + goto err; + } + + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); + for (i = Current; i < UUID_SIZE; i++) + bdev->md.uuid[i]=be64_to_cpu(buffer->uuid[i]); + bdev->md.flags = be32_to_cpu(buffer->flags); + mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + + if (mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + /* FIXME if this ever happens when reading meta data, + * it possibly screws up reading of the activity log? + */ + + err: + up(&mdev->md_io_mutex); + dec_local(mdev); + + return rv; +} + +/** + * drbd_md_mark_dirty: + * Call this function if you change enything that should be written to + * the meta-data super block. This function sets MD_DIRTY, and starts a + * timer that ensures that within five seconds you have to call drbd_md_sync(). + */ +void drbd_md_mark_dirty(drbd_dev *mdev) +{ + set_bit(MD_DIRTY,&mdev->flags); + mod_timer(&mdev->md_sync_timer,jiffies + 5*HZ ); +} + + +STATIC void drbd_uuid_move_history(drbd_dev *mdev) +{ + int i; + + for ( i=History_start ; ibc->md.uuid[i+1] = mdev->bc->md.uuid[i]; + + MTRACE(TraceTypeUuid,TraceLvlAll, + drbd_print_uuid(mdev,i+1); + ); + } +} + +void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) +{ + if(idx == Current) { + if (mdev->state.role == Primary) { + val |= 1; + } else { + val &= ~((u64)1); + } + } + + mdev->bc->md.uuid[idx] = val; + + MTRACE(TraceTypeUuid,TraceLvlSummary, + drbd_print_uuid(mdev,idx); + ); + + drbd_md_mark_dirty(mdev); +} + + +void drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) +{ + if(mdev->bc->md.uuid[idx]) { + drbd_uuid_move_history(mdev); + mdev->bc->md.uuid[History_start]=mdev->bc->md.uuid[idx]; + MTRACE(TraceTypeUuid,TraceLvlMetrics, + drbd_print_uuid(mdev,History_start); + ); + } + _drbd_uuid_set(mdev,idx,val); +} + +void drbd_uuid_new_current(drbd_dev *mdev) +{ + INFO("Creating new current UUID\n"); + D_ASSERT(mdev->bc->md.uuid[Bitmap] == 0); + mdev->bc->md.uuid[Bitmap] = mdev->bc->md.uuid[Current]; + MTRACE(TraceTypeUuid,TraceLvlMetrics, + drbd_print_uuid(mdev,Bitmap); + ); + + get_random_bytes(&mdev->bc->md.uuid[Current], sizeof(u64)); + if (mdev->state.role == Primary) { + mdev->bc->md.uuid[Current] |= 1; + } else { + mdev->bc->md.uuid[Current] &= ~((u64)1); + } + + MTRACE(TraceTypeUuid,TraceLvlSummary, + drbd_print_uuid(mdev,Current); + ); + + drbd_md_mark_dirty(mdev); +} + +void drbd_uuid_set_bm(drbd_dev *mdev, u64 val) +{ + if( mdev->bc->md.uuid[Bitmap]==0 && val==0 ) return; + + if(val==0) { + drbd_uuid_move_history(mdev); + mdev->bc->md.uuid[History_start]=mdev->bc->md.uuid[Bitmap]; + mdev->bc->md.uuid[Bitmap]=0; + + MTRACE(TraceTypeUuid,TraceLvlMetrics, + drbd_print_uuid(mdev,History_start); + drbd_print_uuid(mdev,Bitmap); + ); + } else { + if( mdev->bc->md.uuid[Bitmap] ) WARN("bm UUID already set"); + + mdev->bc->md.uuid[Bitmap] = val; + mdev->bc->md.uuid[Bitmap] &= ~((u64)1); + + MTRACE(TraceTypeUuid,TraceLvlMetrics, + drbd_print_uuid(mdev,Bitmap); + ); + } + drbd_md_mark_dirty(mdev); +} + + +void drbd_md_set_flag(drbd_dev *mdev, int flag) +{ + MUST_HOLD(mdev->req_lock); + if ( (mdev->bc->md.flags & flag) != flag) { + drbd_md_mark_dirty(mdev); + mdev->bc->md.flags |= flag; + } +} +void drbd_md_clear_flag(drbd_dev *mdev, int flag) +{ + MUST_HOLD(mdev->req_lock); + if ( (mdev->bc->md.flags & flag) != 0 ) { + drbd_md_mark_dirty(mdev); + mdev->bc->md.flags &= ~flag; + } +} +int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) +{ + return ((bdev->md.flags & flag) != 0); +} + +STATIC void md_sync_timer_fn(unsigned long data) +{ + drbd_dev* mdev = (drbd_dev*) data; + + drbd_queue_work_front(&mdev->data.work,&mdev->md_sync_work); +} + +STATIC int w_md_sync(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + WARN("BUG! md_sync_timer expired! Worker calls drbd_md_sync().\n"); + drbd_md_sync(mdev); + + return 1; +} + +#ifdef DRBD_ENABLE_FAULTS +// Fault insertion support including random number generator shamelessly +// stolen from kernel/rcutorture.c +struct fault_random_state { + unsigned long state; + unsigned long count; +}; + +#define FAULT_RANDOM_MULT 39916801 /* prime */ +#define FAULT_RANDOM_ADD 479001701 /* prime */ +#define FAULT_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from get_random_bytes(). + */ +STATIC unsigned long +_drbd_fault_random(struct fault_random_state *rsp) +{ + long refresh; + + if (--rsp->count < 0) { + get_random_bytes(&refresh, sizeof(refresh)); + rsp->state += refresh; + rsp->count = FAULT_RANDOM_REFRESH; + } + rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; + return swahw32(rsp->state); +} + +STATIC char * +_drbd_fault_str(unsigned int type) { + static char *_faults[] = { + "Meta-data write", + "Meta-data read", + "Resync write", + "Resync read", + "Data write", + "Data read", + "Data read ahead", + }; + + return (type < DRBD_FAULT_MAX)? _faults[type] : "**Unknown**"; +} + +unsigned int +_drbd_insert_fault(drbd_dev *mdev, unsigned int type) +{ + static struct fault_random_state rrs = {0,0}; + + unsigned int ret = ( + (fault_devs == 0 || ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) && + (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); + + if (ret) { + fault_count++; + + if (printk_ratelimit()) + WARN("***Simulating %s failure\n", _drbd_fault_str(type)); + } + + return ret; +} +#endif + +#ifdef ENABLE_DYNAMIC_TRACE + +STATIC char *_drbd_uuid_str(unsigned int idx) { + static char *uuid_str[] = { + "Current", + "Bitmap", + "History_start", + "History_end", + "UUID_SIZE", + "UUID_FLAGS", + }; + + return (idx < EXT_UUID_SIZE) ? uuid_str[idx] : "*Unknown UUID index*"; +} + +/* Pretty print a UUID value */ +void +drbd_print_uuid(drbd_dev *mdev, unsigned int idx) { + INFO(" uuid[%s] now %016llX\n",_drbd_uuid_str(idx),mdev->bc->md.uuid[idx]); +} + + +/* + +drbd_print_buffer + +This routine dumps binary data to the debugging output. Can be +called at interrupt level. + +Arguments: + + prefix - String is output at the beginning of each line output + flags - Control operation of the routine. Currently defined + Flags are: + DBGPRINT_BUFFADDR; if set, each line starts with the + virtual address of the line being outupt. If clear, + each line starts with the offset from the beginning + of the buffer. + size - Indicates the size of each entry in the buffer. Supported + values are sizeof(char), sizeof(short) and sizeof(int) + buffer - Start address of buffer + buffer_va - Virtual address of start of buffer (normally the same + as Buffer, but having it separate allows it to hold + file address for example) + length - length of buffer + +*/ +void +drbd_print_buffer(const char *prefix,unsigned int flags,int size, + const void *buffer,const void *buffer_va, + unsigned int length) + +#define LINE_SIZE 16 +#define LINE_ENTRIES (int)(LINE_SIZE/size) +{ + const unsigned char *pstart; + const unsigned char *pstart_va; + const unsigned char *pend; + char bytes_str[LINE_SIZE*3+8],ascii_str[LINE_SIZE+8]; + char *pbytes=bytes_str,*pascii=ascii_str; + int offset=0; + long sizemask; + int field_width; + int index; + const unsigned char *pend_str; + const unsigned char *p; + int count; + + // verify size parameter + if (size != sizeof(char) && size != sizeof(short) && size != sizeof(int)) { + printk(KERN_DEBUG "drbd_print_buffer: ERROR invalid size %d\n", size); + return; + } + + sizemask = size-1; + field_width = size*2; + + // Adjust start/end to be on appropriate boundary for size + buffer = (const char *)((long)buffer & ~sizemask); + pend = (const unsigned char *)(((long)buffer + length + sizemask) & ~sizemask); + + if (flags & DBGPRINT_BUFFADDR) { + // Move start back to nearest multiple of line size if printing address + // This results in nicely formatted output with addresses being on + // line size (16) byte boundaries + pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1)); + } + else { + pstart = (const unsigned char *)buffer; + } + + // Set value of start VA to print if addresses asked for + pstart_va = (const unsigned char *)buffer_va - ((const unsigned char *)buffer-pstart); + + // Calculate end position to nicely align right hand side + pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1)); + + // Init strings + *pbytes = *pascii = '\0'; + + // Start at beginning of first line + p = pstart; + count=0; + + while (p < pend_str) { + if (p < (const unsigned char *)buffer || p >= pend) { + // Before start of buffer or after end- print spaces + pbytes += sprintf(pbytes,"%*c ",field_width,' '); + pascii += sprintf(pascii,"%*c",size,' '); + p += size; + } + else { + // Add hex and ascii to strings + int val; + switch (size) { + default: + case 1: + val = *(unsigned char *)p; + break; + case 2: + val = *(unsigned short *)p; + break; + case 4: + val = *(unsigned int *)p; + break; + } + + pbytes += sprintf(pbytes,"%0*x ",field_width,val); + + for (index = size; index; index--) { + *pascii++ = isprint(*p) ? *p : '.'; + p++; + } + } + + count++; + + if (count == LINE_ENTRIES || p >= pend_str) { + // Null terminate and print record + *pascii = '\0'; + printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n", + prefix, + (flags & DBGPRINT_BUFFADDR) + ? (long)pstart_va : (long)offset, + LINE_ENTRIES*(field_width+1),bytes_str, + LINE_SIZE,ascii_str); + + // Move onto next line + pstart_va += (p-pstart); + pstart = p; + count = 0; + offset+= LINE_SIZE; + + // Re-init strings + pbytes = bytes_str; + pascii = ascii_str; + *pbytes = *pascii = '\0'; + } + } +} + +#define PSM(A) \ +do { \ + if( mask.A ) { \ + int i = snprintf(p, len, " " #A "( %s )", \ + A##s_to_name(val.A)); \ + if (i >= len) return op; \ + p += i; \ + len -= i; \ + } \ +} while (0) + +STATIC char *dump_st(char *p, int len, drbd_state_t mask, drbd_state_t val) +{ + char *op=p; + *p = '\0'; + PSM(role); + PSM(peer); + PSM(conn); + PSM(disk); + PSM(pdsk); + + return op; +} + +#define INFOP(fmt, args...) \ +do { \ + if (trace_level >= TraceLvlAll) { \ + INFO("%s:%d: %s [%d] %s %s " fmt , \ + file, line, current->comm, current->pid, \ + sockname, recv?"<<<":">>>", \ + ## args ); \ + } \ + else { \ + INFO("%s %s " fmt, sockname, \ + recv?"<<<":">>>", \ + ## args ); \ + } \ +} while (0) + +char *_dump_block_id(u64 block_id, char *buff) { + if (is_syncer_block_id(block_id)) + strcpy(buff,"SyncerId"); + else + sprintf(buff,"%llx",block_id); + + return buff; +} + +void +_dump_packet(drbd_dev *mdev, struct socket *sock, + int recv, Drbd_Polymorph_Packet *p, char* file, int line) +{ + char *sockname = sock == mdev->meta.socket ? "meta" : "data"; + int cmd = (recv == 2) ? p->head.command : be16_to_cpu(p->head.command); + char tmp[300]; + drbd_state_t m,v; + + switch (cmd) { + case HandShake: + INFOP("%s (protocol %u)\n", cmdname(cmd), be32_to_cpu(p->HandShake.protocol_version)); + break; + + case ReportBitMap: /* don't report this */ + break; + + case Data: + INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd), + (unsigned long long)be64_to_cpu(p->Data.sector), + _dump_block_id(p->Data.block_id,tmp), + be32_to_cpu(p->Data.seq_num), + be32_to_cpu(p->Data.dp_flags) + ); + break; + + case DataReply: + case RSDataReply: + INFOP("%s (sector %llus, id %s)\n", cmdname(cmd), + (unsigned long long)be64_to_cpu(p->Data.sector), + _dump_block_id(p->Data.block_id,tmp) + ); + break; + + case RecvAck: + case WriteAck: + case RSWriteAck: + case DiscardAck: + case NegAck: + case NegRSDReply: + INFOP("%s (sector %llus, size %u, id %s, seq %u)\n", cmdname(cmd), + (long long)be64_to_cpu(p->BlockAck.sector), + be32_to_cpu(p->BlockAck.blksize), + _dump_block_id(p->BlockAck.block_id,tmp), + be32_to_cpu(p->BlockAck.seq_num) + ); + break; + + case DataRequest: + case RSDataRequest: + INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd), + (long long)be64_to_cpu(p->BlockRequest.sector), + be32_to_cpu(p->BlockRequest.blksize), + _dump_block_id(p->BlockRequest.block_id,tmp) + ); + break; + + case Barrier: + case BarrierAck: + INFOP("%s (barrier %u)\n", cmdname(cmd), p->Barrier.barrier); + break; + + case ReportUUIDs: + INFOP("%s Curr:%016llX, Bitmap:%016llX, HisSt:%016llX, HisEnd:%016llX\n", cmdname(cmd), + be64_to_cpu(p->GenCnt.uuid[Current]), + be64_to_cpu(p->GenCnt.uuid[Bitmap]), + be64_to_cpu(p->GenCnt.uuid[History_start]), + be64_to_cpu(p->GenCnt.uuid[History_end])); + break; + + case ReportSizes: + INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, max bio %x, q order %x)\n", cmdname(cmd), + (long long)(be64_to_cpu(p->Sizes.d_size)>>(20-9)), + (long long)(be64_to_cpu(p->Sizes.u_size)>>(20-9)), + (long long)(be64_to_cpu(p->Sizes.c_size)>>(20-9)), + be32_to_cpu(p->Sizes.max_segment_size), + be32_to_cpu(p->Sizes.queue_order_type)); + break; + + case ReportState: + v.i = be32_to_cpu(p->State.state); + m.i = 0xffffffff; + dump_st(tmp,sizeof(tmp),m,v); + INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp); + break; + + case StateChgRequest: + m.i = be32_to_cpu(p->ReqState.mask); + v.i = be32_to_cpu(p->ReqState.val); + dump_st(tmp,sizeof(tmp),m,v); + INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp); + break; + + case StateChgReply: + INFOP("%s (ret %x)\n", cmdname(cmd), + be32_to_cpu(p->RqSReply.retcode)); + break; + + case Ping: + case PingAck: + /* + * Dont trace pings at summary level + */ + if (trace_level < TraceLvlAll) + break; + /* fall through... */ + default: + INFOP("%s (%u)\n",cmdname(cmd), cmd); + break; + } +} + +// Debug routine to dump info about bio + +void _dump_bio(drbd_dev *mdev, struct bio *bio, int complete) +{ +#ifdef CONFIG_LBD +#define SECTOR_FORMAT "%Lx" +#else +#define SECTOR_FORMAT "%lx" +#endif +#define SECTOR_SHIFT 9 + + unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT); + char *faddr = (char *)(lowaddr); + struct bio_vec *bvec; + int segno; + + INFO("%s %s Bio:%p - %soffset " SECTOR_FORMAT ", size %x\n", + complete? "<<<":">>>", + bio_rw(bio)==WRITE?"Write":"Read",bio, + complete? (drbd_bio_uptodate(bio)? "Success, ":"Failed, ") : "", + bio->bi_sector << SECTOR_SHIFT, + bio->bi_size); + + if (trace_level >= TraceLvlMetrics && + ((bio_rw(bio) == WRITE) ^ complete) ) { + printk(KERN_DEBUG " ind page offset length\n"); + __bio_for_each_segment(bvec, bio, segno, 0) { + printk(KERN_DEBUG " [%d] %p %8.8x %8.8x\n",segno, + bvec->bv_page, bvec->bv_offset, bvec->bv_len); + + if (trace_level >= TraceLvlAll) { + char *bvec_buf; + unsigned long flags; + + bvec_buf = bvec_kmap_irq(bvec, &flags); + + drbd_print_buffer(" ",DBGPRINT_BUFFADDR,1, + bvec_buf, + faddr, + (bvec->bv_len <= 0x80)? bvec->bv_len : 0x80); + + bvec_kunmap_irq(bvec_buf, &flags); + + if (bvec->bv_len > 0x40) + printk(KERN_DEBUG " ....\n"); + + faddr += bvec->bv_len; + } + } + } +} +#endif + +module_init(drbd_init) +module_exit(drbd_cleanup) diff -uprN linux-2.6.18/drivers/block/drbd/drbd_nl.c linux-2.6.18.ovz/drivers/block/drbd/drbd_nl.c --- linux-2.6.18/drivers/block/drbd/drbd_nl.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_nl.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1781 @@ +/* +-*- linux-c -*- + drbd_nl.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "drbd_int.h" +#include +#include + +/* see get_sb_bdev and bd_claim */ +char *drbd_d_holder = "Hands off! this is DRBD's data storage device."; +char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; + + +// Generate the tag_list to struct functions +#define PACKET(name, number, fields) \ +int name ## _from_tags (drbd_dev *mdev, unsigned short* tags, struct name * arg) \ +{ \ + int tag; \ + int dlen; \ + \ + while( (tag = *tags++) != TT_END ) { \ + dlen = *tags++; \ + switch( tag_number(tag) ) { \ + fields \ + default: \ + if( tag & T_MANDATORY ) { \ + ERR("Unknown tag: %d\n",tag_number(tag)); \ + return 0; \ + } \ + } \ + tags = (unsigned short*)((char*)tags + dlen); \ + } \ + return 1; \ +} +#define INTEGER(pn,pr,member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ + arg->member = *(int*)(tags); \ + break; +#define INT64(pn,pr,member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ + arg->member = *(u64*)(tags); \ + break; +#define BIT(pn,pr,member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ + arg->member = *(char*)(tags) ? 1 : 0; \ + break; +#define STRING(pn,pr,member,len) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ + arg->member ## _len = dlen; \ + memcpy(arg->member,tags,dlen); \ + break; +#include "linux/drbd_nl.h" + +// Generate the struct to tag_list functions +#define PACKET(name, number, fields) \ +unsigned short* \ +name ## _to_tags (drbd_dev *mdev, struct name * arg, unsigned short* tags) \ +{ \ + fields \ + return tags; \ +} + +#define INTEGER(pn,pr,member) \ + *tags++ = pn | pr | TT_INTEGER; \ + *tags++ = sizeof(int); \ + *(int*)tags = arg->member; \ + tags = (unsigned short*)((char*)tags+sizeof(int)); +#define INT64(pn,pr,member) \ + *tags++ = pn | pr | TT_INT64; \ + *tags++ = sizeof(u64); \ + *(u64*)tags = arg->member; \ + tags = (unsigned short*)((char*)tags+sizeof(u64)); +#define BIT(pn,pr,member) \ + *tags++ = pn | pr | TT_BIT; \ + *tags++ = sizeof(char); \ + *(char*)tags = arg->member; \ + tags = (unsigned short*)((char*)tags+sizeof(char)); +#define STRING(pn,pr,member,len) \ + *tags++ = pn | pr | TT_STRING; \ + *tags++ = arg->member ## _len; \ + memcpy(tags,arg->member, arg->member ## _len); \ + tags = (unsigned short*)((char*)tags + arg->member ## _len); +#include "linux/drbd_nl.h" + +extern void drbd_init_set_defaults(drbd_dev *mdev); +void drbd_bcast_ev_helper(drbd_dev *mdev, char* helper_name); +void drbd_nl_send_reply(struct cn_msg *, int); + +char *nl_packet_name(int packet_type) { +// Generate packet type strings +#define PACKET(name, number, fields) \ + [ P_ ## name ] = # name, +#define INTEGER Argh! +#define BIT Argh! +#define INT64 Argh! +#define STRING Argh! + + static char *nl_tag_name[P_nl_after_last_packet] = { +#include "linux/drbd_nl.h" + }; + + return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ? + nl_tag_name[packet_type] : "*Unknown*"; +} + +void nl_trace_packet(void *data) { + struct cn_msg *req = data; + struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req*)req->data; + + printk(KERN_INFO DEVICE_NAME "%d: " + "Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n", + nlp->drbd_minor, + nl_packet_name(nlp->packet_type), + nlp->packet_type, + req->seq, req->ack, req->len); +} + +void nl_trace_reply(void *data) { + struct cn_msg *req = data; + struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply*)req->data; + + printk(KERN_INFO DEVICE_NAME "%d: " + "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n", + nlp->minor, + nlp->packet_type==P_nl_after_last_packet? + "Empty-Reply" : nl_packet_name(nlp->packet_type), + nlp->packet_type, + req->seq, req->ack, req->len); +} + +int drbd_khelper(drbd_dev *mdev, char* cmd) +{ + char mb[12]; + char *argv[] = {"/sbin/drbdadm", cmd, mb, NULL }; + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL }; + + snprintf(mb,12,"minor-%d",mdev_to_minor(mdev)); + + drbd_bcast_ev_helper(mdev,cmd); + return call_usermodehelper("/sbin/drbdadm",argv,envp,1); +} + +drbd_disks_t drbd_try_outdate_peer(drbd_dev *mdev) +{ + int r; + drbd_disks_t nps; + enum fencing_policy fp; + + D_ASSERT(mdev->state.pdsk == DUnknown); + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + dec_local(mdev); + } + + D_ASSERT( fp > DontCare ); + + if( fp == Stonith ) drbd_request_state(mdev,NS(susp,1)); + + r=drbd_khelper(mdev,"outdate-peer"); + + switch( (r>>8) & 0xff ) { + case 3: /* peer is inconsistent */ + nps = Inconsistent; + break; + case 4: /* peer is outdated */ + nps = Outdated; + break; + case 5: /* peer was down, we will(have) create(d) a new UUID anyways... */ + /* If we would be more strict, we would return DUnknown here. */ + nps = Outdated; + break; + case 6: /* Peer is primary, voluntarily outdate myself */ + WARN("Peer is primary, outdating myself.\n"); + nps = DUnknown; + drbd_request_state(mdev,NS(disk,Outdated)); + break; + case 7: + if( fp != Stonith ) { + ERR("outdate-peer() = 7 && fencing != Stonith !!!\n"); + } + nps = Outdated; + break; + default: + /* The script is broken ... */ + nps = DUnknown; + drbd_request_state(mdev,NS(disk,Outdated)); + ERR("outdate-peer helper broken, returned %d \n",(r>>8)&0xff); + return nps; + } + + INFO("outdate-peer helper returned %d \n",(r>>8)&0xff); + return nps; +} + + +int drbd_set_role(drbd_dev *mdev, drbd_role_t new_role, int force) +{ + int r=0,forced = 0, try=0; + drbd_state_t mask, val; + drbd_disks_t nps; + + if ( new_role == Primary ) { + request_ping(mdev); // Detect a dead peer ASAP + } + + mask.i = 0; mask.role = role_mask; + val.i = 0; val.role = new_role; + + while (try++ < 3) { + r = _drbd_request_state(mdev,mask,val,0); + if( r == SS_NoUpToDateDisk && force && + ( mdev->state.disk == Inconsistent || + mdev->state.disk == Outdated ) ) { + mask.disk = disk_mask; + val.disk = UpToDate; + forced = 1; + continue; + } + + if( r == SS_NoUpToDateDisk && + mdev->state.disk == Consistent ) { + D_ASSERT(mdev->state.pdsk == DUnknown); + nps = drbd_try_outdate_peer(mdev); + + if(nps == Outdated) { + val.disk = UpToDate; + mask.disk = disk_mask; + } + + val.pdsk = nps; + mask.pdsk = disk_mask; + + continue; + } + + if ( r == SS_NothingToDo ) goto fail; + if ( r == SS_PrimaryNOP ) { + nps = drbd_try_outdate_peer(mdev); + + if ( force && nps > Outdated ) { + WARN("Forced into split brain situation!\n"); + nps = Outdated; + } + + mask.pdsk = disk_mask; + val.pdsk = nps; + + continue; + } + if( r == SS_TwoPrimaries ) { + // Maybe the peer is detected as dead very soon... + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); + if(try == 1) try++; // only a single retry in this case. + continue; + } + if ( r < SS_Success ) { + r = drbd_request_state(mdev,mask,val); // Be verbose. + if( r < SS_Success ) goto fail; + } + break; + } + + if(forced) WARN("Forced to conisder local data as UpToDate!\n"); + + drbd_sync_me(mdev); + + /* Wait until nothing is on the fly :) */ + if ( wait_event_interruptible( mdev->misc_wait, + atomic_read(&mdev->ap_pending_cnt) == 0 ) ) { + r = GotSignal; + goto fail; + } + + /* FIXME RACE here: if our direct user is not using bd_claim (i.e. + * not a filesystem) since cstate might still be >= Connected, new + * ap requests may come in and increase ap_pending_cnt again! + * but that means someone is misusing DRBD... + * */ + + if (new_role == Secondary) { + set_disk_ro(mdev->vdisk, TRUE ); + } else { + if(inc_net(mdev)) { + mdev->net_conf->want_lose = 0; + dec_net(mdev); + } + set_disk_ro(mdev->vdisk, FALSE ); + /* why?? what for?? + mdev->this_bdev->bd_disk = mdev->vdisk; + */ + + if ( ( ( mdev->state.conn < Connected || + mdev->state.pdsk <= Failed ) && + mdev->bc->md.uuid[Bitmap] == 0) || forced ) { + drbd_uuid_new_current(mdev); + } + } + + if((new_role == Secondary) && inc_local(mdev) ) { + drbd_al_to_on_disk_bm(mdev); + dec_local(mdev); + } + + if (mdev->state.conn >= WFReportParams) { + /* if this was forced, we should consider sync */ + if(forced) drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + + drbd_md_sync(mdev); + + return r; + + fail: + return r; +} + + +STATIC int drbd_nl_primary(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + struct primary primary_args; + + memset(&primary_args, 0, sizeof(struct primary)); + if(!primary_from_tags(mdev,nlp->tag_list,&primary_args)) { + reply->ret_code=UnknownMandatoryTag; + return 0; + } + + reply->ret_code = drbd_set_role(mdev, Primary, primary_args.overwrite_peer); + + return 0; +} + +STATIC int drbd_nl_secondary(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_set_role(mdev, Secondary, 0); + + return 0; +} + +/* initializes the md.*_offset members, so we are able to find + * the on disk meta data */ +STATIC void drbd_md_set_sector_offsets(drbd_dev *mdev, + struct drbd_backing_dev *bdev) +{ + sector_t md_size_sect = 0; + switch(bdev->dc.meta_dev_idx) { + default: + /* v07 style fixed size indexed meta data */ + bdev->md.md_size_sect = MD_RESERVED_SECT; + bdev->md.md_offset = drbd_md_ss__(mdev,bdev); + bdev->md.al_offset = MD_AL_OFFSET; + bdev->md.bm_offset = MD_BM_OFFSET; + break; + case DRBD_MD_INDEX_FLEX_EXT: + /* just occupy the full device; unit: sectors */ + bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); + bdev->md.md_offset = 0; + bdev->md.al_offset = MD_AL_OFFSET; + bdev->md.bm_offset = MD_BM_OFFSET; + break; + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + bdev->md.md_offset = drbd_md_ss__(mdev,bdev); + /* al size is still fixed */ + bdev->md.al_offset = -MD_AL_MAX_SIZE; + //LGE FIXME max size check missing. + /* we need (slightly less than) ~ this much bitmap sectors: */ + md_size_sect = drbd_get_capacity(bdev->backing_bdev); + md_size_sect = ALIGN(md_size_sect,BM_SECT_PER_EXT); + md_size_sect = BM_SECT_TO_EXT(md_size_sect); + md_size_sect = ALIGN(md_size_sect,8); + + /* plus the "drbd meta data super block", + * and the activity log; */ + md_size_sect += MD_BM_OFFSET; + + bdev->md.md_size_sect = md_size_sect; + /* bitmap offset is adjusted by 'super' block size */ + bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; + break; + } +} + +char* ppsize(char* buf, unsigned long long size) +{ + // Needs 9 bytes at max. + static char units[] = { 'K','M','G','T','P','E' }; + int base = 0; + while (size >= 10000 ) { + size = size >> 10; + base++; + } + sprintf(buf,"%lu %cB",(long)size,units[base]); + + return buf; +} + +/* You should call drbd_md_sync() after calling this. + */ +int drbd_determin_dev_size(struct Drbd_Conf* mdev) +{ + sector_t prev_first_sect, prev_size; // previous meta location + sector_t la_size; + sector_t size; + char ppb[10]; + + int md_moved, la_size_changed; + int rv=0; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + prev_first_sect = drbd_md_first_sector(mdev->bc); + prev_size = mdev->bc->md.md_size_sect; + la_size = mdev->bc->md.la_size_sect; + + // TODO: should only be some assert here, not (re)init... + drbd_md_set_sector_offsets(mdev,mdev->bc); + + size = drbd_new_dev_size(mdev,mdev->bc); + + if( drbd_get_capacity(mdev->this_bdev) != size || + drbd_bm_capacity(mdev) != size ) { + int err; + err = drbd_bm_resize(mdev,size); + if (unlikely(err)) { + /* currently there is only one error: ENOMEM! */ + size = drbd_bm_capacity(mdev)>>1; + if (size == 0) { + ERR("OUT OF MEMORY! Could not allocate bitmap! Set device size => 0\n"); + } else { + /* FIXME this is problematic, + * if we in fact are smaller now! */ + ERR("BM resizing failed. " + "Leaving size unchanged at size = %lu KB\n", + (unsigned long)size); + } + rv = err; + } + // racy, see comments above. + drbd_set_my_capacity(mdev,size); + mdev->bc->md.la_size_sect = size; + INFO("size = %s (%llu KB)\n",ppsize(ppb,size>>1), + (unsigned long long)size>>1); + } + if (rv < 0) goto out; + + la_size_changed = (la_size != mdev->bc->md.la_size_sect); + + //LGE: flexible device size!! is this the right thing to test? + md_moved = prev_first_sect != drbd_md_first_sector(mdev->bc) + || prev_size != mdev->bc->md.md_size_sect; + + if ( md_moved ) { + WARN("Moving meta-data.\n"); + /* assert: (flexible) internal meta data */ + } + + if ( la_size_changed || md_moved ) { + if( inc_local_if_state(mdev,Attaching) ) { + drbd_al_shrink(mdev); // All extents inactive. + rv = drbd_bm_write(mdev); // write bitmap + // Write mdev->la_size to on disk. + drbd_md_mark_dirty(mdev); + dec_local(mdev); + } + } + out: + lc_unlock(mdev->act_log); + + return rv; +} + +sector_t +drbd_new_dev_size(struct Drbd_Conf* mdev, struct drbd_backing_dev *bdev) +{ + sector_t p_size = mdev->p_size; // partner's disk size. + sector_t la_size = bdev->md.la_size_sect; // last agreed size. + sector_t m_size; // my size + sector_t u_size = bdev->dc.disk_size; // size requested by user. + sector_t size=0; + + m_size = drbd_get_max_capacity(bdev); + + if(p_size && m_size) { + size=min_t(sector_t,p_size,m_size); + } else { + if(la_size) { + size=la_size; + if(m_size && m_size < size) size=m_size; + if(p_size && p_size < size) size=p_size; + } else { + if(m_size) size=m_size; + if(p_size) size=p_size; + } + } + + if(size == 0) { + ERR("Both nodes diskless!\n"); + } + + if(u_size) { + if(u_size<<1 > size) { + ERR("Requested disk size is too big (%lu > %lu)\n", + (unsigned long)u_size, (unsigned long)size>>1); + } else { + size = u_size<<1; + } + } + + return size; +} + +/** + * drbd_check_al_size: + * checks that the al lru is of requested size, and if neccessary tries to + * allocate a new one. returns -EBUSY if current al lru is still used, + * -ENOMEM when allocation failed, and 0 on success. You should call + * drbd_md_sync() after you called this function. + */ +STATIC int drbd_check_al_size(drbd_dev *mdev) +{ + struct lru_cache *n,*t; + struct lc_element *e; + unsigned int in_use; + int i; + + ERR_IF(mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + + if ( mdev->act_log && + mdev->act_log->nr_elements == mdev->sync_conf.al_extents ) + return 0; + + in_use = 0; + t = mdev->act_log; + n = lc_alloc("act_log", mdev->sync_conf.al_extents, + sizeof(struct lc_element), mdev); + + if (n==NULL) { + ERR("Cannot allocate act_log lru!\n"); + return -ENOMEM; + } + spin_lock_irq(&mdev->al_lock); + if (t) { + for (i=0; i < t->nr_elements; i++) { + e = lc_entry(t,i); + if (e->refcnt) + ERR("refcnt(%d)==%d\n", + e->lc_number, e->refcnt); + in_use += e->refcnt; + } + } + if (!in_use) { + mdev->act_log = n; + } + spin_unlock_irq(&mdev->al_lock); + if (in_use) { + ERR("Activity log still in use!\n"); + lc_free(n); + return -EBUSY; + } else { + if (t) lc_free(t); + } + drbd_md_mark_dirty(mdev); //we changed mdev->act_log->nr_elemens + return 0; +} + +void drbd_setup_queue_param(drbd_dev *mdev, unsigned int max_seg_s) +{ + request_queue_t * const q = mdev->rq_queue; + request_queue_t * const b = mdev->bc->backing_bdev->bd_disk->queue; + //unsigned int old_max_seg_s = q->max_segment_size; + + if (b->merge_bvec_fn && !mdev->bc->dc.use_bmbv) + max_seg_s = PAGE_SIZE; + + max_seg_s = min(b->max_sectors * b->hardsect_size, max_seg_s); + + MTRACE(TraceTypeRq,TraceLvlSummary, + DUMPI(b->max_sectors); + DUMPI(b->max_phys_segments); + DUMPI(b->max_hw_segments); + DUMPI(b->max_segment_size); + DUMPI(b->hardsect_size); + DUMPI(b->seg_boundary_mask); + ); + + q->max_sectors = max_seg_s >> 9; + q->max_phys_segments = max_seg_s >> PAGE_SHIFT; + q->max_hw_segments = max_seg_s >> PAGE_SHIFT; + q->max_segment_size = max_seg_s; + q->hardsect_size = 512; + q->seg_boundary_mask = PAGE_SIZE-1; + blk_queue_stack_limits(q, b); + + // KERNEL BUG. in ll_rw_blk.c + // t->max_segment_size = min(t->max_segment_size,b->max_segment_size); + // should be + // t->max_segment_size = min_not_zero(...,...) + + // workaround here: + if(q->max_segment_size == 0) q->max_segment_size = max_seg_s; + + MTRACE(TraceTypeRq,TraceLvlSummary, + DUMPI(q->max_sectors); + DUMPI(q->max_phys_segments); + DUMPI(q->max_hw_segments); + DUMPI(q->max_segment_size); + DUMPI(q->hardsect_size); + DUMPI(q->seg_boundary_mask); + ); + + if(b->merge_bvec_fn) { + WARN("Backing device's merge_bvec_fn() = %p\n", + b->merge_bvec_fn); + } + INFO("max_segment_size ( = BIO size ) = %u\n", q->max_segment_size); + + if( q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + INFO("Adjusting my ra_pages to backing device's (%lu -> %lu)\n", + q->backing_dev_info.ra_pages, + b->backing_dev_info.ra_pages); + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + } +} + +/* does always return 0; + * interesting return code is in reply->ret_code */ +STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + enum ret_codes retcode; + struct drbd_backing_dev* nbc=NULL; // new_backing_conf + struct inode *inode, *inode2; + struct lru_cache* resync_lru = NULL; + drbd_state_t ns,os; + int rv; + + /* if you want to reconfigure, please tear down first */ + if (mdev->state.disk > Diskless) { + retcode=HaveDiskConfig; + goto fail; + } + + nbc = kmalloc(sizeof(struct drbd_backing_dev),GFP_KERNEL); + if(!nbc) { + retcode=KMallocFailed; + goto fail; + } + + if( !(nlp->flags & DRBD_NL_SET_DEFAULTS) && inc_local(mdev) ) { + memcpy(&nbc->dc,&mdev->bc->dc,sizeof(struct disk_conf)); + dec_local(mdev); + } else { + memset(&nbc->dc,0,sizeof(struct disk_conf)); + nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; + nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; + nbc->dc.fencing = DRBD_FENCING_DEF; + } + + if(!disk_conf_from_tags(mdev,nlp->tag_list,&nbc->dc)) { + retcode=UnknownMandatoryTag; + goto fail; + } + + nbc->lo_file = NULL; + nbc->md_file = NULL; + + if ( nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { + retcode=LDMDInvalid; + goto fail; + } + + nbc->lo_file = filp_open(nbc->dc.backing_dev,O_RDWR,0); + if (IS_ERR(nbc->lo_file)) { + ERR("open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, + PTR_ERR(nbc->lo_file)); + nbc->lo_file=NULL; + retcode=LDNameInvalid; + goto fail; + } + + inode = nbc->lo_file->f_dentry->d_inode; + + if (!S_ISBLK(inode->i_mode)) { + retcode=LDNoBlockDev; + goto fail; + } + + nbc->md_file = filp_open(nbc->dc.meta_dev,O_RDWR,0); + if (IS_ERR(nbc->md_file)) { + ERR("open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, + PTR_ERR(nbc->md_file)); + nbc->md_file=NULL; + retcode=MDNameInvalid; + goto fail; + } + + inode2 = nbc->md_file->f_dentry->d_inode; + + if (!S_ISBLK(inode2->i_mode)) { + retcode=MDNoBlockDev; + goto fail; + } + + nbc->backing_bdev = inode->i_bdev; + if (BD_CLAIM(nbc->backing_bdev, mdev)) { + printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", + nbc->backing_bdev, mdev, + nbc->backing_bdev->bd_holder, + nbc->backing_bdev->bd_contains->bd_holder, + nbc->backing_bdev->bd_holders); + retcode=LDMounted; + goto fail; + } + + resync_lru = lc_alloc("resync",31, sizeof(struct bm_extent),mdev); + if(!resync_lru) { + retcode=KMallocFailed; + goto fail; + } + + nbc->md_bdev = inode2->i_bdev; + if (BD_CLAIM(nbc->md_bdev, + (nbc->dc.meta_dev_idx==DRBD_MD_INDEX_INTERNAL || + nbc->dc.meta_dev_idx==DRBD_MD_INDEX_FLEX_INT) ? + (void *)mdev : (void*) drbd_m_holder )) { + retcode=MDMounted; + goto release_bdev_fail; + } + + if ( (nbc->backing_bdev==nbc->md_bdev) != + (nbc->dc.meta_dev_idx==DRBD_MD_INDEX_INTERNAL || + nbc->dc.meta_dev_idx==DRBD_MD_INDEX_FLEX_INT) ) { + retcode=LDMDInvalid; + goto release_bdev2_fail; + } + + if ((drbd_get_capacity(nbc->backing_bdev)>>1) < nbc->dc.disk_size) { + retcode = LDDeviceTooSmall; + goto release_bdev2_fail; + } + +// warning LGE checks below no longer valid +// --- rewrite +#if 0 + if (drbd_get_capacity(nbc->backing_bdev) >= (sector_t)DRBD_MAX_SECTORS) { + retcode = LDDeviceTooLarge; + goto release_bdev2_fail; + } + + if ( nbc->dc.meta_dev_idx == -1 ) i = 1; + else i = nbc->dc.meta_dev_idx+1; + + /* for internal, we need to check agains <= (then we have a drbd with + * zero size, but meta data...) to be on the safe side, I require 32MB + * minimal data storage area for drbd with internal meta data (thats + * 160 total). if someone wants to use that small devices, she can use + * drbd 0.6 anyways... + * + * FIXME this is arbitrary and needs to be reconsidered as soon as we + * move to flexible size meta data. + */ + if( drbd_get_capacity(nbc->md_bdev) < 2*MD_RESERVED_SIZE*i + + (nbc->dc.meta_dev_idx == -1) ? (1<<16) : 0 ) + { + retcode = MDDeviceTooSmall; + goto release_bdev2_fail; + } +#endif +// -- up to here + + // Make sure the new disk is big enough + if (drbd_get_capacity(nbc->backing_bdev) < + drbd_get_capacity(mdev->this_bdev) ) { + retcode = LDDeviceTooSmall; + goto release_bdev2_fail; + } + + if((retcode = drbd_request_state(mdev,NS(disk,Attaching))) < SS_Success ) { + goto release_bdev2_fail; + } + + drbd_md_set_sector_offsets(mdev,nbc); + + retcode = drbd_md_read(mdev,nbc); + if ( retcode != NoError ) { + goto force_diskless; + } + + // Since we are diskless, fix the AL first... + if (drbd_check_al_size(mdev)) { + retcode = KMallocFailed; + goto force_diskless; + } + + // Prevent shrinking of consistent devices ! + if(drbd_md_test_flag(nbc,MDF_Consistent) && + drbd_new_dev_size(mdev,nbc) < nbc->md.la_size_sect) { + retcode = LDDeviceTooSmall; + goto force_diskless; + } + + if(!drbd_al_read_log(mdev,nbc)) { + retcode = MDIOError; + goto force_diskless; + } + + /* Point of no return reached. + * Devices and memory are no longer released by error cleanup below. + * now mdev takes over responsibility, and the state engine should + * clean it up somewhere. */ + D_ASSERT(mdev->bc == NULL); + mdev->bc = nbc; + mdev->resync = resync_lru; + nbc = NULL; + resync_lru = NULL; + + if(drbd_md_test_flag(mdev->bc,MDF_PrimaryInd)) { + set_bit(CRASHED_PRIMARY, &mdev->flags); + } else { + clear_bit(CRASHED_PRIMARY, &mdev->flags); + } + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + mdev->read_cnt = 0; + mdev->writ_cnt = 0; + + drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); + /* + * FIXME currently broken. + * drbd_set_recv_tcq(mdev,drbd_queue_order_type(mdev)==QUEUE_ORDERED_TAG); + */ + + /* If I am currently not Primary, + * but meta data primary indicator is set, + * I just now recover from a hard crash, + * and have been Primary before that crash. + * + * Now, if I had no connection before that crash + * (have been degraded Primary), chances are that + * I won't find my peer now either. + * + * In that case, and _only_ in that case, + * we use the degr-wfc-timeout instead of the default, + * so we can automatically recover from a crash of a + * degraded but active "cluster" after a certain timeout. + */ + clear_bit(USE_DEGR_WFC_T,&mdev->flags); + if ( mdev->state.role != Primary && + drbd_md_test_flag(mdev->bc,MDF_PrimaryInd) && + !drbd_md_test_flag(mdev->bc,MDF_ConnectedInd) ) { + set_bit(USE_DEGR_WFC_T,&mdev->flags); + } + + drbd_bm_lock(mdev); // racy... + drbd_determin_dev_size(mdev); + + if (drbd_md_test_flag(mdev->bc,MDF_FullSync)) { + INFO("Assuming that all blocks are out of sync (aka FullSync)\n"); + drbd_bm_set_all(mdev); + if (unlikely(drbd_bm_write(mdev) < 0)) { + retcode = MDIOError; + goto unlock_bm; + } + drbd_md_clear_flag(mdev,MDF_FullSync); + } else { + if (unlikely(drbd_bm_read(mdev) < 0)) { + retcode = MDIOError; + goto unlock_bm; + } + } + + if(test_bit(CRASHED_PRIMARY, &mdev->flags)) { + drbd_al_apply_to_bm(mdev); + drbd_al_to_on_disk_bm(mdev); + } + /* else { + FIXME wipe out on disk al! + } */ + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + ns.i = os.i; + /* If MDF_Consistent is not set go into inconsistent state, + otherwise investige MDF_WasUpToDate... + If MDF_WasUpToDate is not set go into Outdated disk state, + otherwise into Consistent state. + */ + if(drbd_md_test_flag(mdev->bc,MDF_Consistent)) { + if(drbd_md_test_flag(mdev->bc,MDF_WasUpToDate)) { + ns.disk = Consistent; + } else { + ns.disk = Outdated; + } + } else { + ns.disk = Inconsistent; + } + + if(drbd_md_test_flag(mdev->bc,MDF_PeerOutDated)) { + ns.pdsk = Outdated; + } + + if( ns.disk == Consistent && + ( ns.pdsk == Outdated || mdev->bc->dc.fencing == DontCare ) ) { + ns.disk = UpToDate; + } + + /* All tests on MDF_PrimaryInd, MDF_ConnectedInd, + MDF_Consistent and MDF_WasUpToDate must happen before + this point, because drbd_request_state() modifies these + flags. */ + + /* In case we are Connected postpone any desicion on the new disk + state after the negotiation phase. */ + if(mdev->state.conn == Connected) { + mdev->new_state_tmp.i = ns.i; + ns.i = os.i; + ns.disk = Negotiating; + } + + rv = _drbd_set_state(mdev, ns, ChgStateVerbose); + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); + if (rv==SS_Success) after_state_ch(mdev,os,ns,ChgStateVerbose); + + if (rv < SS_Success) { + goto unlock_bm; + } + + drbd_bm_unlock(mdev); + drbd_md_sync(mdev); + + reply->ret_code = retcode; + return 0; + + unlock_bm: + drbd_bm_unlock(mdev); + force_diskless: + drbd_force_state(mdev,NS(disk,Diskless)); + drbd_md_sync(mdev); + release_bdev2_fail: + if (nbc) BD_RELEASE(nbc->md_bdev); + release_bdev_fail: + if (nbc) BD_RELEASE(nbc->backing_bdev); + fail: + if (nbc) { + if (nbc->lo_file) fput(nbc->lo_file); + if (nbc->md_file) fput(nbc->md_file); + kfree(nbc); + } + if (resync_lru) lc_free(resync_lru); + + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_detach(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + drbd_sync_me(mdev); + reply->ret_code = drbd_request_state(mdev,NS(disk,Diskless)); + + return 0; +} + +#define HMAC_NAME_L 20 + +STATIC int drbd_nl_net_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int i,ns; + enum ret_codes retcode; + struct net_conf *new_conf = NULL; + struct crypto_hash *tfm = NULL; + struct hlist_head *new_tl_hash = NULL; + struct hlist_head *new_ee_hash = NULL; + drbd_dev *odev; + char hmac_name[HMAC_NAME_L]; + + if (mdev->state.conn > StandAlone) { + retcode=HaveNetConfig; + goto fail; + } + + new_conf = kmalloc(sizeof(struct net_conf),GFP_KERNEL); + if(!new_conf) { + retcode=KMallocFailed; + goto fail; + } + + if( !(nlp->flags & DRBD_NL_SET_DEFAULTS) && inc_net(mdev)) { + memcpy(new_conf,mdev->net_conf,sizeof(struct net_conf)); + dec_local(mdev); + } else { + memset(new_conf,0,sizeof(struct net_conf)); + new_conf->timeout = DRBD_TIMEOUT_DEF; + new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; + new_conf->ping_int = DRBD_PING_INT_DEF; + new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; + new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; + new_conf->unplug_watermark= DRBD_UNPLUG_WATERMARK_DEF; + new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; + new_conf->ko_count = DRBD_KO_COUNT_DEF; + new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; + new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; + new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; + new_conf->want_lose = 0; + new_conf->two_primaries = 0; + new_conf->wire_protocol = DRBD_PROT_C; + new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; + new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; + } + + if (!net_conf_from_tags(mdev,nlp->tag_list,new_conf)) { + retcode=UnknownMandatoryTag; + goto fail; + } + + if (new_conf->two_primaries && (new_conf->wire_protocol != DRBD_PROT_C)) { + retcode=ProtocolCRequired; + goto fail; + }; + + if( mdev->state.role == Primary && new_conf->want_lose ) { + retcode=DiscardNotAllowed; + goto fail; + } + +#define M_ADDR(A) (((struct sockaddr_in *)&A->my_addr)->sin_addr.s_addr) +#define M_PORT(A) (((struct sockaddr_in *)&A->my_addr)->sin_port) +#define O_ADDR(A) (((struct sockaddr_in *)&A->peer_addr)->sin_addr.s_addr) +#define O_PORT(A) (((struct sockaddr_in *)&A->peer_addr)->sin_port) + retcode = NoError; + for(i=0;inet_conf) && + M_PORT(new_conf) == M_PORT(odev->net_conf) ) { + retcode=LAAlreadyInUse; + } + if(O_ADDR(new_conf) == O_ADDR(odev->net_conf) && + O_PORT(new_conf) == O_PORT(odev->net_conf) ) { + retcode=OAAlreadyInUse; + } + dec_net(odev); + if(retcode != NoError) goto fail; + } + } +#undef M_ADDR +#undef M_PORT +#undef O_ADDR +#undef O_PORT + + if( new_conf->cram_hmac_alg[0] != 0) { + snprintf(hmac_name,HMAC_NAME_L,"hmac(%s)",new_conf->cram_hmac_alg); + tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + tfm = NULL; + retcode=CRAMAlgNotAvail; + goto fail; + } + + if (crypto_tfm_alg_type(crypto_hash_tfm(tfm)) != CRYPTO_ALG_TYPE_HASH ) { + retcode=CRAMAlgNotDigest; + goto fail; + } + } + + + ns = new_conf->max_epoch_size/8; + if (mdev->tl_hash_s != ns) { + new_tl_hash=kzalloc(ns*sizeof(void*), GFP_KERNEL); + if(!new_tl_hash) { + retcode=KMallocFailed; + goto fail; + } + } + + ns = new_conf->max_buffers/8; + if (new_conf->two_primaries && ( mdev->ee_hash_s != ns ) ) { + new_ee_hash=kzalloc(ns*sizeof(void*), GFP_KERNEL); + if(!new_ee_hash) { + retcode=KMallocFailed; + goto fail; + } + } + + ((char*)new_conf->shared_secret)[SHARED_SECRET_MAX-1]=0; + +#if 0 +FIXME LGE + /* for the connection loss logic in drbd_recv + * I _need_ the resulting timeo in jiffies to be + * non-zero and different + * + * XXX maybe rather store the value scaled to jiffies? + * Note: MAX_SCHEDULE_TIMEOUT/HZ*HZ != MAX_SCHEDULE_TIMEOUT + * and HZ > 10; which is unlikely to change... + * Thus, if interrupted by a signal, + * sock_{send,recv}msg returns -EINTR, + * if the timeout expires, -EAGAIN. + */ + // unlikely: someone disabled the timeouts ... + // just put some huge values in there. + if (!new_conf->ping_int) + new_conf->ping_int = MAX_SCHEDULE_TIMEOUT/HZ; + if (!new_conf->timeout) + new_conf->timeout = MAX_SCHEDULE_TIMEOUT/HZ*10; + if (new_conf->ping_int*10 < new_conf->timeout) + new_conf->timeout = new_conf->ping_int*10/6; + if (new_conf->ping_int*10 == new_conf->timeout) + new_conf->ping_int = new_conf->ping_int+1; +#endif + + D_ASSERT(mdev->net_conf==NULL); + mdev->net_conf = new_conf; + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + + if(new_tl_hash) { + if (mdev->tl_hash) kfree(mdev->tl_hash); + mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; + mdev->tl_hash = new_tl_hash; + } + + if(new_ee_hash) { + if (mdev->ee_hash) kfree(mdev->ee_hash); + mdev->ee_hash_s = mdev->net_conf->max_buffers/8; + mdev->ee_hash = new_ee_hash; + } + + if ( mdev->cram_hmac_tfm ) { + crypto_free_hash(mdev->cram_hmac_tfm); + } + mdev->cram_hmac_tfm = tfm; + + retcode = drbd_request_state(mdev,NS(conn,Unconnected)); + + reply->ret_code = retcode; + return 0; + + fail: + if (tfm) crypto_free_hash(tfm); + if (new_tl_hash) kfree(new_tl_hash); + if (new_ee_hash) kfree(new_ee_hash); + if (new_conf) kfree(new_conf); + + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_disconnect(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode; + + retcode = _drbd_request_state(mdev,NS(conn,Disconnecting),0); // silently. + + if ( retcode == SS_NothingToDo ) goto done; + else if ( retcode == SS_AlreadyStandAlone ) goto done; + else if ( retcode == SS_PrimaryNOP ) { + // Our statche checking code wants to see the peer outdated. + retcode = drbd_request_state(mdev,NS2(conn,Disconnecting, + pdsk,Outdated)); + } else if (retcode == SS_CW_FailedByPeer) { + // The peer probabely wants to see us outdated. + retcode = _drbd_request_state(mdev,NS2(conn,Disconnecting, + disk,Outdated),0); + if( retcode == SS_CanNotOutdateDL ) { + // We are diskless and our peer wants to outdate us. + // So, simply go away, and let the peer try to + // outdate us with its 'outdate-peer' handler later. + retcode = drbd_request_state(mdev,NS(conn,StandAlone)); + } + } + + if( retcode < SS_Success ) goto fail; + + if( wait_event_interruptible( mdev->misc_wait, + mdev->state.conn==StandAlone) ) { + retcode = GotSignal; + goto fail; + } + + done: + retcode = NoError; + fail: + drbd_md_sync(mdev); + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_resize(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + struct resize rs; + int retcode=NoError; + + memset(&rs, 0, sizeof(struct resize)); + if (!resize_from_tags(mdev,nlp->tag_list,&rs)) { + retcode=UnknownMandatoryTag; + goto fail; + } + + if (mdev->state.conn > Connected) { + retcode = NoResizeDuringResync; + goto fail; + } + + if ( mdev->state.role == Secondary && + mdev->state.peer == Secondary) { + retcode = APrimaryNodeNeeded; + goto fail; + } + + if(!inc_local(mdev)) { + retcode = HaveNoDiskConfig; + goto fail; + } + + mdev->bc->dc.disk_size = (sector_t)rs.resize_size; + drbd_bm_lock(mdev); + drbd_determin_dev_size(mdev); + drbd_md_sync(mdev); + drbd_bm_unlock(mdev); + dec_local(mdev); + if (mdev->state.conn == Connected) { + drbd_send_uuids(mdev); // to start sync... + drbd_send_sizes(mdev); + } + + fail: + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_syncer_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode=NoError; + struct syncer_conf sc; + drbd_dev *odev; + int err; + + memcpy(&sc,&mdev->sync_conf,sizeof(struct syncer_conf)); + + if(nlp->flags & DRBD_NL_SET_DEFAULTS) { + sc.rate = DRBD_RATE_DEF; + sc.after = DRBD_AFTER_DEF; + sc.al_extents = DRBD_AL_EXTENTS_DEF; + } + + if (!syncer_conf_from_tags(mdev,nlp->tag_list,&sc)) { + retcode=UnknownMandatoryTag; + goto fail; + } + + if( sc.after != -1) { + if( sc.after < -1 || minor_to_mdev(sc.after) == NULL ) { + retcode=SyncAfterInvalid; + goto fail; + } + odev = minor_to_mdev(sc.after); // check against loops in + while(1) { + if( odev == mdev ) { + retcode=SyncAfterCycle; + goto fail; + } + if( odev->sync_conf.after == -1 ) break; // no cycles. + odev = minor_to_mdev(odev->sync_conf.after); + } + } + + ERR_IF (sc.rate < 1) sc.rate = 1; + ERR_IF (sc.al_extents < 7) sc.al_extents = 127; // arbitrary minimum +#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) + if(sc.al_extents > AL_MAX) { + ERR("sc.al_extents > %d\n",AL_MAX); + sc.al_extents = AL_MAX; + } +#undef AL_MAX + + mdev->sync_conf = sc; + + if(inc_local(mdev)) { + err = drbd_check_al_size(mdev); + dec_local(mdev); + drbd_md_sync(mdev); + + if (err) { + retcode = KMallocFailed; + goto fail; + } + } + + if (mdev->state.conn >= Connected) + drbd_send_sync_param(mdev,&sc); + + drbd_alter_sa(mdev, sc.after); + + fail: + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_invalidate(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev,NS2(conn,StartingSyncT, + disk,Inconsistent)); + return 0; +} + +STATIC int drbd_nl_invalidate_peer(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + + reply->ret_code = drbd_request_state(mdev,NS2(conn,StartingSyncS, + pdsk,Inconsistent)); + + return 0; +} + +STATIC int drbd_nl_pause_sync(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode=NoError; + + if(drbd_request_state(mdev,NS(user_isp,1)) == SS_NothingToDo) + retcode = PauseFlagAlreadySet; + + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_resume_sync(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode=NoError; + + if(drbd_request_state(mdev,NS(user_isp,0)) == SS_NothingToDo) + retcode = PauseFlagAlreadyClear; + + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_suspend_io(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev,NS(susp,1)); + + return 0; +} + +STATIC int drbd_nl_resume_io(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev,NS(susp,0)); + return 0; +} + +STATIC int drbd_nl_outdate(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode; + drbd_state_t os,ns; + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + if( mdev->state.disk < Outdated ) { + retcode = -999; + } else { + retcode = _drbd_set_state(_NS(mdev,disk,Outdated),ChgStateVerbose); + } + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); + if (retcode==SS_Success) after_state_ch(mdev,os,ns, ChgStateVerbose); + + if( retcode == -999 ) { + retcode = DiskLowerThanOutdated; + goto fail; + } + + drbd_md_sync(mdev); + + fail: + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_get_config(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + if(inc_local(mdev)) { + tl = disk_conf_to_tags(mdev,&mdev->bc->dc,tl); + dec_local(mdev); + } + + if(inc_net(mdev)) { + tl = net_conf_to_tags(mdev,mdev->net_conf,tl); + dec_net(mdev); + } + tl = syncer_conf_to_tags(mdev,&mdev->sync_conf,tl); + + *tl++ = TT_END; /* Close the tag list */ + + return (int)((char*)tl - (char*)reply->tag_list); +} + +STATIC int drbd_nl_get_state(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + tl = get_state_to_tags(mdev,(struct get_state*)&mdev->state,tl); + *tl++ = TT_END; /* Close the tag list */ + + return (int)((char*)tl - (char*)reply->tag_list); +} + +STATIC int drbd_nl_get_uuids(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + if(inc_local(mdev)) { + // This is a hand crafted add tag ;) + *tl++ = T_uuids; + *tl++ = UUID_SIZE*sizeof(u64); + memcpy(tl,mdev->bc->md.uuid,UUID_SIZE*sizeof(u64)); + tl=(unsigned short*)((char*)tl + UUID_SIZE*sizeof(u64)); + dec_local(mdev); + *tl++ = T_uuids_flags; + *tl++ = sizeof(int); + memcpy(tl,&mdev->bc->md.flags,sizeof(int)); + tl=(unsigned short*)((char*)tl + sizeof(int)); + } + *tl++ = TT_END; /* Close the tag list */ + + return (int)((char*)tl - (char*)reply->tag_list); +} + + +STATIC int drbd_nl_get_timeout_flag(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + // This is a hand crafted add tag ;) + *tl++ = T_use_degraded; + *tl++ = sizeof(char); + *((char*)tl) = test_bit(USE_DEGR_WFC_T,&mdev->flags) ? 1 : 0 ; + tl=(unsigned short*)((char*)tl + sizeof(char)); + *tl++ = TT_END; + + return (int)((char*)tl - (char*)reply->tag_list); +} + +STATIC drbd_dev *ensure_mdev(struct drbd_nl_cfg_req *nlp) +{ + drbd_dev *mdev; + + mdev = minor_to_mdev(nlp->drbd_minor); + + if(!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { + mdev = drbd_new_device(nlp->drbd_minor); + + spin_lock_irq(&drbd_pp_lock); + if( minor_table[nlp->drbd_minor] == NULL) { + minor_table[nlp->drbd_minor] = mdev; + mdev = NULL; + } + spin_unlock_irq(&drbd_pp_lock); + + if(mdev) { + if(mdev->app_reads_hash) kfree(mdev->app_reads_hash); + if(mdev->md_io_page) __free_page(mdev->md_io_page); + kfree(mdev); + mdev = NULL; + } + + mdev = minor_to_mdev(nlp->drbd_minor); + } + + return mdev; +} + +struct cn_handler_struct { + int (*function)(drbd_dev *, + struct drbd_nl_cfg_req *, + struct drbd_nl_cfg_reply* ); + int reply_body_size; +}; + +static struct cn_handler_struct cnd_table[] = { + [ P_primary ] = { &drbd_nl_primary, 0 }, + [ P_secondary ] = { &drbd_nl_secondary, 0 }, + [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, + [ P_detach ] = { &drbd_nl_detach, 0 }, + [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, + [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, + [ P_resize ] = { &drbd_nl_resize, 0 }, + [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, + [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, + [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, + [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, + [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, + [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, + [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, + [ P_outdate ] = { &drbd_nl_outdate, 0 }, + [ P_get_config ] = { &drbd_nl_get_config, + sizeof(struct syncer_conf_tag_len_struct) + + sizeof(struct disk_conf_tag_len_struct) + + sizeof(struct net_conf_tag_len_struct) }, + [ P_get_state ] = { &drbd_nl_get_state, + sizeof(struct get_state_tag_len_struct) }, + [ P_get_uuids ] = { &drbd_nl_get_uuids, + sizeof(struct get_uuids_tag_len_struct) }, + [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, + sizeof(struct get_timeout_flag_tag_len_struct)}, + +}; + +void drbd_connector_callback(void *data) +{ + struct cn_msg *req = data; + struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req*)req->data; + struct cn_handler_struct *cm; + struct cn_msg *cn_reply; + struct drbd_nl_cfg_reply* reply; + drbd_dev *mdev; + int retcode,rr; + int reply_size = sizeof(struct cn_msg) + + sizeof(struct drbd_nl_cfg_reply) + + sizeof(short int); + + if(!try_module_get(THIS_MODULE)) { + printk(KERN_ERR DEVICE_NAME "try_module_get() failed!\n"); + return; + } + + if( !(mdev = ensure_mdev(nlp)) ) { + retcode=MinorNotKnown; + goto fail; + } + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_packet(data);); + + if( nlp->packet_type >= P_nl_after_last_packet ) { + retcode=UnknownNetLinkPacket; + goto fail; + } + + cm = cnd_table + nlp->packet_type; + reply_size += cm->reply_body_size; + + if( !(cn_reply = kmalloc(reply_size,GFP_KERNEL)) ) { + retcode=KMallocFailed; + goto fail; + } + reply = (struct drbd_nl_cfg_reply*) cn_reply->data; + + reply->packet_type = cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; + reply->minor = nlp->drbd_minor; + reply->ret_code = NoError; // Might by modified by cm->function. + // reply->tag_list; might be modified by cm->fucntion. + + rr = cm->function(mdev,nlp,reply); + + cn_reply->id = req->id; + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; + cn_reply->flags = 0; + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_reply(cn_reply);); + + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); + if(rr && rr != -ESRCH) { + printk(KERN_INFO DEVICE_NAME " cn_netlink_send()=%d\n",rr); + } + kfree(cn_reply); + module_put(THIS_MODULE); + return; + fail: + drbd_nl_send_reply(req, retcode); + module_put(THIS_MODULE); +} + +atomic_t drbd_nl_seq = ATOMIC_INIT(2); // two. + +void drbd_bcast_state(drbd_dev *mdev) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct get_state_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply* reply = (struct drbd_nl_cfg_reply*)cn_reply->data; + unsigned short *tl = reply->tag_list; + + // WARN("drbd_bcast_state() got called\n"); + + tl = get_state_to_tags(mdev,(struct get_state*)&mdev->state,tl); + *tl++ = TT_END; /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); + cn_reply->ack = 0; // not used here. + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char*)tl - (char*)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_get_state; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NoError; + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_reply(cn_reply);); + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); +} + +void drbd_bcast_ev_helper(drbd_dev *mdev, char* helper_name) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct call_helper_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply* reply = (struct drbd_nl_cfg_reply*)cn_reply->data; + unsigned short *tl = reply->tag_list; + int str_len; + + // WARN("drbd_bcast_state() got called\n"); + + str_len = strlen(helper_name)+1; + *tl++ = T_helper; + *tl++ = str_len; + memcpy(tl,helper_name,str_len); + tl=(unsigned short*)((char*)tl + str_len); + *tl++ = TT_END; /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); + cn_reply->ack = 0; // not used here. + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char*)tl - (char*)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_call_helper; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NoError; + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_reply(cn_reply);); + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); +} + +#ifdef NETLINK_ROUTE6 +int __init cn_init(void); +void __exit cn_fini(void); +#endif + +int __init drbd_nl_init() +{ + static struct cb_id cn_id_drbd = { CN_IDX_DRBD, CN_VAL_DRBD }; + int err; + +#ifdef NETLINK_ROUTE6 + /* pre 2.6.16 */ + err = cn_init(); + if(err) return err; +#endif + err = cn_add_callback(&cn_id_drbd,"cn_drbd",&drbd_connector_callback); + if(err) { + printk(KERN_ERR DEVICE_NAME "cn_drbd failed to register\n"); + return err; + } + + return 0; +} + +void drbd_nl_cleanup() +{ + static struct cb_id cn_id_drbd = { CN_IDX_DRBD, CN_VAL_DRBD }; + + cn_del_callback(&cn_id_drbd); + +#ifdef NETLINK_ROUTE6 + /* pre 2.6.16 */ + cn_fini(); +#endif +} + +void drbd_nl_send_reply( struct cn_msg *req, + int ret_code) +{ + char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply* reply = (struct drbd_nl_cfg_reply*)cn_reply->data; + int rr; + + cn_reply->id = req->id; + + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply); + cn_reply->flags = 0; + + reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; + reply->ret_code = ret_code; + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_reply(cn_reply);); + + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); + if(rr && rr != -ESRCH) { + printk(KERN_INFO DEVICE_NAME " cn_netlink_send()=%d\n",rr); + } +} + diff -uprN linux-2.6.18/drivers/block/drbd/drbd_proc.c linux-2.6.18.ovz/drivers/block/drbd/drbd_proc.c --- linux-2.6.18/drivers/block/drbd/drbd_proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_proc.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,267 @@ +/* +-*- linux-c -*- + drbd_proc.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "drbd_int.h" +#include "lru_cache.h" /* for lc_sprintf_stats */ + +STATIC int drbd_proc_open(struct inode *inode, struct file *file); + + +struct proc_dir_entry *drbd_proc; +struct file_operations drbd_proc_fops = { + .owner = THIS_MODULE, + .open = drbd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/*lge + * progress bars shamelessly adapted from driver/md/md.c + * output looks like + * [=====>..............] 33.5% (23456/123456) + * finish: 2:20:20 speed: 6,345 (6,456) K/sec + */ +STATIC void drbd_syncer_progress(struct Drbd_Conf* mdev, struct seq_file *seq) +{ + unsigned long res , db, dt, dbdt, rt, rs_left; + + /* the whole sector_div thingy was wrong (did overflow, + * did not use correctly typed parameters), and is not even + * neccessary as long as rs_total and drbd_bm_total_weight + * are both unsigned long. + * + * this is to break it at compile time when we change that + * (we may feel 4TB maximum storage per drbd is not enough) + */ + typecheck(unsigned long, mdev->rs_total); + + /* note: both rs_total and rs_left are in bits, i.e. in + * units of BM_BLOCK_SIZE. + * for the percentage, we don't care. */ + + rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; + /* >> 10 to prevent overflow, + * +1 to prevent division by zero */ + if (rs_left > mdev->rs_total) { + /* doh. logic bug somewhere. + * for now, just try to prevent in-kernel buffer overflow. + */ + ERR("logic bug? rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", + rs_left, mdev->rs_total, mdev->rs_failed); + res = 1000; + } else { + res = (rs_left >> 10)*1000/((mdev->rs_total >> 10) + 1); + } + { + int i, y = res/50, x = 20-y; + seq_printf(seq, "\t["); + for (i = 1; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + res = 1000L - res; + seq_printf(seq,"sync'ed:%3lu.%lu%% ", res / 10, res % 10); + /* if more than 1 GB display in MB */ + if (mdev->rs_total > 0x100000L) { + seq_printf(seq,"(%lu/%lu)M\n\t", + (unsigned long) Bit2KB(rs_left) >> 10, + (unsigned long) Bit2KB(mdev->rs_total) >> 10 ); + } else { + seq_printf(seq,"(%lu/%lu)K\n\t", + (unsigned long) Bit2KB(rs_left), + (unsigned long) Bit2KB(mdev->rs_total) ); + } + + /* see drivers/md/md.c + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = (jiffies - mdev->rs_mark_time) / HZ; + + if (dt > 20) { + /* if we made no update to rs_mark_time for too long, + * we are stalled. show that. */ + seq_printf(seq, "stalled\n"); + return; + } + + if (!dt) dt++; + db = mdev->rs_mark_left - rs_left; + rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ + + seq_printf(seq, "finish: %lu:%02lu:%02lu", + rt / 3600, (rt % 3600) / 60, rt % 60); + + /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " speed: %ld,%03ld", + dbdt/1000,dbdt % 1000); + else + seq_printf(seq, " speed: %ld", dbdt); + + /* mean speed since syncer started + * we do account for PausedSync periods */ + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) dt=1; + db = mdev->rs_total - rs_left; + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " (%ld,%03ld)", + dbdt/1000,dbdt % 1000); + else + seq_printf(seq, " (%ld)", dbdt); + + seq_printf(seq," K/sec\n"); +} + +#if 0 +STATIC void resync_dump_detail(struct seq_file *seq, struct lc_element * e) +{ + struct bm_extent *bme = (struct bm_extent *)e; + + seq_printf(seq,"%5d %s %s\n",bme->rs_left, + bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", + bme->flags & BME_LOCKED ? "LOCKED" : "------" + ); +} +#endif + +STATIC int drbd_seq_show(struct seq_file *seq, void *v) +{ + int i,hole=0; + const char *sn; + drbd_dev *mdev; + + seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d)\n%s\n", + API_VERSION,PRO_VERSION, drbd_buildtag()); + + /* + cs .. connection state + st .. node state (local/remote) + ld .. local data consistentency + ns .. network send + nr .. network receive + dw .. disk write + dr .. disk read + pe .. pending (waiting for ack) + ua .. unack'd (still need to send ack) + al .. access log write count + */ + + for (i = 0; i < minor_count; i++) { + mdev = minor_to_mdev(i); + if(!mdev) { + hole=1; + continue; + } + if( hole ) { + hole=0; + seq_printf( seq, "\n"); + } + + sn = conns_to_name(mdev->state.conn); + + if ( mdev->state.conn == StandAlone && + mdev->state.disk == Diskless) { + seq_printf( seq, "%2d: cs:Unconfigured\n", i); + } else { + seq_printf( seq, + "%2d: cs:%s st:%s/%s ds:%s/%s %c %c%c%c%c\n" + " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " + "lo:%d pe:%d ua:%d ap:%d\n", + i, sn, + roles_to_name(mdev->state.role), + roles_to_name(mdev->state.peer), + disks_to_name(mdev->state.disk), + disks_to_name(mdev->state.pdsk), + (mdev->net_conf == NULL ? ' ' : + (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), + mdev->state.susp ? 's' : 'r', + mdev->state.aftr_isp ? 'a' : '-', + mdev->state.peer_isp ? 'p' : '-', + mdev->state.user_isp ? 'u' : '-', + mdev->send_cnt/2, + mdev->recv_cnt/2, + mdev->writ_cnt/2, + mdev->read_cnt/2, + mdev->al_writ_cnt, + mdev->bm_writ_cnt, + atomic_read(&mdev->local_cnt), + atomic_read(&mdev->ap_pending_cnt) + + atomic_read(&mdev->rs_pending_cnt), + atomic_read(&mdev->unacked_cnt), + atomic_read(&mdev->ap_bio_cnt) + ); + } + if ( mdev->state.conn == SyncSource || + mdev->state.conn == SyncTarget ) { + drbd_syncer_progress(mdev,seq); + } + if(mdev->resync) { + lc_printf_stats(seq,mdev->resync); + } + if(mdev->act_log) { + lc_printf_stats(seq,mdev->act_log); + } +#if 0 + if(mdev->resync) { + lc_dump(mdev->resync,seq,"rs_left", + resync_dump_detail); + } +#endif + + } + + return 0; +} + +STATIC int drbd_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, drbd_seq_show, PDE(inode)->data); +} + +/* PROC FS stuff end */ diff -uprN linux-2.6.18/drivers/block/drbd/drbd_receiver.c linux-2.6.18.ovz/drivers/block/drbd/drbd_receiver.c --- linux-2.6.18/drivers/block/drbd/drbd_receiver.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_receiver.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,3437 @@ +/* +-*- linux-c -*- + drbd_receiver.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define __KERNEL_SYSCALLS__ +#include +#include +#include +#include +#include "drbd_int.h" +#include "drbd_req.h" + +#if defined(__arch_um__) && !defined(HAVE_UML_TO_VIRT) +static inline void *to_virt(unsigned long phys) +{ + return((void *) uml_physmem + phys); +} +#endif + +#ifdef DBG_ASSERTS +void drbd_assert_breakpoint(drbd_dev *mdev, char *exp, + char *file, int line) +{ + ERR("ASSERT( %s ) in %s:%d\n", exp, file, line); +} +#endif + + +#if 0 +#define CHECK_LIST_LIMIT 1000 +void check_list(drbd_dev *mdev,struct list_head *list,char *t) +{ + struct list_head *le,*la; + int forward=0,backward=0; + + le=list; + do { + la=le; + le=le->next; + if( le->prev != la ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s list fucked.\n", + mdev_to_minor(mdev),t); + break; + } + if( forward++ > CHECK_LIST_LIMIT ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s forward > 1000\n", + mdev_to_minor(mdev),t); + break; + } + } while(le != list); + + le=list; + do { + la=le; + le=le->prev; + if( le->next != la ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s list fucked.\n", + mdev_to_minor(mdev),t); + break; + } + if( backward++ > CHECK_LIST_LIMIT ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s backward > 1000\n", + mdev_to_minor(mdev),t); + break; + } + } while(le != list); + + if(forward != backward) { + printk(KERN_ERR DEVICE_NAME "%d: forward=%d, backward=%d\n", + mdev_to_minor(mdev),forward,backward); + } +} +#endif + +#define GFP_TRY ( __GFP_HIGHMEM | __GFP_NOWARN ) + +/** + * drbd_bp_alloc: Returns a page. Fails only if a signal comes in. + */ +STATIC struct page * drbd_pp_alloc(drbd_dev *mdev, unsigned int gfp_mask) +{ + unsigned long flags=0; + struct page *page; + DEFINE_WAIT(wait); + + /* FIXME Add some usefull watermark again to "kick_lo", if pages get + * used up too quickly. The watermark that had been in place here did + * not make sense. + */ + + spin_lock_irqsave(&drbd_pp_lock,flags); + /* This lock needs to lock out irq because we might call drdb_pp_free() + from IRQ context. + FIXME but why irq _save_ ? + this is only called from drbd_alloc_ee, + and that is strictly process context! */ + if ( (page = drbd_pp_pool) ) { + drbd_pp_pool = (struct page*)page_private(page); + drbd_pp_vacant--; + } + spin_unlock_irqrestore(&drbd_pp_lock,flags); + if (page) goto got_page; + + drbd_kick_lo(mdev); + + for (;;) { + prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); + + /* try the pool again, maybe the drbd_kick_lo set some free */ + spin_lock_irqsave(&drbd_pp_lock,flags); + if ( (page = drbd_pp_pool) ) { + drbd_pp_pool = (struct page*)page_private(page); + drbd_pp_vacant--; + } + spin_unlock_irqrestore(&drbd_pp_lock,flags); + + if (page) break; + + /* hm. pool was empty. try to allocate from kernel. + * don't wait, if none is available, though. + */ + if ( atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers ) { + if( (page = alloc_page(GFP_TRY)) ) + break; + } + + /* doh. still no page. + * either used up the configured maximum number, + * or we are low on memory. + * wait for someone to return a page into the pool. + * unless, of course, someone signalled us. + */ + if (signal_pending(current)) { + WARN("drbd_pp_alloc interrupted!\n"); + finish_wait(&drbd_pp_wait, &wait); + return NULL; + } + drbd_kick_lo(mdev); + schedule(); + } + finish_wait(&drbd_pp_wait, &wait); + + got_page: + atomic_inc(&mdev->pp_in_use); + return page; +} + +STATIC void drbd_pp_free(drbd_dev *mdev,struct page *page) +{ + unsigned long flags=0; + int free_it; + + spin_lock_irqsave(&drbd_pp_lock,flags); + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { + free_it = 1; + } else { + set_page_private(page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + drbd_pp_vacant++; + free_it = 0; + } + spin_unlock_irqrestore(&drbd_pp_lock,flags); + + atomic_dec(&mdev->pp_in_use); + + if(free_it) __free_page(page); + + /* + * FIXME + * typically there are no waiters. + * we should try to avoid any unnecessary call to wake_up. + */ + wake_up(&drbd_pp_wait); +} + +/* +You need to hold the req_lock: + drbd_free_ee() + _drbd_wait_ee_list_empty() + +You must not have the req_lock: + drbd_alloc_ee() + drbd_init_ee() + drbd_release_ee() + drbd_ee_fix_bhs() + drbd_process_done_ee() + drbd_clear_done_ee() + drbd_wait_ee_list_empty() +*/ + +struct Tl_epoch_entry* drbd_alloc_ee(drbd_dev *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + unsigned int gfp_mask) +{ + request_queue_t *q; + struct Tl_epoch_entry* e; + struct bio_vec *bvec; + struct page *page; + struct bio *bio; + unsigned int ds; + int i; + + e = mempool_alloc(drbd_ee_mempool, gfp_mask); + if (!e) { + ERR("alloc_ee: Allocation of an EE failed\n"); + return NULL; + } + + bio = bio_alloc(GFP_KERNEL, div_ceil(data_size,PAGE_SIZE)); + if (!bio) { + ERR("alloc_ee: Allocation of a bio failed\n"); + goto fail1; + } + + bio->bi_bdev = mdev->bc->backing_bdev; + bio->bi_sector = sector; + + ds = data_size; + while(ds) { + page = drbd_pp_alloc(mdev, gfp_mask); + if (!page) { + ERR("alloc_ee: Allocation of a page failed\n"); + goto fail2; + } + if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { + drbd_pp_free(mdev,page); + ERR("alloc_ee: bio_add_page(s=%llu," + "data_size=%u,ds=%u) failed\n", + (unsigned long long)sector, data_size, ds); + + q = bdev_get_queue(bio->bi_bdev); + if (q->merge_bvec_fn) { + ERR("merge_bvec_fn() = %d\n", + q->merge_bvec_fn(q, bio, + &bio->bi_io_vec[bio->bi_vcnt])); + } + + /* dump more of the bio. */ + DUMPI(bio->bi_max_vecs); + DUMPI(bio->bi_vcnt); + DUMPI(bio->bi_size); + DUMPI(bio->bi_phys_segments); + DUMPI(bio->bi_hw_segments); + + goto fail2; + break; + } + ds -= min_t(int, ds, PAGE_SIZE); + } + + D_ASSERT( data_size == bio->bi_size); + + bio->bi_private = e; + e->mdev = mdev; + e->sector = sector; + e->size = bio->bi_size; + + e->private_bio = bio; + e->block_id = id; + INIT_HLIST_NODE(&e->colision); + e->barrier_nr = 0; + e->barrier_nr2 = 0; + e->flags = 0; + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("allocated EE sec=%llus size=%u ee=%p\n", + (unsigned long long)sector,data_size,e); + ); + + return e; + + fail2: + __bio_for_each_segment(bvec, bio, i, 0) { + drbd_pp_free(mdev,bvec->bv_page); + } + bio_put(bio); + fail1: + mempool_free(e, drbd_ee_mempool); + + return NULL; +} + +void drbd_free_ee(drbd_dev *mdev, struct Tl_epoch_entry* e) +{ + struct bio *bio=e->private_bio; + struct bio_vec *bvec; + int i; + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("Free EE sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + + __bio_for_each_segment(bvec, bio, i, 0) { + drbd_pp_free(mdev,bvec->bv_page); + } + + bio_put(bio); + + D_ASSERT(hlist_unhashed(&e->colision)); + + mempool_free(e, drbd_ee_mempool); +} + +/* currently on module unload only */ +int drbd_release_ee(drbd_dev *mdev,struct list_head* list) +{ + int count=0; + struct Tl_epoch_entry* e; + struct list_head *le; + + spin_lock_irq(&mdev->req_lock); + while(!list_empty(list)) { + le = list->next; + e = list_entry(le, struct Tl_epoch_entry, w.list); + drbd_free_ee(mdev,e); + count++; + } + spin_unlock_irq(&mdev->req_lock); + + return count; +} + + +STATIC void reclaim_net_ee(drbd_dev *mdev) +{ + struct Tl_epoch_entry *e; + struct list_head *le,*tle; + + /* The EEs are always appended to the end of the list. Since + they are sent in order over the wire, they have to finish + in order. As soon as we see the first not finished we can + stop to examine the list... */ + + list_for_each_safe(le, tle, &mdev->net_ee) { + e = list_entry(le, struct Tl_epoch_entry, w.list); + if( drbd_bio_has_active_page(e->private_bio) ) break; + list_del(le); + drbd_free_ee(mdev,e); + } +} + + +/* + * This function is called from _asender only_ + * but see also comments in _req_mod(,barrier_acked) + * and receive_Barrier_no_tcq. + * + * Move entries from net_ee to done_ee, if ready. + * Grab done_ee, call all callbacks, free the entries. + * The callbacks typically send out ACKs. + */ +STATIC int drbd_process_done_ee(drbd_dev *mdev) +{ + LIST_HEAD(work_list); + struct Tl_epoch_entry *e, *t; + int ok=1; + int do_clear_bit = test_bit(WRITE_ACK_PENDING,&mdev->flags); + + spin_lock_irq(&mdev->req_lock); + reclaim_net_ee(mdev); + list_splice_init(&mdev->done_ee,&work_list); + spin_unlock_irq(&mdev->req_lock); + + /* possible callbacks here: + * e_end_block, and e_end_resync_block, e_send_discard_ack. + * all ignore the last argument. + */ + list_for_each_entry_safe(e, t, &work_list, w.list) { + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("Process EE on done_ee sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + // list_del not necessary, next/prev members not touched + if (e->w.cb(mdev,&e->w,0) == 0) ok = 0; + drbd_free_ee(mdev,e); + } + if (do_clear_bit) + clear_bit(WRITE_ACK_PENDING,&mdev->flags); + wake_up(&mdev->ee_wait); + + return ok; +} + + + +/* clean-up helper for drbd_disconnect */ +void _drbd_clear_done_ee(drbd_dev *mdev) +{ + struct list_head *le; + struct Tl_epoch_entry *e; + int n = 0; + + MUST_HOLD(&mdev->req_lock); + + reclaim_net_ee(mdev); + + while(!list_empty(&mdev->done_ee)) { + le = mdev->done_ee.next; + list_del(le); + e = list_entry(le, struct Tl_epoch_entry, w.list); + if(mdev->net_conf->wire_protocol == DRBD_PROT_C || + is_syncer_block_id(e->block_id)) { + ++n; + } + if(!hlist_unhashed(&e->colision)) hlist_del_init(&e->colision); + drbd_free_ee(mdev,e); + } + + sub_unacked(mdev, n); +} + +void _drbd_wait_ee_list_empty(drbd_dev *mdev,struct list_head *head) +{ + DEFINE_WAIT(wait); + MUST_HOLD(&mdev->req_lock); + + /* avoids spin_lock/unlock and calling prepare_to_wait in the fast path */ + while (!list_empty(head)) { + prepare_to_wait(&mdev->ee_wait,&wait,TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mdev->req_lock); + drbd_kick_lo(mdev); + schedule(); + finish_wait(&mdev->ee_wait, &wait); + spin_lock_irq(&mdev->req_lock); + } +} + +void drbd_wait_ee_list_empty(drbd_dev *mdev,struct list_head *head) +{ + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev, head); + spin_unlock_irq(&mdev->req_lock); +} + +STATIC struct socket* drbd_accept(drbd_dev *mdev,struct socket* sock) +{ + struct socket *newsock; + int err = 0; + + err = sock->ops->listen(sock, 5); + if (err) + goto out; + + if (sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock)) + goto out; + + newsock->type = sock->type; + newsock->ops = sock->ops; + + err = newsock->ops->accept(sock, newsock, 0); + if (err < 0) + goto out_release; + + return newsock; + + out_release: + sock_release(newsock); + out: + if(err != -EAGAIN && err != -EINTR) + ERR("accept failed! %d\n", err); + return 0; +} + +STATIC int drbd_recv_short(drbd_dev *mdev, struct socket *sock, + void *buf, size_t size) +{ + mm_segment_t oldfs; + struct iovec iov; + struct msghdr msg; + int rv; + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + iov.iov_len = size; + iov.iov_base = buf; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); + + set_fs(oldfs); + + return rv; +} + +int drbd_recv(drbd_dev *mdev,void *buf, size_t size) +{ + mm_segment_t oldfs; + struct iovec iov; + struct msghdr msg; + int rv; + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + iov.iov_len = size; + iov.iov_base = buf; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + for(;;) { + rv = sock_recvmsg(mdev->data.socket,&msg,size,msg.msg_flags); + if (rv == size) break; + + /* Note: + * ECONNRESET other side closed the connection + * ERESTARTSYS (on sock) we got a signal + */ + + if (rv < 0) { + if (rv == -ECONNRESET) + INFO("sock was reset by peer\n"); + else if (rv != -ERESTARTSYS) + ERR("sock_recvmsg returned %d\n",rv); + break; + } else if (rv == 0) { + INFO("sock was shut down by peer\n"); + break; + } else { + /* signal came in, or peer/link went down, + * after we read a partial message + */ + // D_ASSERT(signal_pending(current)); + break; + } + }; + + set_fs(oldfs); + + if(rv != size) drbd_force_state(mdev,NS(conn,BrokenPipe)); + + return rv; +} + +STATIC struct socket *drbd_try_connect(drbd_dev *mdev) +{ + int err; + struct socket *sock; + struct sockaddr_in src_in; + + err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err) { + ERR("sock_creat(..)=%d\n", err); + return NULL; + } + + if(!inc_net(mdev)) return NULL; + + sock->sk->sk_rcvtimeo = + sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + + /* explicitly bind to the configured IP as source IP + for the outgoing connections. + This is needed for multihomed hosts and to be + able to use lo: interfaces for drbd. + Make sure to use 0 as portnumber, so linux selects + a free one dynamically. + */ + memcpy (&src_in, &(mdev->net_conf->my_addr), sizeof(struct sockaddr_in)); + src_in.sin_port = 0; + + err = sock->ops->bind(sock, + (struct sockaddr * ) &src_in, + sizeof (struct sockaddr_in)); + if (err) { + ERR("Unable to bind source sock (%d)\n", err); + sock_release(sock); + sock = NULL; + dec_net(mdev); + return sock; + } + + err = sock->ops->connect(sock, + (struct sockaddr *)mdev->net_conf->peer_addr, + mdev->net_conf->peer_addr_len, 0); + + if (err) { + sock_release(sock); + sock = NULL; + } + + dec_net(mdev); + return sock; +} + +STATIC struct socket *drbd_wait_for_connect(drbd_dev *mdev) +{ + int err; + struct socket *sock,*sock2; + + err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock2); + if (err) { + ERR("sock_creat(..)=%d\n", err); + return NULL; + } + + if(!inc_net(mdev)) return NULL; + + sock2->sk->sk_reuse = 1; /* SO_REUSEADDR */ + sock2->sk->sk_rcvtimeo = + sock2->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + + err = sock2->ops->bind(sock2, + (struct sockaddr *) mdev->net_conf->my_addr, + mdev->net_conf->my_addr_len); + dec_net(mdev); + + if (err) { + ERR("Unable to bind sock2 (%d)\n", err); + sock_release(sock2); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return NULL; + } + + sock = drbd_accept(mdev,sock2); + sock_release(sock2); + + return sock; +} + +STATIC int drbd_do_handshake(drbd_dev *mdev); +STATIC int drbd_do_auth(drbd_dev *mdev); + +STATIC int drbd_send_fp(drbd_dev *mdev,struct socket *sock,Drbd_Packet_Cmd cmd) +{ + Drbd_Header *h = (Drbd_Header *) &mdev->data.sbuf.head; + + return _drbd_send_cmd(mdev,sock,cmd,h,sizeof(*h),0); +} + +STATIC Drbd_Packet_Cmd drbd_recv_fp(drbd_dev *mdev,struct socket *sock) +{ + Drbd_Header *h = (Drbd_Header *) &mdev->data.sbuf.head; + int rr; + + rr = drbd_recv_short(mdev, sock, h, sizeof(*h)); + + if( rr==sizeof(*h) && h->magic==BE_DRBD_MAGIC ) { + return be16_to_cpu(h->command); + } + + return 0xffff; +} + +/* + * return values: + * 1 yess, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +int drbd_connect(drbd_dev *mdev) +{ + struct socket *s, *sock,*msock; + int try,h; + + D_ASSERT(mdev->state.conn >= Unconnected); + D_ASSERT(!mdev->data.socket); + + if(drbd_request_state(mdev,NS(conn,WFConnection)) < SS_Success ) return 0; + clear_bit(DISCARD_CONCURRENT, &mdev->flags); + + sock = NULL; + msock = NULL; + + do { + for(try=0;;) { // 3 tries, this should take less than a second! + s=drbd_try_connect(mdev); + if(s || ++try >= 3 ) break; + // give the other side time to call bind() & listen() + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + } + + if(s) { + if( !sock ) { + if( drbd_send_fp(mdev, s, HandShakeS) ) { + sock = s; + s = NULL; + } + } else if( !msock ) { + if( drbd_send_fp(mdev, s, HandShakeM) ) { + msock = s; + s = NULL; + } + } else { + ERR("Logic error in drbd_connect()\n"); + return -1; + } + if(s) { + ERR("Error during sending initial packet.\n"); + sock_release(s); + } + } + + if(sock && msock) break; + + s=drbd_wait_for_connect(mdev); + if(s) { + switch(drbd_recv_fp(mdev,s)) { + case HandShakeS: + if(sock) sock_release(sock); + sock = s; + break; + case HandShakeM: + if(msock) sock_release(msock); + msock = s; + set_bit(DISCARD_CONCURRENT, &mdev->flags); + break; + default: + WARN("Error receiving initial packet\n"); + sock_release(s); + } + } + + if(mdev->state.conn <= Disconnecting) return -1; + if(signal_pending(current)) { + flush_signals(current); + smp_rmb(); + if (get_t_state(&mdev->receiver) == Exiting) { + if(sock) sock_release(sock); + if(msock) sock_release(msock); + return -1; + } + } + + } while( !sock || !msock ); + + msock->sk->sk_reuse=1; /* SO_REUSEADDR */ + sock->sk->sk_reuse=1; /* SO_REUSEADDR */ + + sock->sk->sk_allocation = GFP_NOIO; + msock->sk->sk_allocation = GFP_NOIO; + + sock->sk->sk_priority=TC_PRIO_BULK; + // FIXME fold to limits. should be done in drbd_ioctl + sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; + sock->sk->sk_rcvbuf = mdev->net_conf->sndbuf_size; + /* NOT YET ... + * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + * first set it to the HandShake timeout, wich is hardcoded for now: */ + sock->sk->sk_sndtimeo = + sock->sk->sk_rcvtimeo = 2*HZ; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK; + + msock->sk->sk_priority=TC_PRIO_INTERACTIVE; + msock->sk->sk_sndbuf = 2*32767; + msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + + mdev->data.socket = sock; + mdev->meta.socket = msock; + mdev->last_received = jiffies; + + if(drbd_request_state(mdev,NS(conn,WFReportParams)) < SS_Success) return 0; + D_ASSERT(mdev->asender.task == NULL); + + h = drbd_do_handshake(mdev); + if (h <= 0) return h; + + if ( mdev->cram_hmac_tfm ) { + if (!drbd_do_auth(mdev)) { + ERR("Authentication of peer failed\n"); + return 0; + } + } + + sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + + atomic_set(&mdev->packet_seq,0); + mdev->peer_seq=0; + + drbd_thread_start(&mdev->asender); + + drbd_send_protocol(mdev); + drbd_send_sync_param(mdev,&mdev->sync_conf); + drbd_send_sizes(mdev); + drbd_send_uuids(mdev); + drbd_send_state(mdev); + clear_bit(USE_DEGR_WFC_T,&mdev->flags); + + return 1; +} + +STATIC int drbd_recv_header(drbd_dev *mdev, Drbd_Header *h) +{ + int r; + + r = drbd_recv(mdev,h,sizeof(*h)); + + if (unlikely( r != sizeof(*h) )) { + ERR("short read expecting header on sock: r=%d\n",r); + return FALSE; + }; + h->command = be16_to_cpu(h->command); + h->length = be16_to_cpu(h->length); + if (unlikely( h->magic != BE_DRBD_MAGIC )) { + ERR("magic?? on data m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + return FALSE; + } + mdev->last_received = jiffies; + + return TRUE; +} + +#if 0 +STATIC int receive_Barrier_tcq(drbd_dev *mdev, Drbd_Header* h) +{ + int rv; + int epoch_size=0; + Drbd_Barrier_Packet *p = (Drbd_Barrier_Packet*)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + rv = drbd_recv(mdev, h->payload, h->length); + ERR_IF(rv != h->length) return FALSE; + + inc_unacked(mdev); + + spin_lock_irq(&mdev->ee_lock); + if(list_empty(&mdev->active_ee)) { + epoch_size = mdev->epoch_size; + mdev->epoch_size = 0; + } else if (mdev->last_write_w_barrier) { + mdev->last_write_w_barrier->barrier_nr2 = be32_to_cpu(p->barrier); + } else { + mdev->next_barrier_nr = be32_to_cpu(p->barrier); + } + spin_unlock_irq(&mdev->ee_lock); + + if(epoch_size) { + rv = drbd_send_b_ack(mdev, p->barrier, epoch_size); + dec_unacked(mdev); + } + + return rv; +} +#endif + +STATIC int receive_Barrier_no_tcq(drbd_dev *mdev, Drbd_Header* h) +{ + int rv; + int epoch_size; + Drbd_Barrier_Packet *p = (Drbd_Barrier_Packet*)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + rv = drbd_recv(mdev, h->payload, h->length); + ERR_IF(rv != h->length) return FALSE; + + inc_unacked(mdev); + + if (mdev->net_conf->wire_protocol != DRBD_PROT_C) + drbd_kick_lo(mdev); + + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev,&mdev->active_ee); + epoch_size = mdev->epoch_size; + mdev->epoch_size = 0; + spin_unlock_irq(&mdev->req_lock); + + /* FIXME CAUTION! receiver thread sending via msock. + * to make sure this BarrierAck will not be received before the asender + * had a chance to send all the write acks corresponding to this epoch, + * wait_for that bit to clear... */ + set_bit(WRITE_ACK_PENDING,&mdev->flags); + wake_asender(mdev); + rv = wait_event_interruptible(mdev->ee_wait, + !test_bit(WRITE_ACK_PENDING,&mdev->flags)); + + if (rv == 0 && mdev->state.conn >= Connected) + rv = drbd_send_b_ack(mdev, p->barrier, epoch_size); + else + rv = 0; + dec_unacked(mdev); + + return rv; +} + +/* used from receive_RSDataReply (recv_resync_read) + * and from receive_Data */ +STATIC struct Tl_epoch_entry * +read_in_block(drbd_dev *mdev, u64 id, sector_t sector, int data_size) +{ + struct Tl_epoch_entry *e; + struct bio_vec *bvec; + struct page *page; + struct bio *bio; + int ds,i,rr; + + e = drbd_alloc_ee(mdev,id,sector,data_size,GFP_KERNEL); + if(!e) return 0; + bio = e->private_bio; + ds = data_size; + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + rr = drbd_recv(mdev,kmap(page),min_t(int,ds,PAGE_SIZE)); + kunmap(page); + if( rr != min_t(int,ds,PAGE_SIZE) ) { + drbd_free_ee(mdev,e); + WARN("short read receiving data: read %d expected %d\n", + rr, min_t(int,ds,PAGE_SIZE)); + return 0; + } + ds -= rr; + } + + mdev->recv_cnt+=data_size>>9; + return e; +} + +/* drbd_drain_block() just takes a data block out of the socket input + * buffer and discards ist. + */ +STATIC int +drbd_drain_block(drbd_dev *mdev, int data_size) +{ + struct page *page; + int rr, rv=1; + void* data; + + page = drbd_pp_alloc(mdev, GFP_KERNEL); + + data=kmap(page); + while(data_size) { + rr = drbd_recv(mdev,data,min_t(int,data_size,PAGE_SIZE)); + if( rr != min_t(int,data_size,PAGE_SIZE) ) { + rv = 0; + WARN("short read receiving data: read %d expected %d\n", + rr, min_t(int,data_size,PAGE_SIZE)); + goto out; + } + + data_size -= rr; + } + kunmap(page); + out: + drbd_pp_free(mdev,page); + return rv; +} + +/* kick lower level device, if we have more than (arbitrary number) + * reference counts on it, which typically are locally submitted io + * requests. don't use unacked_cnt, so we speed up proto A and B, too. */ +static void maybe_kick_lo(drbd_dev *mdev) +{ + if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark ) { + /* FIXME hysteresis ?? */ + drbd_kick_lo(mdev); + } +} + +STATIC int recv_dless_read(drbd_dev *mdev, drbd_request_t *req, + sector_t sector, int data_size) +{ + struct bio_vec *bvec; + struct bio *bio; + int rr,i,expect; + + bio = req->master_bio; + D_ASSERT( sector == bio->bi_sector ); + + bio_for_each_segment(bvec, bio, i) { + expect = min_t(int,data_size,bvec->bv_len); + rr=drbd_recv(mdev, + kmap(bvec->bv_page)+bvec->bv_offset, + expect); + kunmap(bvec->bv_page); + if (rr != expect) { + WARN("short read receiving data reply: read %d expected %d\n", + rr, expect); + return 0; + } + data_size -= rr; + } + + D_ASSERT(data_size == 0); + /* FIXME recv_cnt accounting ?? */ + return 1; +} + +/* e_end_resync_block() is called via + * drbd_process_done_ee() by asender only */ +STATIC int e_end_resync_block(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + sector_t sector = e->sector; + int ok; + + D_ASSERT(hlist_unhashed(&e->colision)); + + if (likely( drbd_bio_uptodate(e->private_bio) )) { + drbd_set_in_sync(mdev, sector, e->size); + ok = drbd_send_ack(mdev,RSWriteAck,e); + } else { + // Record failure to sync + drbd_rs_failed_io(mdev, sector, e->size); + + ok = drbd_send_ack(mdev,NegAck,e); + ok&= drbd_io_error(mdev, FALSE); + } + dec_unacked(mdev); + + return ok; +} + +STATIC int recv_resync_read(drbd_dev *mdev,sector_t sector,int data_size) +{ + struct Tl_epoch_entry *e; + + e = read_in_block(mdev,ID_SYNCER,sector,data_size); + if(!e) return FALSE; + + dec_rs_pending(mdev); + + e->private_bio->bi_end_io = drbd_endio_write_sec; + e->w.cb = e_end_resync_block; + + inc_unacked(mdev); + /* corresponding dec_unacked() in e_end_resync_block() + * respective _drbd_clear_done_ee */ + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list,&mdev->sync_ee); + spin_unlock_irq(&mdev->req_lock); + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("submit EE (RS)WRITE sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + drbd_generic_make_request(mdev,WRITE,DRBD_FAULT_RS_WR,e->private_bio); + /* accounting done in endio */ + + maybe_kick_lo(mdev); + return TRUE; +} + +STATIC int receive_DataReply(drbd_dev *mdev,Drbd_Header* h) +{ + drbd_request_t *req; + sector_t sector; + unsigned int header_size,data_size; + int ok; + Drbd_Data_Packet *p = (Drbd_Data_Packet*)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + /* I expect a block to be a multiple of 512 byte, + * and no more than DRBD_MAX_SEGMENT_SIZE. + * is this too restrictive? */ + ERR_IF(data_size == 0) return FALSE; + ERR_IF(data_size & 0x1ff) return FALSE; + ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + + spin_lock_irq(&mdev->req_lock); + req = _ar_id_to_req(mdev,p->block_id, sector); + spin_unlock_irq(&mdev->req_lock); + if (unlikely(!req)) { + ERR("Got a corrupt block_id/sector pair(1).\n"); + return FALSE; + } + + /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid + * special casing it there for the various failure cases. + * still no race with drbd_fail_pending_reads */ + ok = recv_dless_read(mdev,req,sector,data_size); + + if (ok) req_mod(req, data_received, 0); + /* else: nothing. handled from drbd_disconnect... + * I don't think we may complete this just yet + * in case we are "on-disconnect: freeze" */ + + return ok; +} + +STATIC int receive_RSDataReply(drbd_dev *mdev,Drbd_Header* h) +{ + sector_t sector; + unsigned int header_size,data_size; + int ok; + Drbd_Data_Packet *p = (Drbd_Data_Packet*)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + /* I expect a block to be a multiple of 512 byte, + * and no more than DRBD_MAX_SEGMENT_SIZE. + * is this too restrictive? */ + ERR_IF(data_size == 0) return FALSE; + ERR_IF(data_size & 0x1ff) return FALSE; + ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + D_ASSERT(p->block_id == ID_SYNCER); + + if(inc_local(mdev)) { + /* data is submitted to disk within recv_resync_read. + * corresponding dec_local done below on error, + * or in drbd_endio_write_sec. */ + /* FIXME paranoia: + * verify that the corresponding bit is set. + * in case we are Primary SyncTarget, + * verify there are no pending write request to that area. + */ + ok = recv_resync_read(mdev,sector,data_size); + if (!ok) dec_local(mdev); + } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Can not write resync data to local disk.\n"); + + ok = drbd_drain_block(mdev,data_size); + + drbd_send_ack_dp(mdev,NegAck,p); + } + + return ok; +} + +/* e_end_block() is called via drbd_process_done_ee(). + * this means this function only runs in the asender thread + * + * for a broken example implementation of the TCQ barrier version of + * e_end_block see older revisions... + */ +STATIC int e_end_block(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + sector_t sector = e->sector; + // unsigned int epoch_size; + int ok=1,pcmd; + + if(mdev->net_conf->wire_protocol == DRBD_PROT_C) { + if(likely(drbd_bio_uptodate(e->private_bio))) { + pcmd = (mdev->state.conn >= SyncSource && + mdev->state.conn <= PausedSyncT && + e->flags & EE_MAY_SET_IN_SYNC) ? + RSWriteAck : WriteAck; + ok &= drbd_send_ack(mdev,pcmd,e); + if(pcmd==RSWriteAck) + drbd_set_in_sync(mdev,sector,e->size); + } else { + /* FIXME I think we should send a NegAck regardless of + * which protocol is in effect. + * In which case we would need to make sure that any + * NegAck is sent. basically that means that drbd_process_done_ee + * may not list_del() the ee before this callback did run... + * maybe even move the list_del(e) in here... */ + ok = drbd_send_ack(mdev,NegAck,e); + ok&= drbd_io_error(mdev, FALSE); + /* we expect it to be marked out of sync anyways... + * maybe assert this? */ + } + dec_unacked(mdev); + } else if(unlikely(!drbd_bio_uptodate(e->private_bio))) { + ok = drbd_io_error(mdev, FALSE); + } + + /* we delete from the conflict detection hash _after_ we sent out the + * WriteAck / NegAck, to get the sequence number right. */ + if (mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->colision)); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + } else { + D_ASSERT(hlist_unhashed(&e->colision)); + } + + return ok; +} + +STATIC int e_send_discard_ack(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + int ok=1; + + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + ok = drbd_send_ack(mdev,DiscardAck,e); + + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->colision)); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + + dec_unacked(mdev); + + return ok; +} + +/* Called from receive_Data. + * Synchronize packets on sock with packets on msock. + * + * This is here so even when a Data packet traveling via sock overtook an Ack + * packet traveling on msock, they are still processed in the order they have + * been sent. + * + * Note: we don't care for Ack packets overtaking Data packets. + * + * In case packet_seq is larger than mdev->peer_seq number, there are + * outstanding packets on the msock. We wait for them to arrive. + * In case we are the logically next packet, we update mdev->peer_seq + * ourselves. Correctly handles 32bit wrap around. + * FIXME verify that atomic_t guarantees 32bit wrap around, + * otherwise we have to play tricks with << ... + * + * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, + * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds + * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have + * 1<<9 == 512 seconds aka ages for the 32bit wrap around... + * + * returns 0 if we may process the packet, + * -ERESTARTSYS if we were interrupted (by disconnect signal). */ +static int drbd_wait_peer_seq(drbd_dev *mdev, const u32 packet_seq) +{ + DEFINE_WAIT(wait); + int ret = 0; + spin_lock(&mdev->peer_seq_lock); + for (;;) { + prepare_to_wait(&mdev->seq_wait,&wait,TASK_INTERRUPTIBLE); + if (seq_le(packet_seq,mdev->peer_seq+1)) + break; + spin_unlock(&mdev->peer_seq_lock); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + schedule(); + spin_lock(&mdev->peer_seq_lock); + } + finish_wait(&mdev->seq_wait, &wait); + if (mdev->peer_seq+1 == packet_seq) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); + return ret; +} + +// mirrored write +STATIC int receive_Data(drbd_dev *mdev,Drbd_Header* h) +{ + sector_t sector; + struct Tl_epoch_entry *e; + Drbd_Data_Packet *p = (Drbd_Data_Packet*)h; + int header_size, data_size; + unsigned int barrier_nr = 0; + unsigned int epoch_size = 0; + u32 dp_flags; + + // FIXME merge this code dups into some helper function + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + ERR_IF(data_size == 0) return FALSE; + ERR_IF(data_size & 0x1ff) return FALSE; + ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + if(!inc_local(mdev)) { + /* data is submitted to disk at the end of this function. + * corresponding dec_local done either below (on error), + * or in drbd_endio_write_sec. */ + if (DRBD_ratelimit(5*HZ,5)) + ERR("Can not write mirrored data block to local disk.\n"); + spin_lock(&mdev->peer_seq_lock); + if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); + + drbd_send_ack_dp(mdev,NegAck,p); + mdev->epoch_size++; // spin lock ? + return drbd_drain_block(mdev,data_size); + } + + sector = be64_to_cpu(p->sector); + e = read_in_block(mdev,p->block_id,sector,data_size); + if (!e) { + dec_local(mdev); + return FALSE; + } + + e->private_bio->bi_end_io = drbd_endio_write_sec; + e->w.cb = e_end_block; + + dp_flags = be32_to_cpu(p->dp_flags); + if ( dp_flags & DP_HARDBARRIER ) { + e->private_bio->bi_rw |= BIO_RW_BARRIER; + } + if ( dp_flags & DP_RW_SYNC ) { + e->private_bio->bi_rw |= BIO_RW_SYNC; + } + if ( dp_flags & DP_MAY_SET_IN_SYNC ) { + e->flags |= EE_MAY_SET_IN_SYNC; + } + + /* I'm the receiver, I do hold a net_cnt reference. */ + if (!mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + } else { + /* don't get the req_lock yet, + * we may sleep in drbd_wait_peer_seq */ + const sector_t sector = e->sector; + const int size = e->size; + const int discard = test_bit(DISCARD_CONCURRENT,&mdev->flags); + DEFINE_WAIT(wait); + drbd_request_t *i; + struct hlist_node *n; + struct hlist_head *slot; + int first; + + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + BUG_ON(mdev->ee_hash == NULL); + BUG_ON(mdev->tl_hash == NULL); + + /* conflict detection and handling: + * 1. wait on the sequence number, + * in case this data packet overtook ACK packets. + * 2. check our hash tables for conflicting requests. + * we only need to walk the tl_hash, since an ee can not + * have a conflict with an other ee: on the submitting + * node, the corresponding req had already been conflicting, + * and a conflicting req is never sent. + * + * Note: for two_primaries, we are protocol C, + * so there cannot be any request that is DONE + * but still on the transfer log. + * + * unconditionally add to the ee_hash. + * + * if no conflicting request is found: + * submit. + * + * if any conflicting request is found that has not yet been acked, + * AND I have the "discard concurrent writes" flag: + * queue (via done_ee) the DiscardAck; OUT. + * + * if any conflicting request is found: + * block the receiver, waiting on misc_wait + * until no more conflicting requests are there, + * or we get interrupted (disconnect). + * + * we do not just write after local io completion of those + * requests, but only after req is done completely, i.e. + * we wait for the DiscardAck to arrive! + * + * then proceed normally, i.e. submit. + */ + if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) + goto out_interrupted; + + spin_lock_irq(&mdev->req_lock); + + hlist_add_head(&e->colision,ee_hash_slot(mdev,sector)); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev,sector); + first = 1; + for(;;) { + int have_unacked = 0; + int have_conflict = 0; + prepare_to_wait(&mdev->misc_wait,&wait,TASK_INTERRUPTIBLE); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + if (first) { + /* only ALERT on first iteration, + * we may be woken up early... */ + ALERT("%s[%u] Concurrent local write detected!" + " new: %llus +%u; pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + } + if (i->rq_state & RQ_NET_PENDING) ++have_unacked; + ++have_conflict; + } + } +#undef OVERLAPS + if (!have_conflict) break; + + /* Discard Ack only for the _first_ iteration */ + if (first && discard && have_unacked) { + ALERT("Concurrent write! [DISCARD BY FLAG] sec=%llus\n", + (unsigned long long)sector); + inc_unacked(mdev); + mdev->epoch_size++; + e->w.cb = e_send_discard_ack; + list_add_tail(&e->w.list,&mdev->done_ee); + + spin_unlock_irq(&mdev->req_lock); + + /* we could probably send that DiscardAck ourselves, + * but I don't like the receiver using the msock */ + + dec_local(mdev); + wake_asender(mdev); + finish_wait(&mdev->misc_wait, &wait); + return TRUE; + } + + if (signal_pending(current)) { + hlist_del_init(&e->colision); + + spin_unlock_irq(&mdev->req_lock); + + finish_wait(&mdev->misc_wait, &wait); + goto out_interrupted; + } + + spin_unlock_irq(&mdev->req_lock); + if (first) { + first = 0; + ALERT("Concurrent write! [W AFTERWARDS] " + "sec=%llus\n",(unsigned long long)sector); + } else if (discard) { + /* we had none on the first iteration. + * there must be none now. */ + D_ASSERT(have_unacked == 0); + } + schedule(); + spin_lock_irq(&mdev->req_lock); + } + finish_wait(&mdev->misc_wait, &wait); + } + + /* when using TCQ: + * note that, when using tagged command queuing, we may + * have more than one reorder domain "active" at a time. + * + * THINK: + * do we have any guarantees that we get the completion + * events of the different reorder domains in order? + * or does the api only "guarantee" that the events + * _happened_ in order, but eventually the completion + * callbacks are shuffeled again? + * + * note that I wonder about the order in which the + * callbacks are run, I am reasonable confident that the + * actual completion happens in order. + * + * - can it happen that the tagged write completion is + * called even though not all of the writes before it + * have run their completion callback? + * - can it happen that some completion callback of some + * write after the tagged one is run, even though the + * callback of the tagged one itself is still pending? + * + * if this can happen, we either need to drop our "debug + * assertion" about the epoch size and just trust our code + * and the layers below us (nah, won't do that). + * + * or we need to replace the "active_ee" list by some sort + * of "transfer log" on the receiving side, too, which + * uses epoch counters per reorder domain. + */ + + /* when using tcq: + * if we got a barrier packet before, but at that time the active_ee + * was not yet empty, we just "remembered" this barrier request. + * + * if this is the first data packet since that barrier, maybe meanwhile + * all previously active writes have been completed? + * if so, send the b_ack right now + * (though, maybe rather move it into the e_end_block callback, + * where it would be sent as soon as possible). + * + * otherwise, tag the write with the barrier number, so it + * will trigger the b_ack before its own ack. + */ + if (mdev->next_barrier_nr) { + /* only when using TCQ */ + if (list_empty(&mdev->active_ee)) { + barrier_nr = mdev->next_barrier_nr; + epoch_size = mdev->epoch_size; + mdev->epoch_size = 0; + } else { + e->barrier_nr = mdev->next_barrier_nr; + } + e->private_bio->bi_rw |= BIO_RW_BARRIER; + mdev->next_barrier_nr = 0; + } + list_add(&e->w.list,&mdev->active_ee); + spin_unlock_irq(&mdev->req_lock); + + if (barrier_nr) { + /* only when using TCQ + * maybe rather move it into the e_end_block callback, + * where it would be sent as soon as possible). + */ + (void)drbd_send_b_ack(mdev, cpu_to_be32(barrier_nr), epoch_size); + } + + switch(mdev->net_conf->wire_protocol) { + case DRBD_PROT_C: + inc_unacked(mdev); + /* corresponding dec_unacked() in e_end_block() + * respective _drbd_clear_done_ee */ + break; + case DRBD_PROT_B: + /* I really don't like it that the receiver thread + * sends on the msock, but anyways */ + drbd_send_ack(mdev, RecvAck, e); + break; + case DRBD_PROT_A: + // nothing to do + break; + } + + if(mdev->state.pdsk == Diskless) { + // In case we have the only disk of the cluster, + drbd_set_out_of_sync(mdev,e->sector,e->size); + e->flags |= EE_CALL_AL_COMPLETE_IO; + drbd_al_begin_io(mdev, e->sector); + } + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("submit EE (DATA)WRITE sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + /* FIXME drbd_al_begin_io in case we have two primaries... */ + drbd_generic_make_request(mdev,WRITE,DRBD_FAULT_DT_WR,e->private_bio); + /* accounting done in endio */ + + maybe_kick_lo(mdev); + return TRUE; + + out_interrupted: + /* yes, the epoch_size now is imbalanced. + * but we drop the connection anyways, so we don't have a chance to + * receive a barrier... atomic_inc(&mdev->epoch_size); */ + dec_local(mdev); + drbd_free_ee(mdev,e); + return FALSE; +} + +STATIC int receive_DataRequest(drbd_dev *mdev,Drbd_Header *h) +{ + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + struct Tl_epoch_entry *e; + int size; + unsigned int fault_type; + Drbd_BlockRequest_Packet *p = (Drbd_BlockRequest_Packet*)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + ERR("%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector,size); + return FALSE; + } + if ( sector + (size>>9) > capacity) { + ERR("%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector,size); + return FALSE; + } + + if(!inc_local_if_state(mdev, UpToDate)) { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Can not satisfy peer's read request, no local data.\n"); + drbd_send_ack_rp(mdev,h->command == DataRequest ? NegDReply : + NegRSDReply ,p); + return TRUE; + } + + e = drbd_alloc_ee(mdev,p->block_id,sector,size,GFP_KERNEL); + if (!e) { + dec_local(mdev); + return FALSE; + } + + e->private_bio->bi_end_io = drbd_endio_read_sec; + + switch (h->command) { + case DataRequest: + e->w.cb = w_e_end_data_req; + fault_type = DRBD_FAULT_DT_RD; + break; + case RSDataRequest: + e->w.cb = w_e_end_rsdata_req; + fault_type = DRBD_FAULT_RS_RD; + /* Eventually this should become asynchrously. Currently it + * blocks the whole receiver just to delay the reading of a + * resync data block. + * the drbd_work_queue mechanism is made for this... + */ + if (!drbd_rs_begin_io(mdev,sector)) { + /* we have been interrupted, + * probably connection lost! */ + D_ASSERT(signal_pending(current)); + dec_local(mdev); + drbd_free_ee(mdev,e); + return 0; + } + break; + default:; /* avoid compiler warning */ + fault_type = DRBD_FAULT_MAX; + } + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list,&mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); + + inc_unacked(mdev); + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("submit EE READ sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + /* FIXME actually, it could be a READA originating from the peer ... */ + drbd_generic_make_request(mdev,READ,fault_type,e->private_bio); + maybe_kick_lo(mdev); + + return TRUE; +} + +STATIC int drbd_asb_recover_0p(drbd_dev *mdev) +{ + int self, peer, rv=-100; + unsigned long ch_self, ch_peer; + + self = mdev->bc->md.uuid[Bitmap] & 1; + peer = mdev->p_uuid[Bitmap] & 1; + + ch_peer = mdev->p_uuid[UUID_SIZE]; + ch_self = mdev->comm_bm_set; + + switch ( mdev->net_conf->after_sb_0p ) { + case Consensus: + case DiscardSecondary: + case CallHelper: + ERR("Configuration error.\n"); + break; + case Disconnect: + break; + case DiscardYoungerPri: + if (self == 0 && peer == 1) { rv = -1; break; } + if (self == 1 && peer == 0) { rv = 1; break; } + /* Else fall through to one of the other strategies... */ + case DiscardOlderPri: + if (self == 0 && peer == 1) { rv = 1; break; } + if (self == 1 && peer == 0) { rv = -1; break; } + /* Else fall through to one of the other strategies... */ + WARN("Discard younger/older primary did not found a decision\n" + "Using discard-least-changes instead\n"); + case DiscardZeroChg: + if( ch_peer == 0 && ch_self == 0) { + rv=test_bit(DISCARD_CONCURRENT,&mdev->flags) ? -1 : 1; + break; + } else { + if ( ch_peer == 0 ) { rv = 1; break; } + if ( ch_self == 0 ) { rv = -1; break; } + } + if( mdev->net_conf->after_sb_0p == DiscardZeroChg ) break; + case DiscardLeastChg: + if ( ch_self < ch_peer ) rv = -1; + else if ( ch_self > ch_peer ) rv = 1; + else /* ( ch_self == ch_peer ) */ { + // Well, then use something else. + rv=test_bit(DISCARD_CONCURRENT,&mdev->flags) ? -1 : 1; + } + break; + case DiscardLocal: + rv = -1; + break; + case DiscardRemote: + rv = 1; + } + + return rv; +} + +STATIC int drbd_asb_recover_1p(drbd_dev *mdev) +{ + int self, peer, hg, rv=-100; + + self = mdev->bc->md.uuid[Bitmap] & 1; + peer = mdev->p_uuid[Bitmap] & 1; + + switch ( mdev->net_conf->after_sb_1p ) { + case DiscardYoungerPri: + case DiscardOlderPri: + case DiscardLeastChg: + case DiscardLocal: + case DiscardRemote: + ERR("Configuration error.\n"); + break; + case Disconnect: + break; + case Consensus: + hg = drbd_asb_recover_0p(mdev); + if( hg == -1 && mdev->state.role==Secondary) rv=hg; + if( hg == 1 && mdev->state.role==Primary) rv=hg; + break; + case Violently: + rv = drbd_asb_recover_0p(mdev); + break; + case DiscardSecondary: + return mdev->state.role==Primary ? 1 : -1; + case CallHelper: + hg = drbd_asb_recover_0p(mdev); + if( hg == -1 && mdev->state.role==Primary) { + self = drbd_set_role(mdev,Secondary,0); + if (self != SS_Success) { + drbd_khelper(mdev,"pri-lost-after-sb"); + } else { + WARN("Sucessfully gave up primary role.\n"); + rv = hg; + } + } else rv = hg; + } + + return rv; +} + +STATIC int drbd_asb_recover_2p(drbd_dev *mdev) +{ + int self, peer, hg, rv=-100; + + self = mdev->bc->md.uuid[Bitmap] & 1; + peer = mdev->p_uuid[Bitmap] & 1; + + switch ( mdev->net_conf->after_sb_2p ) { + case DiscardYoungerPri: + case DiscardOlderPri: + case DiscardLeastChg: + case DiscardLocal: + case DiscardRemote: + case Consensus: + case DiscardSecondary: + ERR("Configuration error.\n"); + break; + case Violently: + rv = drbd_asb_recover_0p(mdev); + break; + case Disconnect: + break; + case CallHelper: + hg = drbd_asb_recover_0p(mdev); + if( hg == -1 ) { + self = drbd_set_role(mdev,Secondary,0); + if (self != SS_Success) { + drbd_khelper(mdev,"pri-lost-after-sb"); + } else { + WARN("Sucessfully gave up primary role.\n"); + rv = hg; + } + } else rv = hg; + } + + return rv; +} + +STATIC void drbd_uuid_dump(drbd_dev *mdev,char* text,u64* uuid) +{ + INFO("%s %016llX:%016llX:%016llX:%016llX\n", + text, + uuid[Current], + uuid[Bitmap], + uuid[History_start], + uuid[History_end]); +} + +/* + 100 after split brain try auto recover + 2 SyncSource set BitMap + 1 SyncSource use BitMap + 0 no Sync + -1 SyncTarget use BitMap + -2 SyncTarget set BitMap + -100 after split brain, disconnect +-1000 unrelated data + */ +STATIC int drbd_uuid_compare(drbd_dev *mdev, int *rule_nr) +{ + u64 self, peer; + int i,j; + + self = mdev->bc->md.uuid[Current] & ~((u64)1); + peer = mdev->p_uuid[Current] & ~((u64)1); + + *rule_nr = 1; + if (self == UUID_JUST_CREATED && + peer == UUID_JUST_CREATED) return 0; + + *rule_nr = 2; + if ( (self == UUID_JUST_CREATED || self == (u64)0) && + peer != UUID_JUST_CREATED) return -2; + + *rule_nr = 3; + if ( self != UUID_JUST_CREATED && + (peer == UUID_JUST_CREATED || peer == (u64)0) ) return 2; + + *rule_nr = 4; + if (self == peer) { // Common power [off|failure] + int rct,dc; // roles at crash time + + rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) + + ( mdev->p_uuid[UUID_FLAGS] & 2 ); + // lowest bit is set when we were primary + // next bit (weight 2) is set when peer was primary + + MTRACE(TraceTypeUuid,TraceLvlMetrics, DUMPI(rct); ); + + switch(rct) { + case 0: /* !self_pri && !peer_pri */ return 0; + case 1: /* self_pri && !peer_pri */ return 1; + case 2: /* !self_pri && peer_pri */ return -1; + case 3: /* self_pri && peer_pri */ + dc = test_bit(DISCARD_CONCURRENT,&mdev->flags); + MTRACE(TraceTypeUuid,TraceLvlMetrics, DUMPI(dc); ); + return dc ? -1 : 1; + } + } + + *rule_nr = 5; + peer = mdev->p_uuid[Bitmap] & ~((u64)1); + if (self == peer) return -1; + + *rule_nr = 6; + for ( i=History_start ; i<=History_end ; i++ ) { + peer = mdev->p_uuid[i] & ~((u64)1); + if (self == peer) return -2; + } + + *rule_nr = 7; + self = mdev->bc->md.uuid[Bitmap] & ~((u64)1); + peer = mdev->p_uuid[Current] & ~((u64)1); + if (self == peer) return 1; + + *rule_nr = 8; + for ( i=History_start ; i<=History_end ; i++ ) { + self = mdev->bc->md.uuid[i] & ~((u64)1); + if (self == peer) return 2; + } + + *rule_nr = 9; + self = mdev->bc->md.uuid[Bitmap] & ~((u64)1); + peer = mdev->p_uuid[Bitmap] & ~((u64)1); + if (self == peer && self != ((u64)0) ) return 100; + + *rule_nr = 10; + for ( i=History_start ; i<=History_end ; i++ ) { + self = mdev->p_uuid[i] & ~((u64)1); + for ( j=History_start ; j<=History_end ; j++ ) { + peer = mdev->p_uuid[j] & ~((u64)1); + if (self == peer) return -100; + } + } + + return -1000; +} + +/* drbd_sync_handshake() returns the new conn state on success, or + conn_mask (-1) on failure. + */ +STATIC drbd_conns_t drbd_sync_handshake(drbd_dev *mdev, drbd_role_t peer_role, + drbd_disks_t peer_disk) +{ + int hg,rule_nr; + drbd_conns_t rv = conn_mask; + drbd_disks_t mydisk; + + mydisk = mdev->state.disk; + if( mydisk == Negotiating ) mydisk = mdev->new_state_tmp.disk; + + hg = drbd_uuid_compare(mdev,&rule_nr); + + MTRACE(TraceTypeUuid,TraceLvlSummary, + INFO("drbd_sync_handshake:\n"); + drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid); + drbd_uuid_dump(mdev,"peer",mdev->p_uuid); + INFO("uuid_compare()=%d by rule %d\n",hg,rule_nr); + ); + + if (hg == -1000) { + ALERT("Unrelated data, dropping connection!\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return conn_mask; + } + + if( (mydisk==Inconsistent && peer_disk>Inconsistent) || + (peer_disk==Inconsistent && mydisk>Inconsistent) ) { + int f = (hg == -100) || abs(hg) == 2; + hg = mydisk > Inconsistent ? 1 : -1; + if(f) hg=hg*2; + INFO("Becoming sync %s due to disk states.\n", + hg > 0 ? "source" : "target"); + } + + if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp) ) { + int pcount = (mdev->state.role==Primary) + (peer_role==Primary); + int forced = (hg == -100); + + switch (pcount) { + case 0: + hg = drbd_asb_recover_0p(mdev); + break; + case 1: + hg = drbd_asb_recover_1p(mdev); + break; + case 2: + hg = drbd_asb_recover_2p(mdev); + break; + } + if ( abs(hg) < 100 ) { + WARN("Split-Brain detected, %d primaries, automatically solved. Sync from %s node\n", + pcount, (hg < 0) ? "peer":"this"); + if(forced) { + WARN("Doing a full sync, since" + " UUIDs where ambiguous.\n"); + drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid); + drbd_uuid_dump(mdev,"peer",mdev->p_uuid); + hg=hg*2; + } + } + } + + if ( hg == -100 ) { + if(mdev->net_conf->want_lose && !(mdev->p_uuid[UUID_FLAGS]&1)){ + hg = -1; + } + if(!mdev->net_conf->want_lose && (mdev->p_uuid[UUID_FLAGS]&1)){ + hg = 1; + } + + if ( abs(hg) < 100 ) { + WARN("Split-Brain detected, manually solved. Sync from %s node\n", + (hg < 0) ? "peer":"this"); + } + } + + if (hg == -100) { + ALERT("Split-Brain detected, dropping connection!\n"); + drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid); + drbd_uuid_dump(mdev,"peer",mdev->p_uuid); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return conn_mask; + } + + if (hg > 0 && mydisk <= Inconsistent ) { + ERR("I shall become SyncSource, but I am inconsistent!\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return conn_mask; + } + + if (hg < 0 && // by intention we do not use mydisk here. + mdev->state.role == Primary && mdev->state.disk >= Consistent ) { + switch(mdev->net_conf->rr_conflict) { + case CallHelper: + drbd_khelper(mdev,"pri-lost"); + // fall through + case Disconnect: + ERR("I shall become SyncTarget, but I am primary!\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return conn_mask; + case Violently: + WARN("Becoming SyncTarget, violating the stable-data" + "assumption\n"); + } + } + + if (abs(hg) >= 2) { + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_set_all(mdev); + + if (unlikely(drbd_bm_write(mdev) < 0)) { + return conn_mask; + } + + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + } + + if (hg > 0) { // become sync source. + rv = WFBitMapS; + } else if (hg < 0) { // become sync target + rv = WFBitMapT; + } else { + rv = Connected; + if(drbd_bm_total_weight(mdev)) { + INFO("No resync, but %lu bits in bitmap!\n", + drbd_bm_total_weight(mdev)); + } + } + + drbd_bm_recount_bits(mdev); + + return rv; +} + +/* returns 1 if invalid */ +STATIC int cmp_after_sb(enum after_sb_handler peer, enum after_sb_handler self) +{ + // DiscardRemote - DiscardLocal is valid + if( (peer == DiscardRemote && self == DiscardLocal) || + (self == DiscardRemote && peer == DiscardLocal) ) return 0; + + // any other things with DiscardRemote or DiscardLocal are invalid + if( peer == DiscardRemote || peer == DiscardLocal || + self == DiscardRemote || self == DiscardLocal ) return 1; + + // everything else is valid if they are equal on both sides. + if( peer == self ) return 0; + + // everything es is invalid. + return 1; +} + +STATIC int receive_protocol(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_Protocol_Packet *p = (Drbd_Protocol_Packet*)h; + + int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; + int p_want_lose, p_two_primaries; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_proto = be32_to_cpu(p->protocol); + p_after_sb_0p = be32_to_cpu(p->after_sb_0p); + p_after_sb_1p = be32_to_cpu(p->after_sb_1p); + p_after_sb_2p = be32_to_cpu(p->after_sb_2p); + p_want_lose = be32_to_cpu(p->want_lose); + p_two_primaries = be32_to_cpu(p->two_primaries); + + if( p_proto != mdev->net_conf->wire_protocol) { + ERR("incompatible communication protocols\n"); + goto disconnect; + } + + if( cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p) ) { + ERR("incompatible after-sb-0pri settings\n"); + goto disconnect; + } + + if( cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p) ) { + ERR("incompatible after-sb-1pri settings\n"); + goto disconnect; + } + + if( cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p) ) { + ERR("incompatible after-sb-2pri settings\n"); + goto disconnect; + } + + if( p_want_lose && mdev->net_conf->want_lose ) { + ERR("both sides have the 'want_lose' flag set\n"); + goto disconnect; + } + + if( p_two_primaries != mdev->net_conf->two_primaries ) { + ERR("incompatible setting of the two-primaries options\n"); + goto disconnect; + } + + return TRUE; + + disconnect: + drbd_force_state(mdev,NS(conn,Disconnecting)); + return FALSE; +} + +STATIC int receive_SyncParam(drbd_dev *mdev,Drbd_Header *h) +{ + int ok = TRUE; + Drbd_SyncParam_Packet *p = (Drbd_SyncParam_Packet*)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + // XXX harmless race with ioctl ... + mdev->sync_conf.rate = be32_to_cpu(p->rate); + + return ok; +} + +STATIC void drbd_setup_order_type(drbd_dev *mdev, int peer) +{ +#if 0 + int self = drbd_queue_order_type(mdev); + int type; + + static char *order_txt[] = { + [QUEUE_ORDERED_NONE] = "none - oldIDE", + [QUEUE_ORDERED_FLUSH] = "flush - IDE", + [QUEUE_ORDERED_TAG] = "tag - TCQ", + }; + + if(self == QUEUE_ORDERED_NONE || + peer == QUEUE_ORDERED_NONE) { + type = QUEUE_ORDERED_NONE; + } else if (self == QUEUE_ORDERED_FLUSH || + peer == QUEUE_ORDERED_FLUSH) { + type = QUEUE_ORDERED_FLUSH; + } else if(self == QUEUE_ORDERED_TAG || + peer == QUEUE_ORDERED_TAG) { + type = QUEUE_ORDERED_TAG; + } else { + D_ASSERT(0); + type = QUEUE_ORDERED_NONE; + } + + if (type != self ) { + INFO("Exposing an order type of '%s' to the kernel\n", + order_txt[type]); + blk_queue_ordered(mdev->rq_queue,type); + } +#endif +} + +/* warn if the arguments differ by more than 12.5% */ +static void warn_if_differ_considerably(drbd_dev *mdev, const char *s, sector_t a, sector_t b) +{ + sector_t d; + if (a == 0 || b == 0) return; + d = (a > b) ? (a - b) : (b - a); + if ( d > (a>>3) || d > (b>>3)) { + WARN("Considerable difference in %s: %llus vs. %llus\n", s, + (unsigned long long)a, (unsigned long long)b); + } +} + +STATIC int receive_sizes(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_Sizes_Packet *p = (Drbd_Sizes_Packet*)h; + unsigned int max_seg_s; + sector_t p_size, p_usize, my_usize; + drbd_conns_t nconn; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_size=be64_to_cpu(p->d_size); + p_usize=be64_to_cpu(p->u_size); + + if(p_size == 0 && mdev->state.disk == Diskless ) { + ERR("some backing storage is needed\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return FALSE; + } + +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + if(inc_local(mdev)) { + warn_if_differ_considerably(mdev, "lower level device sizes", + p_size, drbd_get_capacity(mdev->bc->backing_bdev)); + warn_if_differ_considerably(mdev, "user requested size", + p_usize, mdev->bc->dc.disk_size); + + if (mdev->state.conn == WFReportParams) { + /* this is first connect, or an otherwise expected + param exchange. choose the minimum */ + p_usize=min_not_zero((sector_t)mdev->bc->dc.disk_size, + p_usize); + } + + my_usize = mdev->bc->dc.disk_size; + + if( mdev->bc->dc.disk_size != p_usize ) { + mdev->bc->dc.disk_size = p_usize; + INFO("Peer sets u_size to %lu KB\n", + (unsigned long)mdev->bc->dc.disk_size); + } + + // Never shrink a device with usable data. + if(drbd_new_dev_size(mdev,mdev->bc) < + drbd_get_capacity(mdev->this_bdev) && + mdev->state.disk >= Outdated ) { + dec_local(mdev); + ERR("The peer's disk size is too small!\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + mdev->bc->dc.disk_size = my_usize; + return FALSE; + } + dec_local(mdev); + } +#undef min_not_zero + + mdev->p_size=p_size; + if(inc_local(mdev)) { + drbd_bm_lock(mdev); // { + /* + * you may get a flip-flop connection established/connection loss, + * in case both really have different usize uppon first connect! + * try to solve it thus: + ***/ + + drbd_determin_dev_size(mdev); + drbd_bm_unlock(mdev); // } + dec_local(mdev); + } else { + // I am diskless, need to accept the peer's size. + drbd_set_my_capacity(mdev,p_size); + } + + if (mdev->p_uuid && mdev->state.conn <= Connected && inc_local(mdev)) { + nconn=drbd_sync_handshake(mdev,mdev->state.peer,mdev->state.pdsk); + dec_local(mdev); + + if(nconn == conn_mask) return FALSE; + + if(drbd_request_state(mdev,NS(conn,nconn)) < SS_Success) { + drbd_force_state(mdev,NS(conn,Disconnecting)); + return FALSE; + } + } + + if(inc_local(mdev)) { + max_seg_s = be32_to_cpu(p->max_segment_size); + if( max_seg_s != mdev->rq_queue->max_segment_size ) { + drbd_setup_queue_param(mdev, max_seg_s); + } + + drbd_setup_order_type(mdev,be32_to_cpu(p->queue_order_type)); + dec_local(mdev); + } + + if (mdev->state.conn > WFReportParams ) { + if( be64_to_cpu(p->c_size) != + drbd_get_capacity(mdev->this_bdev) ) { + // we have different sizes, probabely peer + // needs to know my new size... + drbd_send_sizes(mdev); + } + } + + return TRUE; +} + +STATIC int receive_uuids(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_GenCnt_Packet *p = (Drbd_GenCnt_Packet*)h; + u64 *p_uuid; + int i; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_uuid = kmalloc(sizeof(u64)*EXT_UUID_SIZE, GFP_KERNEL); + + for (i = Current; i < EXT_UUID_SIZE; i++) { + p_uuid[i] = be64_to_cpu(p->uuid[i]); + } + + if ( mdev->p_uuid ) kfree(mdev->p_uuid); + mdev->p_uuid = p_uuid; + + return TRUE; +} + +/** + * convert_state: + * Switches the view of the state. + */ +STATIC drbd_state_t convert_state(drbd_state_t ps) +{ + drbd_state_t ms; + + static drbd_conns_t c_tab[] = { + [Connected] = Connected, + + [StartingSyncS] = StartingSyncT, + [StartingSyncT] = StartingSyncS, + [Disconnecting] = TearDown, // NetworkFailure, + + [conn_mask] = conn_mask, + }; + + ms.i = ps.i; + + ms.conn = c_tab[ps.conn]; + ms.peer = ps.role; + ms.role = ps.peer; + ms.pdsk = ps.disk; + ms.disk = ps.pdsk; + ms.peer_isp = ( ps.aftr_isp | ps.user_isp ); + + return ms; +} + +STATIC int receive_req_state(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_Req_State_Packet *p = (Drbd_Req_State_Packet*)h; + drbd_state_t mask,val; + int rv; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + mask.i = be32_to_cpu(p->mask); + val.i = be32_to_cpu(p->val); + + if (test_bit(DISCARD_CONCURRENT,&mdev->flags)) drbd_state_lock(mdev); + + mask = convert_state(mask); + val = convert_state(val); + + rv = drbd_change_state(mdev,ChgStateVerbose,mask,val); + + if (test_bit(DISCARD_CONCURRENT,&mdev->flags)) drbd_state_unlock(mdev); + + drbd_send_sr_reply(mdev,rv); + drbd_md_sync(mdev); + + return TRUE; +} + +STATIC int receive_state(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_State_Packet *p = (Drbd_State_Packet*)h; + drbd_conns_t nconn; + drbd_state_t os,ns,peer_state; + int rv; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + nconn = mdev->state.conn; + if (nconn == WFReportParams ) nconn = Connected; + + peer_state.i = be32_to_cpu(p->state); + + if (mdev->p_uuid && mdev->state.conn <= Connected && + inc_local_if_state(mdev,Negotiating) && + peer_state.disk >= Negotiating) { + nconn=drbd_sync_handshake(mdev,peer_state.role,peer_state.disk); + dec_local(mdev); + + if(nconn == conn_mask) return FALSE; + } + + if (mdev->state.conn > WFReportParams ) { + if( nconn > Connected && peer_state.conn <= Connected) { + // we want resync, peer has not yet decided to sync... + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + else if (nconn == Connected && peer_state.disk == Negotiating) { + // peer is waiting for us to respond... + drbd_send_state(mdev); + } + } + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + ns.i = mdev->state.i; + ns.conn = nconn; + ns.peer = peer_state.role; + ns.pdsk = peer_state.disk; + ns.peer_isp = ( peer_state.aftr_isp | peer_state.user_isp ); + if((nconn == Connected || nconn == WFBitMapS) && + ns.disk == Negotiating ) ns.disk = UpToDate; + if((nconn == Connected || nconn == WFBitMapT) && + ns.pdsk == Negotiating ) ns.pdsk = UpToDate; + rv = _drbd_set_state(mdev,ns,ChgStateVerbose | ChgStateHard); + spin_unlock_irq(&mdev->req_lock); + if (rv==SS_Success) { + after_state_ch(mdev,os,ns,ChgStateVerbose | ChgStateHard); + } + + if(rv < SS_Success) { + drbd_force_state(mdev,NS(conn,Disconnecting)); + return FALSE; + } + + mdev->net_conf->want_lose = 0; + + /* FIXME assertion for (gencounts do not diverge) */ + drbd_md_sync(mdev); // update connected indicator, la_size, ... + + return TRUE; +} + +STATIC int receive_sync_uuid(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_SyncUUID_Packet *p = (Drbd_SyncUUID_Packet*)h; + + wait_event( mdev->misc_wait, + mdev->state.conn < Connected || mdev->state.conn == WFSyncUUID); + + // D_ASSERT( mdev->state.conn == WFSyncUUID ); + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + drbd_uuid_set(mdev,Current,be64_to_cpu(p->uuid)); + _drbd_uuid_set(mdev,Bitmap,0UL); + + drbd_start_resync(mdev,SyncTarget); + + return TRUE; +} + +/* Since we are processing the bitfild from lower addresses to higher, + it does not matter if the process it in 32 bit chunks or 64 bit + chunks as long as it is little endian. (Understand it as byte stream, + beginning with the lowest byte...) If we would use big endian + we would need to process it from the highest address to the lowest, + in order to be agnostic to the 32 vs 64 bits issue. + + returns 0 on failure, 1 if we suceessfully received it. */ +STATIC int receive_bitmap(drbd_dev *mdev, Drbd_Header *h) +{ + size_t bm_words, bm_i, want, num_words; + unsigned long *buffer; + int ok=FALSE; + + drbd_bm_lock(mdev); // { + + bm_words = drbd_bm_words(mdev); + bm_i = 0; + buffer = vmalloc(BM_PACKET_WORDS*sizeof(long)); + + while (1) { + num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i ); + want = num_words * sizeof(long); + ERR_IF(want != h->length) goto out; + if (want==0) break; + if (drbd_recv(mdev, buffer, want) != want) + goto out; + + drbd_bm_merge_lel(mdev, bm_i, num_words, buffer); + bm_i += num_words; + + if (!drbd_recv_header(mdev,h)) + goto out; + D_ASSERT(h->command == ReportBitMap); + } + + if (mdev->state.conn == WFBitMapS) { + drbd_start_resync(mdev,SyncSource); + } else if (mdev->state.conn == WFBitMapT) { + ok = drbd_send_bitmap(mdev); + if (!ok) goto out; + ok = drbd_request_state(mdev,NS(conn,WFSyncUUID)); + D_ASSERT( ok == SS_Success ); + } else { + ERR("unexpected cstate (%s) in receive_bitmap\n", + conns_to_name(mdev->state.conn)); + } + + ok=TRUE; + out: + drbd_bm_unlock(mdev); // } + vfree(buffer); + return ok; +} + +STATIC int receive_skip(drbd_dev *mdev,Drbd_Header *h) +{ + // TODO zero copy sink :) + static char sink[128]; + int size,want,r; + + WARN("skipping unknown optional packet type %d, l: %d!\n", + h->command, h->length ); + + size = h->length; + while (size > 0) { + want = min_t(int,size,sizeof(sink)); + r = drbd_recv(mdev,sink,want); + ERR_IF(r < 0) break; + size -= r; + } + return (size == 0); +} + +STATIC int receive_UnplugRemote(drbd_dev *mdev, Drbd_Header *h) +{ + if (mdev->state.disk >= Inconsistent) drbd_kick_lo(mdev); + return TRUE; // cannot fail. +} + +typedef int (*drbd_cmd_handler_f)(drbd_dev*,Drbd_Header*); + +static drbd_cmd_handler_f drbd_default_handler[] = { + [Data] = receive_Data, + [DataReply] = receive_DataReply, + [RSDataReply] = receive_RSDataReply, + [RecvAck] = NULL, // via msock: got_RecvAck, + [WriteAck] = NULL, // via msock: got_WriteAck, + [Barrier] = receive_Barrier_no_tcq, + [BarrierAck] = NULL, // via msock: got_BarrierAck, + [ReportBitMap] = receive_bitmap, + [Ping] = NULL, // via msock: got_Ping, + [PingAck] = NULL, // via msock: got_PingAck, + [UnplugRemote] = receive_UnplugRemote, + [DataRequest] = receive_DataRequest, + [RSDataRequest] = receive_DataRequest, //receive_RSDataRequest, + [SyncParam] = receive_SyncParam, + [ReportProtocol] = receive_protocol, + [ReportUUIDs] = receive_uuids, + [ReportSizes] = receive_sizes, + [ReportState] = receive_state, + [StateChgRequest] = receive_req_state, + [ReportSyncUUID] = receive_sync_uuid, +}; + +static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; +static drbd_cmd_handler_f *drbd_opt_cmd_handler = NULL; + +#if 0 + /* FIXME lge thinks the implementation of barrier handling via + * tcq is currently broken */ +void drbd_set_recv_tcq(drbd_dev * mdev, int tcq_enabled) +{ +// warning LGE "FIXME make drbd_cmd_handler a member of mdev" + if(tcq_enabled && + drbd_default_handler[Barrier] != receive_Barrier_tcq) { + INFO("Enabling TCQ for barrier processing on backend.\n"); + drbd_default_handler[Barrier] = receive_Barrier_tcq; + } + + if(!tcq_enabled && + drbd_default_handler[Barrier] != receive_Barrier_usual) { + INFO("Using conventional (non TCQ) barrier processing" + " on backend.\n"); + drbd_default_handler[Barrier] = receive_Barrier_usual; + } +} +#endif + +STATIC void drbdd(drbd_dev *mdev) +{ + drbd_cmd_handler_f handler; + Drbd_Header *header = &mdev->data.rbuf.head; + + while (get_t_state(&mdev->receiver) == Running) { + if (!drbd_recv_header(mdev,header)) + break; + + if (header->command < MAX_CMD) + handler = drbd_cmd_handler[header->command]; + else if (MayIgnore < header->command && header->command < MAX_OPT_CMD) + handler = drbd_opt_cmd_handler[header->command-MayIgnore]; + else if (header->command > MAX_OPT_CMD) + handler = receive_skip; + else + handler = NULL; + + if (unlikely(!handler)) { + ERR("unknown packet type %d, l: %d!\n", + header->command, header->length); + drbd_force_state(mdev,NS(conn,ProtocolError)); + break; + } + if (unlikely(!handler(mdev,header))) { + ERR("error receiving %s, l: %d!\n", + cmdname(header->command), header->length); + drbd_force_state(mdev,NS(conn,ProtocolError)); + break; + } + + dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__); + } +} + +/* FIXME how should freeze-io be handled? */ +STATIC void drbd_fail_pending_reads(drbd_dev *mdev) +{ + struct hlist_head *slot; + struct hlist_node *n; + drbd_request_t * req; + struct list_head *le; + LIST_HEAD(workset); + int i; + + /* + * Application READ requests + */ + spin_lock_irq(&mdev->req_lock); + for(i=0;iapp_reads_hash+i; + hlist_for_each_entry(req, n, slot, colision) { + list_add(&req->w.list, &workset); + } + } + memset(mdev->app_reads_hash,0,APP_R_HSIZE*sizeof(void*)); + + while(!list_empty(&workset)) { + le = workset.next; + req = list_entry(le, drbd_request_t, w.list); + list_del(le); + + _req_mod(req, connection_lost_while_pending, 0); + } + spin_unlock_irq(&mdev->req_lock); +} + +STATIC void drbd_disconnect(drbd_dev *mdev) +{ + struct drbd_work prev_work_done; + enum fencing_policy fp; + drbd_state_t os,ns; + int rv=SS_UnknownError; + + D_ASSERT(mdev->state.conn < Connected); + /* FIXME verify that: + * the state change magic prevents us from becoming >= Connected again + * while we are still cleaning up. + */ + + /* asender does not clean up anything. it must not interfere, either */ + drbd_thread_stop(&mdev->asender); + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + dec_local(mdev); + } + + down(&mdev->data.mutex); + drbd_free_sock(mdev); + up(&mdev->data.mutex); + + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev,&mdev->active_ee); + _drbd_wait_ee_list_empty(mdev,&mdev->sync_ee); + _drbd_clear_done_ee(mdev); + _drbd_wait_ee_list_empty(mdev,&mdev->read_ee); + reclaim_net_ee(mdev); + spin_unlock_irq(&mdev->req_lock); + + /* FIXME: fail pending reads? + * when we are configured for freeze io, + * we could retry them once we un-freeze. */ + drbd_fail_pending_reads(mdev); + + /* We do not have data structures that would allow us to + get the rs_pending_cnt down to 0 again. + * On SyncTarget we do not have any data structures describing + the pending RSDataRequest's we have sent. + * On SyncSource there is no data structure that tracks + the RSDataReply blocks that we sent to the SyncTarget. + And no, it is not the sum of the reference counts in the + resync_LRU. The resync_LRU tracks the whole operation including + the disk-IO, while the rs_pending_cnt only tracks the blocks + on the fly. */ + drbd_rs_cancel_all(mdev); + mdev->rs_total=0; + mdev->rs_failed=0; + atomic_set(&mdev->rs_pending_cnt,0); + wake_up(&mdev->misc_wait); + + /* make sure syncer is stopped and w_resume_next_sg queued */ + del_timer_sync(&mdev->resync_timer); + set_bit(STOP_SYNC_TIMER,&mdev->flags); + resync_timer_fn((unsigned long)mdev); + + /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, + * w_make_resync_request etc. which may still be on the worker queue + * to be "canceled" */ + set_bit(WORK_PENDING,&mdev->flags); + prev_work_done.cb = w_prev_work_done; + drbd_queue_work(&mdev->data.work,&prev_work_done); + wait_event(mdev->misc_wait, !test_bit(WORK_PENDING,&mdev->flags)); + + if ( mdev->p_uuid ) { + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + } + + /* queue cleanup for the worker. + * FIXME this should go into after_state_ch */ + if (!mdev->state.susp) + tl_clear(mdev); + + INFO("Connection closed\n"); + + drbd_md_sync(mdev); + + if ( mdev->state.role == Primary ) { + if( fp >= Resource && + mdev->state.pdsk >= DUnknown ) { + drbd_disks_t nps = drbd_try_outdate_peer(mdev); + drbd_request_state(mdev,NS(pdsk,nps)); + } + } + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + if ( os.conn >= Unconnected ) { + // Do not restart in case we are Disconnecting + ns = os; + ns.conn = Unconnected; + rv=_drbd_set_state(mdev,ns,ChgStateVerbose); + } + spin_unlock_irq(&mdev->req_lock); + if (rv == SS_Success) { + after_state_ch(mdev,os,ns,ChgStateVerbose); + } + + if(os.conn == Disconnecting) { + wait_event( mdev->misc_wait,atomic_read(&mdev->net_cnt) == 0 ); + if(mdev->ee_hash) { + kfree(mdev->ee_hash); + mdev->ee_hash = NULL; + mdev->ee_hash_s = 0; + } + + if(mdev->tl_hash) { + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + } + if(mdev->cram_hmac_tfm) { + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + } + kfree(mdev->net_conf); + mdev->net_conf=NULL; + drbd_request_state(mdev, NS(conn,StandAlone)); + } + + /* they do trigger all the time. + * hm. why won't tcp release the page references, + * we already released the socket!? + D_ASSERT(atomic_read(&mdev->pp_in_use) == 0); + D_ASSERT(list_empty(&mdev->net_ee)); + */ + D_ASSERT(list_empty(&mdev->read_ee)); + D_ASSERT(list_empty(&mdev->active_ee)); + D_ASSERT(list_empty(&mdev->sync_ee)); + D_ASSERT(list_empty(&mdev->done_ee)); + + /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ + mdev->epoch_size = 0; +} + +/* + * we hereby assure that we always support the drbd dialects + * PRO_VERSION and (PRO_VERSION -1), allowing for rolling upgrades + * + * feature flags and the reserved array should be enough room for future + * enhancements of the handshake protocol, and possible plugins... + * + * for now, they are expected to be zero, but ignored. + */ +int drbd_send_handshake(drbd_dev *mdev) +{ + // ASSERT current == mdev->receiver ... + Drbd_HandShake_Packet *p = &mdev->data.sbuf.HandShake; + int ok; + + if (down_interruptible(&mdev->data.mutex)) { + ERR("interrupted during initial handshake\n"); + return 0; /* interrupted. not ok. */ + } + /* FIXME do we need to verify this here? */ + if (mdev->data.socket == NULL) { + up(&mdev->data.mutex); + return 0; + } + + memset(p,0,sizeof(*p)); + p->protocol_version = cpu_to_be32(PRO_VERSION); + ok = _drbd_send_cmd( mdev, mdev->data.socket, HandShake, + (Drbd_Header *)p, sizeof(*p), 0 ); + up(&mdev->data.mutex); + return ok; +} + +/* + * return values: + * 1 yess, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +STATIC int drbd_do_handshake(drbd_dev *mdev) +{ + // ASSERT current == mdev->receiver ... + Drbd_HandShake_Packet *p = &mdev->data.rbuf.HandShake; + const int expect = sizeof(Drbd_HandShake_Packet)-sizeof(Drbd_Header); + int rv; + + rv = drbd_send_handshake(mdev); + if (!rv) goto break_c_loop; + + rv = drbd_recv_header(mdev,&p->head); + if (!rv) goto break_c_loop; + + if (p->head.command != HandShake) { + ERR( "expected HandShake packet, received: %s (0x%04x)\n", + cmdname(p->head.command), p->head.command ); + return -1; + } + + if (p->head.length != expect) { + ERR( "expected HandShake length: %u, received: %u\n", + expect, p->head.length ); + return -1; + } + + rv = drbd_recv(mdev, &p->head.payload, expect); + + if (rv != expect) { + ERR("short read receiving handshake packet: l=%u\n", rv); + return 0; + } + + dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__); + + p->protocol_version = be32_to_cpu(p->protocol_version); + + if ( p->protocol_version == PRO_VERSION || + p->protocol_version == (PRO_VERSION+1) ) { + if (p->protocol_version == (PRO_VERSION+1)) { + WARN( "You should upgrade me! " + "Peer wants protocol version: %u\n", + p->protocol_version ); + } + INFO( "Handshake successful: DRBD Network Protocol version %u\n", + PRO_VERSION ); + } /* else if ( p->protocol_version == (PRO_VERSION-1) ) { + // not yet; but next time :) + INFO( "Handshake successful: DRBD Protocol version %u\n", + (PRO_VERSION-1) ); + ... do some remapping of defaults and jump tables here ... + } */ else { + ERR( "incompatible DRBD dialects: " + "I support %u, peer wants %u\n", + PRO_VERSION, p->protocol_version ); + return -1; + } + + return 1; + + break_c_loop: + WARN( "My msock connect got accepted onto peer's sock!\n"); + /* In case a tcp connection set-up takes longer than + connect-int, we might get into the situation that this + node's msock gets connected to the peer's sock! + + To break out of this endless loop behaviour, we need to + wait unti the peer's msock connect tries are over. (1 Second) + + Additionally we wait connect-int/2 to hit with our next + connect try exactly in the peer's window of expectation. */ + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ + (mdev->net_conf->try_connect_int*HZ)/2); + + return 0; +} + +#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) +STATIC int drbd_do_auth(drbd_dev *mdev) +{ + ERR( "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); + ERR( "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); + return 0; +} +#else +#define CHALLENGE_LEN 64 +STATIC int drbd_do_auth(drbd_dev *mdev) +{ + char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ + struct scatterlist sg; + char *response = NULL; + char *right_response = NULL; + char *peers_ch = NULL; + Drbd_Header p; + unsigned int key_len = strlen(mdev->net_conf->shared_secret); + unsigned int resp_size; + struct hash_desc desc; + int rv; + + desc.tfm=mdev->cram_hmac_tfm; + desc.flags=0; + + rv = crypto_hash_setkey(mdev->cram_hmac_tfm, + (u8*)mdev->net_conf->shared_secret, key_len); + if(rv) { + ERR("crypto_hash_setkey() failed with %d\n",rv); + rv = 0; + goto fail; + } + + get_random_bytes(my_challenge, CHALLENGE_LEN); + + rv = drbd_send_cmd2(mdev,AuthChallenge,my_challenge,CHALLENGE_LEN); + if (!rv) goto fail; + + rv = drbd_recv_header(mdev,&p); + if (!rv) goto fail; + + if (p.command != AuthChallenge) { + ERR( "expected AuthChallenge packet, received: %s (0x%04x)\n", + cmdname(p.command), p.command ); + rv = 0; + goto fail; + } + + if (p.length > CHALLENGE_LEN*2 ) { + ERR( "expected AuthChallenge payload too big.\n"); + rv = 0; + goto fail; + } + + peers_ch = kmalloc(p.length,GFP_KERNEL); + if(peers_ch == NULL) { + ERR("kmalloc of peers_ch failed\n"); + rv = 0; + goto fail; + } + + rv = drbd_recv(mdev, peers_ch, p.length); + + if (rv != p.length) { + ERR("short read AuthChallenge: l=%u\n", rv); + rv = 0; + goto fail; + } + + resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); + response = kmalloc(resp_size,GFP_KERNEL); + if(response == NULL) { + ERR("kmalloc of response failed\n"); + rv = 0; + goto fail; + } + + sg.page = virt_to_page(peers_ch); + sg.offset = offset_in_page(peers_ch); + sg.length = p.length; + + rv = crypto_hash_digest(&desc, &sg, sg.length, response); + if(rv) { + ERR( "crypto_hash_digest() failed with %d\n",rv); + rv = 0; + goto fail; + } + + rv = drbd_send_cmd2(mdev,AuthResponse,response,resp_size); + if (!rv) goto fail; + + rv = drbd_recv_header(mdev,&p); + if (!rv) goto fail; + + if (p.command != AuthResponse) { + ERR( "expected AuthResponse packet, received: %s (0x%04x)\n", + cmdname(p.command), p.command ); + rv = 0; + goto fail; + } + + if (p.length != resp_size ) { + ERR( "expected AuthResponse payload of wrong size\n" ); + rv = 0; + goto fail; + } + + rv = drbd_recv(mdev, response , resp_size); + + if (rv != resp_size) { + ERR("short read receiving AuthResponse: l=%u\n", rv); + rv = 0; + goto fail; + } + + right_response = kmalloc(resp_size,GFP_KERNEL); + if(response == NULL) { + ERR("kmalloc of right_response failed\n"); + rv = 0; + goto fail; + } + + sg.page = virt_to_page(my_challenge); + sg.offset = offset_in_page(my_challenge); + sg.length = CHALLENGE_LEN; + + rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); + if(rv) { + ERR( "crypto_hash_digest() failed with %d\n",rv); + rv = 0; + goto fail; + } + + rv = ! memcmp(response,right_response,resp_size); + + if(rv) { + INFO("Peer authenticated using %d bytes of '%s' HMAC\n", + resp_size,mdev->net_conf->cram_hmac_alg); + } + + fail: + if(peers_ch) kfree(peers_ch); + if(response) kfree(response); + if(right_response) kfree(right_response); + + return rv; +} +#endif + +int drbdd_init(struct Drbd_thread *thi) +{ + drbd_dev *mdev = thi->mdev; + int minor = mdev_to_minor(mdev); + int h; + + sprintf(current->comm, "drbd%d_receiver", minor); + + INFO("receiver (re)started\n"); + + do { + h = drbd_connect(mdev); + if (h == 0) { + drbd_disconnect(mdev); + schedule_timeout(HZ); + } + if( h < 0 ) { + WARN("Discarding network configuration.\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + } + } while ( h == 0 ); + + if( h > 0 ) { + if(inc_net(mdev)) { + drbdd(mdev); + dec_net(mdev); + } + } + + drbd_disconnect(mdev); + + // Ensure that the thread state fits to our connection state. + if( mdev->state.conn == Unconnected ) { + ERR_IF( mdev->receiver.t_state != Restarting ) + drbd_thread_restart_nowait(&mdev->receiver); + } else if( mdev->state.conn == StandAlone ) { + ERR_IF( mdev->receiver.t_state != Exiting ) + drbd_thread_stop_nowait(&mdev->receiver); + } + + INFO("receiver terminated\n"); + return 0; +} + +/* ********* acknowledge sender ******** */ + +STATIC int got_RqSReply(drbd_dev *mdev, Drbd_Header* h) +{ + Drbd_RqS_Reply_Packet *p = (Drbd_RqS_Reply_Packet*)h; + + int retcode = be32_to_cpu(p->retcode); + + if(retcode >= SS_Success) { + set_bit(CL_ST_CHG_SUCCESS,&mdev->flags); + } else { + set_bit(CL_ST_CHG_FAIL,&mdev->flags); + ERR("Requested state change failed by peer: %s\n", + set_st_err_name(retcode)); + } + wake_up(&mdev->state_wait); + + return TRUE; +} + +STATIC int got_Ping(drbd_dev *mdev, Drbd_Header* h) +{ + return drbd_send_ping_ack(mdev); + +} + +STATIC int got_PingAck(drbd_dev *mdev, Drbd_Header* h) +{ + // restore idle timeout + mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + + return TRUE; +} + +STATIC int got_BlockAck(drbd_dev *mdev, Drbd_Header* h) +{ + drbd_request_t *req; + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + + update_peer_seq(mdev,be32_to_cpu(p->seq_num)); + + if( is_syncer_block_id(p->block_id)) { + drbd_set_in_sync(mdev,sector,blksize); + dec_rs_pending(mdev); + } else { + spin_lock_irq(&mdev->req_lock); + req = _ack_id_to_req(mdev, p->block_id, sector); + + if (unlikely(!req)) { + spin_unlock_irq(&mdev->req_lock); + ERR("Got a corrupt block_id/sector pair(2).\n"); + return FALSE; + } + + switch (be16_to_cpu(h->command)) { + case RSWriteAck: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + _req_mod(req,write_acked_by_peer_and_sis,0); + break; + case WriteAck: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + _req_mod(req,write_acked_by_peer,0); + break; + case RecvAck: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); + _req_mod(req,recv_acked_by_peer,0); + break; + case DiscardAck: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + ALERT("Got DiscardAck packet %llus +%u!" + " DRBD is not a random data generator!\n", + (unsigned long long)req->sector, req->size); + _req_mod(req, conflict_discarded_by_peer, 0); + break; + default: + D_ASSERT(0); + } + spin_unlock_irq(&mdev->req_lock); + } + /* dec_ap_pending is handled within _req_mod */ + + return TRUE; +} + +STATIC int got_NegAck(drbd_dev *mdev, Drbd_Header* h) +{ + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + sector_t sector = be64_to_cpu(p->sector); + drbd_request_t *req; + + if (DRBD_ratelimit(5*HZ,5)) + WARN("Got NegAck packet. Peer is in troubles?\n"); + + update_peer_seq(mdev,be32_to_cpu(p->seq_num)); + + if(is_syncer_block_id(p->block_id)) { + sector_t sector = be64_to_cpu(p->sector); + int size = be32_to_cpu(p->blksize); + + dec_rs_pending(mdev); + + drbd_rs_failed_io(mdev, sector, size); + } else { + req = _ack_id_to_req(mdev, p->block_id, sector); + + if (unlikely(!req)) { + spin_unlock_irq(&mdev->req_lock); + ERR("Got a corrupt block_id/sector pair(2).\n"); + return FALSE; + } + + req_mod(req, neg_acked, 0); + } + + return TRUE; +} + +STATIC int got_NegDReply(drbd_dev *mdev, Drbd_Header* h) +{ + drbd_request_t *req; + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + sector_t sector = be64_to_cpu(p->sector); + + spin_lock_irq(&mdev->req_lock); + req = _ar_id_to_req(mdev,p->block_id, sector); + if (unlikely(!req)) { + spin_unlock_irq(&mdev->req_lock); + ERR("Got a corrupt block_id/sector pair(3).\n"); + return FALSE; + } + + /* FIXME explicitly warn if protocol != C */ + + ERR("Got NegDReply; Sector %llus, len %u; Fail original request.\n", + (unsigned long long)sector,be32_to_cpu(p->blksize)); + + _req_mod(req, neg_acked, 0); + spin_unlock_irq(&mdev->req_lock); + +// warning LGE "ugly and wrong" + drbd_khelper(mdev,"pri-on-incon-degr"); + + return TRUE; +} + +STATIC int got_NegRSDReply(drbd_dev *mdev, Drbd_Header* h) +{ + sector_t sector; + int size; + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + D_ASSERT(p->block_id == ID_SYNCER); + + dec_rs_pending(mdev); + + if(inc_local_if_state(mdev,Failed)) { + drbd_rs_complete_io(mdev,sector); + drbd_rs_failed_io(mdev, sector, size); + dec_local(mdev); + } + + return TRUE; +} + +STATIC int got_BarrierAck(drbd_dev *mdev, Drbd_Header* h) +{ + Drbd_BarrierAck_Packet *p = (Drbd_BarrierAck_Packet*)h; + + tl_release(mdev,p->barrier,be32_to_cpu(p->set_size)); + dec_ap_pending(mdev); + + return TRUE; +} + +struct asender_cmd { + size_t pkt_size; + int (*process)(drbd_dev *mdev, Drbd_Header* h); +}; + +int drbd_asender(struct Drbd_thread *thi) +{ + drbd_dev *mdev = thi->mdev; + Drbd_Header *h = &mdev->meta.rbuf.head; + + int rv,len; + void *buf = h; + int received = 0; + int expect = sizeof(Drbd_Header); + int cmd = -1; + int empty; + + static struct asender_cmd asender_tbl[] = { + [Ping] ={ sizeof(Drbd_Header), got_Ping }, + [PingAck] ={ sizeof(Drbd_Header), got_PingAck }, + [RecvAck] ={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [WriteAck] ={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [RSWriteAck]={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [DiscardAck]={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [NegAck] ={ sizeof(Drbd_BlockAck_Packet), got_NegAck }, + [NegDReply] ={ sizeof(Drbd_BlockAck_Packet), got_NegDReply }, + [NegRSDReply]={sizeof(Drbd_BlockAck_Packet), got_NegRSDReply}, + [BarrierAck]={ sizeof(Drbd_BarrierAck_Packet),got_BarrierAck }, + [StateChgReply]={sizeof(Drbd_RqS_Reply_Packet),got_RqSReply }, + }; + + sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); + + current->policy = SCHED_RR; /* Make this a realtime task! */ + current->rt_priority = 2; /* more important than all other tasks */ + + while (get_t_state(thi) == Running) { + if (test_and_clear_bit(SEND_PING, &mdev->flags)) { + ERR_IF(!drbd_send_ping(mdev)) goto err; + mdev->meta.socket->sk->sk_rcvtimeo = + mdev->net_conf->ping_timeo*HZ/10; + } + + while(1) { + if (!drbd_process_done_ee(mdev)) { + ERR("process_done_ee() = NOT_OK\n"); + goto err; + } + set_bit(SIGNAL_ASENDER, &mdev->flags); + spin_lock_irq(&mdev->req_lock); + empty = list_empty(&mdev->done_ee); + spin_unlock_irq(&mdev->req_lock); + if(empty) break; + clear_bit(SIGNAL_ASENDER, &mdev->flags); + flush_signals(current); + } + drbd_tcp_flush(mdev->meta.socket); + + rv = drbd_recv_short(mdev, mdev->meta.socket, + buf,expect-received); + clear_bit(SIGNAL_ASENDER, &mdev->flags); + + flush_signals(current); + + drbd_tcp_cork(mdev->meta.socket); + + /* Note: + * -EINTR (on meta) we got a signal + * -EAGAIN (on meta) rcvtimeo expired + * -ECONNRESET other side closed the connection + * -ERESTARTSYS (on data) we got a signal + * rv < 0 other than above: unexpected error! + * rv == expected: full header or command + * rv < expected: "woken" by signal during receive + * rv == 0 : "connection shut down by peer" + */ + if (likely(rv > 0)) { + received += rv; + buf += rv; + } else if (rv == 0) { + ERR("meta connection shut down by peer.\n"); + goto err; + } else if (rv == -EAGAIN) { + if( mdev->meta.socket->sk->sk_rcvtimeo == + mdev->net_conf->ping_timeo*HZ/10 ) { + ERR("PingAck did not arrive in time.\n"); + goto err; + } + set_bit(SEND_PING,&mdev->flags); + continue; + } else if (rv == -EINTR) { + continue; + } else { + ERR("sock_recvmsg returned %d\n", rv); + goto err; + } + + if (received == expect && cmd == -1 ) { + cmd = be16_to_cpu(h->command); + len = be16_to_cpu(h->length); + if (unlikely( h->magic != BE_DRBD_MAGIC )) { + ERR("magic?? on meta m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + goto err; + } + expect = asender_tbl[cmd].pkt_size; + ERR_IF(len != expect-sizeof(Drbd_Header)) { + dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__); + DUMPI(expect); + } + } + if(received == expect) { + D_ASSERT(cmd != -1); + dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__); + if(!asender_tbl[cmd].process(mdev,h)) goto err; + + buf = h; + received = 0; + expect = sizeof(Drbd_Header); + cmd = -1; + } + } //while + + if(0) { + err: + clear_bit(SIGNAL_ASENDER, &mdev->flags); + if (mdev->state.conn >= Connected) + drbd_force_state(mdev,NS(conn,NetworkFailure)); + } + + D_ASSERT(mdev->state.conn < Connected); + INFO("asender terminated\n"); + + return 0; +} diff -uprN linux-2.6.18/drivers/block/drbd/drbd_req.c linux-2.6.18.ovz/drivers/block/drbd/drbd_req.c --- linux-2.6.18/drivers/block/drbd/drbd_req.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_req.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1169 @@ +/* +-*- linux-c -*- + drbd_req.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include + +#include +#include +#include "drbd_int.h" +#include "drbd_req.h" + +/* outside of the ifdef + * because of the _print_rq_state(,FIXME) in barrier_acked */ +void _print_rq_state(drbd_request_t *req, const char *txt) +{ + const unsigned long s = req->rq_state; + drbd_dev *mdev = req->mdev; + const int rw = (req->master_bio == NULL || + bio_data_dir(req->master_bio) == WRITE) ? + 'W' : 'R'; + + INFO("%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n", + txt, req, rw, + s & RQ_LOCAL_PENDING ? 'p' : '-', + s & RQ_LOCAL_COMPLETED ? 'c' : '-', + s & RQ_LOCAL_OK ? 'o' : '-', + s & RQ_NET_PENDING ? 'p' : '-', + s & RQ_NET_QUEUED ? 'q' : '-', + s & RQ_NET_SENT ? 's' : '-', + s & RQ_NET_DONE ? 'd' : '-', + s & RQ_NET_OK ? 'o' : '-', + req->epoch, + (unsigned long long)req->sector, + req->size, + conns_to_name(mdev->state.conn)); +} + +//#define VERBOSE_REQUEST_CODE +#if defined(VERBOSE_REQUEST_CODE) || defined(ENABLE_DYNAMIC_TRACE) +void _print_req_mod(drbd_request_t *req,drbd_req_event_t what) +{ + drbd_dev *mdev = req->mdev; + const int rw = (req->master_bio == NULL || + bio_data_dir(req->master_bio) == WRITE) ? + 'W' : 'R'; + + static const char *rq_event_names[] = { + [created] = "created", + [to_be_send] = "to_be_send", + [to_be_submitted] = "to_be_submitted", + [queue_for_net_write] = "queue_for_net_write", + [queue_for_net_read] = "queue_for_net_read", + [send_canceled] = "send_canceled", + [send_failed] = "send_failed", + [handed_over_to_network] = "handed_over_to_network", + [connection_lost_while_pending] = "connection_lost_while_pending", + [recv_acked_by_peer] = "recv_acked_by_peer", + [write_acked_by_peer] = "write_acked_by_peer", + [neg_acked] = "neg_acked", + [conflict_discarded_by_peer] = "conflict_discarded_by_peer", + [barrier_acked] = "barrier_acked", + [data_received] = "data_received", + [read_completed_with_error] = "read_completed_with_error", + [write_completed_with_error] = "write_completed_with_error", + [completed_ok] = "completed_ok", + }; + + INFO("_req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]); +} + +# ifdef ENABLE_DYNAMIC_TRACE +# define print_rq_state(R,T) MTRACE(TraceTypeRq,TraceLvlMetrics,_print_rq_state(R,T);) +# define print_req_mod(T,W) MTRACE(TraceTypeRq,TraceLvlMetrics,_print_req_mod(T,W);) +# else +# define print_rq_state(R,T) _print_rq_state(R,T) +# define print_req_mod(T,W) _print_req_mod(T,W) +# endif + +#else +#define print_rq_state(R,T) +#define print_req_mod(T,W) +#endif + +static void _req_is_done(drbd_dev *mdev, drbd_request_t *req, const int rw) +{ + const unsigned long s = req->rq_state; + /* if it was a write, we may have to set the corresponding + * bit(s) out-of-sync first. If it had a local part, we need to + * release the reference to the activity log. */ + if (rw == WRITE) { + /* remove it from the transfer log. + * well, only if it had been there in the first + * place... if it had not (local only or conflicting + * and never sent), it should still be "empty" as + * initialised in drbd_req_new(), so we can list_del() it + * here unconditionally */ + list_del(&req->tl_requests); + /* Set out-of-sync unless both OK flags are set + * (local only or remote failed). + * Other places where we set out-of-sync: + * READ with local io-error */ + if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) { + drbd_set_out_of_sync(mdev,req->sector,req->size); + } + + if( (s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && + (s & RQ_NET_SIS) ) { + drbd_set_in_sync(mdev,req->sector,req->size); + } + + /* one might be tempted to move the drbd_al_complete_io + * to the local io completion callback drbd_endio_pri. + * but, if this was a mirror write, we may only + * drbd_al_complete_io after this is RQ_NET_DONE, + * otherwise the extent could be dropped from the al + * before it has actually been written on the peer. + * if we crash before our peer knows about the request, + * but after the extent has been dropped from the al, + * we would forget to resync the corresponding extent. + */ + if (s & RQ_LOCAL_MASK) { + if (inc_local_if_state(mdev,Failed)) { + drbd_al_complete_io(mdev, req->sector); + dec_local(mdev); + } else { + WARN("Should have called drbd_al_complete_io(, %llu), " + "but my Disk seems to have failed:(\n", + (unsigned long long) req->sector); + } + } + } + + /* if it was a local io error, we want to notify our + * peer about that, and see if we need to + * detach the disk and stuff. + * to avoid allocating some special work + * struct, reuse the request. */ + + /* THINK + * why do we do this not when we detect the error, + * but delay it until it is "done", i.e. possibly + * until the next barrier ack? */ + + if (rw == WRITE && + (( s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { + if (!(req->w.list.next == LIST_POISON1 || + list_empty(&req->w.list))) { + /* DEBUG ASSERT only; if this triggers, we + * probably corrupt the worker list here */ + DUMPP(req->w.list.next); + DUMPP(req->w.list.prev); + } + req->w.cb = w_io_error; + drbd_queue_work(&mdev->data.work, &req->w); + /* drbd_req_free() is done in w_io_error */ + } else { + drbd_req_free(req); + } +} + +static void _about_to_complete_local_write(drbd_dev *mdev, drbd_request_t *req) +{ + const unsigned long s = req->rq_state; + drbd_request_t *i; + struct Tl_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; + + /* before we can signal completion to the upper layers, + * we may need to close the current epoch */ + if (req->epoch == mdev->newest_barrier->br_number) + set_bit(ISSUE_BARRIER,&mdev->flags); + + /* we need to do the conflict detection stuff, + * if we have the ee_hash (two_primaries) and + * this has been on the network */ + if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { + const sector_t sector = req->sector; + const int size = req->size; + + /* ASSERT: + * there must be no conflicting requests, since + * they must have been failed on the spot */ +#define OVERLAPS overlaps(sector, size, i->sector, i->size) + slot = tl_hash_slot(mdev,sector); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + ALERT("LOGIC BUG: completed: %p %llus +%u; other: %p %llus +%u\n", + req, (unsigned long long)sector, size, + i, (unsigned long long)i->sector, i->size); + } + } + + /* maybe "wake" those conflicting epoch entries + * that wait for this request to finish. + * + * currently, there can be only _one_ such ee + * (well, or some more, which would be pending + * DiscardAck not yet sent by the asender...), + * since we block the receiver thread upon the + * first conflict detection, which will wait on + * misc_wait. maybe we want to assert that? + * + * anyways, if we found one, + * we just have to do a wake_up. */ +#undef OVERLAPS +#define OVERLAPS overlaps(sector, size, e->sector, e->size) + slot = ee_hash_slot(mdev,req->sector); + hlist_for_each_entry(e, n, slot, colision) { + if (OVERLAPS) { + wake_up(&mdev->misc_wait); + break; + } + } + } +#undef OVERLAPS +} + +static void _complete_master_bio(drbd_dev *mdev, drbd_request_t *req, int error) +{ + dump_bio(mdev,req->master_bio,1); + bio_endio(req->master_bio, req->master_bio->bi_size, error); + req->master_bio = NULL; + dec_ap_bio(mdev); +} + +void _req_may_be_done(drbd_request_t *req, int error) +{ + const unsigned long s = req->rq_state; + drbd_dev *mdev = req->mdev; + int rw; + + print_rq_state(req, "_req_may_be_done"); + MUST_HOLD(&mdev->req_lock) + + /* we must not complete the master bio, while it is + * still being processed by _drbd_send_zc_bio (drbd_send_dblock) + * not yet acknowledged by the peer + * not yet completed by the local io subsystem + * these flags may get cleared in any order by + * the worker, + * the receiver, + * the bio_endio completion callbacks. + */ + if (s & RQ_NET_QUEUED) return; + if (s & RQ_NET_PENDING) return; + if (s & RQ_LOCAL_PENDING) return; + + if (req->master_bio) { + /* this is data_received (remote read) + * or protocol C WriteAck + * or protocol B RecvAck + * or protocol A "handed_over_to_network" (SendAck) + * or canceled or failed, + * or killed from the transfer log due to connection loss. + */ + + /* + * figure out whether to report success or failure. + * + * report success when at least one of the operations suceeded. + * or, to put the other way, + * only report failure, when both operations failed. + * + * what to do about the failures is handled elsewhere. + * what we need to do here is just: complete the master_bio. + */ + int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); + rw = bio_data_dir(req->master_bio); + + /* remove the request from the conflict detection + * respective block_id verification hash */ + if (!hlist_unhashed(&req->colision)) hlist_del(&req->colision); + else D_ASSERT((s & RQ_NET_MASK) == 0); + + if (rw == WRITE) { + /* for writes we need to do some extra housekeeping */ + _about_to_complete_local_write(mdev,req); + } + + /* FIXME not yet implemented... + * in case we got "suspended" (on_disconnect: freeze io) + * we may not yet complete the request... + * though, this is probably best handled elsewhere by not + * walking the transfer log until "unfreeze", so we won't end + * up here anyways during the freeze ... + * then again, if it is a READ, it is not in the TL at all. + * is it still leagal to complete a READ during freeze? */ + + _complete_master_bio(mdev,req, + ok ? 0 : ( error ? error : -EIO ) ); + } else { + /* only WRITE requests can end up here without a master_bio */ + rw = WRITE; + } + + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { + /* this is disconnected (local only) operation, + * or protocol C WriteAck, + * or protocol A or B BarrierAck, + * or killed from the transfer log due to connection loss. */ + _req_is_done(mdev,req,rw); + } + /* else: network part and not DONE yet. that is + * protocol A or B, barrier ack still pending... */ +} + +/* + * checks whether there was an overlapping request + * or ee already registered. + * + * if so, return 1, in which case this request is completed on the spot, + * without ever being submitted or send. + * + * return 0 if it is ok to submit this request. + * + * NOTE: + * paranoia: assume something above us is broken, and issues different write + * requests for the same block simultaneously... + * + * To ensure these won't be reordered differently on both nodes, resulting in + * diverging data sets, we discard the later one(s). Not that this is supposed + * to happen, but this is the rationale why we also have to check for + * conflicting requests with local origin, and why we have to do so regardless + * of whether we allowed multiple primaries. + * + * BTW, in case we only have one primary, the ee_hash is empty anyways, and the + * second hlist_for_each_entry becomes a noop. This is even simpler than to + * grab a reference on the net_conf, and check for the two_primaries flag... + */ +STATIC int _req_conflicts(drbd_request_t *req) +{ + drbd_dev *mdev = req->mdev; + const sector_t sector = req->sector; + const int size = req->size; + drbd_request_t *i; + struct Tl_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; + + MUST_HOLD(&mdev->req_lock); + D_ASSERT(hlist_unhashed(&req->colision)); + + /* FIXME should this inc_net/dec_net + * rather be done in drbd_make_request_common? */ + if (!inc_net(mdev)) + return 0; + + /* BUG_ON */ + ERR_IF (mdev->tl_hash_s == 0) + goto out_no_conflict; + BUG_ON(mdev->tl_hash == NULL); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev,sector); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + ALERT("%s[%u] Concurrent local write detected!" + " [DISCARD L] new: %llus +%u; pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + goto out_conflict; + } + } + + if(mdev->ee_hash_s) { + /* now, check for overlapping requests with remote origin */ + BUG_ON(mdev->ee_hash == NULL); +#undef OVERLAPS +#define OVERLAPS overlaps(e->sector, e->size, sector, size) + slot = ee_hash_slot(mdev,sector); + hlist_for_each_entry(e, n, slot, colision) { + if (OVERLAPS) { + ALERT("%s[%u] Concurrent remote write detected!" + " [DISCARD L] new: %llus +%u; pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)e->sector, e->size); + goto out_conflict; + } + } + } +#undef OVERLAPS + + out_no_conflict: + /* this is like it should be, and what we expected. + * our users do behave after all... */ + dec_net(mdev); + return 0; + + out_conflict: + dec_net(mdev); + return 1; +} + +/* obviously this could be coded as many single functions + * instead of one huge switch, + * or by putting the code directly in the respective locations + * (as it has been before). + * + * but having it this way + * enforces that it is all in this one place, where it is easier to audit, + * it makes it obvious that whatever "event" "happens" to a request should + * happen "atomically" within the req_lock, + * and it enforces that we have to think in a very structured manner + * about the "events" that may happen to a request during its life time ... + * + * Though I think it is likely that we break this again into many + * static inline void _req_mod_ ## what (req) ... + */ +void _req_mod(drbd_request_t *req, drbd_req_event_t what, int error) +{ + drbd_dev *mdev = req->mdev; + MUST_HOLD(&mdev->req_lock); + + if (error && ( bio_rw(req->master_bio) != READA ) ) { + ERR("got an _req_mod() errno of %d\n",error); + } + + print_req_mod(req,what); + + switch(what) { + default: + ERR("LOGIC BUG in %s:%u\n", __FILE__ , __LINE__ ); + return; + + /* does not happen... + * initialization done in drbd_req_new + case created: + break; + */ + + case to_be_send: /* via network */ + /* reached via drbd_make_request_common + * and from FIXME w_read_retry_remote */ + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + break; + + case to_be_submitted: /* locally */ + /* reached via drbd_make_request_common */ + D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); + req->rq_state |= RQ_LOCAL_PENDING; + break; + + /* FIXME these *_completed_* are basically the same. + * can probably be merged with some if (what == xy) */ + + case completed_ok: + if (bio_data_dir(req->private_bio) == WRITE) + mdev->writ_cnt += req->size>>9; + else + mdev->read_cnt += req->size>>9; + + bio_put(req->private_bio); + req->private_bio = NULL; + + req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); + req->rq_state &= ~RQ_LOCAL_PENDING; + + _req_may_be_done(req,error); + dec_local(mdev); + break; + + case write_completed_with_error: + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + bio_put(req->private_bio); + req->private_bio = NULL; + ALERT("Local WRITE failed sec=%llus size=%u\n", + (unsigned long long)req->sector, req->size); + /* and now: check how to handle local io error. + * FIXME see comment below in read_completed_with_error */ + __drbd_chk_io_error(mdev,FALSE); + _req_may_be_done(req,error); + dec_local(mdev); + break; + + case read_completed_with_error: + if (bio_rw(req->master_bio) != READA) { + drbd_set_out_of_sync(mdev,req->sector,req->size); + } + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + bio_put(req->private_bio); + req->private_bio = NULL; + dec_local(mdev); + if (bio_rw(req->master_bio) == READA) { + /* it is legal to fail READA */ + _req_may_be_done(req,error); + break; + } + /* else */ + ALERT("Local READ failed sec=%llus size=%u\n", + (unsigned long long)req->sector, req->size); + /* _req_mod(req,to_be_send); oops, recursion in static inline */ + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + + /* and now: check how to handle local io error. + * + * FIXME we should not handle WRITE and READ io errors + * the same. When we retry the READ, and then write + * the answer, that might suceed because modern drives + * would relocate the sectors. We'd need to keep our + * private bio then, and round the offset and size so + * we get back enough data to be able to clear the bits again. + */ + __drbd_chk_io_error(mdev,FALSE); + /* fall through: _req_mod(req,queue_for_net_read); */ + + case queue_for_net_read: + /* READ or READA, and + * no local disk, + * or target area marked as invalid, + * or just got an io-error. */ + /* from drbd_make_request_common + * or from bio_endio during read io-error recovery */ + + /* so we can verify the handle in the answer packet + * corresponding hlist_del is in _req_may_be_done() */ + hlist_add_head(&req->colision, ar_hash_slot(mdev,req->sector)); + + set_bit(UNPLUG_REMOTE,&mdev->flags); /* why? */ + + D_ASSERT(req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = (req->rq_state & RQ_LOCAL_MASK) + ? w_read_retry_remote + : w_send_read_req; + drbd_queue_work(&mdev->data.work, &req->w); + break; + + case queue_for_net_write: + /* assert something? */ + /* from drbd_make_request_common only */ + + hlist_add_head(&req->colision,tl_hash_slot(mdev,req->sector)); + /* corresponding hlist_del is in _req_may_be_done() */ + + /* NOTE + * In case the req ended up on the transfer log before being + * queued on the worker, it could lead to this request being + * missed during cleanup after connection loss. + * So we have to do both operations here, + * within the same lock that protects the transfer log. + * + * _req_add_to_epoch(req); this has to be after the + * _maybe_start_new_epoch(req); which happened in + * drbd_make_request_common, because we now may set the bit + * again ourselves to close the current epoch. + * + * Add req to the (now) current epoch (barrier). */ + + /* see drbd_make_request_common just after it grabs the req_lock */ + D_ASSERT(test_bit(ISSUE_BARRIER, &mdev->flags) == 0); + + req->epoch = mdev->newest_barrier->br_number; + list_add_tail(&req->tl_requests,&mdev->newest_barrier->requests); + + /* mark the current epoch as closed, + * in case it outgrew the limit */ + if( ++mdev->newest_barrier->n_req >= mdev->net_conf->max_epoch_size ) + set_bit(ISSUE_BARRIER,&mdev->flags); + + D_ASSERT(req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = w_send_dblock; + drbd_queue_work(&mdev->data.work, &req->w); + break; + + /* FIXME + * to implement freeze-io, + * we may not finish the request just yet. + */ + case send_canceled: + /* for the request, this is the same thing */ + case send_failed: + /* real cleanup will be done from tl_clear. just update flags so + * it is no longer marked as on the worker queue */ + req->rq_state &= ~RQ_NET_QUEUED; + /* if we did it right, tl_clear should be scheduled only after this, + * so this should not be necessary! */ + _req_may_be_done(req,error); + break; + + case handed_over_to_network: + /* assert something? */ + if ( bio_data_dir(req->master_bio) == WRITE && + mdev->net_conf->wire_protocol == DRBD_PROT_A ) { + /* this is what is dangerous about protocol A: + * pretend it was sucessfully written on the peer. + * FIXME in case we get a local io-error in + * protocol != C, we might want to defer comletion + * until we get the barrier ack, and send a NegAck + * in case the other node had an io-error, too... + * That way we would at least not report "success" + * if it was not written at all. */ + if (req->rq_state & RQ_NET_PENDING) { + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= RQ_NET_OK; + } /* else: neg-ack was faster... */ + /* it is still not yet RQ_NET_DONE until the + * corresponding epoch barrier got acked as well, + * so we know what to dirty on connection loss */ + } + req->rq_state &= ~RQ_NET_QUEUED; + req->rq_state |= RQ_NET_SENT; + /* because _drbd_send_zc_bio could sleep, and may want to + * dereference the bio even after the "write_acked_by_peer" and + * "completed_ok" events came in, once we return from + * _drbd_send_zc_bio (drbd_send_dblock), we have to check + * whether it is done already, and end it. */ + _req_may_be_done(req,error); + break; + + case connection_lost_while_pending: + /* transfer log cleanup after connection loss */ + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev); + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); + req->rq_state |= RQ_NET_DONE; + /* if it is still queued, we may not complete it here. + * it will be canceled soon. + * FIXME we should change the code so this can not happen. */ + if (!(req->rq_state & RQ_NET_QUEUED)) + _req_may_be_done(req,error); + break; + + case write_acked_by_peer_and_sis: + req->rq_state |= RQ_NET_SIS; + case conflict_discarded_by_peer: + /* interesstingly, this is the same thing! */ + case write_acked_by_peer: + /* assert something? */ + /* protocol C; successfully written on peer */ + req->rq_state |= RQ_NET_DONE; + /* rest is the same as for: */ + case recv_acked_by_peer: + /* protocol B; pretends to be sucessfully written on peer. + * see also notes above in handed_over_to_network about + * protocol != C */ + req->rq_state |= RQ_NET_OK; + D_ASSERT(req->rq_state & RQ_NET_PENDING); + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + _req_may_be_done(req,error); + break; + + case neg_acked: + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev); + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); + /* FIXME THINK! is it DONE now, or is it not? */ + req->rq_state |= RQ_NET_DONE; + _req_may_be_done(req,error); + /* else: done by handed_over_to_network */ + break; + + case barrier_acked: + /* can even happen for protocol C, + * when local io is still pending. + * in which case it does nothing. */ + if (req->rq_state & RQ_NET_PENDING) { + /* barrier came in before all requests have been acked. + * this is bad, because if the connection is lost now, + * we won't be able to clean them up... */ + _print_rq_state(req, "FIXME (barrier_acked but pending)"); + } + D_ASSERT(req->rq_state & RQ_NET_SENT); + req->rq_state |= RQ_NET_DONE; + _req_may_be_done(req,error); + break; + + case data_received: + D_ASSERT(req->rq_state & RQ_NET_PENDING); + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); + _req_may_be_done(req,error); + break; + }; +} + +/* we may do a local read if: + * - we are consistent (of course), + * - or we are generally inconsistent, + * BUT we are still/already IN SYNC for this area. + * since size may be bigger than BM_BLOCK_SIZE, + * we may need to check several bits. + */ +STATIC int drbd_may_do_local_read(drbd_dev *mdev, sector_t sector, int size) +{ + unsigned long sbnr,ebnr,bnr; + sector_t esector, nr_sectors; + + if (mdev->state.disk == UpToDate) return 1; + if (mdev->state.disk >= Outdated) return 0; + if (mdev->state.disk < Inconsistent) return 0; + // state.disk == Inconsistent We will have a look at the BitMap + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + D_ASSERT(sector < nr_sectors); + D_ASSERT(esector < nr_sectors); + + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + for (bnr = sbnr; bnr <= ebnr; bnr++) { + if (drbd_bm_test_bit(mdev,bnr)) return 0; + } + return 1; +} + +/* + * general note: + * looking at the state (conn, disk, susp, pdsk) outside of the spinlock that + * protects the state changes is inherently racy. + * + * FIXME verify this rationale why we may do so anyways: + * + * I think it "should" be like this: + * as soon as we have a "ap_bio_cnt" reference we may test for "bad" states, + * because the transition from "bad" to "good" states may only happen while no + * application request is on the fly, so once we are positive about a "bad" + * state, we know it won't get better during the lifetime of this request. + * + * In case we think we are ok, but "asynchronously" some interrupt or other thread + * marks some operation as impossible, we are still ok, since we would just try + * anyways, and then see that it does not work there and then. + */ + +STATIC int +drbd_make_request_common(drbd_dev *mdev, int rw, int size, + sector_t sector, struct bio *bio) +{ + struct drbd_barrier *b = NULL; + drbd_request_t *req; + int local, remote; + int err = -EIO; + + /* allocate outside of all locks; get a "reference count" (ap_bio_cnt) + * to avoid races with the disconnect/reconnect code. */ + inc_ap_bio(mdev); + req = drbd_req_new(mdev,bio); + if (!req) { + dec_ap_bio(mdev); + /* only pass the error to the upper layers. + * if user cannot handle io errors, thats not our business. */ + ERR("could not kmalloc() req\n"); + bio_endio(bio, bio->bi_size, -ENOMEM); + return 0; + } + + dump_bio(mdev,bio,0); + + local = inc_local(mdev); + if (!local) { + bio_put(req->private_bio); /* or we get a bio leak */ + req->private_bio = NULL; + } + if (rw == WRITE) { + remote = 1; + } else { + /* READ || READA */ + if (local) { + if (!drbd_may_do_local_read(mdev,sector,size)) { + /* we could kick the syncer to + * sync this extent asap, wait for + * it, then continue locally. + * Or just issue the request remotely. + */ + /* FIXME + * I think we have a RACE here. We request + * something from the peer, then later some + * write starts ... and finished *before* + * the answer to the read comes in, because + * the ACK for the WRITE goes over + * meta-socket ... + * Maybe we need to properly lock reads + * against the syncer, too. But if we have + * some user issuing writes on an area that + * he has pending reads on, _he_ is really + * broke anyways, and would get "undefined + * results" on _any_ io stack, even just the + * local io stack. + */ + +/* XXX SHARED DISK mode + * think this over again for two primaries */ + + local = 0; + bio_put(req->private_bio); + req->private_bio = NULL; + dec_local(mdev); + } + } + remote = !local && mdev->state.pdsk >= UpToDate;//Consistent; + } + + /* If we have a disk, but a READA request is mapped to remote, + * we are Primary, Inconsistent, SyncTarget. + * Just fail that READA request right here. + * + * THINK: maybe fail all READA when not local? + * or make this configurable... + * if network is slow, READA won't do any good. + */ + if (rw == READA && mdev->state.disk >= Inconsistent && !local) { + err = -EWOULDBLOCK; + goto fail_and_free_req; + } + + /* For WRITES going to the local disk, grab a reference on the target extent. + * This waits for any resync activity in the corresponding resync + * extent to finish, and, if necessary, pulls in the target extent into + * the activity log, which involves further disk io because of transactional + * on-disk meta data updates. */ + if (rw == WRITE && local) + drbd_al_begin_io(mdev, sector); + + remote = remote && (mdev->state.pdsk == UpToDate || + ( mdev->state.pdsk == Inconsistent && + mdev->state.conn >= Connected ) ); + + if (!(local || remote)) { + ERR("IO ERROR: neither local nor remote disk\n"); + goto fail_and_free_req; + } + + /* For WRITE request, we have to make sure that we have an + * unused_spare_barrier, in case we need to start a new epoch. + * I try to be smart and avoid to pre-allocate always "just in case", + * but there is a race between testing the bit and pointer outside the + * spinlock, and grabbing the spinlock. + * if we lost that race, we retry. */ + if (rw == WRITE && remote && + mdev->unused_spare_barrier == NULL && + test_bit(ISSUE_BARRIER,&mdev->flags)) + { + allocate_barrier: + b = kmalloc(sizeof(struct drbd_barrier),GFP_NOIO); + if(!b) { + ERR("Failed to alloc barrier."); + err = -ENOMEM; + goto fail_and_free_req; + } + } + + /* GOOD, everything prepared, grab the spin_lock */ + spin_lock_irq(&mdev->req_lock); + + /* FIXME race with drbd_disconnect and tl_clear? */ + if (remote) { + remote = (mdev->state.pdsk == UpToDate || + ( mdev->state.pdsk == Inconsistent && + mdev->state.conn >= Connected ) ); + if (!remote) { + WARN("lost connection while grabbing the req_lock!\n"); + } + if (!(local || remote)) { + ERR("IO ERROR: neither local nor remote disk\n"); + spin_unlock_irq(&mdev->req_lock); + goto fail_and_free_req; + } + } + + if (b && mdev->unused_spare_barrier == NULL) { + mdev->unused_spare_barrier = b; + b = NULL; + } + if (rw == WRITE && remote && + mdev->unused_spare_barrier == NULL && + test_bit(ISSUE_BARRIER,&mdev->flags)) { + /* someone closed the current epoch + * while we were grabbing the spinlock */ + spin_unlock_irq(&mdev->req_lock); + goto allocate_barrier; + } + + + /* _maybe_start_new_epoch(mdev); + * If we need to generate a write barrier packet, we have to add the + * new epoch (barrier) object, and queue the barrier packet for sending, + * and queue the req's data after it _within the same lock_, otherwise + * we have race conditions were the reorder domains could be mixed up. + * + * Even read requests may start a new epoch and queue the corresponding + * barrier packet. To get the write ordering right, we only have to + * make sure that, if this is a write request and it triggered a + * barrier packet, this request is queued within the same spinlock. */ + if (remote && mdev->unused_spare_barrier && + test_and_clear_bit(ISSUE_BARRIER,&mdev->flags)) { + struct drbd_barrier *b = mdev->unused_spare_barrier; + b = _tl_add_barrier(mdev,b); + mdev->unused_spare_barrier = NULL; + b->w.cb = w_send_barrier; + /* inc_ap_pending done here, so we won't + * get imbalanced on connection loss. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in tl_clear. */ + inc_ap_pending(mdev); + drbd_queue_work(&mdev->data.work, &b->w); + } else { + D_ASSERT(!(remote && rw == WRITE && + test_bit(ISSUE_BARRIER,&mdev->flags))); + } + + /* NOTE + * Actually, 'local' may be wrong here already, since we may have failed + * to write to the meta data, and may become wrong anytime because of + * local io-error for some other request, which would lead to us + * "detaching" the local disk. + * + * 'remote' may become wrong any time because the network could fail. + * + * This is a harmless race condition, though, since it is handled + * correctly at the appropriate places; so it just deferres the failure + * of the respective operation. + */ + + /* mark them early for readability. + * this just sets some state flags. */ + if (remote) _req_mod(req, to_be_send, 0); + if (local) _req_mod(req, to_be_submitted, 0); + + /* check this request on the colison detection hash tables. + * if we have a conflict, just complete it here. + * THINK do we want to check reads, too? (I don't think so...) */ + if (rw == WRITE && _req_conflicts(req)) { + /* this is a conflicting request. + * even though it may have been only _partially_ + * overlapping with one of the currently pending requests, + * without even submitting or sending it, we will + * pretend that it was successfully served right now. + */ + if (local) { + bio_put(req->private_bio); + req->private_bio = NULL; + drbd_al_complete_io(mdev, req->sector); + dec_local(mdev); + local = 0; + } + if (remote) dec_ap_pending(mdev); + dump_bio(mdev,req->master_bio,1); + /* THINK: do we want to fail it (-EIO), or pretend success? */ + bio_endio(req->master_bio, req->master_bio->bi_size, 0); + req->master_bio = NULL; + dec_ap_bio(mdev); + drbd_req_free(req); + local = remote = 0; + } + + /* NOTE remote first: to get the concurrent write detection right, + * we must register the request before start of local IO. */ + if (remote) { + /* either WRITE and Connected, + * or READ, and no local disk, + * or READ, but not in sync. + */ + if (rw == WRITE) _req_mod(req,queue_for_net_write, 0); + else _req_mod(req,queue_for_net_read, 0); + } + spin_unlock_irq(&mdev->req_lock); + if (b) kfree(b); /* if someone else has beaten us to it... */ + + if (local) { + /* FIXME what ref count do we have to ensure the backing_bdev + * was not detached below us? */ + req->private_bio->bi_bdev = mdev->bc->backing_bdev; + + if (FAULT_ACTIVE(mdev, rw==WRITE ? DRBD_FAULT_DT_WR : + ( rw==READ ? DRBD_FAULT_DT_RD : + DRBD_FAULT_DT_RA ) )) + bio_endio(req->private_bio, req->private_bio->bi_size, -EIO); + else + generic_make_request(req->private_bio); + } + + /* we need to plug ALWAYS since we possibly need to kick lo_dev. + * we plug after submit, so we won't miss an unplug event */ + drbd_plug_device(mdev); + + return 0; + + fail_and_free_req: + if (b) kfree(b); + bio_endio(bio, bio->bi_size, err); + drbd_req_free(req); + return 0; +} + +/* helper function for drbd_make_request + * if we can determine just by the mdev (state) that this request will fail, + * return 1 + * otherwise return 0 + */ +static int drbd_fail_request_early(drbd_dev* mdev, int is_write) +{ + // Unconfigured + if (mdev->state.conn == Disconnecting && + mdev->state.disk == Diskless) + return 1; + + if (mdev->state.role != Primary && + ( !allow_oos || is_write) ) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("Process %s[%u] tried to %s; since we are not in Primary state, we cannot allow this\n", + current->comm, current->pid, is_write ? "WRITE" : "READ"); + } + return 1; + } + + /* + * Paranoia: we might have been primary, but sync target, or + * even diskless, then lost the connection. + * This should have been handled (panic? suspend?) somehwere + * else. But maybe it was not, so check again here. + * Caution: as long as we do not have a read/write lock on mdev, + * to serialize state changes, this is racy, since we may lose + * the connection *after* we test for the cstate. + */ + if ( mdev->state.disk < UpToDate && + mdev->state.conn < Connected) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("Sorry, I have no access to good data anymore.\n"); + } + /* + * FIXME suspend, loop waiting on cstate wait? + */ + return 1; + } + + return 0; +} + +int drbd_make_request_26(request_queue_t *q, struct bio *bio) +{ + unsigned int s_enr,e_enr; + struct Drbd_Conf* mdev = (drbd_dev*) q->queuedata; + + if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { + bio_endio(bio, bio->bi_size, -EPERM); + return 0; + } + + /* Currently our BARRIER code is disabled. */ + if(unlikely(bio_barrier(bio))) { + bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + return 0; + } + + /* + * what we "blindly" assume: + */ + D_ASSERT(bio->bi_size > 0); + D_ASSERT( (bio->bi_size & 0x1ff) == 0); + // D_ASSERT(bio->bi_size <= q->max_segment_size); // wrong. + D_ASSERT(bio->bi_idx == 0); + + /* to make some things easier, force allignment of requests within the + * granularity of our hash tables */ + s_enr = bio->bi_sector >> HT_SHIFT; + e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; + + if(unlikely(s_enr != e_enr)) { + if (bio->bi_vcnt != 1 || bio->bi_idx != 0) { + /* rather error out here than BUG in bio_split */ + ERR("bio would need to, but cannot, be split: " + "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", + bio->bi_vcnt, bio->bi_idx, bio->bi_size, bio->bi_sector); + bio_endio(bio, bio->bi_size, -EINVAL); + return 0; + } else { + /* This bio crosses some boundary, so we have to split it. */ + struct bio_pair *bp; + /* works for the "do not cross hash slot boundaries" case + * e.g. sector 262269, size 4096 + * s_enr = 262269 >> 6 = 4097 + * e_enr = (262269+8-1) >> 6 = 4098 + * HT_SHIFT = 6 + * sps = 64, mask = 63 + * first_sectors = 64 - (262269 & 63) = 3 + */ + const sector_t sect = bio->bi_sector; + const int sps = 1<bio1); + drbd_make_request_26(q,&bp->bio2); + bio_pair_release(bp); + return 0; + }} + + return drbd_make_request_common(mdev,bio_rw(bio),bio->bi_size, + bio->bi_sector,bio); +} + +/* This is called by bio_add_page(). With this function we reduce + * the number of BIOs that span over multiple AL_EXTENTs. + * + * we do the calculation within the lower 32bit of the byte offsets, + * since we don't care for actual offset, but only check whether it + * would cross "activity log extent" boundaries. + * + * As long as the BIO is emtpy we have to allow at least one bvec, + * regardless of size and offset. so the resulting bio may still + * cross extent boundaries. those are dealt with (bio_split) in + * drbd_make_request_26. + */ +/* FIXME for two_primaries, + * we should use DRBD_MAX_SEGMENT_SIZE instead of AL_EXTENT_SIZE */ +int drbd_merge_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *bvec) +{ + struct Drbd_Conf* mdev = (drbd_dev*) q->queuedata; + unsigned int bio_offset = (unsigned int)bio->bi_sector << 9; // 32 bit + unsigned int bio_size = bio->bi_size; + int limit, backing_limit; + +#if 1 + limit = DRBD_MAX_SEGMENT_SIZE - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); +#else + limit = AL_EXTENT_SIZE - ((bio_offset & (AL_EXTENT_SIZE-1)) + bio_size); +#endif + if (limit < 0) limit = 0; + if (bio_size == 0) { + if (limit <= bvec->bv_len) limit = bvec->bv_len; + } else if (limit && inc_local(mdev)) { + request_queue_t * const b = mdev->bc->backing_bdev->bd_disk->queue; + if(b->merge_bvec_fn && mdev->bc->dc.use_bmbv) { + backing_limit = b->merge_bvec_fn(b,bio,bvec); + limit = min(limit,backing_limit); + } + dec_local(mdev); + } + return limit; +} diff -uprN linux-2.6.18/drivers/block/drbd/drbd_req.h linux-2.6.18.ovz/drivers/block/drbd/drbd_req.h --- linux-2.6.18/drivers/block/drbd/drbd_req.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_req.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,320 @@ +/* + drbd_req.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2006-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2006-2007, Lars Ellenberg . + Copyright (C) 2006-2007, Philipp Reisner . + + DRBD is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + DRBD is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_REQ_H +#define _DRBD_REQ_H + +#include +#include + +#include +#include +#include "drbd_int.h" + +/* The request callbacks will be called in irq context by the IDE drivers, + and in Softirqs/Tasklets/BH context by the SCSI drivers, + and by the receiver and worker in kernel-thread context. + Try to get the locking right :) */ + +/* + * Objects of type drbd_request_t do only exist on a Primary node, and are + * associated with IO requests originating from the block layer above us. + * + * There are quite a few things that may happen to a drbd request + * during its lifetime. + * + * It will be created. + * It will be marked with the intention to be + * submitted to local disk and/or + * send via the network. + * + * It has to be placed on the transfer log and other housekeeping lists, + * In case we have a network connection. + * FIXME I believe that for consistency we should place even READ requests + * on these lists, so we can moan when we detect that the other node is + * writing to an area that we currently read from (when this happens, our + * users are broken). + * + * It may be identified as a concurrent (write) request + * and be handled accordingly. + * + * It may me handed over to the local disk subsystem. + * It may be completed by the local disk subsystem, + * either sucessfully or with io-error. + * In case it is a READ request, and it failed locally, + * it may be retried remotely. + * + * It may be queued for sending. + * It may be handed over to the network stack, + * which may fail. + * It may be acknowledged by the "peer" according to the wire_protocol in use. + * this may be a negative ack. + * It may receive a faked ack when the network connection is lost and the + * transfer log is cleaned up. + * Sending may be canceled due to network connection loss. + * When it finally has outlived its time, + * corresponding dirty bits in the resync-bitmap may be cleared or set, + * it will be destroyed, + * and completion will be signalled to the originator, + * with or without "success". + * + * See also documentation/drbd-request-state-overview.dot + * (dot -Tps2 documentation/drbd-request-state-overview.dot | display -) + */ + +typedef enum { + created, + to_be_send, + to_be_submitted, + + /* XXX yes, now I am inconsistent... + * these two are not "events" but "actions" + * oh, well... */ + queue_for_net_write, + queue_for_net_read, + + send_canceled, + send_failed, + handed_over_to_network, + connection_lost_while_pending, + recv_acked_by_peer, + write_acked_by_peer, + write_acked_by_peer_and_sis, // and set_in_sync + conflict_discarded_by_peer, + neg_acked, + barrier_acked, /* in protocol A and B */ + data_received, /* (remote read) */ + + read_completed_with_error, + write_completed_with_error, + completed_ok, +} drbd_req_event_t; + +/* encoding of request states for now. we don't actually need that many bits. + * we don't need to do atomic bit operations either, since most of the time we + * need to look at the connection state and/or manipulate some lists at the + * same time, so we should hold the request lock anyways. + */ +enum drbd_req_state_bits { + /* 210 + * 000: no local possible + * 001: to be submitted + * UNUSED, we could map: 011: submitted, completion still pending + * 110: completed ok + * 010: completed with error + */ + __RQ_LOCAL_PENDING, + __RQ_LOCAL_COMPLETED, + __RQ_LOCAL_OK, + + /* 76543 + * 00000: no network possible + * 00001: to be send + * 00011: to be send, on worker queue + * 00101: sent, expecting recv_ack (B) or write_ack (C) + * 11101: sent, + * recv_ack (B) or implicit "ack" (A), + * still waiting for the barrier ack. + * master_bio may already be completed and invalidated. + * 11100: write_acked (C), + * data_received (for remote read, any protocol) + * or finally the barrier ack has arrived (B,A)... + * request can be freed + * 01100: neg-acked (write, protocol C) + * or neg-d-acked (read, any protocol) + * or killed from the transfer log + * during cleanup after connection loss + * request can be freed + * 01000: canceled or send failed... + * request can be freed + */ + + /* if "SENT" is not set, yet, this can still fail or be canceled. + * if "SENT" is set already, we still wait for an Ack packet. + * when cleared, the master_bio may be completed. + * in (B,A) the request object may still linger on the transaction log + * until the corresponding barrier ack comes in */ + __RQ_NET_PENDING, + + /* If it is QUEUED, and it is a WRITE, it is also registered in the + * transfer log. Currently we need this flag to avoid conflicts between + * worker canceling the request and tl_clear_barrier killing it from + * transfer log. We should restructure the code so this conflict does + * no longer occur. */ + __RQ_NET_QUEUED, + + /* well, actually only "handed over to the network stack". + * + * TODO can potentially be dropped because of the similar meaning + * of RQ_NET_SENT and ~RQ_NET_QUEUED. + * however it is not exactly the same. before we drop it + * we must ensure that we can tell a request with network part + * from a request without, regardless of what happens to it. */ + __RQ_NET_SENT, + + /* when set, the request may be freed (if RQ_NET_QUEUED is clear). + * in (C) this happens when WriteAck is received, + * in (B,A) when the corresponding BarrierAck is received */ + __RQ_NET_DONE, + + /* whether or not we know (C) or pretend (B,A) that the write + * was successfully written on the peer. + */ + __RQ_NET_OK, + + /* peer called drbd_set_in_sync() for this write */ + __RQ_NET_SIS, + + /* keep this last, its for the RQ_NET_MASK */ + __RQ_NET_MAX, +}; + +#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) +#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) +#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) + +#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ + +#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) +#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) +#define RQ_NET_SENT (1UL << __RQ_NET_SENT) +#define RQ_NET_DONE (1UL << __RQ_NET_DONE) +#define RQ_NET_OK (1UL << __RQ_NET_OK) +#define RQ_NET_SIS (1UL << __RQ_NET_SIS) + +#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) /* 0x1f8 */ + +/* epoch entries */ +static inline struct hlist_head* ee_hash_slot(drbd_dev *mdev, sector_t sector) +{ + BUG_ON(mdev->ee_hash_s == 0); + return mdev->ee_hash + ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); +} + +/* transfer log (drbd_request objects) */ +static inline struct hlist_head* tl_hash_slot(drbd_dev *mdev, sector_t sector) +{ + BUG_ON(mdev->tl_hash_s == 0); + return mdev->tl_hash + + ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); +} + +/* when we receive the answer for a read request, + * verify that we actually know about it */ +static inline drbd_request_t* _ack_id_to_req(drbd_dev *mdev,u64 id, sector_t sector) +{ + struct hlist_head *slot = tl_hash_slot(mdev,sector); + struct hlist_node *n; + drbd_request_t * req; + + hlist_for_each_entry(req, n, slot, colision) { + if ((unsigned long)req == (unsigned long)id) { + if (req->sector != sector) { + ERR("_ack_id_to_req: found req %p but it has " + "wrong sector (%llus versus %llus)\n", req, + (unsigned long long)req->sector, + (unsigned long long)sector); + break; + } + return req; + } + } + ERR("_ack_id_to_req: failed to find req %p, sector %llus in list\n", + (void*)(unsigned long)id, (unsigned long long)sector); + return NULL; +} + +/* application reads (drbd_request objects) */ +static struct hlist_head* ar_hash_slot(drbd_dev *mdev, sector_t sector) +{ + return mdev->app_reads_hash + + ((unsigned int)(sector) % APP_R_HSIZE); +} + +/* when we receive the answer for a read request, + * verify that we actually know about it */ +static inline drbd_request_t* _ar_id_to_req(drbd_dev *mdev,u64 id, sector_t sector) +{ + struct hlist_head *slot = ar_hash_slot(mdev,sector); + struct hlist_node *n; + drbd_request_t * req; + + hlist_for_each_entry(req, n, slot, colision) { + if ((unsigned long)req == (unsigned long)id) { + D_ASSERT(req->sector == sector); + return req; + } + } + return NULL; +} + +static inline drbd_request_t* drbd_req_new(drbd_dev *mdev, struct bio *bio_src) +{ + struct bio *bio; + drbd_request_t *req = mempool_alloc(drbd_request_mempool, GFP_NOIO); + if (likely(req)) { + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + + req->rq_state = 0; + req->mdev = mdev; + req->master_bio = bio_src; + req->private_bio = bio; + req->epoch = 0; + req->sector = bio->bi_sector; + req->size = bio->bi_size; + INIT_HLIST_NODE(&req->colision); + INIT_LIST_HEAD(&req->tl_requests); + + bio->bi_private = req; + bio->bi_end_io = drbd_endio_pri; + bio->bi_next = 0; + } + return req; +} + +static inline void drbd_req_free(drbd_request_t *req) +{ + mempool_free(req,drbd_request_mempool); +} + +static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) +{ + return !( ( s1 + (l1>>9) <= s2 ) || ( s1 >= s2 + (l2>>9) ) ); +} + +/* aparently too large to be inlined... + * moved to drbd_req.c */ +extern void _req_may_be_done(drbd_request_t *req, int error); +extern void _req_mod(drbd_request_t *req, drbd_req_event_t what, int error); + +/* If you need it irqsave, do it your self! */ +static inline void req_mod(drbd_request_t *req, drbd_req_event_t what, int error) +{ + drbd_dev *mdev = req->mdev; + spin_lock_irq(&mdev->req_lock); + _req_mod(req,what,error); + spin_unlock_irq(&mdev->req_lock); +} +#endif diff -uprN linux-2.6.18/drivers/block/drbd/drbd_strings.c linux-2.6.18.ovz/drivers/block/drbd/drbd_strings.c --- linux-2.6.18/drivers/block/drbd/drbd_strings.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_strings.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,105 @@ +/* + drbd.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2007, Philipp Reisner . + Copyright (C) 2003-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include + +static const char *drbd_conn_s_names[] = { + [StandAlone] = "StandAlone", + [Disconnecting] = "Disconnecting", + [Unconnected] = "Unconnected", + [Timeout] = "Timeout", + [BrokenPipe] = "BrokenPipe", + [NetworkFailure] = "NetworkFailure", + [ProtocolError] = "ProtocolError", + [WFConnection] = "WFConnection", + [WFReportParams] = "WFReportParams", + [TearDown] = "TearDown", + [Connected] = "Connected", + [StartingSyncS] = "StartingSyncS", + [StartingSyncT] = "StartingSyncT", + [WFBitMapS] = "WFBitMapS", + [WFBitMapT] = "WFBitMapT", + [WFSyncUUID] = "WFSyncUUID", + [SyncSource] = "SyncSource", + [SyncTarget] = "SyncTarget", + [PausedSyncS] = "PausedSyncS", + [PausedSyncT] = "PausedSyncT" +}; + +static const char *drbd_role_s_names[] = { + [Primary] = "Primary", + [Secondary] = "Secondary", + [Unknown] = "Unknown" +}; + +static const char *drbd_disk_s_names[] = { + [Diskless] = "Diskless", + [Attaching] = "Attaching", + [Failed] = "Failed", + [Negotiating] = "Negotiating", + [Inconsistent] = "Inconsistent", + [Outdated] = "Outdated", + [DUnknown] = "DUnknown", + [Consistent] = "Consistent", + [UpToDate] = "UpToDate", +}; + +static const char *drbd_state_sw_errors[] = { + [-SS_TwoPrimaries] = "Multiple primaries not allowed by config", + [-SS_NoUpToDateDisk] = + "Refusing to be Primary without at least one UpToDate disk", + [-SS_BothInconsistent] = "Refusing to be inconsistent on both nodes", + [-SS_SyncingDiskless] = "Refusing to be syncing and diskless", + [-SS_ConnectedOutdates] = "Refusing to be Outdated while Connected", + [-SS_PrimaryNOP] = "Refusing to be Primary while peer is not outdated", + [-SS_ResyncRunning] = "Can not start resync since it is already active", + [-SS_AlreadyStandAlone] = "Can not disconnect a StandAlone device", + [-SS_CW_FailedByPeer] = "State changed was refused by peer node", + [-SS_CanNotOutdateDL] = "Can not outdate a diskless device", + [-SS_DeviceInUse] = "Device is held open by someone" +}; + +const char* conns_to_name(drbd_conns_t s) { + /* enums are unsigned... */ + return s > PausedSyncT ? "TOO_LARGE" + : drbd_conn_s_names[s]; +} + +const char* roles_to_name(drbd_role_t s) { + return s > Secondary ? "TOO_LARGE" + : drbd_role_s_names[s]; +} + +const char* disks_to_name(drbd_disks_t s) { + return s > UpToDate ? "TOO_LARGE" + : drbd_disk_s_names[s]; +} + +const char* set_st_err_name(set_st_err_t err) { + return err < SS_DeviceInUse ? "TOO_SMALL" : + err > SS_TwoPrimaries ? "TOO_LARGE" + : drbd_state_sw_errors[-err]; +} diff -uprN linux-2.6.18/drivers/block/drbd/drbd_worker.c linux-2.6.18.ovz/drivers/block/drbd/drbd_worker.c --- linux-2.6.18/drivers/block/drbd/drbd_worker.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/drbd_worker.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1012 @@ +/* +-*- linux-c -*- + drbd_worker.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include // for the page_count macro on RH/Fedora +#include +#include + +#include +#include "drbd_int.h" +#include "drbd_req.h" + +/* I choose to have all block layer end_io handlers defined here. + + * For all these callbacks, note the follwing: + * The callbacks will be called in irq context by the IDE drivers, + * and in Softirqs/Tasklets/BH context by the SCSI drivers. + * Try to get the locking right :) + * + */ + +/* used for synchronous meta data and bitmap IO + * submitted by drbd_md_sync_page_io() + */ +int drbd_md_io_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + if (bio->bi_size) return 1; + + complete((struct completion*)bio->bi_private); + return 0; +} + +/* reads on behalf of the partner, + * "submitted" by the receiver + */ +int drbd_endio_read_sec(struct bio *bio, unsigned int bytes_done, int error) +{ + unsigned long flags=0; + struct Tl_epoch_entry *e=NULL; + struct Drbd_Conf* mdev; + + e = bio->bi_private; + mdev = e->mdev; + + /* We are called each time a part of the bio is finished, but + * we are only interested when the whole bio is finished, therefore + * return as long as bio->bio_size is positive. */ + if (bio->bi_size) return 1; + + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->req_lock,flags); + mdev->read_cnt += e->size >> 9; + list_del(&e->w.list); + if(list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); + spin_unlock_irqrestore(&mdev->req_lock,flags); + + drbd_chk_io_error(mdev,error,FALSE); + drbd_queue_work(&mdev->data.work,&e->w); + dec_local(mdev); + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("Moved EE (READ) to worker sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + return 0; +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +int drbd_endio_write_sec(struct bio *bio, unsigned int bytes_done, int error) +{ + unsigned long flags=0; + struct Tl_epoch_entry *e=NULL; + drbd_dev *mdev; + sector_t e_sector; + int do_wake; + int is_syncer_req; + int do_al_complete_io; + + e = bio->bi_private; + mdev = e->mdev; + + // see above + if (bio->bi_size) return 1; + + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->req_lock,flags); + mdev->writ_cnt += e->size >> 9; + is_syncer_req = is_syncer_block_id(e->block_id); + + /* after we moved e to done_ee, + * we may no longer access it, + * it may be freed/reused already! + * (as soon as we release the req_lock) */ + e_sector = e->sector; + do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; + + list_del(&e->w.list); /* has been on active_ee or sync_ee */ + list_add_tail(&e->w.list,&mdev->done_ee); + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("Moved EE (WRITE) to done_ee sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + + /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, + * neither did we wake possibly waiting conflicting requests. + * done from "drbd_process_done_ee" within the appropriate w.cb + * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ + + if(!is_syncer_req) mdev->epoch_size++; + + do_wake = is_syncer_req + ? list_empty(&mdev->sync_ee) + : list_empty(&mdev->active_ee); + + if (error) __drbd_chk_io_error(mdev,FALSE); + spin_unlock_irqrestore(&mdev->req_lock,flags); + + if (is_syncer_req) drbd_rs_complete_io(mdev,e_sector); + + if (do_wake) wake_up(&mdev->ee_wait); + + if (do_al_complete_io) drbd_al_complete_io(mdev,e_sector); + + wake_asender(mdev); + dec_local(mdev); + + return 0; +} + +/* read, readA or write requests on Primary comming from drbd_make_request + */ +int drbd_endio_pri(struct bio *bio, unsigned int bytes_done, int error) +{ + unsigned long flags; + drbd_request_t *req=bio->bi_private; + drbd_dev *mdev = req->mdev; + drbd_req_event_t what; + + // see above + if (bio->bi_size) return 1; + + /* to avoid recursion in _req_mod */ + what = error + ? (bio_data_dir(bio) == WRITE) + ? write_completed_with_error + : read_completed_with_error + : completed_ok; + spin_lock_irqsave(&mdev->req_lock,flags); + _req_mod(req, what, error); + spin_unlock_irqrestore(&mdev->req_lock,flags); + return 0; +} + +int w_io_error(drbd_dev* mdev, struct drbd_work* w,int cancel) +{ + drbd_request_t *req = (drbd_request_t*)w; + int ok; + + /* FIXME send a "set_out_of_sync" packet to the peer + * in the PassOn case... + * in the Detach (or Panic) case, we (try to) send + * a "we are diskless" param packet anyways, and the peer + * will then set the FullSync bit in the meta data ... + */ + // NOTE: mdev->bc can be NULL by the time we get here! + //D_ASSERT(mdev->bc->dc.on_io_error != PassOn); + + /* the only way this callback is scheduled is from _req_may_be_done, + * when it is done and had a local write error, see comments there */ + drbd_req_free(req); + + if(unlikely(cancel)) return 1; + + ok = drbd_io_error(mdev, FALSE); + if(unlikely(!ok)) ERR("Sending in w_io_error() failed\n"); + return ok; +} + +int w_read_retry_remote(drbd_dev* mdev, struct drbd_work* w,int cancel) +{ + drbd_request_t *req = (drbd_request_t*)w; + + spin_lock_irq(&mdev->req_lock); + if ( cancel || + mdev->state.conn < Connected || + mdev->state.pdsk <= Inconsistent ) { + _req_mod(req, send_canceled, 0); /* FIXME freeze? ... */ + spin_unlock_irq(&mdev->req_lock); + drbd_khelper(mdev,"pri-on-incon-degr"); /* FIXME REALLY? */ + ALERT("WE ARE LOST. Local IO failure, no peer.\n"); + return 1; + } + spin_unlock_irq(&mdev->req_lock); + + /* FIXME this is ugly. we should not detach for read io-error, + * but try to WRITE the DataReply to the failed location, + * to give the disk the chance to relocate that block */ + drbd_io_error(mdev,FALSE); /* tries to schedule a detach and notifies peer */ + return w_send_read_req(mdev,w,0); +} + +int w_resync_inactive(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + ERR_IF(cancel) return 1; + ERR("resync inactive, but callback triggered??\n"); + return 1; // Simply ignore this! +} + +void resync_timer_fn(unsigned long data) +{ + unsigned long flags; + drbd_dev* mdev = (drbd_dev*) data; + int queue; + + spin_lock_irqsave(&mdev->req_lock,flags); + + if(likely(!test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags))) { + queue=1; + mdev->resync_work.cb = w_make_resync_request; + } else { + queue=0; + mdev->resync_work.cb = w_resync_inactive; + } + + spin_unlock_irqrestore(&mdev->req_lock,flags); + + /* harmless race: list_empty outside data.work.q_lock */ + if(list_empty(&mdev->resync_work.list) && queue) { + drbd_queue_work(&mdev->data.work,&mdev->resync_work); + } +} + +#define SLEEP_TIME (HZ/10) + +int w_make_resync_request(drbd_dev* mdev, struct drbd_work* w,int cancel) +{ + unsigned long bit; + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + int max_segment_size = mdev->rq_queue->max_segment_size; + int number,i,size; + int align; + + PARANOIA_BUG_ON(w != &mdev->resync_work); + + if(unlikely(cancel)) return 1; + + if(unlikely(mdev->state.conn < Connected)) { + ERR("Confused in w_make_resync_request()! cstate < Connected"); + return 0; + } + + if (mdev->state.conn != SyncTarget) { + ERR("%s in w_make_resync_request\n", conns_to_name(mdev->state.conn)); + } + + number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); + + if (atomic_read(&mdev->rs_pending_cnt)>number) { + goto requeue; + } + number -= atomic_read(&mdev->rs_pending_cnt); + + if(!inc_local(mdev)) { + /* Since we only need to access mdev->rsync a + inc_local_if_state(mdev,Failed) would be sufficient, but + to continue resync with a broken disk makes no sense at + all */ + ERR("Disk broke down during resync!\n"); + mdev->resync_work.cb = w_resync_inactive; + return 1; + } + + for(i=0;iresync_work.cb = w_resync_inactive; + dec_local(mdev); + return 1; + } + + sector = BM_BIT_TO_SECT(bit); + + if (drbd_try_rs_begin_io(mdev, sector)) { + drbd_bm_set_find(mdev,bit); + goto requeue; + } + + if (unlikely(drbd_bm_test_bit(mdev,bit) == 0 )) { + //INFO("Block got synced while in drbd_rs_begin_io()\n"); + drbd_rs_complete_io(mdev,sector); + goto next_sector; + } + +#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE + /* try to find some adjacent bits. + * we stop if we have already the maximum req size. + * + * Aditionally always align bigger requests, in order to + * be prepared for all stripe sizes of software RAIDs. + * + * we _do_ care about the agreed-uppon q->max_segment_size + * here, as splitting up the requests on the other side is more + * difficult. the consequence is, that on lvm and md and other + * "indirect" devices, this is dead code, since + * q->max_segment_size will be PAGE_SIZE. + */ + align=1; + for (;;) { + if (size + BM_BLOCK_SIZE > max_segment_size) + break; + + // Be always aligned + if (sector & ((1<<(align+3))-1) ) + break; + + // do not cross extent boundaries + if (( (bit+1) & BM_BLOCKS_PER_BM_EXT_MASK ) == 0) + break; + /* now, is it actually dirty, after all? + * caution, drbd_bm_test_bit is tri-state for some + * obscure reason; ( b == 0 ) would get the out-of-band + * only accidentally right because of the "oddly sized" + * adjustment below */ + if ( drbd_bm_test_bit(mdev,bit+1) != 1 ) + break; + bit++; + size += BM_BLOCK_SIZE; + if( (BM_BLOCK_SIZE< BM_BLOCK_SIZE) + drbd_bm_set_find(mdev,bit+1); +#endif + + /* adjust very last sectors, in case we are oddly sized */ + if (sector + (size>>9) > capacity) size = (capacity-sector)<<9; + inc_rs_pending(mdev); + if(!drbd_send_drequest(mdev,RSDataRequest, + sector,size,ID_SYNCER)) { + ERR("drbd_send_drequest() failed, aborting...\n"); + dec_rs_pending(mdev); + dec_local(mdev); + return 0; + } + } + + if(drbd_bm_rs_done(mdev)) { + /* last syncer _request_ was sent, + * but the RSDataReply not yet received. sync will end (and + * next sync group will resume), as soon as we receive the last + * resync data block, and the last bit is cleared. + * until then resync "work" is "inactive" ... + */ + mdev->resync_work.cb = w_resync_inactive; + dec_local(mdev); + return 1; + } + + requeue: + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); + dec_local(mdev); + return 1; +} + +int w_resync_finished(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + kfree(w); + + drbd_bm_lock(mdev); + drbd_resync_finished(mdev); + drbd_bm_unlock(mdev); + + return 1; +} + +int drbd_resync_finished(drbd_dev* mdev) +{ + unsigned long db,dt,dbdt; + int dstate, pdstate; + struct drbd_work *w; + + // Remove all elements from the resync LRU. Since future actions + // might set bits in the (main) bitmap, then the entries in the + // resync LRU would be wrong. + if(drbd_rs_del_all(mdev)) { + // In case this is not possible now, most probabely because + // there are RSDataReply Packets lingering on the worker's + // queue (or even the read operations for those packets + // is not finished by now). Retry in 100ms. + + drbd_kick_lo(mdev); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); + if(w) { + w->cb = w_resync_finished; + drbd_queue_work(&mdev->data.work,w); + return 1; + } + ERR("Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); + } + + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) dt=1; + db = mdev->rs_total; + dbdt = Bit2KB(db/dt); + mdev->rs_paused /= HZ; + INFO("Resync done (total %lu sec; paused %lu sec; %lu K/sec)\n", + dt + mdev->rs_paused, mdev->rs_paused, dbdt); + + D_ASSERT((drbd_bm_total_weight(mdev)-mdev->rs_failed) == 0); + + if (mdev->rs_failed) { + INFO(" %lu failed blocks\n",mdev->rs_failed); + + if (mdev->state.conn == SyncTarget || + mdev->state.conn == PausedSyncT) { + dstate = Inconsistent; + pdstate = UpToDate; + } else { + dstate = UpToDate; + pdstate = Inconsistent; + } + } else { + dstate = pdstate = UpToDate; + + if (mdev->state.conn == SyncTarget || + mdev->state.conn == PausedSyncT) { + if( mdev->p_uuid ) { + int i; + for ( i=Bitmap ; i<=History_end ; i++ ) { + _drbd_uuid_set(mdev,i,mdev->p_uuid[i]); + } + drbd_uuid_set(mdev,Bitmap,mdev->bc->md.uuid[Current]); + _drbd_uuid_set(mdev,Current,mdev->p_uuid[Current]); + } else { + ERR("mdev->p_uuid is NULL! BUG\n"); + } + } + + drbd_uuid_set_bm(mdev,0UL); + + if ( mdev->p_uuid ) { + // Now the two UUID sets are equal, update what we + // know of the peer. + int i; + for ( i=Current ; i<=History_end ; i++ ) { + mdev->p_uuid[i]=mdev->bc->md.uuid[i]; + } + } + } + + mdev->rs_total = 0; + mdev->rs_failed = 0; + mdev->rs_paused = 0; + + if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC,&mdev->flags)) { + WARN("Writing the whole bitmap, due to failed kmalloc\n"); + drbd_bm_write(mdev); + } + + drbd_bm_recount_bits(mdev); + + drbd_request_state(mdev,NS3(conn,Connected, + disk,dstate, + pdsk,pdstate)); + + drbd_md_sync(mdev); + + return 1; +} + +/** + * w_e_end_data_req: Send the answer (DataReply) in response to a DataRequest. + */ +int w_e_end_data_req(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + int ok; + + if(unlikely(cancel)) { + drbd_free_ee(mdev,e); + dec_unacked(mdev); + return 1; + } + + if(likely(drbd_bio_uptodate(e->private_bio))) { + ok=drbd_send_block(mdev, DataReply, e); + } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Sending NegDReply. sector=%llus.\n", + (unsigned long long)e->sector); + + ok=drbd_send_ack(mdev,NegDReply,e); + + /* FIXME we should not detach for read io-errors, in particular + * not now: when the peer asked us for our data, we are likely + * the only remaining disk... */ + drbd_io_error(mdev,FALSE); + } + + dec_unacked(mdev); + + spin_lock_irq(&mdev->req_lock); + if( drbd_bio_has_active_page(e->private_bio) ) { + /* This might happen if sendpage() has not finished */ + list_add_tail(&e->w.list,&mdev->net_ee); + } else { + drbd_free_ee(mdev,e); + } + spin_unlock_irq(&mdev->req_lock); + + if(unlikely(!ok)) ERR("drbd_send_block() failed\n"); + return ok; +} + +/** + * w_e_end_rsdata_req: Send the answer (RSDataReply) to a RSDataRequest. + */ +int w_e_end_rsdata_req(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + int ok; + + if(unlikely(cancel)) { + drbd_free_ee(mdev,e); + dec_unacked(mdev); + return 1; + } + + if(inc_local_if_state(mdev,Failed)) { + drbd_rs_complete_io(mdev,e->sector); + dec_local(mdev); + } + + if(likely(drbd_bio_uptodate(e->private_bio))) { + if (likely( mdev->state.pdsk >= Inconsistent )) { + inc_rs_pending(mdev); + ok=drbd_send_block(mdev, RSDataReply, e); + } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Not sending RSDataReply, partner DISKLESS!\n"); + ok=1; + } + } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Sending NegRSDReply. sector %llus.\n", + (unsigned long long)e->sector); + + ok=drbd_send_ack(mdev,NegRSDReply,e); + + drbd_io_error(mdev, FALSE); + + // update resync data with failure + drbd_rs_failed_io(mdev, e->sector, e->size); + } + + dec_unacked(mdev); + + spin_lock_irq(&mdev->req_lock); + if( drbd_bio_has_active_page(e->private_bio) ) { + /* This might happen if sendpage() has not finished */ + list_add_tail(&e->w.list,&mdev->net_ee); + } else { + drbd_free_ee(mdev,e); + } + spin_unlock_irq(&mdev->req_lock); + + if(unlikely(!ok)) ERR("drbd_send_block() failed\n"); + return ok; +} + +int w_prev_work_done(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + clear_bit(WORK_PENDING,&mdev->flags); + wake_up(&mdev->misc_wait); + return 1; +} + +int w_send_barrier(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_barrier *b = (struct drbd_barrier *)w; + Drbd_Barrier_Packet *p = &mdev->data.sbuf.Barrier; + int ok=1; + + /* really avoid racing with tl_clear. w.cb may have been referenced + * just before it was reassigned and requeued, so double check that. + * actually, this race was harmless, since we only try to send the + * barrier packet here, and otherwise do nothing with the object. + * but compare with the head of w_clear_epoch */ + spin_lock_irq(&mdev->req_lock); + if (w->cb != w_send_barrier || mdev->state.conn < Connected) + cancel = 1; + spin_unlock_irq(&mdev->req_lock); + if (cancel) + return 1; + + if (!drbd_get_data_sock(mdev)) + return 0; + p->barrier = b->br_number; + /* inc_ap_pending was done where this was queued. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in w_clear_epoch. */ + ok = _drbd_send_cmd(mdev,mdev->data.socket,Barrier,(Drbd_Header*)p,sizeof(*p),0); + drbd_put_data_sock(mdev); + + return ok; +} + +int w_send_write_hint(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + if (cancel) return 1; + return drbd_send_short_cmd(mdev,UnplugRemote); +} + +/** + * w_send_dblock: Send a mirrored write request. + */ +int w_send_dblock(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + drbd_request_t *req = (drbd_request_t *)w; + int ok; + + if (unlikely(cancel)) { + req_mod(req, send_canceled, 0); + return 1; + } + + ok = drbd_send_dblock(mdev,req); + req_mod(req,ok ? handed_over_to_network : send_failed, 0); + + return ok; +} + +/** + * w_send_read_req: Send a read requests. + */ +int w_send_read_req(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + drbd_request_t *req = (drbd_request_t *)w; + int ok; + + if (unlikely(cancel)) { + req_mod(req, send_canceled, 0); + return 1; + } + + ok = drbd_send_drequest(mdev, DataRequest, req->sector, req->size, + (unsigned long)req); + + if(ok) { + req_mod(req, handed_over_to_network, 0); + } else { + /* ?? we set Timeout or BrokenPipe in drbd_send() */ + if (mdev->state.conn >= Connected) + drbd_force_state(mdev,NS(conn,NetworkFailure)); + /* req_mod(req, send_failed); we should not fail it here, + * we might have to "freeze" on disconnect. + * handled by req_mod(req, connection_lost_while_pending); + * in drbd_fail_pending_reads soon enough. */ + } + + return ok; +} + +STATIC void drbd_global_lock(void) +{ + drbd_dev *mdev; + int i; + + local_irq_disable(); + for (i=0; i < minor_count; i++) { + if(!(mdev = minor_to_mdev(i))) continue; + spin_lock(&mdev->req_lock); + } +} + +STATIC void drbd_global_unlock(void) +{ + drbd_dev *mdev; + int i; + + for (i=0; i < minor_count; i++) { + if(!(mdev = minor_to_mdev(i))) continue; + spin_unlock(&mdev->req_lock); + } + local_irq_enable(); +} + +STATIC int _drbd_may_sync_now(drbd_dev *mdev) +{ + drbd_dev *odev = mdev; + + while(1) { + if( odev->sync_conf.after == -1 ) return 1; + odev = minor_to_mdev(odev->sync_conf.after); + ERR_IF(!odev) return 1; + if( (odev->state.conn >= SyncSource && + odev->state.conn <= PausedSyncT) || + odev->state.aftr_isp || odev->state.peer_isp || + odev->state.user_isp ) return 0; + } +} + +/** + * _drbd_pause_after: + * Finds all devices that may not resync now, and causes them to + * pause their resynchronisation. + * Called from process context only ( ioctl and after_state_ch ). + */ +STATIC int _drbd_pause_after(drbd_dev *mdev) +{ + drbd_dev *odev; + int i, rv = 0; + + for (i=0; i < minor_count; i++) { + if( !(odev = minor_to_mdev(i)) ) continue; + if (! _drbd_may_sync_now(odev)) { + rv |= ( _drbd_set_state(_NS(odev,aftr_isp,1), + ChgStateHard|ScheduleAfter) + != SS_NothingToDo ) ; + } + } + + return rv; +} + +/** + * _drbd_resume_next: + * Finds all devices that can resume resynchronisation + * process, and causes them to resume. + * Called from process context only ( ioctl and worker ). + */ +STATIC int _drbd_resume_next(drbd_dev *mdev) +{ + drbd_dev *odev; + int i, rv = 0; + + for (i=0; i < minor_count; i++) { + if( !(odev = minor_to_mdev(i)) ) continue; + if ( odev->state.aftr_isp ) { + if (_drbd_may_sync_now(odev)) { + rv |= ( _drbd_set_state(_NS(odev,aftr_isp,0), + ChgStateHard|ScheduleAfter) + != SS_NothingToDo ) ; + } + } + } + return rv; +} + +void resume_next_sg(drbd_dev* mdev) +{ + drbd_global_lock(); + _drbd_resume_next(mdev); + drbd_global_unlock(); +} + +void suspend_other_sg(drbd_dev* mdev) +{ + drbd_global_lock(); + _drbd_pause_after(mdev); + drbd_global_unlock(); +} + +void drbd_alter_sa(drbd_dev *mdev, int na) +{ + int changes; + + drbd_global_lock(); + mdev->sync_conf.after = na; + + do { + changes = _drbd_pause_after(mdev); + changes |= _drbd_resume_next(mdev); + } while (changes); + + drbd_global_unlock(); +} + +/** + * drbd_start_resync: + * @side: Either SyncSource or SyncTarget + * Start the resync process. Called from process context only, + * either ioctl or drbd_receiver. + * Note, this function might bring you directly into one of the + * PausedSync* states. + */ +void drbd_start_resync(drbd_dev *mdev, drbd_conns_t side) +{ + drbd_state_t os,ns; + int r=0; + + MTRACE(TraceTypeResync, TraceLvlSummary, + INFO("Resync starting: side=%s\n", + side==SyncTarget?"SyncTarget":"SyncSource"); + ); + + drbd_bm_recount_bits(mdev); + + /* In case a previous resync run was aborted by an IO error... */ + drbd_rs_cancel_all(mdev); + + if(side == SyncTarget) { + drbd_bm_reset_find(mdev); + } else /* side == SyncSource */ { + u64 uuid; + + get_random_bytes(&uuid, sizeof(u64)); + drbd_uuid_set(mdev, Bitmap, uuid); + drbd_send_sync_uuid(mdev,uuid); + + D_ASSERT(mdev->state.disk == UpToDate); + } + + drbd_global_lock(); + ns = os = mdev->state; + + ns.aftr_isp = !_drbd_may_sync_now(mdev); + + ns.conn = side; + + if(side == SyncTarget) { + ns.disk = Inconsistent; + } else /* side == SyncSource */ { + ns.pdsk = Inconsistent; + } + + r = _drbd_set_state(mdev,ns,ChgStateVerbose); + ns = mdev->state; + + if ( r == SS_Success ) { + mdev->rs_total = + mdev->rs_mark_left = drbd_bm_total_weight(mdev); + mdev->rs_failed = 0; + mdev->rs_paused = 0; + mdev->rs_start = + mdev->rs_mark_time = jiffies; + _drbd_pause_after(mdev); + } + drbd_global_unlock(); + + if ( r == SS_Success ) { + after_state_ch(mdev,os,ns,ChgStateVerbose); + + INFO("Began resync as %s (will sync %lu KB [%lu bits set]).\n", + conns_to_name(ns.conn), + (unsigned long) mdev->rs_total << (BM_BLOCK_SIZE_B-10), + (unsigned long) mdev->rs_total); + + if ( mdev->rs_total == 0 ) { + drbd_resync_finished(mdev); + return; + } + + if( ns.conn == SyncTarget ) { + D_ASSERT(!test_bit(STOP_SYNC_TIMER,&mdev->flags)); + mod_timer(&mdev->resync_timer,jiffies); + } + + drbd_md_sync(mdev); + } +} + +int drbd_worker(struct Drbd_thread *thi) +{ + drbd_dev *mdev = thi->mdev; + struct drbd_work *w = 0; + LIST_HEAD(work_list); + int intr=0,i; + + sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); + + while (get_t_state(thi) == Running) { + + if(down_trylock(&mdev->data.work.s)) { + down(&mdev->data.mutex); + if(mdev->data.socket)drbd_tcp_flush(mdev->data.socket); + up(&mdev->data.mutex); + + intr = down_interruptible(&mdev->data.work.s); + + down(&mdev->data.mutex); + if(mdev->data.socket) drbd_tcp_cork(mdev->data.socket); + up(&mdev->data.mutex); + } + + if (intr) { + D_ASSERT(intr == -EINTR); + flush_signals(current); + ERR_IF (get_t_state(thi) == Running) + continue; + break; + } + + if (get_t_state(thi) != Running) break; + /* With this break, we have done a down() but not consumed + the entry from the list. The cleanup code takes care of + this... */ + + w = 0; + spin_lock_irq(&mdev->data.work.q_lock); + ERR_IF(list_empty(&mdev->data.work.q)) { + /* something terribly wrong in our logic. + * we were able to down() the semaphore, + * but the list is empty... doh. + * + * what is the best thing to do now? + * try again from scratch, restarting the receiver, + * asender, whatnot? could break even more ugly, + * e.g. when we are primary, but no good local data. + * + * I'll try to get away just starting over this loop. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + continue; + } + w = list_entry(mdev->data.work.q.next,struct drbd_work,list); + list_del_init(&w->list); + spin_unlock_irq(&mdev->data.work.q_lock); + + if(!w->cb(mdev,w, mdev->state.conn < Connected )) { + //WARN("worker: a callback failed! \n"); + if (mdev->state.conn >= Connected) + drbd_force_state(mdev,NS(conn,NetworkFailure)); + } + } + + spin_lock_irq(&mdev->data.work.q_lock); + i = 0; + while (!list_empty(&mdev->data.work.q)) { + list_splice_init(&mdev->data.work.q,&work_list); + spin_unlock_irq(&mdev->data.work.q_lock); + + while(!list_empty(&work_list)) { + w = list_entry(work_list.next, struct drbd_work,list); + list_del_init(&w->list); + w->cb(mdev,w,1); + i++; /* dead debugging code */ + } + + spin_lock_irq(&mdev->data.work.q_lock); + } + sema_init(&mdev->data.work.s,0); + /* DANGEROUS race: if someone did queue his work within the spinlock, + * but up() ed outside the spinlock, we could get an up() on the + * semaphore without corresponding list entry. + * So don't do that. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + /* FIXME verify that there absolutely can not be any more work + * on the queue now... + * if so, the comment above is no longer true, but historic + * from the times when the worker did not live as long as the + * device.. */ + + D_ASSERT( mdev->state.disk == Diskless && mdev->state.conn == StandAlone ); + drbd_mdev_cleanup(mdev); + module_put(THIS_MODULE); + + INFO("worker terminated\n"); + + return 0; +} diff -uprN linux-2.6.18/drivers/block/drbd/lru_cache.c linux-2.6.18.ovz/drivers/block/drbd/lru_cache.c --- linux-2.6.18/drivers/block/drbd/lru_cache.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/lru_cache.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,370 @@ +/* +-*- linux-c -*- + lru_cache.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2007, Philipp Reisner . + Copyright (C) 2003-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include // for memset +#include // for seq_printf +#include "lru_cache.h" + +#define STATIC static + +// this is developers aid only! +#define PARANOIA_ENTRY() BUG_ON(test_and_set_bit(__LC_PARANOIA,&lc->flags)) +#define PARANOIA_LEAVE() do { clear_bit(__LC_PARANOIA,&lc->flags); smp_mb__after_clear_bit(); } while (0) +#define RETURN(x...) do { PARANOIA_LEAVE(); return x ; } while (0) + +/** + * lc_alloc: allocates memory for @e_count objects of @e_size bytes plus the + * struct lru_cache, and the hash table slots. + * returns pointer to a newly initialized lru_cache object with said parameters. + */ +struct lru_cache* lc_alloc(const char *name, unsigned int e_count, + size_t e_size, void *private_p) +{ + unsigned long bytes; + struct lru_cache *lc; + struct lc_element *e; + int i; + + BUG_ON(!e_count); + e_size = max(sizeof(struct lc_element),e_size); + bytes = e_size+sizeof(struct hlist_head); + bytes *= e_count; + bytes += sizeof(struct lru_cache); + lc = vmalloc(bytes); + memset(lc, 0, bytes); + if (lc) { + INIT_LIST_HEAD(&lc->in_use); + INIT_LIST_HEAD(&lc->lru); + INIT_LIST_HEAD(&lc->free); + lc->element_size = e_size; + lc->nr_elements = e_count; + lc->new_number = -1; + lc->lc_private = private_p; + lc->name = name; + for(i=0;ilc_number = LC_FREE; + list_add(&e->list,&lc->free); + // memset(,0,) did the rest of init for us + } + } + return lc; +} + +/** + * lc_free: Frees memory allocated by lc_alloc. + * @lc: The lru_cache object + */ +void lc_free(struct lru_cache* lc) +{ + vfree(lc); +} + +size_t lc_printf_stats(struct seq_file *seq, struct lru_cache* lc) +{ + /* NOTE: + * total calls to lc_get are + * starving + hits + misses + * misses include "dirty" count (update from an other thread in progress) + * and "changed", when this in fact lead to an successful update of the cache. + */ + return seq_printf(seq,"\t%s: used:%u/%u " + "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n", + lc->name, lc->used, lc->nr_elements, + lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed); +} + +static unsigned int lc_hash_fn(struct lru_cache* lc, unsigned int enr) +{ + return enr % lc->nr_elements; +} + + +/** + * lc_find: Returns the pointer to an element, if the element is present + * in the hash table. In case it is not this function returns NULL. + * @lc: The lru_cache object + * @enr: element number + */ +struct lc_element* lc_find(struct lru_cache* lc, unsigned int enr) +{ + struct hlist_node *n; + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + hlist_for_each_entry(e, n, lc->slot + lc_hash_fn(lc, enr), colision) { + if (e->lc_number == enr) return e; + } + return NULL; +} + +STATIC struct lc_element * lc_evict(struct lru_cache* lc) +{ + struct list_head *n; + struct lc_element *e; + + if (list_empty(&lc->lru)) return 0; + + n=lc->lru.prev; + e=list_entry(n, struct lc_element,list); + + list_del(&e->list); + hlist_del(&e->colision); + return e; +} + +/** + * lc_del: Removes an element from the cache (and therefore adds the + * element's storage to the free list) + * + * @lc: The lru_cache object + * @e: The element to remove + */ +void lc_del(struct lru_cache* lc, struct lc_element *e) +{ + // FIXME what to do with refcnt != 0 ? + PARANOIA_ENTRY(); + BUG_ON(e->refcnt); + list_del(&e->list); + hlist_del_init(&e->colision); + e->lc_number = LC_FREE; + e->refcnt = 0; + list_add(&e->list,&lc->free); + RETURN(); +} + +STATIC struct lc_element* lc_get_unused_element(struct lru_cache* lc) +{ + struct list_head *n; + + if (list_empty(&lc->free)) return lc_evict(lc); + + n=lc->free.next; + list_del(n); + return list_entry(n, struct lc_element,list); +} + +STATIC int lc_unused_element_available(struct lru_cache* lc) +{ + if (!list_empty(&lc->free)) return 1; // something on the free list + if (!list_empty(&lc->lru)) return 1; // something to evict + + return 0; +} + + +/** + * lc_get: Finds an element in the cache, increases its usage count, + * "touches" and returns it. + * In case the requested number is not present, it needs to be added to the + * cache. Therefore it is possible that an other element becomes eviced from + * the cache. In either case, the user is notified so he is able to e.g. keep + * a persistent log of the cache changes, and therefore the objects in use. + * + * Return values: + * NULL if the requested element number was not in the cache, and no unused + * element could be recycled + * pointer to the element with the REQUESTED element number + * In this case, it can be used right away + * + * pointer to an UNUSED element with some different element number. + * In this case, the cache is marked dirty, and the returned element + * pointer is removed from the lru list and hash collision chains. + * The user now should do whatever houskeeping is necessary. Then he + * needs to call lc_element_changed(lc,element_pointer), to finish the + * change. + * + * NOTE: The user needs to check the lc_number on EACH use, so he recognizes + * any cache set change. + * + * @lc: The lru_cache object + * @enr: element number + */ +struct lc_element* lc_get(struct lru_cache* lc, unsigned int enr) +{ + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + + PARANOIA_ENTRY(); + if ( lc->flags & LC_STARVING ) { + ++lc->starving; + RETURN(NULL); + } + + e = lc_find(lc, enr); + if (e) { + ++lc->hits; + if( e->refcnt++ == 0) lc->used++; + list_move(&e->list,&lc->in_use); // Not evictable... + RETURN(e); + } + + ++lc->misses; + + /* In case there is nothing available and we can not kick out + * the LRU element, we have to wait ... + */ + if(!lc_unused_element_available(lc)) { + __set_bit(__LC_STARVING,&lc->flags); + RETURN(NULL); + } + + /* it was not present in the cache, find an unused element, + * which then is replaced. + * we need to update the cache; serialize on lc->flags & LC_DIRTY + */ + if (test_and_set_bit(__LC_DIRTY,&lc->flags)) { + ++lc->dirty; + RETURN(NULL); + } + + e = lc_get_unused_element(lc); + BUG_ON(!e); + + clear_bit(__LC_STARVING,&lc->flags); + BUG_ON(++e->refcnt != 1); + lc->used++; + + lc->changing_element = e; + lc->new_number = enr; + + RETURN(e); +} + +/* similar to lc_get, + * but only gets a new reference on an existing element. + * you either get the requested element, or NULL. + */ +struct lc_element* lc_try_get(struct lru_cache* lc, unsigned int enr) +{ + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + + PARANOIA_ENTRY(); + if ( lc->flags & LC_STARVING ) { + ++lc->starving; + RETURN(NULL); + } + + e = lc_find(lc, enr); + if (e) { + ++lc->hits; + if( e->refcnt++ == 0) lc->used++; + list_move(&e->list,&lc->in_use); // Not evictable... + } + RETURN(e); +} + +void lc_changed(struct lru_cache* lc, struct lc_element* e) +{ + PARANOIA_ENTRY(); + BUG_ON(e != lc->changing_element); + ++lc->changed; + e->lc_number = lc->new_number; + list_add(&e->list,&lc->in_use); + hlist_add_head( &e->colision, lc->slot + lc_hash_fn(lc, lc->new_number) ); + lc->changing_element = NULL; + lc->new_number = -1; + clear_bit(__LC_DIRTY,&lc->flags); + smp_mb__after_clear_bit(); + PARANOIA_LEAVE(); +} + + +unsigned int lc_put(struct lru_cache* lc, struct lc_element* e) +{ + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + BUG_ON(!e); + + PARANOIA_ENTRY(); + BUG_ON(e->refcnt == 0); + BUG_ON(e == lc->changing_element); + if ( --e->refcnt == 0) { + list_move(&e->list,&lc->lru); // move it to the front of LRU. + lc->used--; + clear_bit(__LC_STARVING,&lc->flags); + smp_mb__after_clear_bit(); + } + RETURN(e->refcnt); +} + + +/** + * lc_set: Sets an element in the cache. You might use this function to + * setup the cache. It is expected that the elements are properly initialized. + * @lc: The lru_cache object + * @enr: element number + * @index: The elements' position in the cache + */ +void lc_set(struct lru_cache* lc, unsigned int enr, int index) +{ + struct lc_element *e; + + if ( index < 0 || index >= lc->nr_elements ) return; + + e = lc_entry(lc,index); + e->lc_number = enr; + + hlist_del_init(&e->colision); + hlist_add_head( &e->colision, lc->slot + lc_hash_fn(lc,enr) ); + list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); +} + +#if 0 +/** + * lc_dump: Dump a complete LRU cache to seq in textual form. + */ +void lc_dump(struct lru_cache* lc, struct seq_file *seq, char* utext, + void (*detail) (struct seq_file *, struct lc_element *) ) +{ + unsigned int nr_elements = lc->nr_elements; + struct lc_element *e; + int i; + + seq_printf(seq,"\tnn: lc_number refcnt %s\n ",utext); + for(i=0;ilc_number == LC_FREE ) { + seq_printf(seq,"\t%2d: FREE\n",i ); + } else { + seq_printf(seq,"\t%2d: %4u %4u ", i, + e->lc_number, + e->refcnt ); + detail(seq,e); + } + } +} + +#endif diff -uprN linux-2.6.18/drivers/block/drbd/lru_cache.h linux-2.6.18.ovz/drivers/block/drbd/lru_cache.h --- linux-2.6.18/drivers/block/drbd/lru_cache.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/block/drbd/lru_cache.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,147 @@ +/* +-*- linux-c -*- + lru_cache.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2007, Philipp Reisner . + Copyright (C) 2003-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +/* + The lru_cache describes a big set of objects that are addressed + by an index number (=lc_number). Only a small fraction of this set + is present in the cache. + (You set the size of the cache during lc_alloc) + Once created, the api consists of + lc_find(,nr) -- finds the object with the given number, if present + lc_get(,nr) -- finds the object and increases the usage count + if not present, actions are taken to make sure that + the cache is updated, the user is notified of this by a callback. + Return value is NULL in this case. + As soon as the user informs the cache that it has been updated, + the next lc_get on that very object number will be successfull. + lc_put(,lc_element*) + -- decreases the usage count of this object, and returns the new value. + + NOTE: It is the USERS responsibility to make sure that calls do not happen concurrently. + */ + +#ifndef LRU_CACHE_H +#define LRU_CACHE_H + +#include +#ifndef HLIST_HEAD_INIT +# include "hlist.h" +#endif + +#include + +/* FIXME + * I want these structs opaque outside of lru_cache.c + */ + +struct lc_element { + struct hlist_node colision; + struct list_head list; // LRU list or free list + unsigned int refcnt; + unsigned int lc_number; +}; + +struct lru_cache { + struct list_head lru; + struct list_head free; + struct list_head in_use; + size_t element_size; + unsigned int nr_elements; + unsigned int new_number; + + /* here may or may not be a pad... */ + + unsigned int used; + unsigned long flags; + unsigned long hits, misses, starving, dirty, changed; + struct lc_element *changing_element; // just for paranoia + + void *lc_private; + const char *name; + + struct hlist_head slot[0]; + // hash colision chains here, then element storage. +}; + + +// flag-bits for lru_cache +enum { + __LC_PARANOIA, + __LC_DIRTY, + __LC_STARVING, +}; +#define LC_PARANOIA (1<<__LC_PARANOIA) +#define LC_DIRTY (1<<__LC_DIRTY) +#define LC_STARVING (1<<__LC_STARVING) + +extern struct lru_cache* lc_alloc(const char *name, unsigned int e_count, + size_t e_size, void *private_p); +extern void lc_free(struct lru_cache* lc); +extern void lc_set (struct lru_cache* lc, unsigned int enr, int index); +extern void lc_del (struct lru_cache* lc, struct lc_element *element); + +extern struct lc_element* lc_try_get(struct lru_cache* lc, unsigned int enr); +extern struct lc_element* lc_find(struct lru_cache* lc, unsigned int enr); +extern struct lc_element* lc_get (struct lru_cache* lc, unsigned int enr); +extern unsigned int lc_put (struct lru_cache* lc, struct lc_element* e); +extern void lc_changed(struct lru_cache* lc, struct lc_element* e); + +struct seq_file; +extern size_t lc_printf_stats(struct seq_file *seq, struct lru_cache* lc); + +void lc_dump(struct lru_cache* lc, struct seq_file *seq, char* utext, + void (*detail) (struct seq_file *, struct lc_element *) ); + +/* This can be used to stop lc_get from changing the set of active elements. + * Note that the reference counts and order on the lru list may still change. + * returns true if we aquired the lock. + */ +static inline int lc_try_lock(struct lru_cache* lc) +{ + return !test_and_set_bit(__LC_DIRTY,&lc->flags); +} + +static inline void lc_unlock(struct lru_cache* lc) +{ + clear_bit(__LC_DIRTY,&lc->flags); + smp_mb__after_clear_bit(); +} + +static inline int lc_is_used(struct lru_cache* lc, unsigned int enr) +{ + struct lc_element* e = lc_find(lc,enr); + return (e && e->refcnt); +} + +#define LC_FREE (-1U) + +#define lc_e_base(lc) ((char*) ( (lc)->slot + (lc)->nr_elements ) ) +#define lc_entry(lc,i) ((struct lc_element*) \ + (lc_e_base(lc) + (i)*(lc)->element_size)) +#define lc_index_of(lc,e) (((char*)(e) - lc_e_base(lc))/(lc)->element_size) + +#endif diff -uprN linux-2.6.18/drivers/block/loop.c linux-2.6.18.ovz/drivers/block/loop.c --- linux-2.6.18/drivers/block/loop.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/block/loop.c 2007-06-13 06:55:05.000000000 -0400 @@ -519,7 +519,7 @@ static int loop_make_request(request_que spin_lock_irq(&lo->lo_lock); if (lo->lo_state != Lo_bound) - goto out; + goto out_not_bound; if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) goto out; lo->lo_pending++; @@ -531,6 +531,7 @@ static int loop_make_request(request_que out: if (lo->lo_pending == 0) complete(&lo->lo_bh_done); +out_not_bound: spin_unlock_irq(&lo->lo_lock); bio_io_error(old_bio, old_bio->bi_size); return 0; diff -uprN linux-2.6.18/drivers/char/agp/generic.c linux-2.6.18.ovz/drivers/char/agp/generic.c --- linux-2.6.18/drivers/char/agp/generic.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/agp/generic.c 2007-06-13 06:55:05.000000000 -0400 @@ -1042,7 +1042,7 @@ void *agp_generic_alloc_page(struct agp_ { struct page * page; - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_KERNEL | GFP_DMA32); if (page == NULL) return NULL; diff -uprN linux-2.6.18/drivers/char/agp/intel-agp.c linux-2.6.18.ovz/drivers/char/agp/intel-agp.c --- linux-2.6.18/drivers/char/agp/intel-agp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/agp/intel-agp.c 2007-06-13 06:55:05.000000000 -0400 @@ -160,7 +160,7 @@ static void *i8xx_alloc_pages(void) { struct page * page; - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2); if (page == NULL) return NULL; diff -uprN linux-2.6.18/drivers/char/hw_random/intel-rng.c linux-2.6.18.ovz/drivers/char/hw_random/intel-rng.c --- linux-2.6.18/drivers/char/hw_random/intel-rng.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/hw_random/intel-rng.c 2007-06-13 06:55:05.000000000 -0400 @@ -50,6 +50,43 @@ #define INTEL_RNG_ADDR_LEN 3 /* + * LPC bridge PCI config space registers + */ +#define FWH_DEC_EN1_REG_OLD 0xe3 +#define FWH_DEC_EN1_REG_NEW 0xd9 /* high byte of 16-bit register */ +#define FWH_F8_EN_MASK 0x80 + +#define BIOS_CNTL_REG_OLD 0x4e +#define BIOS_CNTL_REG_NEW 0xdc +#define BIOS_CNTL_WRITE_ENABLE_MASK 0x01 +#define BIOS_CNTL_LOCK_ENABLE_MASK 0x02 + +/* + * Magic address at which Intel Firmware Hubs get accessed + */ +#define INTEL_FWH_ADDR 0xffff0000 +#define INTEL_FWH_ADDR_LEN 2 + +/* + * Intel Firmware Hub command codes (write to any address inside the device) + */ +#define INTEL_FWH_RESET_CMD 0xff /* aka READ_ARRAY */ +#define INTEL_FWH_READ_ID_CMD 0x90 + +/* + * Intel Firmware Hub Read ID command result addresses + */ +#define INTEL_FWH_MANUFACTURER_CODE_ADDRESS 0x000000 +#define INTEL_FWH_DEVICE_CODE_ADDRESS 0x000001 + +/* + * Intel Firmware Hub Read ID command result values + */ +#define INTEL_FWH_MANUFACTURER_CODE 0x89 +#define INTEL_FWH_DEVICE_CODE_8M 0xac +#define INTEL_FWH_DEVICE_CODE_4M 0xad + +/* * Data for PCI driver interface * * This data only exists for exporting the supported @@ -58,12 +95,50 @@ * want to register another driver on the same PCI id. */ static const struct pci_device_id pci_tbl[] = { - { 0x8086, 0x2418, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, - { 0x8086, 0x2428, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, - { 0x8086, 0x2430, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, - { 0x8086, 0x2448, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, - { 0x8086, 0x244e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, - { 0x8086, 0x245e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, +/* AA + { 0x8086, 0x2418, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, */ + { 0x8086, 0x2410, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* AA */ +/* AB + { 0x8086, 0x2428, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, */ + { 0x8086, 0x2420, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* AB */ +/* ?? + { 0x8086, 0x2430, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, */ +/* BAM, CAM, DBM, FBM, GxM + { 0x8086, 0x2448, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, */ + { 0x8086, 0x244c, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* BAM */ + { 0x8086, 0x248c, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* CAM */ + { 0x8086, 0x24cc, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* DBM */ + { 0x8086, 0x2641, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* FBM */ + { 0x8086, 0x27b9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* GxM */ + { 0x8086, 0x27bd, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* GxM DH */ +/* BA, CA, DB, Ex, 6300, Fx, 631x/632x, Gx + { 0x8086, 0x244e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, */ + { 0x8086, 0x2440, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* BA */ + { 0x8086, 0x2480, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* CA */ + { 0x8086, 0x24c0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* DB */ + { 0x8086, 0x24d0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* Ex */ + { 0x8086, 0x25a1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 6300 */ + { 0x8086, 0x2640, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* Fx */ + { 0x8086, 0x2670, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x2671, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x2672, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x2673, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x2674, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x2675, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x2676, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x2677, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x2678, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x2679, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x267a, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x267b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x267c, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x267d, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x267e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x267f, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* 631x/632x */ + { 0x8086, 0x27b8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* Gx */ +/* E + { 0x8086, 0x245e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, */ + { 0x8086, 0x2450, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, /* E */ { 0, }, /* terminate list */ }; MODULE_DEVICE_TABLE(pci, pci_tbl); @@ -138,22 +213,115 @@ static struct hwrng intel_rng = { }; +#ifdef CONFIG_SMP +static char __initdata waitflag; + +static void __init intel_init_wait(void *unused) +{ + while (waitflag) + cpu_relax(); +} +#endif + static int __init mod_init(void) { int err = -ENODEV; + unsigned i; + struct pci_dev *dev = NULL; void __iomem *mem; - u8 hw_status; + unsigned long flags; + u8 bios_cntl_off, fwh_dec_en1_off; + u8 bios_cntl_val = 0xff, fwh_dec_en1_val = 0xff; + u8 hw_status, mfc, dvc; + + for (i = 0; !dev && pci_tbl[i].vendor; ++i) + dev = pci_get_device(pci_tbl[i].vendor, pci_tbl[i].device, NULL); - if (!pci_dev_present(pci_tbl)) + if (!dev) goto out; /* Device not found. */ + /* Check for Intel 82802 */ + if (dev->device < 0x2640) { + fwh_dec_en1_off = FWH_DEC_EN1_REG_OLD; + bios_cntl_off = BIOS_CNTL_REG_OLD; + } else { + fwh_dec_en1_off = FWH_DEC_EN1_REG_NEW; + bios_cntl_off = BIOS_CNTL_REG_NEW; + } + + pci_read_config_byte(dev, fwh_dec_en1_off, &fwh_dec_en1_val); + pci_read_config_byte(dev, bios_cntl_off, &bios_cntl_val); + + mem = ioremap_nocache(INTEL_FWH_ADDR, INTEL_FWH_ADDR_LEN); + if (mem == NULL) { + pci_dev_put(dev); + err = -EBUSY; + goto out; + } + + /* + * Since the BIOS code/data is going to disappear from its normal + * location with the Read ID command, all activity on the system + * must be stopped until the state is back to normal. + */ +#ifdef CONFIG_SMP + set_mb(waitflag, 1); + if (smp_call_function(intel_init_wait, NULL, 1, 0) != 0) { + set_mb(waitflag, 0); + pci_dev_put(dev); + printk(KERN_ERR PFX "cannot run on all processors\n"); + err = -EAGAIN; + goto err_unmap; + } +#endif + local_irq_save(flags); + + if (!(fwh_dec_en1_val & FWH_F8_EN_MASK)) + pci_write_config_byte(dev, + fwh_dec_en1_off, + fwh_dec_en1_val | FWH_F8_EN_MASK); + if (!(bios_cntl_val & + (BIOS_CNTL_LOCK_ENABLE_MASK|BIOS_CNTL_WRITE_ENABLE_MASK))) + pci_write_config_byte(dev, + bios_cntl_off, + bios_cntl_val | BIOS_CNTL_WRITE_ENABLE_MASK); + + writeb(INTEL_FWH_RESET_CMD, mem); + writeb(INTEL_FWH_READ_ID_CMD, mem); + mfc = readb(mem + INTEL_FWH_MANUFACTURER_CODE_ADDRESS); + dvc = readb(mem + INTEL_FWH_DEVICE_CODE_ADDRESS); + writeb(INTEL_FWH_RESET_CMD, mem); + + if (!(bios_cntl_val & + (BIOS_CNTL_LOCK_ENABLE_MASK|BIOS_CNTL_WRITE_ENABLE_MASK))) + pci_write_config_byte(dev, bios_cntl_off, bios_cntl_val); + if (!(fwh_dec_en1_val & FWH_F8_EN_MASK)) + pci_write_config_byte(dev, fwh_dec_en1_off, fwh_dec_en1_val); + + local_irq_restore(flags); +#ifdef CONFIG_SMP + /* Tell other CPUs to resume. */ + set_mb(waitflag, 0); +#endif + + iounmap(mem); + pci_dev_put(dev); + + if (mfc != INTEL_FWH_MANUFACTURER_CODE || + (dvc != INTEL_FWH_DEVICE_CODE_8M && + dvc != INTEL_FWH_DEVICE_CODE_4M)) { + printk(KERN_ERR PFX "FWH not detected\n"); + err = -ENODEV; + goto out; + } + err = -ENOMEM; mem = ioremap(INTEL_RNG_ADDR, INTEL_RNG_ADDR_LEN); if (!mem) goto out; intel_rng.priv = (unsigned long)mem; - /* Check for Intel 82802 */ + /* Check for Random Number Generator */ err = -ENODEV; hw_status = hwstatus_get(mem); if ((hw_status & INTEL_RNG_PRESENT) == 0) diff -uprN linux-2.6.18/drivers/char/ipmi/ipmi_si_intf.c linux-2.6.18.ovz/drivers/char/ipmi/ipmi_si_intf.c --- linux-2.6.18/drivers/char/ipmi/ipmi_si_intf.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/ipmi/ipmi_si_intf.c 2007-06-13 06:55:05.000000000 -0400 @@ -1845,7 +1845,7 @@ static int ipmi_pci_resume(struct pci_de static struct pci_device_id ipmi_pci_devices[] = { { PCI_DEVICE(PCI_HP_VENDOR_ID, PCI_MMC_DEVICE_ID) }, - { PCI_DEVICE_CLASS(PCI_ERMC_CLASSCODE, PCI_ERMC_CLASSCODE) } + { PCI_DEVICE_CLASS(PCI_ERMC_CLASSCODE, PCI_ERMC_CLASSCODE_MASK) } }; MODULE_DEVICE_TABLE(pci, ipmi_pci_devices); diff -uprN linux-2.6.18/drivers/char/isicom.c linux-2.6.18.ovz/drivers/char/isicom.c --- linux-2.6.18/drivers/char/isicom.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/isicom.c 2007-06-13 06:55:05.000000000 -0400 @@ -1062,11 +1062,12 @@ static void isicom_shutdown_port(struct static void isicom_close(struct tty_struct *tty, struct file *filp) { struct isi_port *port = tty->driver_data; - struct isi_board *card = port->card; + struct isi_board *card; unsigned long flags; if (!port) return; + card = port->card; if (isicom_paranoia_check(port, tty->name, "isicom_close")) return; diff -uprN linux-2.6.18/drivers/char/mem.c linux-2.6.18.ovz/drivers/char/mem.c --- linux-2.6.18/drivers/char/mem.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/mem.c 2007-06-13 06:55:05.000000000 -0400 @@ -616,7 +616,8 @@ static inline size_t read_zero_pagealign count = size; zap_page_range(vma, addr, count, NULL); - zeromap_page_range(vma, addr, count, PAGE_COPY); + if (zeromap_page_range(vma, addr, count, PAGE_COPY)) + break; size -= count; buf += count; @@ -683,11 +684,14 @@ out: static int mmap_zero(struct file * file, struct vm_area_struct * vma) { + int err; + if (vma->vm_flags & VM_SHARED) return shmem_zero_setup(vma); - if (zeromap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) - return -EAGAIN; - return 0; + err = zeromap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, vma->vm_page_prot); + BUG_ON(err == -EEXIST); + return err; } #else /* CONFIG_MMU */ static ssize_t read_zero(struct file * file, char * buf, diff -uprN linux-2.6.18/drivers/char/pty.c linux-2.6.18.ovz/drivers/char/pty.c --- linux-2.6.18/drivers/char/pty.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/pty.c 2007-06-13 06:55:06.000000000 -0400 @@ -30,16 +30,30 @@ #include #include +#include + /* These are global because they are accessed in tty_io.c */ #ifdef CONFIG_UNIX98_PTYS struct tty_driver *ptm_driver; -static struct tty_driver *pts_driver; +struct tty_driver *pts_driver; +EXPORT_SYMBOL(ptm_driver); +EXPORT_SYMBOL(pts_driver); + +void prepare_pty(void) +{ +#ifdef CONFIG_VE + get_ve0()->ptm_driver = ptm_driver; + /* don't clean ptm_driver and co. here, they are used in vecalls.c */ +#endif +} #endif static void pty_close(struct tty_struct * tty, struct file * filp) { if (!tty) return; + + ub_pty_uncharge(tty); if (tty->driver->subtype == PTY_TYPE_MASTER) { if (tty->count > 1) printk("master pty_close: count = %d!!\n", tty->count); @@ -59,8 +73,12 @@ static void pty_close(struct tty_struct if (tty->driver->subtype == PTY_TYPE_MASTER) { set_bit(TTY_OTHER_CLOSED, &tty->flags); #ifdef CONFIG_UNIX98_PTYS - if (tty->driver == ptm_driver) + if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) { + struct ve_struct *old_env; + old_env = set_exec_env(tty->owner_env); devpts_pty_kill(tty->index); + (void)set_exec_env(old_env); + } #endif tty_vhangup(tty->link); } @@ -210,6 +228,10 @@ static int pty_open(struct tty_struct *t if (tty->link->count != 1) goto out; + retval = -ENOMEM; + if (ub_pty_charge(tty)) + goto out; + clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); @@ -237,7 +259,9 @@ static struct tty_operations pty_ops = { /* Traditional BSD devices */ #ifdef CONFIG_LEGACY_PTYS -static struct tty_driver *pty_driver, *pty_slave_driver; +struct tty_driver *pty_driver, *pty_slave_driver; +EXPORT_SYMBOL(pty_driver); +EXPORT_SYMBOL(pty_slave_driver); static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) @@ -392,6 +416,7 @@ static void __init unix98_pty_init(void) panic("Couldn't register Unix98 pts driver"); pty_table[1].data = &ptm_driver->refcount; + prepare_pty(); } #else static inline void unix98_pty_init(void) { } diff -uprN linux-2.6.18/drivers/char/random.c linux-2.6.18.ovz/drivers/char/random.c --- linux-2.6.18/drivers/char/random.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/random.c 2007-06-13 06:55:06.000000000 -0400 @@ -757,7 +757,7 @@ static size_t account(struct entropy_sto static void extract_buf(struct entropy_store *r, __u8 *out) { - int i, x; + int i; __u32 data[16], buf[5 + SHA_WORKSPACE_WORDS]; sha_init(buf); @@ -769,9 +769,11 @@ static void extract_buf(struct entropy_s * attempts to find previous ouputs), unless the hash * function can be inverted. */ - for (i = 0, x = 0; i < r->poolinfo->poolwords; i += 16, x+=2) { - sha_transform(buf, (__u8 *)r->pool+i, buf + 5); - add_entropy_words(r, &buf[x % 5], 1); + for (i = 0; i < r->poolinfo->poolwords; i += 16) { + /* hash blocks of 16 words = 512 bits */ + sha_transform(buf, (__u8 *)(r->pool + i), buf + 5); + /* feed back portion of the resulting hash */ + add_entropy_words(r, &buf[i % 5], 1); } /* @@ -779,7 +781,7 @@ static void extract_buf(struct entropy_s * portion of the pool while mixing, and hash one * final time. */ - __add_entropy_words(r, &buf[x % 5], 1, data); + __add_entropy_words(r, &buf[i % 5], 1, data); sha_transform(buf, (__u8 *)data, buf + 5); /* @@ -887,8 +889,8 @@ static void init_std_data(struct entropy do_gettimeofday(&tv); add_entropy_words(r, (__u32 *)&tv, sizeof(tv)/4); - add_entropy_words(r, (__u32 *)&system_utsname, - sizeof(system_utsname)/4); + add_entropy_words(r, (__u32 *)utsname(), + sizeof(*(utsname()))/4); } static int __init rand_initialize(void) @@ -1017,37 +1019,44 @@ random_poll(struct file *file, poll_tabl return mask; } -static ssize_t -random_write(struct file * file, const char __user * buffer, - size_t count, loff_t *ppos) +static int +write_pool(struct entropy_store *r, const char __user *buffer, size_t count) { - int ret = 0; size_t bytes; __u32 buf[16]; const char __user *p = buffer; - size_t c = count; - while (c > 0) { - bytes = min(c, sizeof(buf)); + while (count > 0) { + bytes = min(count, sizeof(buf)); + if (copy_from_user(&buf, p, bytes)) + return -EFAULT; - bytes -= copy_from_user(&buf, p, bytes); - if (!bytes) { - ret = -EFAULT; - break; - } - c -= bytes; + count -= bytes; p += bytes; - add_entropy_words(&input_pool, buf, (bytes + 3) / 4); - } - if (p == buffer) { - return (ssize_t)ret; - } else { - struct inode *inode = file->f_dentry->d_inode; - inode->i_mtime = current_fs_time(inode->i_sb); - mark_inode_dirty(inode); - return (ssize_t)(p - buffer); + add_entropy_words(r, buf, (bytes + 3) / 4); } + + return 0; +} + +static ssize_t +random_write(struct file * file, const char __user * buffer, + size_t count, loff_t *ppos) +{ + size_t ret; + struct inode *inode = file->f_dentry->d_inode; + + ret = write_pool(&blocking_pool, buffer, count); + if (ret) + return ret; + ret = write_pool(&nonblocking_pool, buffer, count); + if (ret) + return ret; + + inode->i_mtime = current_fs_time(inode->i_sb); + mark_inode_dirty(inode); + return (ssize_t)count; } static int @@ -1086,8 +1095,8 @@ random_ioctl(struct inode * inode, struc return -EINVAL; if (get_user(size, p++)) return -EFAULT; - retval = random_write(file, (const char __user *) p, - size, &file->f_pos); + retval = write_pool(&input_pool, (const char __user *)p, + size); if (retval < 0) return retval; credit_entropy_store(&input_pool, ent_count); diff -uprN linux-2.6.18/drivers/char/rtc.c linux-2.6.18.ovz/drivers/char/rtc.c --- linux-2.6.18/drivers/char/rtc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/rtc.c 2007-06-13 06:55:06.000000000 -0400 @@ -209,11 +209,12 @@ static const unsigned char days_in_mo[] */ static inline unsigned char rtc_is_updating(void) { + unsigned long flags; unsigned char uip; - spin_lock_irq(&rtc_lock); + spin_lock_irqsave(&rtc_lock, flags); uip = (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP); - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return uip; } diff -uprN linux-2.6.18/drivers/char/sysrq.c linux-2.6.18.ovz/drivers/char/sysrq.c --- linux-2.6.18/drivers/char/sysrq.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/sysrq.c 2007-06-13 06:55:06.000000000 -0400 @@ -35,6 +35,8 @@ #include #include #include +#include +#include #include @@ -166,8 +168,13 @@ static struct sysrq_key_op sysrq_showloc static void sysrq_handle_showregs(int key, struct pt_regs *pt_regs, struct tty_struct *tty) { + bust_spinlocks(1); if (pt_regs) show_regs(pt_regs); + bust_spinlocks(0); +#if defined(__i386__) || defined(__x86_64__) + smp_nmi_call_function(smp_show_regs, NULL, 0); +#endif } static struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, @@ -192,6 +199,7 @@ static void sysrq_handle_showmem(int key struct tty_struct *tty) { show_mem(); + show_slab_info(); } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, @@ -207,7 +215,7 @@ static void send_sig_all(int sig) { struct task_struct *p; - for_each_process(p) { + for_each_process_all(p) { if (p->mm && p->pid != 1) /* Not swapper, init nor kernel thread */ force_sig(sig, p); @@ -259,6 +267,19 @@ static struct sysrq_key_op sysrq_kill_op .enable_mask = SYSRQ_ENABLE_SIGNAL, }; +#ifdef CONFIG_SCHED_VCPU +static void sysrq_handle_vschedstate(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + show_vsched(); +} +static struct sysrq_key_op sysrq_vschedstate_op = { + .handler = sysrq_handle_vschedstate, + .help_msg = "vsced_stAte", + .action_msg = "Show Vsched", +}; +#endif + static void sysrq_handle_unrt(int key, struct pt_regs *pt_regs, struct tty_struct *tty) { @@ -274,7 +295,274 @@ static struct sysrq_key_op sysrq_unrt_op /* Key Operations table and lock */ static DEFINE_SPINLOCK(sysrq_key_table_lock); -static struct sysrq_key_op *sysrq_key_table[36] = { +#define SYSRQ_KEY_TABLE_LENGTH 37 +static struct sysrq_key_op **sysrq_key_table; +static struct sysrq_key_op *sysrq_default_key_table[]; + +#ifdef CONFIG_SYSRQ_DEBUG +#define SYSRQ_NAMELEN_MAX 64 +#define SYSRQ_DUMP_LINES 32 + +static struct sysrq_key_op *sysrq_debug_key_table[]; +static struct sysrq_key_op *sysrq_input_key_table[]; +static unsigned long *dump_address; +static int orig_console_loglevel; +static void (*sysrq_input_return)(char *) = NULL; + +static void dump_mem(void) +{ + unsigned long value[4]; + mm_segment_t old_fs; + int line, err; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = 0; + + for (line = 0; line < SYSRQ_DUMP_LINES; line++) { + err |= __get_user(value[0], dump_address++); + err |= __get_user(value[1], dump_address++); + err |= __get_user(value[2], dump_address++); + err |= __get_user(value[3], dump_address++); + if (err) { + printk("Invalid address %p\n", dump_address - 4); + break; + } +#if BITS_PER_LONG == 32 + printk("0x%p: %08lx %08lx %08lx %08lx\n", + dump_address - 4, + value[0], value[1], value[2], value[3]); +#else + printk("0x%p: %016lx %016lx %016lx %016lx\n", + dump_address - 4, + value[0], value[1], value[2], value[3]); +#endif + } + set_fs(old_fs); +} + +static void write_mem(unsigned long val) +{ + mm_segment_t old_fs; + unsigned long old_val; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (__get_user(old_val, dump_address)) { + printk("Invalid address %p\n", dump_address); + goto out; + } + +#if BITS_PER_LONG == 32 + printk("Changing [%p] from %08lx to %08lx\n", + dump_address, old_val, val); +#else + printk("Changing [%p] from %016lx to %016lx\n", + dump_address, old_val, val); +#endif + __put_user(val, dump_address); +out: + set_fs(old_fs); +} + +static void handle_read(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + static int pos; + static int upper_case; + static char str[SYSRQ_NAMELEN_MAX]; + + if (key == 0) { + /* actually 0 is not shift only... */ + upper_case = 1; + return; + } + + if (key == 0x0d || pos == SYSRQ_NAMELEN_MAX - 1) { + /* enter */ + sysrq_key_table = sysrq_debug_key_table; + str[pos] = '\0'; + pos = upper_case = 0; + printk("\n"); + if (sysrq_input_return == NULL) + printk("No return handler!!!\n"); + else + sysrq_input_return(str); + return; + }; + + /* check for alowed symbols */ + if (key == '-') { + if (upper_case) + key = '_'; + goto correct; + }; + if (key >= 'a' && key <= 'z') { + if (upper_case) + key = key - 'a' + 'A'; + goto correct; + }; + if (key >= '0' && key <= '9') + goto correct; + + upper_case = 0; + return; + +correct: + str[pos] = key; + printk("%c", (char)key); + pos++; + upper_case = 0; +} + +static struct sysrq_key_op input_read = { + .handler = handle_read, + .help_msg = "", + .action_msg = NULL, +}; + +static struct sysrq_key_op *sysrq_input_key_table[SYSRQ_KEY_TABLE_LENGTH] = { + [0 ... SYSRQ_KEY_TABLE_LENGTH - 1] = &input_read, +}; + +static void return_dump_mem(char *str) +{ + unsigned long address; + char *end; + + address = simple_strtoul(str, &end, 0); + if (*end != '\0') { + printk("Bad address [%s]\n", str); + return; + } + + dump_address = (unsigned long *)address; + dump_mem(); +} + +static void handle_dump_mem(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + sysrq_input_return = return_dump_mem; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_dump_mem = { + .handler = handle_dump_mem, + .help_msg = "Dump", + .action_msg = "Enter address:", +}; + +static void return_resolve(char *str) +{ + unsigned long address; + + address = kallsyms_lookup_name(str); + printk("%s : %lx\n", str, address); + if (address) { + dump_address = (unsigned long *)address; + printk("Now you can dump it via X\n"); + } +} + +static void handle_resolve(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + sysrq_input_return = return_resolve; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_resolve = { + .handler = handle_resolve, + .help_msg = "Resolve", + .action_msg = "Enter symbol name:", +}; + +static void return_write_mem(char *str) +{ + unsigned long address; + unsigned long value; + char *end; + + address = simple_strtoul(str, &end, 0); + if (*end != '-') { + printk("Bad address in %s\n", str); + return; + } + value = simple_strtoul(end + 1, &end, 0); + if (*end != '\0') { + printk("Bad value in %s\n", str); + return; + } + + dump_address = (unsigned long *)address; + write_mem(value); +} + +static void handle_write_mem(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + sysrq_input_return = return_write_mem; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_write_mem = { + .handler = handle_write_mem, + .help_msg = "Writemem", + .action_msg = "Enter address-value:", +}; + +static void handle_next(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + dump_mem(); +} + +static struct sysrq_key_op debug_next = { + .handler = handle_next, + .help_msg = "neXt", + .action_msg = "continuing", +}; + +static void handle_quit(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + sysrq_key_table = sysrq_default_key_table; + console_loglevel = orig_console_loglevel; +} + +static struct sysrq_key_op debug_quit = { + .handler = handle_quit, + .help_msg = "Quit", + .action_msg = "Tnahk you for using debugger", +}; + +static struct sysrq_key_op *sysrq_debug_key_table[SYSRQ_KEY_TABLE_LENGTH] = { + [13] = &debug_dump_mem, /* d */ + [26] = &debug_quit, /* q */ + [27] = &debug_resolve, /* r */ + [32] = &debug_write_mem, /* w */ + [33] = &debug_next, /* x */ +}; + +static void sysrq_handle_debug(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + orig_console_loglevel = console_loglevel; + console_loglevel = 8; + sysrq_key_table = sysrq_debug_key_table; + printk("Welcome sysrq debugging mode\n" + "Press H for help\n"); +} + +static struct sysrq_key_op sysrq_debug_op = { + .handler = sysrq_handle_debug, + .help_msg = "debuG", + .action_msg = "Select desired action", +}; +#endif + +static struct sysrq_key_op *sysrq_default_key_table[SYSRQ_KEY_TABLE_LENGTH] = { &sysrq_loglevel_op, /* 0 */ &sysrq_loglevel_op, /* 1 */ &sysrq_loglevel_op, /* 2 */ @@ -290,13 +578,21 @@ static struct sysrq_key_op *sysrq_key_ta * Don't use for system provided sysrqs, it is handled specially on * sparc and will never arrive */ +#ifdef CONFIG_SCHED_VCPU + &sysrq_vschedstate_op, /* a */ +#else NULL, /* a */ +#endif &sysrq_reboot_op, /* b */ &sysrq_crashdump_op, /* c */ &sysrq_showlocks_op, /* d */ &sysrq_term_op, /* e */ &sysrq_moom_op, /* f */ +#ifdef CONFIG_SYSRQ_DEBUG + &sysrq_debug_op, /* g */ +#else NULL, /* g */ +#endif NULL, /* h */ &sysrq_kill_op, /* i */ NULL, /* j */ @@ -317,9 +613,12 @@ static struct sysrq_key_op *sysrq_key_ta NULL, /* w */ NULL, /* x */ NULL, /* y */ - NULL /* z */ + NULL, /* z */ + NULL, /* for debugger */ }; +static struct sysrq_key_op **sysrq_key_table = sysrq_default_key_table; + /* key2index calculation, -1 on invalid index */ static int sysrq_key_table_key2index(int key) { @@ -329,6 +628,10 @@ static int sysrq_key_table_key2index(int retval = key - '0'; else if ((key >= 'a') && (key <= 'z')) retval = key + 10 - 'a'; +#ifdef CONFIG_SYSRQ_DEBUG + else if (key == 0 || key == 0x0d || key == '-') + retval = SYSRQ_KEY_TABLE_LENGTH - 1; +#endif else retval = -1; return retval; @@ -371,7 +674,6 @@ void __handle_sysrq(int key, struct pt_r spin_lock_irqsave(&sysrq_key_table_lock, flags); orig_log_level = console_loglevel; console_loglevel = 7; - printk(KERN_INFO "SysRq : "); op_p = __sysrq_get_key_op(key); if (op_p) { @@ -381,16 +683,17 @@ void __handle_sysrq(int key, struct pt_r */ if (!check_mask || sysrq_enabled == 1 || (sysrq_enabled & op_p->enable_mask)) { - printk("%s\n", op_p->action_msg); + if (op_p->action_msg) + printk("SysRq: %s\n", op_p->action_msg); console_loglevel = orig_log_level; op_p->handler(key, pt_regs, tty); } else { printk("This sysrq operation is disabled.\n"); } } else { - printk("HELP : "); + printk("SysRq HELP : "); /* Only print the help msg once per handler */ - for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) { + for (i = 0; i < SYSRQ_KEY_TABLE_LENGTH; i++) { if (sysrq_key_table[i]) { int j; diff -uprN linux-2.6.18/drivers/char/tty_io.c linux-2.6.18.ovz/drivers/char/tty_io.c --- linux-2.6.18/drivers/char/tty_io.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/tty_io.c 2007-06-13 06:55:06.000000000 -0400 @@ -103,6 +103,7 @@ #include #include +#include #undef TTY_DEBUG_HANGUP @@ -120,11 +121,16 @@ struct termios tty_std_termios = { /* fo EXPORT_SYMBOL(tty_std_termios); +/* this lock protects tty_drivers list, this pretty guys do no locking */ +rwlock_t tty_driver_guard = RW_LOCK_UNLOCKED; +EXPORT_SYMBOL(tty_driver_guard); + /* This list gets poked at by procfs and various bits of boot up code. This could do with some rationalisation such as pulling the tty proc function into this file */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ +EXPORT_SYMBOL(tty_drivers); /* Semaphore to protect creating and releasing a tty. This is shared with vt.c for deeply disgusting hack reasons */ @@ -134,6 +140,15 @@ DEFINE_MUTEX(tty_mutex); extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ extern int pty_limit; /* Config limit on Unix98 ptys */ static DEFINE_IDR(allocated_ptys); +#ifdef CONFIG_VE +#define __ve_allocated_ptys(ve) (*((ve)->allocated_ptys)) +#define ve_allocated_ptys __ve_allocated_ptys(get_exec_env()) +#define ve_ptm_driver (get_exec_env()->ptm_driver) +#else +#define __ve_allocated_ptys(ve) allocated_ptys +#define ve_allocated_ptys allocated_ptys +#define ve_ptm_driver ptm_driver +#endif static DECLARE_MUTEX(allocated_ptys_lock); static int ptmx_open(struct inode *, struct file *); #endif @@ -163,11 +178,22 @@ static void release_mem(struct tty_struc * FIXME: use kzalloc */ +void prepare_tty(void) +{ +#ifdef CONFIG_VE + get_ve0()->allocated_ptys = &allocated_ptys; + /* + * in this case, tty_register_driver() setups + * owner_env correctly right from the bootup + */ +#endif +} + static struct tty_struct *alloc_tty_struct(void) { struct tty_struct *tty; - tty = kmalloc(sizeof(struct tty_struct), GFP_KERNEL); + tty = ub_kmalloc(sizeof(struct tty_struct), GFP_KERNEL); if (tty) memset(tty, 0, sizeof(struct tty_struct)); return tty; @@ -1101,14 +1127,37 @@ static struct tty_driver *get_tty_driver { struct tty_driver *p; + read_lock(&tty_driver_guard); list_for_each_entry(p, &tty_drivers, tty_drivers) { dev_t base = MKDEV(p->major, p->minor_start); if (device < base || device >= base + p->num) continue; *index = device - base; - return p; +#ifdef CONFIG_VE + if (in_interrupt()) + goto found; + if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR +#ifdef CONFIG_UNIX98_PTYS + && (p->majormajor>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) && + (p->majormajor>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) +#endif + ) + goto found; + if (ve_is_super(p->owner_env) && ve_is_super(get_exec_env())) + goto found; + if (!ve_accessible_strict(p->owner_env, get_exec_env())) + continue; +#endif + goto found; } + read_unlock(&tty_driver_guard); return NULL; + +found: + read_unlock(&tty_driver_guard); + return p; } /** @@ -1359,7 +1408,7 @@ static void do_tty_hangup(void *data) read_lock(&tasklist_lock); if (tty->session > 0) { - do_each_task_pid(tty->session, PIDTYPE_SID, p) { + do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { if (p->signal->tty == tty) p->signal->tty = NULL; if (!p->signal->leader) @@ -1368,7 +1417,7 @@ static void do_tty_hangup(void *data) group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); if (tty->pgrp > 0) p->signal->tty_old_pgrp = tty->pgrp; - } while_each_task_pid(tty->session, PIDTYPE_SID, p); + } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); } read_unlock(&tasklist_lock); @@ -1519,9 +1568,9 @@ void disassociate_ctty(int on_exit) /* Now clear signal->tty under the lock */ read_lock(&tasklist_lock); - do_each_task_pid(current->signal->session, PIDTYPE_SID, p) { + do_each_task_pid_all(current->signal->session, PIDTYPE_SID, p) { p->signal->tty = NULL; - } while_each_task_pid(current->signal->session, PIDTYPE_SID, p); + } while_each_task_pid_all(current->signal->session, PIDTYPE_SID, p); read_unlock(&tasklist_lock); mutex_unlock(&tty_mutex); unlock_kernel(); @@ -1859,22 +1908,30 @@ static void tty_line_name(struct tty_dri */ static int init_dev(struct tty_driver *driver, int idx, - struct tty_struct **ret_tty) + struct tty_struct *i_tty, struct tty_struct **ret_tty) { struct tty_struct *tty, *o_tty; struct termios *tp, **tp_loc, *o_tp, **o_tp_loc; struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; + struct ve_struct * owner; int retval = 0; - /* check whether we're reopening an existing tty */ - if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { - tty = devpts_get_tty(idx); - if (tty && driver->subtype == PTY_TYPE_MASTER) - tty = tty->link; - } else { - tty = driver->ttys[idx]; + owner = driver->owner_env; + + if (i_tty) + tty = i_tty; + else { + /* check whether we're reopening an existing tty */ + if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { + tty = devpts_get_tty(idx); + if (tty && driver->subtype == PTY_TYPE_MASTER) + tty = tty->link; + } else { + tty = driver->ttys[idx]; + } } - if (tty) goto fast_track; + if (tty) + goto fast_track; /* * First time open is complex, especially for PTY devices. @@ -1900,6 +1957,7 @@ static int init_dev(struct tty_driver *d tty->driver = driver; tty->index = idx; tty_line_name(driver, idx, tty->name); + tty->owner_env = owner; if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { tp_loc = &tty->termios; @@ -1910,7 +1968,7 @@ static int init_dev(struct tty_driver *d } if (!*tp_loc) { - tp = (struct termios *) kmalloc(sizeof(struct termios), + tp = (struct termios *) ub_kmalloc(sizeof(struct termios), GFP_KERNEL); if (!tp) goto free_mem_out; @@ -1918,7 +1976,7 @@ static int init_dev(struct tty_driver *d } if (!*ltp_loc) { - ltp = (struct termios *) kmalloc(sizeof(struct termios), + ltp = (struct termios *) ub_kmalloc(sizeof(struct termios), GFP_KERNEL); if (!ltp) goto free_mem_out; @@ -1933,6 +1991,7 @@ static int init_dev(struct tty_driver *d o_tty->driver = driver->other; o_tty->index = idx; tty_line_name(driver->other, idx, o_tty->name); + o_tty->owner_env = owner; if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { o_tp_loc = &o_tty->termios; @@ -1944,7 +2003,7 @@ static int init_dev(struct tty_driver *d if (!*o_tp_loc) { o_tp = (struct termios *) - kmalloc(sizeof(struct termios), GFP_KERNEL); + ub_kmalloc(sizeof(struct termios), GFP_KERNEL); if (!o_tp) goto free_mem_out; *o_tp = driver->other->init_termios; @@ -1952,7 +2011,7 @@ static int init_dev(struct tty_driver *d if (!*o_ltp_loc) { o_ltp = (struct termios *) - kmalloc(sizeof(struct termios), GFP_KERNEL); + ub_kmalloc(sizeof(struct termios), GFP_KERNEL); if (!o_ltp) goto free_mem_out; memset(o_ltp, 0, sizeof(struct termios)); @@ -1970,6 +2029,10 @@ static int init_dev(struct tty_driver *d *o_ltp_loc = o_ltp; o_tty->termios = *o_tp_loc; o_tty->termios_locked = *o_ltp_loc; +#ifdef CONFIG_VE + if (driver->other->refcount == 0) + (void)get_ve(owner); +#endif driver->other->refcount++; if (driver->subtype == PTY_TYPE_MASTER) o_tty->count++; @@ -1994,6 +2057,10 @@ static int init_dev(struct tty_driver *d *ltp_loc = ltp; tty->termios = *tp_loc; tty->termios_locked = *ltp_loc; +#ifdef CONFIG_VE + if (driver->refcount == 0) + (void)get_ve(owner); +#endif driver->refcount++; tty->count++; @@ -2114,6 +2181,8 @@ static void release_mem(struct tty_struc } o_tty->magic = 0; o_tty->driver->refcount--; + if (o_tty->driver->refcount == 0) + put_ve(o_tty->owner_env); file_list_lock(); list_del_init(&o_tty->tty_files); file_list_unlock(); @@ -2136,6 +2205,8 @@ static void release_mem(struct tty_struc tty->magic = 0; tty->driver->refcount--; + if (tty->driver->refcount == 0) + put_ve(tty->owner_env); file_list_lock(); list_del_init(&tty->tty_files); file_list_unlock(); @@ -2159,7 +2230,10 @@ static void release_dev(struct file * fi int idx; char buf[64]; unsigned long flags; - +#ifdef CONFIG_UNIX98_PTYS + struct idr *idr_alloced; +#endif + tty = (struct tty_struct *)filp->private_data; if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "release_dev")) return; @@ -2173,6 +2247,9 @@ static void release_dev(struct file * fi tty->driver->subtype == PTY_TYPE_MASTER); devpts = (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) != 0; o_tty = tty->link; +#ifdef CONFIG_UNIX98_PTYS + idr_alloced = &__ve_allocated_ptys(tty->owner_env); +#endif #ifdef TTY_PARANOIA_CHECK if (idx < 0 || idx >= tty->driver->num) { @@ -2345,13 +2422,13 @@ static void release_dev(struct file * fi struct task_struct *p; read_lock(&tasklist_lock); - do_each_task_pid(tty->session, PIDTYPE_SID, p) { + do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { p->signal->tty = NULL; - } while_each_task_pid(tty->session, PIDTYPE_SID, p); + } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); if (o_tty) - do_each_task_pid(o_tty->session, PIDTYPE_SID, p) { + do_each_task_pid_all(o_tty->session, PIDTYPE_SID, p) { p->signal->tty = NULL; - } while_each_task_pid(o_tty->session, PIDTYPE_SID, p); + } while_each_task_pid_all(o_tty->session, PIDTYPE_SID, p); read_unlock(&tasklist_lock); } @@ -2425,7 +2502,7 @@ static void release_dev(struct file * fi /* Make this pty number available for reallocation */ if (devpts) { down(&allocated_ptys_lock); - idr_remove(&allocated_ptys, idx); + idr_remove(idr_alloced, idx); up(&allocated_ptys_lock); } #endif @@ -2455,7 +2532,7 @@ static void release_dev(struct file * fi static int tty_open(struct inode * inode, struct file * filp) { - struct tty_struct *tty; + struct tty_struct *tty, *c_tty; int noctty, retval; struct tty_driver *driver; int index; @@ -2468,6 +2545,7 @@ retry_open: noctty = filp->f_flags & O_NOCTTY; index = -1; retval = 0; + c_tty = NULL; mutex_lock(&tty_mutex); @@ -2478,6 +2556,7 @@ retry_open: } driver = current->signal->tty->driver; index = current->signal->tty->index; + c_tty = current->signal->tty; filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ goto got_driver; @@ -2485,6 +2564,12 @@ retry_open: #ifdef CONFIG_VT if (device == MKDEV(TTY_MAJOR,0)) { extern struct tty_driver *console_driver; +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + mutex_unlock(&tty_mutex); + return -ENODEV; + } +#endif driver = console_driver; index = fg_console; noctty = 1; @@ -2492,6 +2577,12 @@ retry_open: } #endif if (device == MKDEV(TTYAUX_MAJOR,1)) { +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + mutex_unlock(&tty_mutex); + return -ENODEV; + } +#endif driver = console_device(&index); if (driver) { /* Don't let /dev/console block */ @@ -2509,7 +2600,7 @@ retry_open: return -ENODEV; } got_driver: - retval = init_dev(driver, index, &tty); + retval = init_dev(driver, index, c_tty, &tty); mutex_unlock(&tty_mutex); if (retval) return retval; @@ -2590,11 +2681,11 @@ static int ptmx_open(struct inode * inod /* find a device that is not in use. */ down(&allocated_ptys_lock); - if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) { + if (!idr_pre_get(&ve_allocated_ptys, GFP_KERNEL)) { up(&allocated_ptys_lock); return -ENOMEM; } - idr_ret = idr_get_new(&allocated_ptys, NULL, &index); + idr_ret = idr_get_new(&ve_allocated_ptys, NULL, &index); if (idr_ret < 0) { up(&allocated_ptys_lock); if (idr_ret == -EAGAIN) @@ -2602,14 +2693,14 @@ static int ptmx_open(struct inode * inod return -EIO; } if (index >= pty_limit) { - idr_remove(&allocated_ptys, index); + idr_remove(&ve_allocated_ptys, index); up(&allocated_ptys_lock); return -EIO; } up(&allocated_ptys_lock); mutex_lock(&tty_mutex); - retval = init_dev(ptm_driver, index, &tty); + retval = init_dev(ve_ptm_driver, index, NULL, &tty); mutex_unlock(&tty_mutex); if (retval) @@ -2624,7 +2715,7 @@ static int ptmx_open(struct inode * inod goto out1; check_tty_count(tty, "tty_open"); - retval = ptm_driver->open(tty, filp); + retval = ve_ptm_driver->open(tty, filp); if (!retval) return 0; out1: @@ -2632,7 +2723,7 @@ out1: return retval; out: down(&allocated_ptys_lock); - idr_remove(&allocated_ptys, index); + idr_remove(&ve_allocated_ptys, index); up(&allocated_ptys_lock); return retval; } @@ -2822,6 +2913,8 @@ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!ve_is_super(get_exec_env())) + return -EACCES; if (file->f_op->write == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); @@ -2910,9 +3003,9 @@ static int tiocsctty(struct tty_struct * */ read_lock(&tasklist_lock); - do_each_task_pid(tty->session, PIDTYPE_SID, p) { + do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { p->signal->tty = NULL; - } while_each_task_pid(tty->session, PIDTYPE_SID, p); + } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); read_unlock(&tasklist_lock); } else return -EPERM; @@ -2946,7 +3039,7 @@ static int tiocgpgrp(struct tty_struct * */ if (tty == real_tty && current->signal->tty != real_tty) return -ENOTTY; - return put_user(real_tty->pgrp, p); + return put_user(pid_to_vpid(real_tty->pgrp), p); } /** @@ -2980,6 +3073,9 @@ static int tiocspgrp(struct tty_struct * return -EFAULT; if (pgrp < 0) return -EINVAL; + pgrp = vpid_to_pid(pgrp); + if (pgrp < 0) + return -EPERM; if (session_of_pgrp(pgrp) != current->signal->session) return -EPERM; real_tty->pgrp = pgrp; @@ -3008,7 +3104,7 @@ static int tiocgsid(struct tty_struct *t return -ENOTTY; if (real_tty->session <= 0) return -ENOTTY; - return put_user(real_tty->session, p); + return put_user(pid_to_vpid(real_tty->session), p); } /** @@ -3330,16 +3426,16 @@ static void __do_SAK(void *arg) read_lock(&tasklist_lock); /* Kill the entire session */ - do_each_task_pid(session, PIDTYPE_SID, p) { + do_each_task_pid_all(session, PIDTYPE_SID, p) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): p->signal->session==tty->session\n", p->pid, p->comm); send_sig(SIGKILL, p, 1); - } while_each_task_pid(session, PIDTYPE_SID, p); + } while_each_task_pid_all(session, PIDTYPE_SID, p); /* Now kill any processes that happen to have the * tty open. */ - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (p->signal->tty == tty) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): p->signal->session==tty->session\n", @@ -3371,7 +3467,7 @@ static void __do_SAK(void *arg) spin_unlock(&p->files->file_lock); } task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); #endif } @@ -3770,8 +3866,11 @@ int tty_register_driver(struct tty_drive if (!driver->put_char) driver->put_char = tty_default_put_char; - + + driver->owner_env = get_exec_env(); + write_lock_irq(&tty_driver_guard); list_add(&driver->tty_drivers, &tty_drivers); + write_unlock_irq(&tty_driver_guard); if ( !(driver->flags & TTY_DRIVER_DYNAMIC_DEV) ) { for(i = 0; i < driver->num; i++) @@ -3798,7 +3897,9 @@ int tty_unregister_driver(struct tty_dri unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), driver->num); + write_lock_irq(&tty_driver_guard); list_del(&driver->tty_drivers); + write_unlock_irq(&tty_driver_guard); /* * Free the termios and termios_locked structures because @@ -3917,6 +4018,44 @@ static int __init tty_init(void) vty_init(); #endif + prepare_tty(); return 0; } module_init(tty_init); + +#ifdef CONFIG_UNIX98_PTYS +struct class *init_ve_tty_class(void) +{ + struct class * ve_tty_class; + struct class_device * ve_ptmx_dev_class; + + ve_tty_class = class_create(THIS_MODULE, "tty"); + if (IS_ERR(ve_tty_class)) + return ve_tty_class; + + ve_ptmx_dev_class = class_device_create(ve_tty_class, NULL, + MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); + if (IS_ERR(ve_ptmx_dev_class)) { + class_destroy(ve_tty_class); + return (struct class *)ve_ptmx_dev_class; + } + + return ve_tty_class; +} + +void fini_ve_tty_class(struct class *ve_tty_class) +{ + class_device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 2)); + class_destroy(ve_tty_class); +} +#else +struct class *init_ve_tty_class(void) +{ + return NULL; +} +void fini_ve_tty_class(struct class *ve_tty_class) +{ +} +#endif +EXPORT_SYMBOL(init_ve_tty_class); +EXPORT_SYMBOL(fini_ve_tty_class); diff -uprN linux-2.6.18/drivers/char/watchdog/sc1200wdt.c linux-2.6.18.ovz/drivers/char/watchdog/sc1200wdt.c --- linux-2.6.18/drivers/char/watchdog/sc1200wdt.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/char/watchdog/sc1200wdt.c 2007-06-13 06:55:06.000000000 -0400 @@ -392,7 +392,7 @@ static int __init sc1200wdt_init(void) if (io == -1) { printk(KERN_ERR PFX "io parameter must be specified\n"); ret = -EINVAL; - goto out_clean; + goto out_pnp; } #if defined CONFIG_PNP @@ -405,7 +405,7 @@ static int __init sc1200wdt_init(void) if (!request_region(io, io_len, SC1200_MODULE_NAME)) { printk(KERN_ERR PFX "Unable to register IO port %#x\n", io); ret = -EBUSY; - goto out_clean; + goto out_pnp; } ret = sc1200wdt_probe(); @@ -435,6 +435,11 @@ out_rbt: out_io: release_region(io, io_len); +out_pnp: +#if defined CONFIG_PNP + if (isapnp) + pnp_unregister_driver(&scl200wdt_pnp_driver); +#endif goto out_clean; } diff -uprN linux-2.6.18/drivers/clocksource/scx200_hrt.c linux-2.6.18.ovz/drivers/clocksource/scx200_hrt.c --- linux-2.6.18/drivers/clocksource/scx200_hrt.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/clocksource/scx200_hrt.c 2007-06-13 06:55:06.000000000 -0400 @@ -63,7 +63,7 @@ static struct clocksource cs_hrt = { static int __init init_hrt_clocksource(void) { - /* Make sure scx200 has initializedd the configuration block */ + /* Make sure scx200 has initialized the configuration block */ if (!scx200_cb_present()) return -ENODEV; @@ -76,7 +76,7 @@ static int __init init_hrt_clocksource(v } /* write timer config */ - outb(HR_TMEN | (mhz27) ? HR_TMCLKSEL : 0, + outb(HR_TMEN | (mhz27 ? HR_TMCLKSEL : 0), scx200_cb_base + SCx200_TMCNFG_OFFSET); if (mhz27) { diff -uprN linux-2.6.18/drivers/cpufreq/cpufreq_stats.c linux-2.6.18.ovz/drivers/cpufreq/cpufreq_stats.c --- linux-2.6.18/drivers/cpufreq/cpufreq_stats.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/cpufreq/cpufreq_stats.c 2007-06-13 06:55:06.000000000 -0400 @@ -350,12 +350,10 @@ __init cpufreq_stats_init(void) } register_hotcpu_notifier(&cpufreq_stat_cpu_notifier); - lock_cpu_hotplug(); for_each_online_cpu(cpu) { cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_ONLINE, (void *)(long)cpu); } - unlock_cpu_hotplug(); return 0; } static void diff -uprN linux-2.6.18/drivers/i2c/chips/ds1337.c linux-2.6.18.ovz/drivers/i2c/chips/ds1337.c --- linux-2.6.18/drivers/i2c/chips/ds1337.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/i2c/chips/ds1337.c 2007-06-13 06:55:06.000000000 -0400 @@ -347,13 +347,19 @@ static void ds1337_init_client(struct i2 if ((status & 0x80) || (control & 0x80)) { /* RTC not running */ - u8 buf[16]; + u8 buf[1+16]; /* First byte is interpreted as address */ struct i2c_msg msg[1]; dev_dbg(&client->dev, "%s: RTC not running!\n", __FUNCTION__); /* Initialize all, including STATUS and CONTROL to zero */ memset(buf, 0, sizeof(buf)); + + /* Write valid values in the date/time registers */ + buf[1+DS1337_REG_DAY] = 1; + buf[1+DS1337_REG_DATE] = 1; + buf[1+DS1337_REG_MONTH] = 1; + msg[0].addr = client->addr; msg[0].flags = 0; msg[0].len = sizeof(buf); diff -uprN linux-2.6.18/drivers/ide/pci/generic.c linux-2.6.18.ovz/drivers/ide/pci/generic.c --- linux-2.6.18/drivers/ide/pci/generic.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/ide/pci/generic.c 2007-06-13 06:55:06.000000000 -0400 @@ -242,13 +242,17 @@ static int __devinit generic_init_one(st (!(PCI_FUNC(dev->devfn) & 1))) goto out; - if (dev->vendor == PCI_VENDOR_ID_JMICRON && PCI_FUNC(dev->devfn) != 1) - goto out; + if (dev->vendor == PCI_VENDOR_ID_JMICRON) { + if (dev->device != PCI_DEVICE_ID_JMICRON_JMB368 && PCI_FUNC(dev->devfn) != 1) + goto out; + } - pci_read_config_word(dev, PCI_COMMAND, &command); - if (!(command & PCI_COMMAND_IO)) { - printk(KERN_INFO "Skipping disabled %s IDE controller.\n", d->name); - goto out; + if (dev->vendor != PCI_VENDOR_ID_JMICRON) { + pci_read_config_word(dev, PCI_COMMAND, &command); + if (!(command & PCI_COMMAND_IO)) { + printk(KERN_INFO "Skipping disabled %s IDE controller.\n", d->name); + goto out; + } } ret = ide_setup_pci_device(dev, d); out: diff -uprN linux-2.6.18/drivers/ide/ppc/pmac.c linux-2.6.18.ovz/drivers/ide/ppc/pmac.c --- linux-2.6.18/drivers/ide/ppc/pmac.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/ide/ppc/pmac.c 2007-06-13 06:55:06.000000000 -0400 @@ -1326,7 +1326,7 @@ pmac_ide_macio_attach(struct macio_dev * if (macio_irq_count(mdev) == 0) { printk(KERN_WARNING "ide%d: no intrs for device %s, using 13\n", i, mdev->ofdev.node->full_name); - irq = 13; + irq = irq_create_mapping(NULL, 13); } else irq = macio_irq(mdev, 0); diff -uprN linux-2.6.18/drivers/ieee1394/ohci1394.c linux-2.6.18.ovz/drivers/ieee1394/ohci1394.c --- linux-2.6.18/drivers/ieee1394/ohci1394.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/ieee1394/ohci1394.c 2007-06-13 06:55:06.000000000 -0400 @@ -3218,6 +3218,19 @@ static int __devinit ohci1394_pci_probe( struct ti_ohci *ohci; /* shortcut to currently handled device */ resource_size_t ohci_base; +#ifdef CONFIG_PPC_PMAC + /* Necessary on some machines if ohci1394 was loaded/ unloaded before */ + if (machine_is(powermac)) { + struct device_node *of_node = pci_device_to_OF_node(dev); + + if (of_node) { + pmac_call_feature(PMAC_FTR_1394_CABLE_POWER, of_node, + 0, 1); + pmac_call_feature(PMAC_FTR_1394_ENABLE, of_node, 0, 1); + } + } +#endif /* CONFIG_PPC_PMAC */ + if (pci_enable_device(dev)) FAIL(-ENXIO, "Failed to enable OHCI hardware"); pci_set_master(dev); @@ -3506,11 +3519,9 @@ static void ohci1394_pci_remove(struct p #endif #ifdef CONFIG_PPC_PMAC - /* On UniNorth, power down the cable and turn off the chip - * clock when the module is removed to save power on - * laptops. Turning it back ON is done by the arch code when - * pci_enable_device() is called */ - { + /* On UniNorth, power down the cable and turn off the chip clock + * to save power on laptops */ + if (machine_is(powermac)) { struct device_node* of_node; of_node = pci_device_to_OF_node(ohci->dev); diff -uprN linux-2.6.18/drivers/infiniband/core/mad.c linux-2.6.18.ovz/drivers/infiniband/core/mad.c --- linux-2.6.18/drivers/infiniband/core/mad.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/infiniband/core/mad.c 2007-06-13 06:55:06.000000000 -0400 @@ -1750,7 +1750,7 @@ ib_find_send_mad(struct ib_mad_agent_pri */ (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) - return wr; + return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } /* diff -uprN linux-2.6.18/drivers/infiniband/hw/ipath/ipath_verbs.c linux-2.6.18.ovz/drivers/infiniband/hw/ipath/ipath_verbs.c --- linux-2.6.18/drivers/infiniband/hw/ipath/ipath_verbs.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/infiniband/hw/ipath/ipath_verbs.c 2007-06-13 06:55:06.000000000 -0400 @@ -1139,7 +1139,7 @@ static void *ipath_register_ib_device(in dev->process_mad = ipath_process_mad; snprintf(dev->node_desc, sizeof(dev->node_desc), - IPATH_IDSTR " %s kernel_SMA", system_utsname.nodename); + IPATH_IDSTR " %s kernel_SMA", init_utsname()->nodename); ret = ib_register_device(dev); if (ret) diff -uprN linux-2.6.18/drivers/infiniband/hw/mthca/mthca_cq.c linux-2.6.18.ovz/drivers/infiniband/hw/mthca/mthca_cq.c --- linux-2.6.18/drivers/infiniband/hw/mthca/mthca_cq.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/infiniband/hw/mthca/mthca_cq.c 2007-06-13 06:55:06.000000000 -0400 @@ -39,6 +39,8 @@ #include #include +#include + #include #include "mthca_dev.h" @@ -210,6 +212,11 @@ static inline void update_cons_index(str mthca_write64(doorbell, dev->kar + MTHCA_CQ_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + /* + * Make sure doorbells don't leak out of CQ spinlock + * and reach the HCA out of order: + */ + mmiowb(); } } diff -uprN linux-2.6.18/drivers/infiniband/hw/mthca/mthca_mad.c linux-2.6.18.ovz/drivers/infiniband/hw/mthca/mthca_mad.c --- linux-2.6.18/drivers/infiniband/hw/mthca/mthca_mad.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/infiniband/hw/mthca/mthca_mad.c 2007-06-13 06:55:06.000000000 -0400 @@ -119,7 +119,7 @@ static void smp_snoop(struct ib_device * mthca_update_rate(to_mdev(ibdev), port_num); update_sm_ah(to_mdev(ibdev), port_num, - be16_to_cpu(pinfo->lid), + be16_to_cpu(pinfo->sm_lid), pinfo->neighbormtu_mastersmsl & 0xf); event.device = ibdev; diff -uprN linux-2.6.18/drivers/infiniband/hw/mthca/mthca_qp.c linux-2.6.18.ovz/drivers/infiniband/hw/mthca/mthca_qp.c --- linux-2.6.18/drivers/infiniband/hw/mthca/mthca_qp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/infiniband/hw/mthca/mthca_qp.c 2007-06-13 06:55:06.000000000 -0400 @@ -39,6 +39,8 @@ #include #include +#include + #include #include #include @@ -1730,6 +1732,11 @@ out: mthca_write64(doorbell, dev->kar + MTHCA_SEND_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + /* + * Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order: + */ + mmiowb(); } qp->sq.next_ind = ind; @@ -1849,6 +1856,12 @@ out: qp->rq.next_ind = ind; qp->rq.head += nreq; + /* + * Make sure doorbells don't leak out of RQ spinlock and reach + * the HCA out of order: + */ + mmiowb(); + spin_unlock_irqrestore(&qp->rq.lock, flags); return err; } @@ -2110,6 +2123,12 @@ out: MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } + /* + * Make sure doorbells don't leak out of SQ spinlock and reach + * the HCA out of order: + */ + mmiowb(); + spin_unlock_irqrestore(&qp->sq.lock, flags); return err; } diff -uprN linux-2.6.18/drivers/infiniband/hw/mthca/mthca_srq.c linux-2.6.18.ovz/drivers/infiniband/hw/mthca/mthca_srq.c --- linux-2.6.18/drivers/infiniband/hw/mthca/mthca_srq.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/infiniband/hw/mthca/mthca_srq.c 2007-06-13 06:55:06.000000000 -0400 @@ -35,6 +35,8 @@ #include #include +#include + #include "mthca_dev.h" #include "mthca_cmd.h" #include "mthca_memfree.h" @@ -593,6 +595,12 @@ int mthca_tavor_post_srq_recv(struct ib_ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } + /* + * Make sure doorbells don't leak out of SRQ spinlock and + * reach the HCA out of order: + */ + mmiowb(); + spin_unlock_irqrestore(&srq->lock, flags); return err; } diff -uprN linux-2.6.18/drivers/infiniband/ulp/ipoib/ipoib_ib.c linux-2.6.18.ovz/drivers/infiniband/ulp/ipoib/ipoib_ib.c --- linux-2.6.18/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2007-06-13 06:55:06.000000000 -0400 @@ -619,8 +619,10 @@ void ipoib_ib_dev_flush(void *_dev) * The device could have been brought down between the start and when * we get here, don't bring it back up if it's not configured up */ - if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { ipoib_ib_dev_up(dev); + ipoib_mcast_restart_task(dev); + } mutex_lock(&priv->vlan_mutex); diff -uprN linux-2.6.18/drivers/infiniband/ulp/srp/ib_srp.c linux-2.6.18.ovz/drivers/infiniband/ulp/srp/ib_srp.c --- linux-2.6.18/drivers/infiniband/ulp/srp/ib_srp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/infiniband/ulp/srp/ib_srp.c 2007-06-13 06:55:06.000000000 -0400 @@ -1851,7 +1851,7 @@ static void srp_add_one(struct ib_device */ srp_dev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); srp_dev->fmr_page_size = 1 << srp_dev->fmr_page_shift; - srp_dev->fmr_page_mask = ~((unsigned long) srp_dev->fmr_page_size - 1); + srp_dev->fmr_page_mask = ~((u64) srp_dev->fmr_page_size - 1); INIT_LIST_HEAD(&srp_dev->dev_list); diff -uprN linux-2.6.18/drivers/infiniband/ulp/srp/ib_srp.h linux-2.6.18.ovz/drivers/infiniband/ulp/srp/ib_srp.h --- linux-2.6.18/drivers/infiniband/ulp/srp/ib_srp.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/infiniband/ulp/srp/ib_srp.h 2007-06-13 06:55:06.000000000 -0400 @@ -87,7 +87,7 @@ struct srp_device { struct ib_fmr_pool *fmr_pool; int fmr_page_shift; int fmr_page_size; - unsigned long fmr_page_mask; + u64 fmr_page_mask; }; struct srp_host { diff -uprN linux-2.6.18/drivers/input/mouse/psmouse-base.c linux-2.6.18.ovz/drivers/input/mouse/psmouse-base.c --- linux-2.6.18/drivers/input/mouse/psmouse-base.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/input/mouse/psmouse-base.c 2007-06-13 06:55:06.000000000 -0400 @@ -1332,20 +1332,22 @@ ssize_t psmouse_attr_set_helper(struct d static ssize_t psmouse_show_int_attr(struct psmouse *psmouse, void *offset, char *buf) { - unsigned long *field = (unsigned long *)((char *)psmouse + (size_t)offset); + unsigned int *field = (unsigned int *)((char *)psmouse + (size_t)offset); - return sprintf(buf, "%lu\n", *field); + return sprintf(buf, "%u\n", *field); } static ssize_t psmouse_set_int_attr(struct psmouse *psmouse, void *offset, const char *buf, size_t count) { - unsigned long *field = (unsigned long *)((char *)psmouse + (size_t)offset); + unsigned int *field = (unsigned int *)((char *)psmouse + (size_t)offset); unsigned long value; char *rest; value = simple_strtoul(buf, &rest, 10); if (*rest) return -EINVAL; + if ((unsigned int)value != value) + return -EINVAL; *field = value; diff -uprN linux-2.6.18/drivers/isdn/capi/capidrv.c linux-2.6.18.ovz/drivers/isdn/capi/capidrv.c --- linux-2.6.18/drivers/isdn/capi/capidrv.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/isdn/capi/capidrv.c 2007-06-13 06:55:06.000000000 -0400 @@ -1907,7 +1907,8 @@ static int if_readstat(u8 __user *buf, i } for (p=buf, count=0; count < len; p++, count++) { - put_user(*card->q931_read++, p); + if (put_user(*card->q931_read++, p)) + return -EFAULT; if (card->q931_read > card->q931_end) card->q931_read = card->q931_buf; } diff -uprN linux-2.6.18/drivers/isdn/hisax/config.c linux-2.6.18.ovz/drivers/isdn/hisax/config.c --- linux-2.6.18/drivers/isdn/hisax/config.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/isdn/hisax/config.c 2007-06-13 06:55:06.000000000 -0400 @@ -631,7 +631,8 @@ static int HiSax_readstatus(u_char __use count = cs->status_end - cs->status_read + 1; if (count >= len) count = len; - copy_to_user(p, cs->status_read, count); + if (copy_to_user(p, cs->status_read, count)) + return -EFAULT; cs->status_read += count; if (cs->status_read > cs->status_end) cs->status_read = cs->status_buf; @@ -642,7 +643,8 @@ static int HiSax_readstatus(u_char __use cnt = HISAX_STATUS_BUFSIZE; else cnt = count; - copy_to_user(p, cs->status_read, cnt); + if (copy_to_user(p, cs->status_read, cnt)) + return -EFAULT; p += cnt; cs->status_read += cnt % HISAX_STATUS_BUFSIZE; count -= cnt; diff -uprN linux-2.6.18/drivers/isdn/i4l/isdn_common.c linux-2.6.18.ovz/drivers/isdn/i4l/isdn_common.c --- linux-2.6.18/drivers/isdn/i4l/isdn_common.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/isdn/i4l/isdn_common.c 2007-06-13 06:55:06.000000000 -0400 @@ -1134,9 +1134,12 @@ isdn_read(struct file *file, char __user if (dev->drv[drvidx]->interface->readstat) { if (count > dev->drv[drvidx]->stavail) count = dev->drv[drvidx]->stavail; - len = dev->drv[drvidx]->interface-> - readstat(buf, count, drvidx, - isdn_minor2chan(minor)); + len = dev->drv[drvidx]->interface->readstat(buf, count, + drvidx, isdn_minor2chan(minor)); + if (len < 0) { + retval = len; + goto out; + } } else { len = 0; } diff -uprN linux-2.6.18/drivers/isdn/icn/icn.c linux-2.6.18.ovz/drivers/isdn/icn/icn.c --- linux-2.6.18/drivers/isdn/icn/icn.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/isdn/icn/icn.c 2007-06-13 06:55:06.000000000 -0400 @@ -1010,7 +1010,8 @@ icn_readstatus(u_char __user *buf, int l for (p = buf, count = 0; count < len; p++, count++) { if (card->msg_buf_read == card->msg_buf_write) return count; - put_user(*card->msg_buf_read++, p); + if (put_user(*card->msg_buf_read++, p)) + return -EFAULT; if (card->msg_buf_read > card->msg_buf_end) card->msg_buf_read = card->msg_buf; } diff -uprN linux-2.6.18/drivers/isdn/isdnloop/isdnloop.c linux-2.6.18.ovz/drivers/isdn/isdnloop/isdnloop.c --- linux-2.6.18/drivers/isdn/isdnloop/isdnloop.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/isdn/isdnloop/isdnloop.c 2007-06-13 06:55:06.000000000 -0400 @@ -451,7 +451,8 @@ isdnloop_readstatus(u_char __user *buf, for (p = buf, count = 0; count < len; p++, count++) { if (card->msg_buf_read == card->msg_buf_write) return count; - put_user(*card->msg_buf_read++, p); + if (put_user(*card->msg_buf_read++, p)) + return -EFAULT; if (card->msg_buf_read > card->msg_buf_end) card->msg_buf_read = card->msg_buf; } diff -uprN linux-2.6.18/drivers/isdn/pcbit/drv.c linux-2.6.18.ovz/drivers/isdn/pcbit/drv.c --- linux-2.6.18/drivers/isdn/pcbit/drv.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/isdn/pcbit/drv.c 2007-06-13 06:55:06.000000000 -0400 @@ -725,23 +725,27 @@ static int pcbit_stat(u_char __user *buf if (stat_st < stat_end) { - copy_to_user(buf, statbuf + stat_st, len); + if (copy_to_user(buf, statbuf + stat_st, len)) + return -EFAULT; stat_st += len; } else { if (len > STATBUF_LEN - stat_st) { - copy_to_user(buf, statbuf + stat_st, - STATBUF_LEN - stat_st); - copy_to_user(buf, statbuf, - len - (STATBUF_LEN - stat_st)); + if (copy_to_user(buf, statbuf + stat_st, + STATBUF_LEN - stat_st)) + return -EFAULT; + if (copy_to_user(buf, statbuf, + len - (STATBUF_LEN - stat_st))) + return -EFAULT; stat_st = len - (STATBUF_LEN - stat_st); } else { - copy_to_user(buf, statbuf + stat_st, len); + if (copy_to_user(buf, statbuf + stat_st, len)) + return -EFAULT; stat_st += len; diff -uprN linux-2.6.18/drivers/macintosh/via-pmu-backlight.c linux-2.6.18.ovz/drivers/macintosh/via-pmu-backlight.c --- linux-2.6.18/drivers/macintosh/via-pmu-backlight.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/macintosh/via-pmu-backlight.c 2007-06-13 06:55:06.000000000 -0400 @@ -16,7 +16,7 @@ #define MAX_PMU_LEVEL 0xFF static struct backlight_properties pmu_backlight_data; -static spinlock_t pmu_backlight_lock; +static DEFINE_SPINLOCK(pmu_backlight_lock); static int sleeping; static u8 bl_curve[FB_BACKLIGHT_LEVELS]; diff -uprN linux-2.6.18/drivers/md/dm-crypt.c linux-2.6.18.ovz/drivers/md/dm-crypt.c --- linux-2.6.18/drivers/md/dm-crypt.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/md/dm-crypt.c 2007-06-13 06:55:06.000000000 -0400 @@ -717,13 +717,15 @@ static int crypt_endio(struct bio *bio, if (bio->bi_size) return 1; + if (!bio_flagged(bio, BIO_UPTODATE) && !error) + error = -EIO; + bio_put(bio); /* * successful reads are decrypted by the worker thread */ - if ((bio_data_dir(bio) == READ) - && bio_flagged(bio, BIO_UPTODATE)) { + if (bio_data_dir(io->bio) == READ && !error) { kcryptd_queue_io(io); return 0; } diff -uprN linux-2.6.18/drivers/md/dm-snap.c linux-2.6.18.ovz/drivers/md/dm-snap.c --- linux-2.6.18/drivers/md/dm-snap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/md/dm-snap.c 2007-06-13 06:55:06.000000000 -0400 @@ -691,6 +691,7 @@ static void pending_complete(struct pend free_exception(e); + remove_exception(&pe->e); error_snapshot_bios(pe); goto out; } diff -uprN linux-2.6.18/drivers/md/md.c linux-2.6.18.ovz/drivers/md/md.c --- linux-2.6.18/drivers/md/md.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/md/md.c 2007-06-13 06:55:06.000000000 -0400 @@ -1994,6 +1994,7 @@ static mdk_rdev_t *md_import_device(dev_ kobject_init(&rdev->kobj); rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; rdev->sb_events = 0; @@ -3867,6 +3868,7 @@ static int hot_add_disk(mddev_t * mddev, } clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) goto abort_export; diff -uprN linux-2.6.18/drivers/md/multipath.c linux-2.6.18.ovz/drivers/md/multipath.c --- linux-2.6.18/drivers/md/multipath.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/md/multipath.c 2007-06-13 06:55:06.000000000 -0400 @@ -480,7 +480,7 @@ static int multipath_run (mddev_t *mddev mdname(mddev)); goto out_free_conf; } - mddev->degraded = conf->raid_disks = conf->working_disks; + mddev->degraded = conf->raid_disks - conf->working_disks; conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS, sizeof(struct multipath_bh)); diff -uprN linux-2.6.18/drivers/md/raid10.c linux-2.6.18.ovz/drivers/md/raid10.c --- linux-2.6.18/drivers/md/raid10.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/md/raid10.c 2007-06-13 06:55:06.000000000 -0400 @@ -2042,7 +2042,7 @@ static int run(mddev_t *mddev) disk = conf->mirrors + i; if (!disk->rdev || - !test_bit(In_sync, &rdev->flags)) { + !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; } diff -uprN linux-2.6.18/drivers/media/Kconfig linux-2.6.18.ovz/drivers/media/Kconfig --- linux-2.6.18/drivers/media/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/Kconfig 2007-06-13 06:55:06.000000000 -0400 @@ -54,6 +54,7 @@ config VIDEO_V4L1_COMPAT config VIDEO_V4L2 bool + depends on VIDEO_DEV default y source "drivers/media/video/Kconfig" diff -uprN linux-2.6.18/drivers/media/dvb/b2c2/flexcop-fe-tuner.c linux-2.6.18.ovz/drivers/media/dvb/b2c2/flexcop-fe-tuner.c --- linux-2.6.18/drivers/media/dvb/b2c2/flexcop-fe-tuner.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/dvb/b2c2/flexcop-fe-tuner.c 2007-06-13 06:55:06.000000000 -0400 @@ -527,7 +527,7 @@ int flexcop_frontend_init(struct flexcop /* try the air atsc 2nd generation (nxt2002) */ if ((fc->fe = nxt200x_attach(&samsung_tbmv_config, &fc->i2c_adap)) != NULL) { fc->dev_type = FC_AIR_ATSC2; - dvb_pll_attach(fc->fe, 0x61, &fc->i2c_adap, &dvb_pll_samsung_tbmv); + dvb_pll_attach(fc->fe, 0x61, NULL, &dvb_pll_samsung_tbmv); info("found the nxt2002 at i2c address: 0x%02x",samsung_tbmv_config.demod_address); } else /* try the air atsc 3nd generation (lgdt3303) */ diff -uprN linux-2.6.18/drivers/media/dvb/dvb-core/dvb_net.c linux-2.6.18.ovz/drivers/media/dvb/dvb-core/dvb_net.c --- linux-2.6.18/drivers/media/dvb/dvb-core/dvb_net.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/dvb/dvb-core/dvb_net.c 2007-06-13 06:55:06.000000000 -0400 @@ -604,7 +604,7 @@ static void dvb_net_ule( struct net_devi { &utype, sizeof utype }, { priv->ule_skb->data, priv->ule_skb->len - 4 } }; - unsigned long ule_crc = ~0L, expected_crc; + u32 ule_crc = ~0L, expected_crc; if (priv->ule_dbit) { /* Set D-bit for CRC32 verification, * if it was set originally. */ @@ -617,7 +617,7 @@ static void dvb_net_ule( struct net_devi *((u8 *)priv->ule_skb->tail - 2) << 8 | *((u8 *)priv->ule_skb->tail - 1); if (ule_crc != expected_crc) { - printk(KERN_WARNING "%lu: CRC32 check FAILED: %#lx / %#lx, SNDU len %d type %#x, ts_remain %d, next 2: %x.\n", + printk(KERN_WARNING "%lu: CRC32 check FAILED: %08x / %08x, SNDU len %d type %#x, ts_remain %d, next 2: %x.\n", priv->ts_count, ule_crc, expected_crc, priv->ule_sndu_len, priv->ule_sndu_type, ts_remain, ts_remain > 2 ? *(unsigned short *)from_where : 0); #ifdef ULE_DEBUG diff -uprN linux-2.6.18/drivers/media/dvb/frontends/cx24123.c linux-2.6.18.ovz/drivers/media/dvb/frontends/cx24123.c --- linux-2.6.18/drivers/media/dvb/frontends/cx24123.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/dvb/frontends/cx24123.c 2007-06-13 06:55:06.000000000 -0400 @@ -549,8 +549,8 @@ static int cx24123_pll_calculate(struct ndiv = ( ((p->frequency * vco_div * 10) / (2 * XTAL / 1000)) / 32) & 0x1ff; adiv = ( ((p->frequency * vco_div * 10) / (2 * XTAL / 1000)) % 32) & 0x1f; - if (adiv == 0) - ndiv++; + if (adiv == 0 && ndiv > 0) + ndiv--; /* control bits 11, refdiv 11, charge pump polarity 1, charge pump current, ndiv, adiv */ state->pllarg = (3 << 19) | (3 << 17) | (1 << 16) | (pump << 14) | (ndiv << 5) | adiv; diff -uprN linux-2.6.18/drivers/media/dvb/frontends/dvb-pll.c linux-2.6.18.ovz/drivers/media/dvb/frontends/dvb-pll.c --- linux-2.6.18/drivers/media/dvb/frontends/dvb-pll.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/dvb/frontends/dvb-pll.c 2007-06-13 06:55:06.000000000 -0400 @@ -493,6 +493,9 @@ static int dvb_pll_sleep(struct dvb_fron int i; int result; + if (priv->i2c == NULL) + return -EINVAL; + for (i = 0; i < priv->pll_desc->count; i++) { if (priv->pll_desc->entries[i].limit == 0) break; diff -uprN linux-2.6.18/drivers/media/dvb/frontends/lgdt330x.c linux-2.6.18.ovz/drivers/media/dvb/frontends/lgdt330x.c --- linux-2.6.18/drivers/media/dvb/frontends/lgdt330x.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/dvb/frontends/lgdt330x.c 2007-06-13 06:55:06.000000000 -0400 @@ -435,9 +435,6 @@ static int lgdt3302_read_status(struct d /* Test signal does not exist flag */ /* as well as the AGC lock flag. */ *status |= FE_HAS_SIGNAL; - } else { - /* Without a signal all other status bits are meaningless */ - return 0; } /* @@ -500,9 +497,6 @@ static int lgdt3303_read_status(struct d /* Test input signal does not exist flag */ /* as well as the AGC lock flag. */ *status |= FE_HAS_SIGNAL; - } else { - /* Without a signal all other status bits are meaningless */ - return 0; } /* Carrier Recovery Lock Status Register */ diff -uprN linux-2.6.18/drivers/media/video/cx88/cx88-cards.c linux-2.6.18.ovz/drivers/media/video/cx88/cx88-cards.c --- linux-2.6.18/drivers/media/video/cx88/cx88-cards.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/cx88/cx88-cards.c 2007-06-13 06:55:06.000000000 -0400 @@ -1465,7 +1465,7 @@ const unsigned int cx88_idcount = ARRAY_ /* ----------------------------------------------------------------------- */ /* some leadtek specific stuff */ -static void __devinit leadtek_eeprom(struct cx88_core *core, u8 *eeprom_data) +static void leadtek_eeprom(struct cx88_core *core, u8 *eeprom_data) { /* This is just for the "Winfast 2000XP Expert" board ATM; I don't have data on * any others. diff -uprN linux-2.6.18/drivers/media/video/cx88/cx88-dvb.c linux-2.6.18.ovz/drivers/media/video/cx88/cx88-dvb.c --- linux-2.6.18/drivers/media/video/cx88/cx88-dvb.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/cx88/cx88-dvb.c 2007-06-13 06:55:06.000000000 -0400 @@ -576,7 +576,7 @@ static int dvb_register(struct cx8802_de &dev->core->i2c_adap); if (dev->dvb.frontend != NULL) { dvb_pll_attach(dev->dvb.frontend, 0x60, - &dev->core->i2c_adap, + NULL, &dvb_pll_thomson_dtt7579); break; } @@ -587,7 +587,7 @@ static int dvb_register(struct cx8802_de &dev->core->i2c_adap); if (dev->dvb.frontend != NULL) { dvb_pll_attach(dev->dvb.frontend, 0x60, - &dev->core->i2c_adap, + NULL, &dvb_pll_thomson_dtt7579); } #endif @@ -600,7 +600,7 @@ static int dvb_register(struct cx8802_de &dev->core->i2c_adap); if (dev->dvb.frontend != NULL) { dvb_pll_attach(dev->dvb.frontend, 0x61, - &dev->core->i2c_adap, + NULL, &dvb_pll_thomson_dtt7579); break; } @@ -611,7 +611,7 @@ static int dvb_register(struct cx8802_de &dev->core->i2c_adap); if (dev->dvb.frontend != NULL) { dvb_pll_attach(dev->dvb.frontend, 0x61, - &dev->core->i2c_adap, + NULL, &dvb_pll_thomson_dtt7579); } #endif @@ -623,7 +623,7 @@ static int dvb_register(struct cx8802_de &dev->core->i2c_adap); if (dev->dvb.frontend != NULL) { dvb_pll_attach(dev->dvb.frontend, 0x61, - &dev->core->i2c_adap, + NULL, &dvb_pll_lg_z201); } break; @@ -634,7 +634,7 @@ static int dvb_register(struct cx8802_de &dev->core->i2c_adap); if (dev->dvb.frontend != NULL) { dvb_pll_attach(dev->dvb.frontend, 0x61, - &dev->core->i2c_adap, + NULL, &dvb_pll_unknown_1); } break; @@ -757,7 +757,7 @@ static int dvb_register(struct cx8802_de &dev->core->i2c_adap); if (dev->dvb.frontend != NULL) { dvb_pll_attach(dev->dvb.frontend, 0x61, - &dev->core->i2c_adap, + NULL, &dvb_pll_tuv1236d); } break; diff -uprN linux-2.6.18/drivers/media/video/ks0127.c linux-2.6.18.ovz/drivers/media/video/ks0127.c --- linux-2.6.18/drivers/media/video/ks0127.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/ks0127.c 2007-06-13 06:55:06.000000000 -0400 @@ -712,13 +712,13 @@ static int ks0127_command(struct i2c_cli *iarg = 0; status = ks0127_read(ks, KS_STAT); if (!(status & 0x20)) /* NOVID not set */ - *iarg = (*iarg & DECODER_STATUS_GOOD); + *iarg = (*iarg | DECODER_STATUS_GOOD); if ((status & 0x01)) /* CLOCK set */ - *iarg = (*iarg & DECODER_STATUS_COLOR); + *iarg = (*iarg | DECODER_STATUS_COLOR); if ((status & 0x08)) /* PALDET set */ - *iarg = (*iarg & DECODER_STATUS_PAL); + *iarg = (*iarg | DECODER_STATUS_PAL); else - *iarg = (*iarg & DECODER_STATUS_NTSC); + *iarg = (*iarg | DECODER_STATUS_NTSC); break; //Catch any unknown command diff -uprN linux-2.6.18/drivers/media/video/msp3400-driver.c linux-2.6.18.ovz/drivers/media/video/msp3400-driver.c --- linux-2.6.18/drivers/media/video/msp3400-driver.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/msp3400-driver.c 2007-06-13 06:55:06.000000000 -0400 @@ -904,6 +904,8 @@ static int msp_attach(struct i2c_adapter state->has_virtual_dolby_surround = msp_revision == 'G' && msp_prod_lo == 1; /* Has Virtual Dolby Surround & Dolby Pro Logic: only in msp34x2 */ state->has_dolby_pro_logic = msp_revision == 'G' && msp_prod_lo == 2; + /* The msp343xG supports BTSC only and cannot do Automatic Standard Detection. */ + state->force_btsc = msp_family == 3 && msp_revision == 'G' && msp_prod_hi == 3; state->opmode = opmode; if (state->opmode == OPMODE_AUTO) { diff -uprN linux-2.6.18/drivers/media/video/msp3400-driver.h linux-2.6.18.ovz/drivers/media/video/msp3400-driver.h --- linux-2.6.18/drivers/media/video/msp3400-driver.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/msp3400-driver.h 2007-06-13 06:55:06.000000000 -0400 @@ -64,6 +64,7 @@ struct msp_state { u8 has_sound_processing; u8 has_virtual_dolby_surround; u8 has_dolby_pro_logic; + u8 force_btsc; int radio; int opmode; diff -uprN linux-2.6.18/drivers/media/video/msp3400-kthreads.c linux-2.6.18.ovz/drivers/media/video/msp3400-kthreads.c --- linux-2.6.18/drivers/media/video/msp3400-kthreads.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/msp3400-kthreads.c 2007-06-13 06:55:06.000000000 -0400 @@ -960,9 +960,10 @@ int msp34xxg_thread(void *data) /* setup the chip*/ msp34xxg_reset(client); - state->std = state->radio ? 0x40 : msp_standard; - /* start autodetect */ + state->std = state->radio ? 0x40 : + (state->force_btsc && msp_standard == 1) ? 32 : msp_standard; msp_write_dem(client, 0x20, state->std); + /* start autodetect */ if (state->std != 1) goto unmute; diff -uprN linux-2.6.18/drivers/media/video/pvrusb2/Kconfig linux-2.6.18.ovz/drivers/media/video/pvrusb2/Kconfig --- linux-2.6.18/drivers/media/video/pvrusb2/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/pvrusb2/Kconfig 2007-06-13 06:55:06.000000000 -0400 @@ -25,14 +25,9 @@ config VIDEO_PVRUSB2_24XXX form "24xxx" (leading prefix of "24" followed by 3 digits). To see if you may need this option, examine the white sticker on the underside of your device. Enabling this - option will not harm support for older devices, however it - is a separate option because of the experimental nature of - this new feature. + option will not harm support for older devices. - If you are in doubt, say N. - - Note: This feature is _very_ experimental. You have been - warned. + If you are in doubt, say Y. config VIDEO_PVRUSB2_SYSFS bool "pvrusb2 sysfs support (EXPERIMENTAL)" diff -uprN linux-2.6.18/drivers/media/video/pvrusb2/pvrusb2-ctrl.c linux-2.6.18.ovz/drivers/media/video/pvrusb2/pvrusb2-ctrl.c --- linux-2.6.18/drivers/media/video/pvrusb2/pvrusb2-ctrl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/pvrusb2/pvrusb2-ctrl.c 2007-06-13 06:55:06.000000000 -0400 @@ -43,12 +43,17 @@ int pvr2_ctrl_set_mask_value(struct pvr2 if (cptr->info->type == pvr2_ctl_bitmask) { mask &= cptr->info->def.type_bitmask.valid_bits; } else if (cptr->info->type == pvr2_ctl_int) { - if (val < cptr->info->def.type_int.min_value) { - break; + int lim; + lim = cptr->info->def.type_int.min_value; + if (cptr->info->get_min_value) { + cptr->info->get_min_value(cptr,&lim); } - if (val > cptr->info->def.type_int.max_value) { - break; + if (val < lim) break; + lim = cptr->info->def.type_int.max_value; + if (cptr->info->get_max_value) { + cptr->info->get_max_value(cptr,&lim); } + if (val > lim) break; } else if (cptr->info->type == pvr2_ctl_enum) { if (val >= cptr->info->def.type_enum.count) { break; @@ -91,7 +96,9 @@ int pvr2_ctrl_get_max(struct pvr2_ctrl * int ret = 0; if (!cptr) return 0; LOCK_TAKE(cptr->hdw->big_lock); do { - if (cptr->info->type == pvr2_ctl_int) { + if (cptr->info->get_max_value) { + cptr->info->get_max_value(cptr,&ret); + } else if (cptr->info->type == pvr2_ctl_int) { ret = cptr->info->def.type_int.max_value; } } while(0); LOCK_GIVE(cptr->hdw->big_lock); @@ -105,7 +112,9 @@ int pvr2_ctrl_get_min(struct pvr2_ctrl * int ret = 0; if (!cptr) return 0; LOCK_TAKE(cptr->hdw->big_lock); do { - if (cptr->info->type == pvr2_ctl_int) { + if (cptr->info->get_min_value) { + cptr->info->get_min_value(cptr,&ret); + } else if (cptr->info->type == pvr2_ctl_int) { ret = cptr->info->def.type_int.min_value; } } while(0); LOCK_GIVE(cptr->hdw->big_lock); diff -uprN linux-2.6.18/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h linux-2.6.18.ovz/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h --- linux-2.6.18/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/pvrusb2/pvrusb2-hdw-internal.h 2007-06-13 06:55:06.000000000 -0400 @@ -107,6 +107,8 @@ struct pvr2_ctl_info { /* Control's implementation */ pvr2_ctlf_get_value get_value; /* Get its value */ + pvr2_ctlf_get_value get_min_value; /* Get minimum allowed value */ + pvr2_ctlf_get_value get_max_value; /* Get maximum allowed value */ pvr2_ctlf_set_value set_value; /* Set its value */ pvr2_ctlf_val_to_sym val_to_sym; /* Custom convert value->symbol */ pvr2_ctlf_sym_to_val sym_to_val; /* Custom convert symbol->value */ diff -uprN linux-2.6.18/drivers/media/video/pvrusb2/pvrusb2-hdw.c linux-2.6.18.ovz/drivers/media/video/pvrusb2/pvrusb2-hdw.c --- linux-2.6.18/drivers/media/video/pvrusb2/pvrusb2-hdw.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/pvrusb2/pvrusb2-hdw.c 2007-06-13 06:55:06.000000000 -0400 @@ -362,6 +362,30 @@ static int ctrl_freq_set(struct pvr2_ctr return 0; } +#ifdef CONFIG_VIDEO_PVRUSB2_24XXX +static int ctrl_hres_max_get(struct pvr2_ctrl *cptr,int *vp) +{ + /* If we're dealing with a 24xxx device, force the horizontal + maximum to be 720 no matter what, since we can't get the device + to work properly with any other value. Otherwise just return + the normal value. */ + *vp = cptr->info->def.type_int.max_value; + if (cptr->hdw->hdw_type == PVR2_HDW_TYPE_24XXX) *vp = 720; + return 0; +} + +static int ctrl_hres_min_get(struct pvr2_ctrl *cptr,int *vp) +{ + /* If we're dealing with a 24xxx device, force the horizontal + minimum to be 720 no matter what, since we can't get the device + to work properly with any other value. Otherwise just return + the normal value. */ + *vp = cptr->info->def.type_int.min_value; + if (cptr->hdw->hdw_type == PVR2_HDW_TYPE_24XXX) *vp = 720; + return 0; +} +#endif + static int ctrl_cx2341x_is_dirty(struct pvr2_ctrl *cptr) { return cptr->hdw->enc_stale != 0; @@ -720,6 +744,12 @@ static const struct pvr2_ctl_info contro .default_value = 720, DEFREF(res_hor), DEFINT(320,720), +#ifdef CONFIG_VIDEO_PVRUSB2_24XXX + /* Hook in check for clamp on horizontal resolution in + order to avoid unsolved problem involving cx25840. */ + .get_max_value = ctrl_hres_max_get, + .get_min_value = ctrl_hres_min_get, +#endif },{ .desc = "Vertical capture resolution", .name = "resolution_ver", diff -uprN linux-2.6.18/drivers/media/video/pvrusb2/pvrusb2-v4l2.c linux-2.6.18.ovz/drivers/media/video/pvrusb2/pvrusb2-v4l2.c --- linux-2.6.18/drivers/media/video/pvrusb2/pvrusb2-v4l2.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/pvrusb2/pvrusb2-v4l2.c 2007-06-13 06:55:06.000000000 -0400 @@ -22,6 +22,7 @@ #include #include +#include #include "pvrusb2-context.h" #include "pvrusb2-hdw.h" #include "pvrusb2.h" @@ -31,25 +32,21 @@ #include #include +/* Mike Isely 23-Sep-2006 - This function is prototyped + * only for V4L1 but is implemented regardless of the V4L1 compatibility + * option state. V4L2 has no replacement for this and we need it. For now + * copy the prototype here so we can avoid the compiler warning. */ +extern struct video_device* video_devdata(struct file*); + struct pvr2_v4l2_dev; struct pvr2_v4l2_fh; struct pvr2_v4l2; -/* V4L no longer provide the ability to set / get a private context pointer - (i.e. video_get_drvdata / video_set_drvdata), which means we have to - concoct our own context locating mechanism. Supposedly this is intended - to simplify driver implementation. It's not clear to me how that can - possibly be true. Our solution here is to maintain a lookup table of - our context instances, indexed by the minor device number of the V4L - device. See pvr2_v4l2_open() for some implications of this approach. */ -static struct pvr2_v4l2_dev *devices[256]; -static DEFINE_MUTEX(device_lock); struct pvr2_v4l2_dev { struct pvr2_v4l2 *v4lp; struct video_device *vdev; struct pvr2_context_stream *stream; - int ctxt_idx; enum pvr2_config config; }; @@ -459,18 +456,26 @@ static int pvr2_v4l2_do_ioctl(struct ino ret = 0; switch(vf->type) { case V4L2_BUF_TYPE_VIDEO_CAPTURE: { + int lmin,lmax; + struct pvr2_ctrl *hcp,*vcp; int h = vf->fmt.pix.height; int w = vf->fmt.pix.width; + hcp = pvr2_hdw_get_ctrl_by_id(hdw,PVR2_CID_HRES); + vcp = pvr2_hdw_get_ctrl_by_id(hdw,PVR2_CID_VRES); - if (h < 200) { - h = 200; - } else if (h > 625) { - h = 625; + lmin = pvr2_ctrl_get_min(hcp); + lmax = pvr2_ctrl_get_max(hcp); + if (w < lmin) { + w = lmin; + } else if (w > lmax) { + w = lmax; } - if (w < 320) { - w = 320; - } else if (w > 720) { - w = 720; + lmin = pvr2_ctrl_get_min(vcp); + lmax = pvr2_ctrl_get_max(vcp); + if (h < lmin) { + h = lmin; + } else if (h > lmax) { + h = lmax; } memcpy(vf, &pvr_format[PVR_FORMAT_PIX], @@ -479,14 +484,8 @@ static int pvr2_v4l2_do_ioctl(struct ino vf->fmt.pix.height = h; if (cmd == VIDIOC_S_FMT) { - pvr2_ctrl_set_value( - pvr2_hdw_get_ctrl_by_id(hdw, - PVR2_CID_HRES), - vf->fmt.pix.width); - pvr2_ctrl_set_value( - pvr2_hdw_get_ctrl_by_id(hdw, - PVR2_CID_VRES), - vf->fmt.pix.height); + pvr2_ctrl_set_value(hcp,vf->fmt.pix.width); + pvr2_ctrl_set_value(vcp,vf->fmt.pix.height); } } break; case V4L2_BUF_TYPE_VBI_CAPTURE: @@ -703,12 +702,6 @@ static void pvr2_v4l2_dev_destroy(struct { printk(KERN_INFO "pvrusb2: unregistering device video%d [%s]\n", dip->vdev->minor,pvr2_config_get_name(dip->config)); - if (dip->ctxt_idx >= 0) { - mutex_lock(&device_lock); - devices[dip->ctxt_idx] = NULL; - dip->ctxt_idx = -1; - mutex_unlock(&device_lock); - } video_unregister_device(dip->vdev); } @@ -800,33 +793,10 @@ static int pvr2_v4l2_open(struct inode * struct pvr2_v4l2 *vp; struct pvr2_hdw *hdw; - mutex_lock(&device_lock); - /* MCI 7-Jun-2006 Even though we're just doing what amounts to an - atomic read of the device mapping array here, we still need the - mutex. The problem is that there is a tiny race possible when - we register the device. We can't update the device mapping - array until after the device has been registered, owing to the - fact that we can't know the minor device number until after the - registration succeeds. And if another thread tries to open the - device in the window of time after registration but before the - map is updated, then it will get back an erroneous null pointer - and the open will result in a spurious failure. The only way to - prevent that is to (a) be inside the mutex here before we access - the array, and (b) cover the entire registration process later - on with this same mutex. Thus if we get inside the mutex here, - then we can be assured that the registration process actually - completed correctly. This is an unhappy complication from the - use of global data in a driver that lives in a preemptible - environment. It sure would be nice if the video device itself - had a means for storing and retrieving a local context pointer. - Oh wait. It did. But now it's gone. Silly me. */ - { - unsigned int midx = iminor(file->f_dentry->d_inode); - if (midx < sizeof(devices)/sizeof(devices[0])) { - dip = devices[midx]; - } + { + struct video_device *vdev = video_devdata(file); + dip = (struct pvr2_v4l2_dev *)video_get_drvdata(vdev); } - mutex_unlock(&device_lock); if (!dip) return -ENODEV; /* Should be impossible but I'm paranoid */ @@ -1066,7 +1036,7 @@ static void pvr2_v4l2_dev_init(struct pv memcpy(dip->vdev,&vdev_template,sizeof(vdev_template)); dip->vdev->release = video_device_release; - mutex_lock(&device_lock); + video_set_drvdata(dip->vdev,dip); mindevnum = -1; unit_number = pvr2_hdw_get_unit_number(vp->channel.mc_head->hdw); @@ -1081,12 +1051,6 @@ static void pvr2_v4l2_dev_init(struct pv dip->vdev->minor,pvr2_config_get_name(dip->config)); } - if ((dip->vdev->minor < sizeof(devices)/sizeof(devices[0])) && - (devices[dip->vdev->minor] == NULL)) { - dip->ctxt_idx = dip->vdev->minor; - devices[dip->ctxt_idx] = dip; - } - mutex_unlock(&device_lock); pvr2_hdw_v4l_store_minor_number(vp->channel.mc_head->hdw, dip->vdev->minor); @@ -1100,7 +1064,6 @@ struct pvr2_v4l2 *pvr2_v4l2_create(struc vp = kmalloc(sizeof(*vp),GFP_KERNEL); if (!vp) return vp; memset(vp,0,sizeof(*vp)); - vp->video_dev.ctxt_idx = -1; pvr2_channel_init(&vp->channel,mnp); pvr2_trace(PVR2_TRACE_STRUCT,"Creating pvr2_v4l2 id=%p",vp); diff -uprN linux-2.6.18/drivers/media/video/saa7134/saa7134-dvb.c linux-2.6.18.ovz/drivers/media/video/saa7134/saa7134-dvb.c --- linux-2.6.18/drivers/media/video/saa7134/saa7134-dvb.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/saa7134/saa7134-dvb.c 2007-06-13 06:55:06.000000000 -0400 @@ -1158,13 +1158,13 @@ static int dvb_init(struct saa7134_dev * case SAA7134_BOARD_AVERMEDIA_AVERTVHD_A180: dev->dvb.frontend = nxt200x_attach(&avertvhda180, &dev->i2c_adap); if (dev->dvb.frontend) { - dvb_pll_attach(dev->dvb.frontend, 0x61, &dev->i2c_adap, &dvb_pll_tdhu2); + dvb_pll_attach(dev->dvb.frontend, 0x61, NULL, &dvb_pll_tdhu2); } break; case SAA7134_BOARD_KWORLD_ATSC110: dev->dvb.frontend = nxt200x_attach(&kworldatsc110, &dev->i2c_adap); if (dev->dvb.frontend) { - dvb_pll_attach(dev->dvb.frontend, 0x61, &dev->i2c_adap, &dvb_pll_tuv1236d); + dvb_pll_attach(dev->dvb.frontend, 0x61, NULL, &dvb_pll_tuv1236d); } break; #endif diff -uprN linux-2.6.18/drivers/media/video/tuner-simple.c linux-2.6.18.ovz/drivers/media/video/tuner-simple.c --- linux-2.6.18/drivers/media/video/tuner-simple.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/tuner-simple.c 2007-06-13 06:55:06.000000000 -0400 @@ -108,6 +108,7 @@ static int tuner_stereo(struct i2c_clien case TUNER_PHILIPS_FM1216ME_MK3: case TUNER_PHILIPS_FM1236_MK3: case TUNER_PHILIPS_FM1256_IH3: + case TUNER_LG_NTSC_TAPE: stereo = ((status & TUNER_SIGNAL) == TUNER_STEREO_MK3); break; default: @@ -419,6 +420,7 @@ static void default_set_radio_freq(struc case TUNER_PHILIPS_FM1216ME_MK3: case TUNER_PHILIPS_FM1236_MK3: case TUNER_PHILIPS_FMD1216ME_MK3: + case TUNER_LG_NTSC_TAPE: buffer[3] = 0x19; break; case TUNER_TNF_5335MF: diff -uprN linux-2.6.18/drivers/media/video/tuner-types.c linux-2.6.18.ovz/drivers/media/video/tuner-types.c --- linux-2.6.18/drivers/media/video/tuner-types.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/tuner-types.c 2007-06-13 06:55:06.000000000 -0400 @@ -671,16 +671,6 @@ static struct tuner_params tuner_panason }, }; -/* ------------ TUNER_LG_NTSC_TAPE - LGINNOTEK NTSC ------------ */ - -static struct tuner_params tuner_lg_ntsc_tape_params[] = { - { - .type = TUNER_PARAM_TYPE_NTSC, - .ranges = tuner_fm1236_mk3_ntsc_ranges, - .count = ARRAY_SIZE(tuner_fm1236_mk3_ntsc_ranges), - }, -}; - /* ------------ TUNER_TNF_8831BGFF - Philips PAL ------------ */ static struct tuner_range tuner_tnf_8831bgff_pal_ranges[] = { @@ -1331,8 +1321,8 @@ struct tunertype tuners[] = { }, [TUNER_LG_NTSC_TAPE] = { /* LGINNOTEK NTSC */ .name = "LG NTSC (TAPE series)", - .params = tuner_lg_ntsc_tape_params, - .count = ARRAY_SIZE(tuner_lg_ntsc_tape_params), + .params = tuner_fm1236_mk3_params, + .count = ARRAY_SIZE(tuner_fm1236_mk3_params), }, [TUNER_TNF_8831BGFF] = { /* Philips PAL */ .name = "Tenna TNF 8831 BGFF)", diff -uprN linux-2.6.18/drivers/media/video/tveeprom.c linux-2.6.18.ovz/drivers/media/video/tveeprom.c --- linux-2.6.18/drivers/media/video/tveeprom.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/tveeprom.c 2007-06-13 06:55:06.000000000 -0400 @@ -184,7 +184,7 @@ hauppauge_tuner[] = { TUNER_ABSENT, "Thompson DTT757"}, /* 80-89 */ { TUNER_ABSENT, "Philips FQ1216LME MK3"}, - { TUNER_ABSENT, "LG TAPC G701D"}, + { TUNER_LG_PAL_NEW_TAPC, "LG TAPC G701D"}, { TUNER_LG_NTSC_NEW_TAPC, "LG TAPC H791F"}, { TUNER_LG_PAL_NEW_TAPC, "TCL 2002MB 3"}, { TUNER_LG_PAL_NEW_TAPC, "TCL 2002MI 3"}, diff -uprN linux-2.6.18/drivers/media/video/usbvideo/quickcam_messenger.h linux-2.6.18.ovz/drivers/media/video/usbvideo/quickcam_messenger.h --- linux-2.6.18/drivers/media/video/usbvideo/quickcam_messenger.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/usbvideo/quickcam_messenger.h 2007-06-13 06:55:06.000000000 -0400 @@ -35,27 +35,13 @@ struct rgb { }; struct bayL0 { -#ifdef __BIG_ENDIAN - u8 r; - u8 g; -#elif __LITTLE_ENDIAN u8 g; u8 r; -#else -#error not byte order defined -#endif }; struct bayL1 { -#ifdef __BIG_ENDIAN - u8 g; - u8 b; -#elif __LITTLE_ENDIAN u8 b; u8 g; -#else -#error not byte order defined -#endif }; struct cam_size { diff -uprN linux-2.6.18/drivers/media/video/video-buf.c linux-2.6.18.ovz/drivers/media/video/video-buf.c --- linux-2.6.18/drivers/media/video/video-buf.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/video-buf.c 2007-06-13 06:55:06.000000000 -0400 @@ -695,6 +695,7 @@ videobuf_qbuf(struct videobuf_queue *q, goto done; } if (buf->state == STATE_QUEUED || + buf->state == STATE_PREPARED || buf->state == STATE_ACTIVE) { dprintk(1,"qbuf: buffer is already queued or active.\n"); goto done; diff -uprN linux-2.6.18/drivers/media/video/videodev.c linux-2.6.18.ovz/drivers/media/video/videodev.c --- linux-2.6.18/drivers/media/video/videodev.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/media/video/videodev.c 2007-06-13 06:55:06.000000000 -0400 @@ -739,13 +739,13 @@ static int __video_do_ioctl(struct inode case VIDIOC_DQBUF: { struct v4l2_buffer *p=arg; - if (!vfd->vidioc_qbuf) + if (!vfd->vidioc_dqbuf) break; ret = check_fmt (vfd, p->type); if (ret) break; - ret=vfd->vidioc_qbuf(file, fh, p); + ret=vfd->vidioc_dqbuf(file, fh, p); if (!ret) dbgbuf(cmd,vfd,p); break; @@ -836,7 +836,7 @@ static int __video_do_ioctl(struct inode break; } - if (index<=0 || index >= vfd->tvnormsize) { + if (index < 0 || index >= vfd->tvnormsize) { ret=-EINVAL; break; } diff -uprN linux-2.6.18/drivers/message/fusion/mptsas.c linux-2.6.18.ovz/drivers/message/fusion/mptsas.c --- linux-2.6.18/drivers/message/fusion/mptsas.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/message/fusion/mptsas.c 2007-06-13 06:55:08.000000000 -0400 @@ -852,6 +852,10 @@ static int mptsas_get_linkerrors(struct dma_addr_t dma_handle; int error; + /* FIXME: only have link errors on local phys */ + if (!scsi_is_sas_phy_local(phy)) + return -EINVAL; + hdr.PageVersion = MPI_SASPHY1_PAGEVERSION; hdr.ExtPageLength = 0; hdr.PageNumber = 1 /* page number 1*/; @@ -924,6 +928,10 @@ static int mptsas_phy_reset(struct sas_p unsigned long timeleft; int error = -ERESTARTSYS; + /* FIXME: fusion doesn't allow non-local phy reset */ + if (!scsi_is_sas_phy_local(phy)) + return -EINVAL; + /* not implemented for expanders */ if (phy->identify.target_port_protocols & SAS_PROTOCOL_SMP) return -ENXIO; @@ -1570,9 +1578,6 @@ static int mptsas_probe_one_phy(struct d if (!phy_info->phy) { - if (local) - phy->local_attached = 1; - error = sas_phy_add(phy); if (error) { sas_phy_free(phy); diff -uprN linux-2.6.18/drivers/net/Makefile linux-2.6.18.ovz/drivers/net/Makefile --- linux-2.6.18/drivers/net/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/Makefile 2007-06-13 06:55:06.000000000 -0400 @@ -21,6 +21,12 @@ gianfar_driver-objs := gianfar.o \ obj-$(CONFIG_UCC_GETH) += ucc_geth_driver.o ucc_geth_driver-objs := ucc_geth.o ucc_geth_phy.o +obj-$(CONFIG_VE_NETDEV) += vznetdev.o +vznetdev-objs := open_vznet.o venet_core.o + +obj-$(CONFIG_VE_ETHDEV) += vzethdev.o +vzethdev-objs := veth.o + # # link order important here # diff -uprN linux-2.6.18/drivers/net/bonding/bond_main.c linux-2.6.18.ovz/drivers/net/bonding/bond_main.c --- linux-2.6.18/drivers/net/bonding/bond_main.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/bonding/bond_main.c 2007-06-13 06:55:06.000000000 -0400 @@ -3547,7 +3547,7 @@ static int bond_do_ioctl(struct net_devi mii->val_out = 0; read_lock_bh(&bond->lock); read_lock(&bond->curr_slave_lock); - if (bond->curr_active_slave) { + if (netif_carrier_ok(bond->dev)) { mii->val_out = BMSR_LSTATUS; } read_unlock(&bond->curr_slave_lock); diff -uprN linux-2.6.18/drivers/net/e1000/e1000_main.c linux-2.6.18.ovz/drivers/net/e1000/e1000_main.c --- linux-2.6.18/drivers/net/e1000/e1000_main.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/e1000/e1000_main.c 2007-06-13 06:55:06.000000000 -0400 @@ -439,14 +439,15 @@ e1000_get_hw_control(struct e1000_adapte } } -int -e1000_up(struct e1000_adapter *adapter) +/** + * e1000_configure - configure the hardware for RX and TX + * @adapter = private board structure + **/ +static void e1000_configure(struct e1000_adapter *adapter) { struct net_device *netdev = adapter->netdev; int i; - /* hardware has been reset, we need to reload some things */ - e1000_set_multi(netdev); e1000_restore_vlan(adapter); @@ -464,11 +465,17 @@ e1000_up(struct e1000_adapter *adapter) } adapter->tx_queue_len = netdev->tx_queue_len; +} + +int e1000_up(struct e1000_adapter *adapter) +{ + /* hardware has been reset, we need to reload some things */ + e1000_configure(adapter); mod_timer(&adapter->watchdog_timer, jiffies); #ifdef CONFIG_E1000_NAPI - netif_poll_enable(netdev); + netif_poll_enable(adapter->netdev); #endif e1000_irq_enable(adapter); @@ -527,15 +534,15 @@ e1000_down(struct e1000_adapter *adapter { struct net_device *netdev = adapter->netdev; +#ifdef CONFIG_E1000_NAPI + netif_poll_disable(netdev); +#endif e1000_irq_disable(adapter); del_timer_sync(&adapter->tx_fifo_stall_timer); del_timer_sync(&adapter->watchdog_timer); del_timer_sync(&adapter->phy_info_timer); -#ifdef CONFIG_E1000_NAPI - netif_poll_disable(netdev); -#endif netdev->tx_queue_len = adapter->tx_queue_len; adapter->link_speed = 0; adapter->link_duplex = 0; @@ -1197,23 +1204,17 @@ e1000_open(struct net_device *netdev) return -EBUSY; /* allocate transmit descriptors */ - - if ((err = e1000_setup_all_tx_resources(adapter))) + err = e1000_setup_all_tx_resources(adapter); + if (err) goto err_setup_tx; /* allocate receive descriptors */ - - if ((err = e1000_setup_all_rx_resources(adapter))) - goto err_setup_rx; - - err = e1000_request_irq(adapter); + err = e1000_setup_all_rx_resources(adapter); if (err) - goto err_up; + goto err_setup_rx; e1000_power_up_phy(adapter); - if ((err = e1000_up(adapter))) - goto err_up; adapter->mng_vlan_id = E1000_MNG_VLAN_NONE; if ((adapter->hw.mng_cookie.status & E1000_MNG_DHCP_COOKIE_STATUS_VLAN_SUPPORT)) { @@ -1226,9 +1227,30 @@ e1000_open(struct net_device *netdev) e1000_check_mng_mode(&adapter->hw)) e1000_get_hw_control(adapter); + /* before we allocate an interrupt, we must be ready to handle it. + * Setting DEBUG_SHIRQ in the kernel makes it fire an interrupt + * as soon as we call pci_request_irq, so we have to setup our + * clean_rx handler before we do so. */ + e1000_configure(adapter); + + err = e1000_request_irq(adapter); + if (err) + goto err_req_irq; + +#ifdef CONFIG_E1000_NAPI + netif_poll_enable(netdev); +#endif + + e1000_irq_enable(adapter); + + /* fire a link status change interrupt to start the watchdog */ + E1000_WRITE_REG(&adapter->hw, ICS, E1000_ICS_LSC); + return E1000_SUCCESS; -err_up: +err_req_irq: + e1000_release_hw_control(adapter); + e1000_power_down_phy(adapter); e1000_free_all_rx_resources(adapter); err_setup_rx: e1000_free_all_tx_resources(adapter); @@ -4683,6 +4705,9 @@ e1000_suspend(struct pci_dev *pdev, pm_m if (adapter->hw.phy_type == e1000_phy_igp_3) e1000_phy_powerdown_workaround(&adapter->hw); + if (netif_running(netdev)) + e1000_free_irq(adapter); + /* Release control of h/w to f/w. If f/w is AMT enabled, this * would have already happened in close and is redundant. */ e1000_release_hw_control(adapter); @@ -4710,6 +4735,10 @@ e1000_resume(struct pci_dev *pdev) pci_enable_wake(pdev, PCI_D3hot, 0); pci_enable_wake(pdev, PCI_D3cold, 0); + if (netif_running(netdev) && (ret_val = e1000_request_irq(adapter))) + return ret_val; + + e1000_power_up_phy(adapter); e1000_reset(adapter); E1000_WRITE_REG(&adapter->hw, WUS, ~0); diff -uprN linux-2.6.18/drivers/net/forcedeth.c linux-2.6.18.ovz/drivers/net/forcedeth.c --- linux-2.6.18/drivers/net/forcedeth.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/forcedeth.c 2007-06-13 06:55:06.000000000 -0400 @@ -2692,11 +2692,13 @@ static int nv_request_irq(struct net_dev } if (ret != 0 && np->msi_flags & NV_MSI_CAPABLE) { if ((ret = pci_enable_msi(np->pci_dev)) == 0) { + pci_intx(np->pci_dev, 0); np->msi_flags |= NV_MSI_ENABLED; if ((!intr_test && request_irq(np->pci_dev->irq, &nv_nic_irq, IRQF_SHARED, dev->name, dev) != 0) || (intr_test && request_irq(np->pci_dev->irq, &nv_nic_irq_test, IRQF_SHARED, dev->name, dev) != 0)) { printk(KERN_INFO "forcedeth: request_irq failed %d\n", ret); pci_disable_msi(np->pci_dev); + pci_intx(np->pci_dev, 1); np->msi_flags &= ~NV_MSI_ENABLED; goto out_err; } @@ -2739,6 +2741,7 @@ static void nv_free_irq(struct net_devic free_irq(np->pci_dev->irq, dev); if (np->msi_flags & NV_MSI_ENABLED) { pci_disable_msi(np->pci_dev); + pci_intx(np->pci_dev, 1); np->msi_flags &= ~NV_MSI_ENABLED; } } diff -uprN linux-2.6.18/drivers/net/loopback.c linux-2.6.18.ovz/drivers/net/loopback.c --- linux-2.6.18/drivers/net/loopback.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/loopback.c 2007-06-13 06:55:07.000000000 -0400 @@ -59,6 +59,13 @@ #include static DEFINE_PER_CPU(struct net_device_stats, loopback_stats); +#ifdef CONFIG_VE +#define LOOPBACK_STATS(cpu) ((ve_is_super(get_exec_env())) ? \ + &per_cpu(loopback_stats, cpu) : \ + per_cpu_ptr(get_exec_env()->_loopback_stats, cpu)) +#else +#define LOOPBACK_STATS(cpu) &per_cpu(loopback_stats, cpu) +#endif #define LOOPBACK_OVERHEAD (128 + MAX_HEADER + 16 + 16) @@ -130,6 +137,11 @@ static int loopback_xmit(struct sk_buff { struct net_device_stats *lb_stats; + if (unlikely(get_exec_env()->disable_net)) { + kfree_skb(skb); + return 0; + } + skb_orphan(skb); skb->protocol = eth_type_trans(skb,dev); @@ -149,7 +161,7 @@ static int loopback_xmit(struct sk_buff #endif dev->last_rx = jiffies; - lb_stats = &per_cpu(loopback_stats, get_cpu()); + lb_stats = LOOPBACK_STATS(get_cpu()); lb_stats->rx_bytes += skb->len; lb_stats->tx_bytes = lb_stats->rx_bytes; lb_stats->rx_packets++; @@ -175,7 +187,7 @@ static struct net_device_stats *get_stat for_each_possible_cpu(i) { struct net_device_stats *lb_stats; - lb_stats = &per_cpu(loopback_stats, i); + lb_stats = LOOPBACK_STATS(i); stats->rx_bytes += lb_stats->rx_bytes; stats->tx_bytes += lb_stats->tx_bytes; stats->rx_packets += lb_stats->rx_packets; @@ -196,6 +208,34 @@ static struct ethtool_ops loopback_ethto .set_tso = ethtool_op_set_tso, }; +static void loopback_destructor(struct net_device *dev) +{ + kfree(dev->priv); + dev->priv = NULL; +} + +struct net_device templ_loopback_dev = { + .name = "lo", + .mtu = (16 * 1024) + 20 + 20 + 12, + .hard_start_xmit = loopback_xmit, + .hard_header = eth_header, + .hard_header_cache = eth_header_cache, + .header_cache_update = eth_header_cache_update, + .hard_header_len = ETH_HLEN, /* 14 */ + .addr_len = ETH_ALEN, /* 6 */ + .tx_queue_len = 0, + .type = ARPHRD_LOOPBACK, /* 0x0001*/ + .rebuild_header = eth_rebuild_header, + .flags = IFF_LOOPBACK, + .features = NETIF_F_SG|NETIF_F_FRAGLIST + |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA + |NETIF_F_LLTX|NETIF_F_VIRTUAL, +}; + +#ifdef loopback_dev +#undef loopback_dev +#endif + struct net_device loopback_dev = { .name = "lo", .mtu = (16 * 1024) + 20 + 20 + 12, @@ -229,9 +269,13 @@ int __init loopback_init(void) memset(stats, 0, sizeof(struct net_device_stats)); loopback_dev.priv = stats; loopback_dev.get_stats = &get_stats; + loopback_dev.destructor = &loopback_destructor; } - +#ifdef CONFIG_VE + get_ve0()->_loopback_dev = &loopback_dev; +#endif return register_netdev(&loopback_dev); }; EXPORT_SYMBOL(loopback_dev); +EXPORT_SYMBOL(templ_loopback_dev); diff -uprN linux-2.6.18/drivers/net/lp486e.c linux-2.6.18.ovz/drivers/net/lp486e.c --- linux-2.6.18/drivers/net/lp486e.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/lp486e.c 2007-06-13 06:55:07.000000000 -0400 @@ -442,16 +442,16 @@ init_rx_bufs(struct net_device *dev, int if (rbd) { rbd->pad = 0; rbd->count = 0; - rbd->skb = dev_alloc_skb(RX_SKB_SIZE); + rbd->skb = dev_alloc_skb(RX_SKBSIZE); if (!rbd->skb) { printk("dev_alloc_skb failed"); } rbd->next = rfd->rbd; if (i) { rfd->rbd->prev = rbd; - rbd->size = RX_SKB_SIZE; + rbd->size = RX_SKBSIZE; } else { - rbd->size = (RX_SKB_SIZE | RBD_EL); + rbd->size = (RX_SKBSIZE | RBD_EL); lp->rbd_tail = rbd; } diff -uprN linux-2.6.18/drivers/net/mv643xx_eth.c linux-2.6.18.ovz/drivers/net/mv643xx_eth.c --- linux-2.6.18/drivers/net/mv643xx_eth.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/mv643xx_eth.c 2007-06-13 06:55:07.000000000 -0400 @@ -385,7 +385,7 @@ static int mv643xx_eth_receive_queue(str struct pkt_info pkt_info; while (budget-- > 0 && eth_port_receive(mp, &pkt_info) == ETH_OK) { - dma_unmap_single(NULL, pkt_info.buf_ptr, RX_SKB_SIZE, + dma_unmap_single(NULL, pkt_info.buf_ptr, ETH_RX_SKB_SIZE, DMA_FROM_DEVICE); mp->rx_desc_count--; received_packets++; diff -uprN linux-2.6.18/drivers/net/open_vznet.c linux-2.6.18.ovz/drivers/net/open_vznet.c --- linux-2.6.18/drivers/net/open_vznet.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/net/open_vznet.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,244 @@ +/* + * open_vznet.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Virtual Networking device used to change VE ownership on packets + */ + +#include +#include +#include + +#include +#include +#include +#include + +void veip_stop(struct ve_struct *ve) +{ + struct list_head *p, *tmp; + + write_lock_irq(&veip_hash_lock); + if (ve->veip == NULL) + goto unlock; + list_for_each_safe(p, tmp, &ve->veip->ip_lh) { + struct ip_entry_struct *ptr; + ptr = list_entry(p, struct ip_entry_struct, ve_list); + ptr->active_env = NULL; + list_del(&ptr->ve_list); + list_del(&ptr->ip_hash); + kfree(ptr); + } + veip_put(ve->veip); + ve->veip = NULL; + if (!ve_is_super(ve)) + module_put(THIS_MODULE); +unlock: + write_unlock_irq(&veip_hash_lock); +} + +int veip_start(struct ve_struct *ve) +{ + int err, get; + + err = 0; + write_lock_irq(&veip_hash_lock); + get = ve->veip == NULL; + ve->veip = veip_findcreate(ve->veid); + if (ve->veip == NULL) + err = -ENOMEM; + write_unlock_irq(&veip_hash_lock); + if (err == 0 && get && !ve_is_super(ve)) + __module_get(THIS_MODULE); + return err; +} + +int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr) +{ + struct ip_entry_struct *entry, *found; + int err; + + entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL); + if (entry == NULL) + return -ENOMEM; + + if (ve->veip == NULL) { + /* This can happen if we load venet AFTER ve was started */ + err = veip_start(ve); + if (err < 0) + goto out; + } + + write_lock_irq(&veip_hash_lock); + err = -EADDRINUSE; + found = venet_entry_lookup(addr); + if (found != NULL) + goto out_unlock; + + entry->active_env = ve; + entry->addr = *addr; + ip_entry_hash(entry, ve->veip); + + err = 0; + entry = NULL; +out_unlock: + write_unlock_irq(&veip_hash_lock); +out: + if (entry != NULL) + kfree(entry); + return err; +} + +int veip_entry_del(envid_t veid, struct ve_addr_struct *addr) +{ + struct ip_entry_struct *found; + int err; + + err = -EADDRNOTAVAIL; + write_lock_irq(&veip_hash_lock); + found = venet_entry_lookup(addr); + if (found == NULL) + goto out; + if (found->active_env->veid != veid) + goto out; + + err = 0; + found->active_env = NULL; + + list_del(&found->ip_hash); + list_del(&found->ve_list); + kfree(found); +out: + write_unlock_irq(&veip_hash_lock); + return err; +} + +static int skb_extract_addr(struct sk_buff *skb, + struct ve_addr_struct *addr, int dir) +{ + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + addr->family = AF_INET; + addr->key[0] = 0; + addr->key[1] = 0; + addr->key[2] = 0; + addr->key[3] = (dir ? skb->nh.iph->daddr : skb->nh.iph->saddr); + return 0; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case __constant_htons(ETH_P_IPV6): + addr->family = AF_INET6; + memcpy(&addr->key, dir ? + skb->nh.ipv6h->daddr.s6_addr32 : + skb->nh.ipv6h->saddr.s6_addr32, + sizeof(addr->key)); + return 0; +#endif + } + + return -EAFNOSUPPORT; +} + +static struct ve_struct *venet_find_ve(struct sk_buff *skb, int dir) +{ + struct ip_entry_struct *entry; + struct ve_addr_struct addr; + + if (skb_extract_addr(skb, &addr, dir) < 0) + return NULL; + + entry = venet_entry_lookup(&addr); + if (entry == NULL) + return NULL; + + return entry->active_env; +} + +int venet_change_skb_owner(struct sk_buff *skb) +{ + struct ve_struct *ve, *ve_old; + + ve_old = skb->owner_env; + + read_lock(&veip_hash_lock); + if (!ve_is_super(ve_old)) { + /* from VE to host */ + ve = venet_find_ve(skb, 0); + if (ve == NULL) + goto out_drop; + if (!ve_accessible_strict(ve, ve_old)) + goto out_source; + skb->owner_env = get_ve0(); + } else { + /* from host to VE */ + ve = venet_find_ve(skb, 1); + if (ve == NULL) + goto out_drop; + skb->owner_env = ve; + } + read_unlock(&veip_hash_lock); + + return 0; + +out_drop: + read_unlock(&veip_hash_lock); + return -ESRCH; + +out_source: + read_unlock(&veip_hash_lock); + if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) { + printk(KERN_WARNING "Dropped packet, source wrong " + "veid=%u src-IP=%u.%u.%u.%u " + "dst-IP=%u.%u.%u.%u\n", + skb->owner_env->veid, + NIPQUAD(skb->nh.iph->saddr), + NIPQUAD(skb->nh.iph->daddr)); + } + return -EACCES; +} + +#ifdef CONFIG_PROC_FS +int veip_seq_show(struct seq_file *m, void *v) +{ + struct list_head *p; + struct ip_entry_struct *entry; + char s[40]; + + p = (struct list_head *)v; + if (p == ip_entry_hash_table) { + seq_puts(m, "Version: 2.5\n"); + return 0; + } + entry = list_entry(p, struct ip_entry_struct, ip_hash); + veaddr_print(s, sizeof(s), &entry->addr); + seq_printf(m, "%39s %10u\n", s, 0); + return 0; +} +#endif + +__exit void veip_cleanup(void) +{ + int i; + + write_lock_irq(&veip_hash_lock); + for (i = 0; i < VEIP_HASH_SZ; i++) + while (!list_empty(ip_entry_hash_table + i)) { + struct ip_entry_struct *entry; + + entry = list_first_entry(ip_entry_hash_table + i, + struct ip_entry_struct, ip_hash); + list_del(&entry->ip_hash); + kfree(entry); + } + write_unlock_irq(&veip_hash_lock); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Virtual Network Device"); +MODULE_LICENSE("GPL v2"); diff -uprN linux-2.6.18/drivers/net/sky2.c linux-2.6.18.ovz/drivers/net/sky2.c --- linux-2.6.18/drivers/net/sky2.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/sky2.c 2007-06-13 06:55:07.000000000 -0400 @@ -106,6 +106,7 @@ static const struct pci_device_id sky2_i { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) }, { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) }, { PCI_DEVICE(PCI_VENDOR_ID_DLINK, 0x4b00) }, /* DGE-560T */ + { PCI_DEVICE(PCI_VENDOR_ID_DLINK, 0x4001) }, /* DGE-550SX */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4340) }, { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4341) }, { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4342) }, @@ -117,10 +118,17 @@ static const struct pci_device_id sky2_i { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4350) }, { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4351) }, { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4352) }, + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4353) }, { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4360) }, { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4361) }, { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4362) }, { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4363) }, + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4364) }, + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4365) }, + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4366) }, + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4367) }, + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4368) }, + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4369) }, { 0 } }; @@ -670,7 +678,7 @@ static void sky2_mac_init(struct sky2_hw sky2_write16(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_OPER_ON); if (hw->chip_id == CHIP_ID_YUKON_EC_U) { - sky2_write8(hw, SK_REG(port, RX_GMF_LP_THR), 768/8); + sky2_write8(hw, SK_REG(port, RX_GMF_LP_THR), 512/8); sky2_write8(hw, SK_REG(port, RX_GMF_UP_THR), 1024/8); if (hw->dev[port]->mtu > ETH_DATA_LEN) { /* set Tx GMAC FIFO Almost Empty Threshold */ @@ -682,16 +690,10 @@ static void sky2_mac_init(struct sky2_hw } -/* Assign Ram Buffer allocation. - * start and end are in units of 4k bytes - * ram registers are in units of 64bit words - */ -static void sky2_ramset(struct sky2_hw *hw, u16 q, u8 startk, u8 endk) +/* Assign Ram Buffer allocation in units of 64bit (8 bytes) */ +static void sky2_ramset(struct sky2_hw *hw, u16 q, u32 start, u32 end) { - u32 start, end; - - start = startk * 4096/8; - end = (endk * 4096/8) - 1; + pr_debug(PFX "q %d %#x %#x\n", q, start, end); sky2_write8(hw, RB_ADDR(q, RB_CTRL), RB_RST_CLR); sky2_write32(hw, RB_ADDR(q, RB_START), start); @@ -700,7 +702,7 @@ static void sky2_ramset(struct sky2_hw * sky2_write32(hw, RB_ADDR(q, RB_RP), start); if (q == Q_R1 || q == Q_R2) { - u32 space = (endk - startk) * 4096/8; + u32 space = end - start + 1; u32 tp = space - space/4; /* On receive queue's set the thresholds @@ -1082,19 +1084,16 @@ static int sky2_up(struct net_device *de sky2_mac_init(hw, port); - /* Determine available ram buffer space (in 4K blocks). - * Note: not sure about the FE setting below yet - */ - if (hw->chip_id == CHIP_ID_YUKON_FE) - ramsize = 4; - else - ramsize = sky2_read8(hw, B2_E_0); + /* Determine available ram buffer space in qwords. */ + ramsize = sky2_read8(hw, B2_E_0) * 4096/8; - /* Give transmitter one third (rounded up) */ - rxspace = ramsize - (ramsize + 2) / 3; + if (ramsize > 6*1024/8) + rxspace = ramsize - (ramsize + 2) / 3; + else + rxspace = ramsize / 2; - sky2_ramset(hw, rxqaddr[port], 0, rxspace); - sky2_ramset(hw, txqaddr[port], rxspace, ramsize); + sky2_ramset(hw, rxqaddr[port], 0, rxspace-1); + sky2_ramset(hw, txqaddr[port], rxspace, ramsize-1); /* Make sure SyncQ is disabled */ sky2_write8(hw, RB_ADDR(port == 0 ? Q_XS1 : Q_XS2, RB_CTRL), @@ -1421,6 +1420,11 @@ static int sky2_down(struct net_device * /* Stop more packets from being queued */ netif_stop_queue(dev); + /* Disable port IRQ */ + imask = sky2_read32(hw, B0_IMSK); + imask &= ~portirq_msk[port]; + sky2_write32(hw, B0_IMSK, imask); + sky2_phy_reset(hw, port); /* Stop transmitter */ @@ -1464,11 +1468,6 @@ static int sky2_down(struct net_device * sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET); sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_RST_SET); - /* Disable port IRQ */ - imask = sky2_read32(hw, B0_IMSK); - imask &= ~portirq_msk[port]; - sky2_write32(hw, B0_IMSK, imask); - /* turn off LED's */ sky2_write16(hw, B0_Y2LED, LED_STAT_OFF); @@ -1679,13 +1678,13 @@ static void sky2_phy_intr(struct sky2_hw struct sky2_port *sky2 = netdev_priv(dev); u16 istatus, phystat; + if (!netif_running(dev)) + return; + spin_lock(&sky2->phy_lock); istatus = gm_phy_read(hw, port, PHY_MARV_INT_STAT); phystat = gm_phy_read(hw, port, PHY_MARV_PHY_STAT); - if (!netif_running(dev)) - goto out; - if (netif_msg_intr(sky2)) printk(KERN_INFO PFX "%s: phy interrupt status 0x%x 0x%x\n", sky2->netdev->name, istatus, phystat); @@ -2737,6 +2736,14 @@ static int sky2_set_mac_address(struct n return 0; } +static void inline sky2_add_filter(u8 filter[8], const u8 *addr) +{ + u32 bit; + + bit = ether_crc(ETH_ALEN, addr) & 63; + filter[bit >> 3] |= 1 << (bit & 7); +} + static void sky2_set_multicast(struct net_device *dev) { struct sky2_port *sky2 = netdev_priv(dev); @@ -2745,6 +2752,7 @@ static void sky2_set_multicast(struct ne struct dev_mc_list *list = dev->mc_list; u16 reg; u8 filter[8]; + static const u8 pause_mc_addr[ETH_ALEN] = { 0x1, 0x80, 0xc2, 0x0, 0x0, 0x1 }; memset(filter, 0, sizeof(filter)); @@ -2755,16 +2763,17 @@ static void sky2_set_multicast(struct ne reg &= ~(GM_RXCR_UCF_ENA | GM_RXCR_MCF_ENA); else if ((dev->flags & IFF_ALLMULTI) || dev->mc_count > 16) /* all multicast */ memset(filter, 0xff, sizeof(filter)); - else if (dev->mc_count == 0) /* no multicast */ + else if (dev->mc_count == 0 && !sky2->rx_pause) reg &= ~GM_RXCR_MCF_ENA; else { int i; reg |= GM_RXCR_MCF_ENA; - for (i = 0; list && i < dev->mc_count; i++, list = list->next) { - u32 bit = ether_crc(ETH_ALEN, list->dmi_addr) & 0x3f; - filter[bit / 8] |= 1 << (bit % 8); - } + if (sky2->rx_pause) + sky2_add_filter(filter, pause_mc_addr); + + for (i = 0; list && i < dev->mc_count; i++, list = list->next) + sky2_add_filter(filter, list->dmi_addr); } gma_write16(hw, port, GM_MC_ADDR_H1, @@ -3200,6 +3209,8 @@ static int __devinit sky2_test_msi(struc struct pci_dev *pdev = hw->pdev; int err; + init_waitqueue_head (&hw->msi_wait); + sky2_write32(hw, B0_IMSK, Y2_IS_IRQ_SW); err = request_irq(pdev->irq, sky2_test_intr, IRQF_SHARED, DRV_NAME, hw); @@ -3209,18 +3220,15 @@ static int __devinit sky2_test_msi(struc return err; } - init_waitqueue_head (&hw->msi_wait); - sky2_write8(hw, B0_CTST, CS_ST_SW_IRQ); - wmb(); + sky2_read8(hw, B0_CTST); wait_event_timeout(hw->msi_wait, hw->msi_detected, HZ/10); if (!hw->msi_detected) { /* MSI test failed, go back to INTx mode */ - printk(KERN_WARNING PFX "%s: No interrupt was generated using MSI, " - "switching to INTx mode. Please report this failure to " - "the PCI maintainer and include system chipset information.\n", + printk(KERN_INFO PFX "%s: No interrupt generated using MSI, " + "switching to INTx mode.\n", pci_name(pdev)); err = -EOPNOTSUPP; @@ -3228,6 +3236,7 @@ static int __devinit sky2_test_msi(struc } sky2_write32(hw, B0_IMSK, 0); + sky2_read32(hw, B0_IMSK); free_irq(pdev->irq, hw); diff -uprN linux-2.6.18/drivers/net/sky2.h linux-2.6.18.ovz/drivers/net/sky2.h --- linux-2.6.18/drivers/net/sky2.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/sky2.h 2007-06-13 06:55:07.000000000 -0400 @@ -1566,7 +1566,7 @@ enum { GMR_FS_ANY_ERR = GMR_FS_RX_FF_OV | GMR_FS_CRC_ERR | GMR_FS_FRAGMENT | GMR_FS_LONG_ERR | - GMR_FS_MII_ERR | GMR_FS_BAD_FC | GMR_FS_GOOD_FC | + GMR_FS_MII_ERR | GMR_FS_GOOD_FC | GMR_FS_BAD_FC | GMR_FS_UN_SIZE | GMR_FS_JABBER, }; diff -uprN linux-2.6.18/drivers/net/sunhme.c linux-2.6.18.ovz/drivers/net/sunhme.c --- linux-2.6.18/drivers/net/sunhme.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/sunhme.c 2007-06-13 06:55:07.000000000 -0400 @@ -3012,6 +3012,11 @@ static int __devinit happy_meal_pci_prob #endif err = -ENODEV; + + if (pci_enable_device(pdev)) + goto err_out; + pci_set_master(pdev); + if (!strcmp(prom_name, "SUNW,qfe") || !strcmp(prom_name, "qfe")) { qp = quattro_pci_find(pdev); if (qp == NULL) diff -uprN linux-2.6.18/drivers/net/tg3.c linux-2.6.18.ovz/drivers/net/tg3.c --- linux-2.6.18/drivers/net/tg3.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/tg3.c 2007-06-13 06:55:07.000000000 -0400 @@ -6889,8 +6889,10 @@ static int tg3_open(struct net_device *d tg3_full_lock(tp, 0); err = tg3_set_power_state(tp, PCI_D0); - if (err) + if (err) { + tg3_full_unlock(tp); return err; + } tg3_disable_ints(tp); tp->tg3_flags &= ~TG3_FLAG_INIT_COMPLETE; diff -uprN linux-2.6.18/drivers/net/tun.c linux-2.6.18.ovz/drivers/net/tun.c --- linux-2.6.18/drivers/net/tun.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/tun.c 2007-06-13 06:55:07.000000000 -0400 @@ -61,6 +61,7 @@ #include #include +#include #ifdef TUN_DEBUG static int debug; @@ -89,6 +90,9 @@ static int tun_net_close(struct net_devi static int tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); +#if 0 + struct user_beancounter *ub; +#endif DBG(KERN_INFO "%s: tun_net_xmit %d\n", tun->dev->name, skb->len); @@ -113,6 +117,24 @@ static int tun_net_xmit(struct sk_buff * } } + /* + * XXX this code is broken: + * See comment in dev_queue_xmit + */ +#if 0 + ub = netdev_bc(dev)->exec_ub; + if (ub && (skb_bc(skb)->charged == 0)) { + unsigned long charge; + charge = skb_charge_fullsize(skb); + if (charge_beancounter(ub, UB_OTHERSOCKBUF, charge, 1)) + goto drop; + get_beancounter(ub); + skb_bc(skb)->ub = ub; + skb_bc(skb)->charged = charge; + skb_bc(skb)->resource = UB_OTHERSOCKBUF; + } +#endif + /* Queue packet */ skb_queue_tail(&tun->readq, skb); dev->trans_start = jiffies; @@ -409,12 +431,14 @@ static ssize_t tun_chr_readv(struct file tun->dev->name, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); ret = tun_put_user(tun, skb, (struct iovec *) iv, len); + /* skb will be uncharged in kfree_skb() */ kfree_skb(skb); break; } else { DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %x:%x:%x:%x:%x:%x\n", tun->dev->name, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + /* skb will be uncharged in kfree_skb() */ kfree_skb(skb); continue; } @@ -450,6 +474,7 @@ static void tun_setup(struct net_device dev->get_stats = tun_net_stats; dev->ethtool_ops = &tun_ethtool_ops; dev->destructor = free_netdev; + dev->features |= NETIF_F_VIRTUAL; } static struct tun_struct *tun_get_by_name(const char *name) @@ -458,8 +483,9 @@ static struct tun_struct *tun_get_by_nam ASSERT_RTNL(); list_for_each_entry(tun, &tun_dev_list, list) { - if (!strncmp(tun->dev->name, name, IFNAMSIZ)) - return tun; + if (ve_accessible_strict(tun->dev->owner_env, get_exec_env()) && + !strncmp(tun->dev->name, name, IFNAMSIZ)) + return tun; } return NULL; @@ -478,7 +504,8 @@ static int tun_set_iff(struct file *file /* Check permissions */ if (tun->owner != -1 && - current->euid != tun->owner && !capable(CAP_NET_ADMIN)) + current->euid != tun->owner && + !capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; } else if (__dev_get_by_name(ifr->ifr_name)) @@ -489,7 +516,7 @@ static int tun_set_iff(struct file *file err = -EINVAL; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; /* Set dev type */ @@ -603,6 +630,9 @@ static int tun_chr_ioctl(struct inode *i break; case TUNSETPERSIST: + /* prohibit persist mode iniside VE */ + if (!ve_is_super(get_exec_env())) + return -EPERM; /* Disable/Enable persist mode */ if (arg) tun->flags |= TUN_PERSIST; diff -uprN linux-2.6.18/drivers/net/venet_core.c linux-2.6.18.ovz/drivers/net/venet_core.c --- linux-2.6.18/drivers/net/venet_core.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/net/venet_core.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,773 @@ +/* + * venet_core.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Common part for Virtuozzo virtual network devices + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ +#include /* For ARPHRD_ETHER */ +#include +#include +#include +#include + +struct list_head ip_entry_hash_table[VEIP_HASH_SZ]; +rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED; +LIST_HEAD(veip_lh); + +struct venet_stats { + struct net_device_stats stats; + struct net_device_stats *real_stats; +}; + +static inline struct net_device_stats * +venet_stats(struct net_device *dev, int cpu) +{ + struct venet_stats *stats; + stats = (struct venet_stats*)dev->priv; + return per_cpu_ptr(stats->real_stats, cpu); +} + + +#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1)) + +void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip) +{ + list_add(&entry->ip_hash, + ip_entry_hash_table + + ip_entry_hash_function(entry->addr.key[3])); + list_add(&entry->ve_list, &veip->ip_lh); +} + +void veip_put(struct veip_struct *veip) +{ + if (!list_empty(&veip->ip_lh)) + return; + if (!list_empty(&veip->src_lh)) + return; + if (!list_empty(&veip->dst_lh)) + return; + + list_del(&veip->list); + kfree(veip); +} + +struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr) +{ + struct ip_entry_struct *entry; + + list_for_each_entry (entry, ip_entry_hash_table + + ip_entry_hash_function(addr->key[3]), ip_hash) + if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0) + return entry; + return NULL; +} + +struct veip_struct *veip_find(envid_t veid) +{ + struct veip_struct *ptr; + + list_for_each_entry(ptr, &veip_lh, list) { + if (ptr->veid != veid) + continue; + return ptr; + } + return NULL; +} + +struct veip_struct *veip_findcreate(envid_t veid) +{ + struct veip_struct *ptr; + + ptr = veip_find(veid); + if (ptr != NULL) + return ptr; + + ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC); + if (ptr == NULL) + return NULL; + memset(ptr, 0, sizeof(struct veip_struct)); + INIT_LIST_HEAD(&ptr->ip_lh); + INIT_LIST_HEAD(&ptr->src_lh); + INIT_LIST_HEAD(&ptr->dst_lh); + ptr->veid = veid; + list_add(&ptr->list, &veip_lh); + return ptr; +} + +static int convert_sockaddr(struct sockaddr *addr, int addrlen, + struct ve_addr_struct *veaddr) +{ + int err; + + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *sin; + + err = -EINVAL; + if (addrlen != sizeof(struct sockaddr_in)) + break; + + err = 0; + sin = (struct sockaddr_in *)addr; + veaddr->family = AF_INET; + veaddr->key[0] = 0; + veaddr->key[1] = 0; + veaddr->key[2] = 0; + veaddr->key[3] = sin->sin_addr.s_addr; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin; + + err = -EINVAL; + if (addrlen != sizeof(struct sockaddr_in6)) + break; + + err = 0; + sin = (struct sockaddr_in6 *)addr; + veaddr->family = AF_INET6; + memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key)); + break; + } + default: + err = -EAFNOSUPPORT; + } + return err; +} + +int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, + struct ve_addr_struct *veaddr) +{ + int err; + char addr[MAX_SOCK_ADDR]; + + err = move_addr_to_kernel(uaddr, addrlen, &addr); + if (err < 0) + goto out; + + err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr); +out: + return err; +} + +void veaddr_print(char *str, int len, struct ve_addr_struct *a) +{ + if (a->family == AF_INET) + snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3])); + else + snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x", + ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF, + ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF, + ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF, + ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF + ); +} + +/* + * Device functions + */ + +static int venet_open(struct net_device *dev) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + return 0; +} + +static int venet_close(struct net_device *master) +{ + module_put(THIS_MODULE); + return 0; +} + +static void venet_destructor(struct net_device *dev) +{ + struct venet_stats *stats = (struct venet_stats *)dev->priv; + if (stats == NULL) + return; + free_percpu(stats->real_stats); + kfree(stats); + dev->priv = NULL; +} + +/* + * The higher levels take care of making this non-reentrant (it's + * called with bh's disabled). + */ +static int venet_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats; + struct net_device *rcv = NULL; + int length; + + stats = venet_stats(dev, smp_processor_id()); + if (unlikely(get_exec_env()->disable_net)) + goto outf; + + /* + * Optimise so buffers with skb->free=1 are not copied but + * instead are lobbed from tx queue to rx queue + */ + if (atomic_read(&skb->users) != 1) { + struct sk_buff *skb2 = skb; + skb = skb_clone(skb, GFP_ATOMIC); /* Clone the buffer */ + if (skb == NULL) { + kfree_skb(skb2); + goto out; + } + kfree_skb(skb2); + } else + skb_orphan(skb); + + if (skb->protocol == __constant_htons(ETH_P_IP)) { + struct iphdr *iph; + iph = skb->nh.iph; + if (MULTICAST(iph->daddr)) + goto outf; + } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h; + ip6h = skb->nh.ipv6h; + if (ipv6_addr_is_multicast(&ip6h->daddr)) + goto outf; + } else { + goto outf; + } + + if (venet_change_skb_owner(skb) < 0) + goto outf; + + if (unlikely(skb->owner_env->disable_net)) + goto outf; + + rcv = skb->owner_env->_venet_dev; + if (!rcv) + /* VE going down */ + goto outf; + + dev_hold(rcv); + + if (!(rcv->flags & IFF_UP)) { + /* Target VE does not want to receive packets */ + dev_put(rcv); + goto outf; + } + + skb->pkt_type = PACKET_HOST; + skb->dev = rcv; + + skb->mac.raw = skb->data; + memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len); + + dst_release(skb->dst); + skb->dst = NULL; +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif +#endif + length = skb->len; + + netif_rx(skb); + + stats->tx_bytes += length; + stats->tx_packets++; + if (rcv) { + struct net_device_stats *rcv_stats; + + rcv_stats = venet_stats(rcv, smp_processor_id()); + rcv_stats->rx_bytes += length; + rcv_stats->rx_packets++; + dev_put(rcv); + } + + return 0; + +outf: + kfree_skb(skb); + ++stats->tx_dropped; +out: + return 0; +} + +static struct net_device_stats *get_stats(struct net_device *dev) +{ + int i; + struct venet_stats *stats; + + stats = (struct venet_stats *)dev->priv; + memset(&stats->stats, 0, sizeof(struct net_device_stats)); + for (i=0; i < NR_CPUS; i++) { + struct net_device_stats *dev_stats; + + if (!cpu_possible(i)) + continue; + dev_stats = venet_stats(dev, i); + stats->stats.rx_bytes += dev_stats->rx_bytes; + stats->stats.tx_bytes += dev_stats->tx_bytes; + stats->stats.rx_packets += dev_stats->rx_packets; + stats->stats.tx_packets += dev_stats->tx_packets; + } + + return &stats->stats; +} + +/* Initialize the rest of the LOOPBACK device. */ +int venet_init_dev(struct net_device *dev) +{ + struct venet_stats *stats; + + dev->hard_start_xmit = venet_xmit; + stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL); + if (stats == NULL) + goto fail; + stats->real_stats = alloc_percpu(struct net_device_stats); + if (stats->real_stats == NULL) + goto fail_free; + dev->priv = stats; + + dev->get_stats = get_stats; + dev->open = venet_open; + dev->stop = venet_close; + dev->destructor = venet_destructor; + + /* + * Fill in the generic fields of the device structure. + */ + dev->type = ARPHRD_VOID; + dev->hard_header_len = ETH_HLEN; + dev->mtu = 1500; /* eth_mtu */ + dev->tx_queue_len = 0; + + memset(dev->broadcast, 0xFF, ETH_ALEN); + + /* New-style flags. */ + dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT; + return 0; + +fail_free: + kfree(stats); +fail: + return -ENOMEM; +} + +static void venet_setup(struct net_device *dev) +{ + dev->init = venet_init_dev; + /* + * No other features, as they are: + * - checksumming is required, and nobody else will done our job + */ + dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX; +} + +#ifdef CONFIG_PROC_FS +static int veinfo_seq_show(struct seq_file *m, void *v) +{ + struct ve_struct *ve; + struct ip_entry_struct *entry; + + ve = list_entry((struct list_head *)v, struct ve_struct, ve_list); + + seq_printf(m, "%10u %5u %5u", ve->veid, + ve->class_id, atomic_read(&ve->pcounter)); + read_lock(&veip_hash_lock); + if (ve->veip == NULL) + goto unlock; + list_for_each_entry (entry, &ve->veip->ip_lh, ve_list) { + char addr[40]; + + if (entry->active_env == NULL) + continue; + + veaddr_print(addr, sizeof(addr), &entry->addr); + seq_printf(m, " %39s", addr); + } +unlock: + read_unlock(&veip_hash_lock); + seq_putc(m, '\n'); + return 0; +} + +static void *ve_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ve_struct *curve; + struct list_head *entry; + loff_t l; + + curve = get_exec_env(); + read_lock(&ve_list_lock); + if (!ve_is_super(curve)) { + if (*pos != 0) + return NULL; + return curve; + } + + l = *pos; + list_for_each(entry, &ve_list_head) { + if (l == 0) + return entry; + l--; + } + return NULL; +} + +static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *entry; + + entry = (struct list_head *)v; + if (!ve_is_super(get_exec_env())) + return NULL; + (*pos)++; + return entry->next == &ve_list_head ? NULL : entry->next; +} + +static void ve_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_list_lock); +} + + +static struct seq_operations veinfo_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = veinfo_seq_show, +}; + +static int veinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &veinfo_seq_op); +} + +static struct file_operations proc_veinfo_operations = { + .open = veinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *veip_seq_start(struct seq_file *m, loff_t *pos) +{ + loff_t l; + struct list_head *p; + int i; + + l = *pos; + write_lock_irq(&veip_hash_lock); + if (l == 0) + return ip_entry_hash_table; + for (i = 0; i < VEIP_HASH_SZ; i++) { + list_for_each(p, ip_entry_hash_table + i) { + if (--l == 0) + return p; + } + } + return NULL; +} + +static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *p; + + p = (struct list_head *)v; + while (1) { + p = p->next; + if (p < ip_entry_hash_table || + p >= ip_entry_hash_table + VEIP_HASH_SZ) { + (*pos)++; + return p; + } + if (++p >= ip_entry_hash_table + VEIP_HASH_SZ) + return NULL; + } + return NULL; +} + +static void veip_seq_stop(struct seq_file *m, void *v) +{ + write_unlock_irq(&veip_hash_lock); +} + +static struct seq_operations veip_seq_op = { + .start = veip_seq_start, + .next = veip_seq_next, + .stop = veip_seq_stop, + .show = veip_seq_show, +}; + +static int veip_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &veip_seq_op); +} + +static struct file_operations proc_veip_operations = { + .open = veip_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + +static int real_ve_ip_map(envid_t veid, int op, struct sockaddr __user *uaddr, + int addrlen) +{ + int err; + struct ve_struct *ve; + struct ve_addr_struct addr; + + err = -EPERM; + if (!capable(CAP_SETVEID)) + goto out; + + err = sockaddr_to_veaddr(uaddr, addrlen, &addr); + if (err < 0) + goto out; + + switch (op) + { + case VE_IP_ADD: + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veip_entry_add(ve, &addr); + up_read(&ve->op_sem); + put_ve(ve); + break; + + case VE_IP_DEL: + err = veip_entry_del(veid, &addr); + break; + default: + err = -EINVAL; + } + +out: + return err; +} + +int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VENETCTL_VE_IP_MAP: { + struct vzctl_ve_ip_map s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen); + break; + } + } + return err; +} + +#ifdef CONFIG_COMPAT +int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch(cmd) { + case VENETCTL_COMPAT_VE_IP_MAP: { + struct compat_vzctl_ve_ip_map cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr), + cs.addrlen); + break; + } + default: + err = venet_ioctl(file, cmd, arg); + break; + } + return err; +} +#endif + +static struct vzioctlinfo venetcalls = { + .type = VENETCTLTYPE, + .ioctl = venet_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_venet_ioctl, +#endif + .owner = THIS_MODULE, +}; + +int venet_dev_start(struct ve_struct *env) +{ + struct net_device *dev_venet; + int err; + + dev_venet = alloc_netdev(0, "venet%d", venet_setup); + if (!dev_venet) + return -ENOMEM; + err = dev_alloc_name(dev_venet, dev_venet->name); + if (err<0) + goto err; + if ((err = register_netdev(dev_venet)) != 0) + goto err; + env->_venet_dev = dev_venet; + return 0; +err: + free_netdev(dev_venet); + printk(KERN_ERR "VENET initialization error err=%d\n", err); + return err; +} + +static int venet_start(void *data) +{ + struct ve_struct *env; + int err; + + env = (struct ve_struct *)data; + if (env->veip) + return -EEXIST; + + err = veip_start(env); + if (err != 0) + return err; + + err = venet_dev_start(env); + if (err) + goto err_free; + return 0; + +err_free: + veip_stop(env); + return err; +} + +static void venet_stop(void *data) +{ + struct ve_struct *env; + struct net_device *dev; + + env = (struct ve_struct *)data; + veip_stop(env); + + dev = env->_venet_dev; + if (dev == NULL) + return; + + unregister_netdev(dev); + env->_venet_dev = NULL; + free_netdev(dev); +} + +static struct ve_hook venet_ve_hook = { + .init = venet_start, + .fini = venet_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET, +}; + +__init int venet_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *de; +#endif + int i, err; + + if (get_ve0()->_venet_dev != NULL) + return -EEXIST; + + for (i = 0; i < VEIP_HASH_SZ; i++) + INIT_LIST_HEAD(ip_entry_hash_table + i); + + err = venet_start(get_ve0()); + if (err) + return err; + +#ifdef CONFIG_PROC_FS + de = create_proc_glob_entry_mod("vz/veinfo", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + if (de) + de->proc_fops = &proc_veinfo_operations; + else + printk(KERN_WARNING "venet: can't make veinfo proc entry\n"); + + de = create_proc_entry_mod("vz/veip", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + if (de) + de->proc_fops = &proc_veip_operations; + else + printk(KERN_WARNING "venet: can't make veip proc entry\n"); +#endif + + ve_hook_register(VE_SS_CHAIN, &venet_ve_hook); + vzioctl_register(&venetcalls); + return 0; +} + +__exit void venet_exit(void) +{ + struct net_device *dev_venet; + + vzioctl_unregister(&venetcalls); + ve_hook_unregister(&venet_ve_hook); +#ifdef CONFIG_PROC_FS + remove_proc_entry("vz/veip", NULL); + remove_proc_entry("vz/veinfo", NULL); +#endif + + dev_venet = get_ve0()->_venet_dev; + if (dev_venet != NULL) { + get_ve0()->_venet_dev = NULL; + unregister_netdev(dev_venet); + free_netdev(dev_venet); + } + veip_stop(get_ve0()); + + veip_cleanup(); +} + +module_init(venet_init); +module_exit(venet_exit); diff -uprN linux-2.6.18/drivers/net/veth.c linux-2.6.18.ovz/drivers/net/veth.c --- linux-2.6.18/drivers/net/veth.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/net/veth.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,691 @@ +/* + * veth.c + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Virtual ethernet device used to change VE ownership on packets + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ +#include /* For ARPHRD_ETHER */ +#include +#include +#include + +#include +#include +#include +#include + +struct veth_struct +{ + struct net_device_stats stats; + struct net_device *pair; + struct list_head hwaddr_list; + struct net_device_stats *real_stats; + int allow_mac_change; +}; + +struct list_head veth_hwaddr_list; +rwlock_t ve_hwaddr_lock = RW_LOCK_UNLOCKED; +DECLARE_MUTEX(hwaddr_sem); + +#define veth_from_netdev(dev) \ + ((struct veth_struct *)(netdev_priv(dev))) +#define veth_to_netdev(veth) \ + ((struct net_device*)((char*)veth - \ + (unsigned long)netdev_priv(NULL))) + +static inline struct net_device_stats * +veth_stats(struct net_device *dev, int cpuid) +{ + return per_cpu_ptr(veth_from_netdev(dev)->real_stats, cpuid); +} + +struct net_device * veth_dev_start(char *dev_addr, char *name); + +struct veth_struct *hwaddr_entry_lookup(char *name) +{ + struct veth_struct *entry; + struct list_head *tmp; + + list_for_each(tmp, &veth_hwaddr_list) { + entry = list_entry(tmp, struct veth_struct, hwaddr_list); + BUG_ON(entry->pair == NULL); + if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0) + return entry; + } + return NULL; +} + +int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name, + char *dev_addr_ve, char *name_ve) +{ + struct net_device *dev_ve; + struct net_device *dev_ve0; + struct ve_struct *old_env; + char dev_name[IFNAMSIZ]; + int err; + + down(&hwaddr_sem); + + if (name[0] == '\0') + snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid); + else { + memcpy(dev_name, name, IFNAMSIZ - 1); + dev_name[IFNAMSIZ - 1] = '\0'; + } + dev_ve0 = veth_dev_start(dev_addr, dev_name); + if (IS_ERR(dev_ve0)) { + err = PTR_ERR(dev_ve0); + goto err; + } + + old_env = set_exec_env(ve); + if (name_ve[0] == '\0') + sprintf(dev_name, "eth%%d"); + else { + memcpy(dev_name, name_ve, IFNAMSIZ - 1); + dev_name[IFNAMSIZ - 1] = '\0'; + } + dev_ve = veth_dev_start(dev_addr_ve, dev_name); + if (IS_ERR(dev_ve)) { + err = PTR_ERR(dev_ve); + goto err_ve; + } + set_exec_env(old_env); + veth_from_netdev(dev_ve)->pair = dev_ve0; + veth_from_netdev(dev_ve0)->pair = dev_ve; + + write_lock(&ve_hwaddr_lock); + list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list); + write_unlock(&ve_hwaddr_lock); + + up(&hwaddr_sem); + return 0; + +err_ve: + set_exec_env(old_env); + unregister_netdev(dev_ve0); +err: + up(&hwaddr_sem); + return err; +} + +void veth_pair_del(struct ve_struct *env, struct veth_struct *entry) +{ + struct net_device *dev; + struct ve_struct *old_env; + + write_lock(&ve_hwaddr_lock); + list_del(&entry->hwaddr_list); + write_unlock(&ve_hwaddr_lock); + + dev = entry->pair; + BUG_ON(entry->pair == NULL); + + veth_from_netdev(dev)->pair = NULL; + entry->pair = NULL; + rtnl_lock(); + old_env = set_exec_env(dev->owner_env); + dev_close(dev); + + /* + * Now device from VE0 does not send or receive anything, + * i.e. dev->hard_start_xmit won't be called. + */ + set_exec_env(env); + unregister_netdevice(veth_to_netdev(entry)); + set_exec_env(dev->owner_env); + unregister_netdevice(dev); + set_exec_env(old_env); + rtnl_unlock(); +} + +int veth_entry_del(struct ve_struct *ve, char *name) +{ + struct veth_struct *found; + int err; + + err = -ENODEV; + down(&hwaddr_sem); + found = hwaddr_entry_lookup(name); + if (found == NULL) + goto out; + if (veth_to_netdev(found)->owner_env != ve) + goto out; + + err = 0; + veth_pair_del(ve, found); + +out: + up(&hwaddr_sem); + return err; +} + +int veth_allow_change_mac(envid_t veid, char *name, int allow) +{ + struct ve_struct *ve; + struct veth_struct *found; + int err; + + err = -ESRCH; + ve = get_ve_by_id(veid); + if (!ve) + return err; + + down_read(&ve->op_sem); + if (!ve->is_running) + goto out_ve; + err = -ENODEV; + down(&hwaddr_sem); + found = hwaddr_entry_lookup(name); + if (found == NULL) + goto out_sem; + if (veth_to_netdev(found)->owner_env != ve) + goto out_sem; + + err = 0; + found->allow_mac_change = allow; + +out_sem: + up(&hwaddr_sem); +out_ve: + up_read(&ve->op_sem); + put_ve(ve); + return err; +} + +/* + * Device functions + */ + +static int veth_open(struct net_device *dev) +{ + return 0; +} + +static int veth_close(struct net_device *master) +{ + return 0; +} + +static void veth_destructor(struct net_device *dev) +{ + free_percpu(veth_from_netdev(dev)->real_stats); + free_netdev(dev); +} + +static struct net_device_stats *get_stats(struct net_device *dev) +{ + int i; + struct net_device_stats *stats; + + stats = &veth_from_netdev(dev)->stats; + memset(stats, 0, sizeof(struct net_device_stats)); + for (i=0; i < NR_CPUS; i++) { + struct net_device_stats *dev_stats; + + if (!cpu_possible(i)) + continue; + dev_stats = veth_stats(dev, i); + stats->rx_bytes += dev_stats->rx_bytes; + stats->tx_bytes += dev_stats->tx_bytes; + stats->rx_packets += dev_stats->rx_packets; + stats->tx_packets += dev_stats->tx_packets; + } + + return stats; +} + +/* + * The higher levels take care of making this non-reentrant (it's + * called with bh's disabled). + */ +static int veth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats; + struct net_device *rcv = NULL; + struct veth_struct *entry; + int length; + + stats = veth_stats(dev, smp_processor_id()); + if (unlikely(get_exec_env()->disable_net)) + goto outf; + + skb_orphan(skb); + + entry = veth_from_netdev(dev); + rcv = entry->pair; + if (!rcv) + /* VE going down */ + goto outf; + + if (!(rcv->flags & IFF_UP)) { + /* Target VE does not want to receive packets */ + goto outf; + } + + if (unlikely(rcv->owner_env->disable_net)) + goto outf; + /* Filtering */ + if (ve_is_super(dev->owner_env) && + !veth_from_netdev(rcv)->allow_mac_change) { + /* from VE0 to VEX */ + if (ve_is_super(rcv->owner_env)) + goto out; + if (is_multicast_ether_addr( + ((struct ethhdr *)skb->data)->h_dest)) + goto out; + if (compare_ether_addr(((struct ethhdr *)skb->data)->h_dest, + rcv->dev_addr)) + goto outf; + } else if (!ve_is_super(dev->owner_env) && + !entry->allow_mac_change) { + /* from VE to VE0 */ + if (compare_ether_addr(((struct ethhdr *)skb->data)->h_source, + dev->dev_addr)) + goto outf; + } + +out: + skb->owner_env = rcv->owner_env; + + skb->dev = rcv; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, rcv); + + dst_release(skb->dst); + skb->dst = NULL; +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif +#endif + length = skb->len; + + netif_rx(skb); + + stats->tx_bytes += length; + stats->tx_packets++; + if (rcv) { + struct net_device_stats *rcv_stats; + rcv_stats = veth_stats(rcv, smp_processor_id()); + rcv_stats->rx_bytes += length; + rcv_stats->rx_packets++; + } + + return 0; + +outf: + kfree_skb(skb); + stats->tx_dropped++; + return 0; +} + +static int veth_set_mac(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (!ve_is_super(dev->owner_env) && + !veth_from_netdev(dev)->allow_mac_change) + return -EPERM; + if (netif_running(dev)) + return -EBUSY; + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + + return 0; +} + +int veth_init_dev(struct net_device *dev) +{ + dev->hard_start_xmit = veth_xmit; + dev->get_stats = get_stats; + dev->open = veth_open; + dev->stop = veth_close; + dev->destructor = veth_destructor; + + ether_setup(dev); + dev->set_mac_address = veth_set_mac; + + /* remove setted by ether_setup() handler */ + dev->change_mtu = NULL; + + dev->tx_queue_len = 0; + + veth_from_netdev(dev)->real_stats = + alloc_percpu(struct net_device_stats); + if (veth_from_netdev(dev)->real_stats == NULL) + return -ENOMEM; + + return 0; +} + +static void veth_setup(struct net_device *dev) +{ + dev->init = veth_init_dev; + /* + * No other features, as they are: + * - checksumming is required, and nobody else will done our job + */ + dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX; +} + +#ifdef CONFIG_PROC_FS +#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x" +#define ADDR_ARG(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5] +static int vehwaddr_seq_show(struct seq_file *m, void *v) +{ + struct list_head *p; + struct veth_struct *entry; + + p = (struct list_head *)v; + if (p == &veth_hwaddr_list) { + seq_puts(m, "Version: 1.0\n"); + return 0; + } + entry = list_entry(p, struct veth_struct, hwaddr_list); + seq_printf(m, ADDR_FMT " %16s ", + ADDR_ARG(entry->pair->dev_addr), entry->pair->name); + seq_printf(m, ADDR_FMT " %16s %10u %5s\n", + ADDR_ARG(veth_to_netdev(entry)->dev_addr), + veth_to_netdev(entry)->name, + VEID(veth_to_netdev(entry)->owner_env), + entry->allow_mac_change ? "allow" : "deny"); + return 0; +} + +static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos) +{ + loff_t l; + struct list_head *p; + + l = *pos; + read_lock(&ve_hwaddr_lock); + if (l == 0) + return &veth_hwaddr_list; + list_for_each(p, &veth_hwaddr_list) { + if (--l == 0) + return p; + } + return NULL; +} + +static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *p; + + p = (struct list_head *)v; + (*pos)++; + return p->next == &veth_hwaddr_list ? NULL : p->next; +} + +static void vehwaddr_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_hwaddr_lock); +} + +static struct seq_operations vehwaddr_seq_op = { + .start = vehwaddr_seq_start, + .next = vehwaddr_seq_next, + .stop = vehwaddr_seq_stop, + .show = vehwaddr_seq_show +}; + +static int vehwaddr_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vehwaddr_seq_op); +} + +static struct file_operations proc_vehwaddr_operations = { + .open = vehwaddr_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; +#endif + +int real_ve_hwaddr(envid_t veid, int op, + unsigned char *dev_addr, int addrlen, char *name, + unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve) +{ + int err; + struct ve_struct *ve; + char ve_addr[ETH_ALEN]; + + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto out; + + err = -EINVAL; + switch (op) + { + case VE_ETH_ADD: + if (addrlen != ETH_ALEN) + goto out; + if (addrlen_ve != ETH_ALEN && addrlen_ve != 0) + goto out; + /* If ve addr is not set then we use dev_addr[3] & 0x80 for it */ + if (addrlen_ve == 0 && (dev_addr[3] & 0x80)) + goto out; + if (addrlen_ve == 0) { + memcpy(ve_addr, dev_addr, ETH_ALEN); + ve_addr[3] |= 0x80; + } else { + memcpy(ve_addr, dev_addr_ve, ETH_ALEN); + } + + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veth_entry_add(ve, dev_addr, name, + ve_addr, name_ve); + up_read(&ve->op_sem); + put_ve(ve); + break; + + case VE_ETH_DEL: + if (name[0] == '\0') + goto out; + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veth_entry_del(ve, name); + up_read(&ve->op_sem); + put_ve(ve); + break; + case VE_ETH_ALLOW_MAC_CHANGE: + case VE_ETH_DENY_MAC_CHANGE: + err = veth_allow_change_mac(veid, name, + op == VE_ETH_ALLOW_MAC_CHANGE); + break; + } + +out: + return err; +} + +int veth_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VETHCTL_VE_HWADDR: { + struct vzctl_ve_hwaddr s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_ve_hwaddr(s.veid, s.op, + s.dev_addr, s.addrlen, s.dev_name, + s.dev_addr_ve, s.addrlen_ve, s.dev_name_ve); + } + break; + } + return err; +} + +static struct vzioctlinfo vethcalls = { + .type = VETHCTLTYPE, + .ioctl = veth_ioctl, + .compat_ioctl = veth_ioctl, + .owner = THIS_MODULE, +}; + +struct net_device * veth_dev_start(char *dev_addr, char *name) +{ + struct net_device *dev; + int err; + + if (!is_valid_ether_addr(dev_addr)) + return ERR_PTR(-EADDRNOTAVAIL); + + dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup); + if (!dev) + return ERR_PTR(-ENOMEM); + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto err; + } + if ((err = register_netdev(dev)) != 0) + goto err; + + memcpy(dev->dev_addr, dev_addr, ETH_ALEN); + dev->addr_len = ETH_ALEN; + + return dev; +err: + free_netdev(dev); + printk(KERN_ERR "%s initialization error err=%d\n", name, err); + return ERR_PTR(err); +} + +static int veth_start(void *data) +{ + return 0; +} + +static void veth_stop(void *data) +{ + struct ve_struct *env; + struct veth_struct *entry, *tmp; + + env = (struct ve_struct *)data; + down(&hwaddr_sem); + list_for_each_entry_safe(entry, tmp, &veth_hwaddr_list, hwaddr_list) + if (VEID(env) == VEID(veth_to_netdev(entry)->owner_env)) + veth_pair_del(env, entry); + up(&hwaddr_sem); +} + +static struct ve_hook veth_ve_hook = { + .init = veth_start, + .fini = veth_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET, +}; + +__init int veth_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *de; +#endif + + INIT_LIST_HEAD(&veth_hwaddr_list); + +#ifdef CONFIG_PROC_FS + de = create_proc_entry_mod("vz/veth", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + if (de) + de->proc_fops = &proc_vehwaddr_operations; + else + printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n"); +#endif + + ve_hook_register(VE_SS_CHAIN, &veth_ve_hook); + vzioctl_register(&vethcalls); + KSYMRESOLVE(veth_open); + KSYMMODRESOLVE(vzethdev); + return 0; +} + +__exit void veth_exit(void) +{ + struct veth_struct *entry; + struct list_head *tmp, *n; + struct ve_struct *ve; + + KSYMMODUNRESOLVE(vzethdev); + KSYMUNRESOLVE(veth_open); + vzioctl_unregister(&vethcalls); + ve_hook_unregister(&veth_ve_hook); +#ifdef CONFIG_PROC_FS + remove_proc_entry("vz/veth", NULL); +#endif + + down(&hwaddr_sem); + list_for_each_safe(tmp, n, &veth_hwaddr_list) { + entry = list_entry(tmp, struct veth_struct, hwaddr_list); + ve = get_ve(veth_to_netdev(entry)->owner_env); + + veth_pair_del(ve, entry); + + put_ve(ve); + } + up(&hwaddr_sem); +} + +module_init(veth_init); +module_exit(veth_exit); + +MODULE_AUTHOR("Andrey Mirkin "); +MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device"); +MODULE_LICENSE("GPL v2"); + diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx.h linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx.h --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx.h 2007-06-13 06:55:07.000000000 -0400 @@ -21,7 +21,7 @@ #define PFX KBUILD_MODNAME ": " #define BCM43xx_SWITCH_CORE_MAX_RETRIES 50 -#define BCM43xx_IRQWAIT_MAX_RETRIES 50 +#define BCM43xx_IRQWAIT_MAX_RETRIES 100 #define BCM43xx_IO_SIZE 8192 @@ -33,14 +33,18 @@ #define BCM43xx_PCICFG_ICR 0x94 /* MMIO offsets */ -#define BCM43xx_MMIO_DMA1_REASON 0x20 -#define BCM43xx_MMIO_DMA1_IRQ_MASK 0x24 -#define BCM43xx_MMIO_DMA2_REASON 0x28 -#define BCM43xx_MMIO_DMA2_IRQ_MASK 0x2C -#define BCM43xx_MMIO_DMA3_REASON 0x30 -#define BCM43xx_MMIO_DMA3_IRQ_MASK 0x34 -#define BCM43xx_MMIO_DMA4_REASON 0x38 -#define BCM43xx_MMIO_DMA4_IRQ_MASK 0x3C +#define BCM43xx_MMIO_DMA0_REASON 0x20 +#define BCM43xx_MMIO_DMA0_IRQ_MASK 0x24 +#define BCM43xx_MMIO_DMA1_REASON 0x28 +#define BCM43xx_MMIO_DMA1_IRQ_MASK 0x2C +#define BCM43xx_MMIO_DMA2_REASON 0x30 +#define BCM43xx_MMIO_DMA2_IRQ_MASK 0x34 +#define BCM43xx_MMIO_DMA3_REASON 0x38 +#define BCM43xx_MMIO_DMA3_IRQ_MASK 0x3C +#define BCM43xx_MMIO_DMA4_REASON 0x40 +#define BCM43xx_MMIO_DMA4_IRQ_MASK 0x44 +#define BCM43xx_MMIO_DMA5_REASON 0x48 +#define BCM43xx_MMIO_DMA5_IRQ_MASK 0x4C #define BCM43xx_MMIO_STATUS_BITFIELD 0x120 #define BCM43xx_MMIO_STATUS2_BITFIELD 0x124 #define BCM43xx_MMIO_GEN_IRQ_REASON 0x128 @@ -56,14 +60,27 @@ #define BCM43xx_MMIO_XMITSTAT_1 0x174 #define BCM43xx_MMIO_REV3PLUS_TSF_LOW 0x180 /* core rev >= 3 only */ #define BCM43xx_MMIO_REV3PLUS_TSF_HIGH 0x184 /* core rev >= 3 only */ -#define BCM43xx_MMIO_DMA1_BASE 0x200 -#define BCM43xx_MMIO_DMA2_BASE 0x220 -#define BCM43xx_MMIO_DMA3_BASE 0x240 -#define BCM43xx_MMIO_DMA4_BASE 0x260 + +/* 32-bit DMA */ +#define BCM43xx_MMIO_DMA32_BASE0 0x200 +#define BCM43xx_MMIO_DMA32_BASE1 0x220 +#define BCM43xx_MMIO_DMA32_BASE2 0x240 +#define BCM43xx_MMIO_DMA32_BASE3 0x260 +#define BCM43xx_MMIO_DMA32_BASE4 0x280 +#define BCM43xx_MMIO_DMA32_BASE5 0x2A0 +/* 64-bit DMA */ +#define BCM43xx_MMIO_DMA64_BASE0 0x200 +#define BCM43xx_MMIO_DMA64_BASE1 0x240 +#define BCM43xx_MMIO_DMA64_BASE2 0x280 +#define BCM43xx_MMIO_DMA64_BASE3 0x2C0 +#define BCM43xx_MMIO_DMA64_BASE4 0x300 +#define BCM43xx_MMIO_DMA64_BASE5 0x340 +/* PIO */ #define BCM43xx_MMIO_PIO1_BASE 0x300 #define BCM43xx_MMIO_PIO2_BASE 0x310 #define BCM43xx_MMIO_PIO3_BASE 0x320 #define BCM43xx_MMIO_PIO4_BASE 0x330 + #define BCM43xx_MMIO_PHY_VER 0x3E0 #define BCM43xx_MMIO_PHY_RADIO 0x3E2 #define BCM43xx_MMIO_ANTENNA 0x3E8 @@ -233,8 +250,14 @@ #define BCM43xx_SBTMSTATELOW_FORCE_GATE_CLOCK 0x20000 /* sbtmstatehigh state flags */ -#define BCM43xx_SBTMSTATEHIGH_SERROR 0x1 -#define BCM43xx_SBTMSTATEHIGH_BUSY 0x4 +#define BCM43xx_SBTMSTATEHIGH_SERROR 0x00000001 +#define BCM43xx_SBTMSTATEHIGH_BUSY 0x00000004 +#define BCM43xx_SBTMSTATEHIGH_TIMEOUT 0x00000020 +#define BCM43xx_SBTMSTATEHIGH_COREFLAGS 0x1FFF0000 +#define BCM43xx_SBTMSTATEHIGH_DMA64BIT 0x10000000 +#define BCM43xx_SBTMSTATEHIGH_GATEDCLK 0x20000000 +#define BCM43xx_SBTMSTATEHIGH_BISTFAILED 0x40000000 +#define BCM43xx_SBTMSTATEHIGH_BISTCOMPLETE 0x80000000 /* sbimstate flags */ #define BCM43xx_SBIMSTATE_IB_ERROR 0x20000 @@ -283,6 +306,13 @@ #define BCM43xx_SBF_TIME_UPDATE 0x10000000 #define BCM43xx_SBF_80000000 0x80000000 /*FIXME: fix name*/ +/* Microcode */ +#define BCM43xx_UCODE_REVISION 0x0000 +#define BCM43xx_UCODE_PATCHLEVEL 0x0002 +#define BCM43xx_UCODE_DATE 0x0004 +#define BCM43xx_UCODE_TIME 0x0006 +#define BCM43xx_UCODE_STATUS 0x0040 + /* MicrocodeFlagsBitfield (addr + lo-word values?)*/ #define BCM43xx_UCODEFLAGS_OFFSET 0x005E @@ -504,6 +534,12 @@ struct bcm43xx_phyinfo { * This lock is only used by bcm43xx_phy_{un}lock() */ spinlock_t lock; + + /* Firmware. */ + const struct firmware *ucode; + const struct firmware *pcm; + const struct firmware *initvals0; + const struct firmware *initvals1; }; @@ -568,8 +604,11 @@ struct bcm43xx_dma { struct bcm43xx_dmaring *tx_ring1; struct bcm43xx_dmaring *tx_ring2; struct bcm43xx_dmaring *tx_ring3; + struct bcm43xx_dmaring *tx_ring4; + struct bcm43xx_dmaring *tx_ring5; + struct bcm43xx_dmaring *rx_ring0; - struct bcm43xx_dmaring *rx_ring1; /* only available on core.rev < 5 */ + struct bcm43xx_dmaring *rx_ring3; /* only available on core.rev < 5 */ }; /* Data structures for PIO transmission, per 80211 core. */ @@ -593,12 +632,14 @@ struct bcm43xx_coreinfo { u8 available:1, enabled:1, initialized:1; - /** core_id ID number */ - u16 id; /** core_rev revision number */ u8 rev; /** Index number for _switch_core() */ u8 index; + /** core_id ID number */ + u16 id; + /** Core-specific data. */ + void *priv; }; /* Additional information for each 80211 core. */ @@ -647,7 +688,23 @@ enum { BCM43xx_STAT_RESTARTING, /* controller_restart() called. */ }; #define bcm43xx_status(bcm) atomic_read(&(bcm)->init_status) -#define bcm43xx_set_status(bcm, stat) atomic_set(&(bcm)->init_status, (stat)) +#define bcm43xx_set_status(bcm, stat) do { \ + atomic_set(&(bcm)->init_status, (stat)); \ + smp_wmb(); \ + } while (0) + +/* *** THEORY OF LOCKING *** + * + * We have two different locks in the bcm43xx driver. + * => bcm->mutex: General sleeping mutex. Protects struct bcm43xx_private + * and the device registers. This mutex does _not_ protect + * against concurrency from the IRQ handler. + * => bcm->irq_lock: IRQ spinlock. Protects against IRQ handler concurrency. + * + * Please note that, if you only take the irq_lock, you are not protected + * against concurrency from the periodic work handlers. + * Most times you want to take _both_ locks. + */ struct bcm43xx_private { struct ieee80211_device *ieee; @@ -659,7 +716,6 @@ struct bcm43xx_private { void __iomem *mmio_addr; - /* Locking, see "theory of locking" text below. */ spinlock_t irq_lock; struct mutex mutex; @@ -691,6 +747,7 @@ struct bcm43xx_private { struct bcm43xx_sprominfo sprom; #define BCM43xx_NR_LEDS 4 struct bcm43xx_led leds[BCM43xx_NR_LEDS]; + spinlock_t leds_lock; /* The currently active core. */ struct bcm43xx_coreinfo *current_core; @@ -708,10 +765,6 @@ struct bcm43xx_private { struct bcm43xx_coreinfo core_80211[ BCM43xx_MAX_80211_CORES ]; /* Additional information, specific to the 80211 cores. */ struct bcm43xx_coreinfo_80211 core_80211_ext[ BCM43xx_MAX_80211_CORES ]; - /* Index of the current 80211 core. If current_core is not - * an 80211 core, this is -1. - */ - int current_80211_core_idx; /* Number of available 80211 cores. */ int nr_80211_available; @@ -719,11 +772,13 @@ struct bcm43xx_private { /* Reason code of the last interrupt. */ u32 irq_reason; - u32 dma_reason[4]; + u32 dma_reason[6]; /* saved irq enable/disable state bitfield. */ u32 irq_savedstate; /* Link Quality calculation context. */ struct bcm43xx_noise_calculation noisecalc; + /* if > 0 MAC is suspended. if == 0 MAC is enabled. */ + int mac_suspended; /* Threshold values. */ //TODO: The RTS thr has to be _used_. Currently, it is only set via WX. @@ -746,12 +801,6 @@ struct bcm43xx_private { struct bcm43xx_key key[54]; u8 default_key_idx; - /* Firmware. */ - const struct firmware *ucode; - const struct firmware *pcm; - const struct firmware *initvals0; - const struct firmware *initvals1; - /* Random Number Generator. */ struct hwrng rng; char rng_name[20 + 1]; @@ -763,55 +812,6 @@ struct bcm43xx_private { }; -/* *** THEORY OF LOCKING *** - * - * We have two different locks in the bcm43xx driver. - * => bcm->mutex: General sleeping mutex. Protects struct bcm43xx_private - * and the device registers. - * => bcm->irq_lock: IRQ spinlock. Protects against IRQ handler concurrency. - * - * We have three types of helper function pairs to utilize these locks. - * (Always use the helper functions.) - * 1) bcm43xx_{un}lock_noirq(): - * Takes bcm->mutex. Does _not_ protect against IRQ concurrency, - * so it is almost always unsafe, if device IRQs are enabled. - * So only use this, if device IRQs are masked. - * Locking may sleep. - * You can sleep within the critical section. - * 2) bcm43xx_{un}lock_irqonly(): - * Takes bcm->irq_lock. Does _not_ protect against - * bcm43xx_lock_noirq() critical sections. - * Does only protect against the IRQ handler path and other - * irqonly() critical sections. - * Locking does not sleep. - * You must not sleep within the critical section. - * 3) bcm43xx_{un}lock_irqsafe(): - * This is the cummulative lock and takes both, mutex and irq_lock. - * Protects against noirq() and irqonly() critical sections (and - * the IRQ handler path). - * Locking may sleep. - * You must not sleep within the critical section. - */ - -/* Lock type 1 */ -#define bcm43xx_lock_noirq(bcm) mutex_lock(&(bcm)->mutex) -#define bcm43xx_unlock_noirq(bcm) mutex_unlock(&(bcm)->mutex) -/* Lock type 2 */ -#define bcm43xx_lock_irqonly(bcm, flags) \ - spin_lock_irqsave(&(bcm)->irq_lock, flags) -#define bcm43xx_unlock_irqonly(bcm, flags) \ - spin_unlock_irqrestore(&(bcm)->irq_lock, flags) -/* Lock type 3 */ -#define bcm43xx_lock_irqsafe(bcm, flags) do { \ - bcm43xx_lock_noirq(bcm); \ - bcm43xx_lock_irqonly(bcm, flags); \ - } while (0) -#define bcm43xx_unlock_irqsafe(bcm, flags) do { \ - bcm43xx_unlock_irqonly(bcm, flags); \ - bcm43xx_unlock_noirq(bcm); \ - } while (0) - - static inline struct bcm43xx_private * bcm43xx_priv(struct net_device *dev) { @@ -863,34 +863,33 @@ int bcm43xx_using_pio(struct bcm43xx_pri * any of these functions. */ static inline +struct bcm43xx_coreinfo_80211 * +bcm43xx_current_80211_priv(struct bcm43xx_private *bcm) +{ + assert(bcm->current_core->id == BCM43xx_COREID_80211); + return bcm->current_core->priv; +} +static inline struct bcm43xx_pio * bcm43xx_current_pio(struct bcm43xx_private *bcm) { assert(bcm43xx_using_pio(bcm)); - assert(bcm->current_80211_core_idx >= 0); - assert(bcm->current_80211_core_idx < BCM43xx_MAX_80211_CORES); - return &(bcm->core_80211_ext[bcm->current_80211_core_idx].pio); + return &(bcm43xx_current_80211_priv(bcm)->pio); } static inline struct bcm43xx_dma * bcm43xx_current_dma(struct bcm43xx_private *bcm) { assert(!bcm43xx_using_pio(bcm)); - assert(bcm->current_80211_core_idx >= 0); - assert(bcm->current_80211_core_idx < BCM43xx_MAX_80211_CORES); - return &(bcm->core_80211_ext[bcm->current_80211_core_idx].dma); + return &(bcm43xx_current_80211_priv(bcm)->dma); } static inline struct bcm43xx_phyinfo * bcm43xx_current_phy(struct bcm43xx_private *bcm) { - assert(bcm->current_80211_core_idx >= 0); - assert(bcm->current_80211_core_idx < BCM43xx_MAX_80211_CORES); - return &(bcm->core_80211_ext[bcm->current_80211_core_idx].phy); + return &(bcm43xx_current_80211_priv(bcm)->phy); } static inline struct bcm43xx_radioinfo * bcm43xx_current_radio(struct bcm43xx_private *bcm) { - assert(bcm->current_80211_core_idx >= 0); - assert(bcm->current_80211_core_idx < BCM43xx_MAX_80211_CORES); - return &(bcm->core_80211_ext[bcm->current_80211_core_idx].radio); + return &(bcm43xx_current_80211_priv(bcm)->radio); } diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c 2007-06-13 06:55:07.000000000 -0400 @@ -77,7 +77,8 @@ static ssize_t devinfo_read_file(struct down(&big_buffer_sem); - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED) { fappend("Board not initialized.\n"); goto out; @@ -121,7 +122,8 @@ static ssize_t devinfo_read_file(struct fappend("\n"); out: - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); res = simple_read_from_buffer(userbuf, count, ppos, buf, pos); up(&big_buffer_sem); return res; @@ -159,7 +161,8 @@ static ssize_t spromdump_read_file(struc unsigned long flags; down(&big_buffer_sem); - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED) { fappend("Board not initialized.\n"); goto out; @@ -169,7 +172,8 @@ static ssize_t spromdump_read_file(struc fappend("boardflags: 0x%04x\n", bcm->sprom.boardflags); out: - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); res = simple_read_from_buffer(userbuf, count, ppos, buf, pos); up(&big_buffer_sem); return res; @@ -188,7 +192,8 @@ static ssize_t tsf_read_file(struct file u64 tsf; down(&big_buffer_sem); - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED) { fappend("Board not initialized.\n"); goto out; @@ -199,7 +204,8 @@ static ssize_t tsf_read_file(struct file (unsigned int)(tsf & 0xFFFFFFFFULL)); out: - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); res = simple_read_from_buffer(userbuf, count, ppos, buf, pos); up(&big_buffer_sem); return res; @@ -221,7 +227,8 @@ static ssize_t tsf_write_file(struct fil res = -EFAULT; goto out_up; } - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED) { printk(KERN_INFO PFX "debugfs: Board not initialized.\n"); res = -EFAULT; @@ -237,7 +244,8 @@ static ssize_t tsf_write_file(struct fil res = buf_size; out_unlock: - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); out_up: up(&big_buffer_sem); return res; @@ -258,7 +266,8 @@ static ssize_t txstat_read_file(struct f int i, cnt, j = 0; down(&big_buffer_sem); - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); fappend("Last %d logged xmitstatus blobs (Latest first):\n\n", BCM43xx_NR_LOGGED_XMITSTATUS); @@ -294,14 +303,51 @@ static ssize_t txstat_read_file(struct f i = BCM43xx_NR_LOGGED_XMITSTATUS - 1; } - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); res = simple_read_from_buffer(userbuf, count, ppos, buf, pos); - bcm43xx_lock_irqsafe(bcm, flags); + spin_lock_irqsave(&bcm->irq_lock, flags); if (*ppos == pos) { /* Done. Drop the copied data. */ e->xmitstatus_printing = 0; } - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); + up(&big_buffer_sem); + return res; +} + +static ssize_t restart_write_file(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct bcm43xx_private *bcm = file->private_data; + char *buf = really_big_buffer; + ssize_t buf_size; + ssize_t res; + unsigned long flags; + + buf_size = min(count, sizeof (really_big_buffer) - 1); + down(&big_buffer_sem); + if (copy_from_user(buf, user_buf, buf_size)) { + res = -EFAULT; + goto out_up; + } + mutex_lock(&(bcm)->mutex); + spin_lock_irqsave(&(bcm)->irq_lock, flags); + if (bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED) { + printk(KERN_INFO PFX "debugfs: Board not initialized.\n"); + res = -EFAULT; + goto out_unlock; + } + if (count > 0 && buf[0] == '1') { + bcm43xx_controller_restart(bcm, "manually restarted"); + res = count; + } else + res = -EINVAL; + +out_unlock: + spin_unlock_irqrestore(&(bcm)->irq_lock, flags); + mutex_unlock(&(bcm)->mutex); +out_up: up(&big_buffer_sem); return res; } @@ -339,6 +385,11 @@ static struct file_operations txstat_fop .open = open_file_generic, }; +static struct file_operations restart_fops = { + .write = restart_write_file, + .open = open_file_generic, +}; + void bcm43xx_debugfs_add_device(struct bcm43xx_private *bcm) { @@ -390,6 +441,10 @@ void bcm43xx_debugfs_add_device(struct b bcm, &txstat_fops); if (!e->dentry_txstat) printk(KERN_ERR PFX "debugfs: creating \"tx_status\" for \"%s\" failed!\n", devdir); + e->dentry_restart = debugfs_create_file("restart", 0222, e->subdir, + bcm, &restart_fops); + if (!e->dentry_restart) + printk(KERN_ERR PFX "debugfs: creating \"restart\" for \"%s\" failed!\n", devdir); } void bcm43xx_debugfs_remove_device(struct bcm43xx_private *bcm) @@ -405,6 +460,7 @@ void bcm43xx_debugfs_remove_device(struc debugfs_remove(e->dentry_devinfo); debugfs_remove(e->dentry_tsf); debugfs_remove(e->dentry_txstat); + debugfs_remove(e->dentry_restart); debugfs_remove(e->subdir); kfree(e->xmitstatus_buffer); kfree(e->xmitstatus_print_buffer); diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.h linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.h --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.h 2007-06-13 06:55:07.000000000 -0400 @@ -20,6 +20,7 @@ struct bcm43xx_dfsentry { struct dentry *dentry_spromdump; struct dentry *dentry_tsf; struct dentry *dentry_txstat; + struct dentry *dentry_restart; struct bcm43xx_private *bcm; diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_dma.c linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_dma.c --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_dma.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_dma.c 2007-06-13 06:55:07.000000000 -0400 @@ -4,7 +4,7 @@ DMA ringbuffer and descriptor allocation/management - Copyright (c) 2005 Michael Buesch + Copyright (c) 2005, 2006 Michael Buesch Some code in this file is derived from the b44.c driver Copyright (C) 2002 David S. Miller @@ -109,6 +109,35 @@ void return_slot(struct bcm43xx_dmaring } } +u16 bcm43xx_dmacontroller_base(int dma64bit, int controller_idx) +{ + static const u16 map64[] = { + BCM43xx_MMIO_DMA64_BASE0, + BCM43xx_MMIO_DMA64_BASE1, + BCM43xx_MMIO_DMA64_BASE2, + BCM43xx_MMIO_DMA64_BASE3, + BCM43xx_MMIO_DMA64_BASE4, + BCM43xx_MMIO_DMA64_BASE5, + }; + static const u16 map32[] = { + BCM43xx_MMIO_DMA32_BASE0, + BCM43xx_MMIO_DMA32_BASE1, + BCM43xx_MMIO_DMA32_BASE2, + BCM43xx_MMIO_DMA32_BASE3, + BCM43xx_MMIO_DMA32_BASE4, + BCM43xx_MMIO_DMA32_BASE5, + }; + + if (dma64bit) { + assert(controller_idx >= 0 && + controller_idx < ARRAY_SIZE(map64)); + return map64[controller_idx]; + } + assert(controller_idx >= 0 && + controller_idx < ARRAY_SIZE(map32)); + return map32[controller_idx]; +} + static inline dma_addr_t map_descbuffer(struct bcm43xx_dmaring *ring, unsigned char *buf, @@ -172,7 +201,6 @@ void sync_descbuffer_for_device(struct b /* Unmap and free a descriptor buffer. */ static inline void free_descriptor_buffer(struct bcm43xx_dmaring *ring, - struct bcm43xx_dmadesc *desc, struct bcm43xx_dmadesc_meta *meta, int irq_context) { @@ -188,23 +216,13 @@ static int alloc_ringmemory(struct bcm43 { struct device *dev = &(ring->bcm->pci_dev->dev); - ring->vbase = dma_alloc_coherent(dev, BCM43xx_DMA_RINGMEMSIZE, - &(ring->dmabase), GFP_KERNEL); - if (!ring->vbase) { + ring->descbase = dma_alloc_coherent(dev, BCM43xx_DMA_RINGMEMSIZE, + &(ring->dmabase), GFP_KERNEL); + if (!ring->descbase) { printk(KERN_ERR PFX "DMA ringmemory allocation failed\n"); return -ENOMEM; } - if (ring->dmabase + BCM43xx_DMA_RINGMEMSIZE > BCM43xx_DMA_BUSADDRMAX) { - printk(KERN_ERR PFX ">>>FATAL ERROR<<< DMA RINGMEMORY >1G " - "(0x%llx, len: %lu)\n", - (unsigned long long)ring->dmabase, - BCM43xx_DMA_RINGMEMSIZE); - dma_free_coherent(dev, BCM43xx_DMA_RINGMEMSIZE, - ring->vbase, ring->dmabase); - return -ENOMEM; - } - assert(!(ring->dmabase & 0x000003FF)); - memset(ring->vbase, 0, BCM43xx_DMA_RINGMEMSIZE); + memset(ring->descbase, 0, BCM43xx_DMA_RINGMEMSIZE); return 0; } @@ -214,26 +232,34 @@ static void free_ringmemory(struct bcm43 struct device *dev = &(ring->bcm->pci_dev->dev); dma_free_coherent(dev, BCM43xx_DMA_RINGMEMSIZE, - ring->vbase, ring->dmabase); + ring->descbase, ring->dmabase); } /* Reset the RX DMA channel */ int bcm43xx_dmacontroller_rx_reset(struct bcm43xx_private *bcm, - u16 mmio_base) + u16 mmio_base, int dma64) { int i; u32 value; + u16 offset; - bcm43xx_write32(bcm, - mmio_base + BCM43xx_DMA_RX_CONTROL, - 0x00000000); + offset = dma64 ? BCM43xx_DMA64_RXCTL : BCM43xx_DMA32_RXCTL; + bcm43xx_write32(bcm, mmio_base + offset, 0); for (i = 0; i < 1000; i++) { - value = bcm43xx_read32(bcm, - mmio_base + BCM43xx_DMA_RX_STATUS); - value &= BCM43xx_DMA_RXSTAT_STAT_MASK; - if (value == BCM43xx_DMA_RXSTAT_STAT_DISABLED) { - i = -1; - break; + offset = dma64 ? BCM43xx_DMA64_RXSTATUS : BCM43xx_DMA32_RXSTATUS; + value = bcm43xx_read32(bcm, mmio_base + offset); + if (dma64) { + value &= BCM43xx_DMA64_RXSTAT; + if (value == BCM43xx_DMA64_RXSTAT_DISABLED) { + i = -1; + break; + } + } else { + value &= BCM43xx_DMA32_RXSTATE; + if (value == BCM43xx_DMA32_RXSTAT_DISABLED) { + i = -1; + break; + } } udelay(10); } @@ -247,31 +273,47 @@ int bcm43xx_dmacontroller_rx_reset(struc /* Reset the RX DMA channel */ int bcm43xx_dmacontroller_tx_reset(struct bcm43xx_private *bcm, - u16 mmio_base) + u16 mmio_base, int dma64) { int i; u32 value; + u16 offset; for (i = 0; i < 1000; i++) { - value = bcm43xx_read32(bcm, - mmio_base + BCM43xx_DMA_TX_STATUS); - value &= BCM43xx_DMA_TXSTAT_STAT_MASK; - if (value == BCM43xx_DMA_TXSTAT_STAT_DISABLED || - value == BCM43xx_DMA_TXSTAT_STAT_IDLEWAIT || - value == BCM43xx_DMA_TXSTAT_STAT_STOPPED) - break; + offset = dma64 ? BCM43xx_DMA64_TXSTATUS : BCM43xx_DMA32_TXSTATUS; + value = bcm43xx_read32(bcm, mmio_base + offset); + if (dma64) { + value &= BCM43xx_DMA64_TXSTAT; + if (value == BCM43xx_DMA64_TXSTAT_DISABLED || + value == BCM43xx_DMA64_TXSTAT_IDLEWAIT || + value == BCM43xx_DMA64_TXSTAT_STOPPED) + break; + } else { + value &= BCM43xx_DMA32_TXSTATE; + if (value == BCM43xx_DMA32_TXSTAT_DISABLED || + value == BCM43xx_DMA32_TXSTAT_IDLEWAIT || + value == BCM43xx_DMA32_TXSTAT_STOPPED) + break; + } udelay(10); } - bcm43xx_write32(bcm, - mmio_base + BCM43xx_DMA_TX_CONTROL, - 0x00000000); + offset = dma64 ? BCM43xx_DMA64_TXCTL : BCM43xx_DMA32_TXCTL; + bcm43xx_write32(bcm, mmio_base + offset, 0); for (i = 0; i < 1000; i++) { - value = bcm43xx_read32(bcm, - mmio_base + BCM43xx_DMA_TX_STATUS); - value &= BCM43xx_DMA_TXSTAT_STAT_MASK; - if (value == BCM43xx_DMA_TXSTAT_STAT_DISABLED) { - i = -1; - break; + offset = dma64 ? BCM43xx_DMA64_TXSTATUS : BCM43xx_DMA32_TXSTATUS; + value = bcm43xx_read32(bcm, mmio_base + offset); + if (dma64) { + value &= BCM43xx_DMA64_TXSTAT; + if (value == BCM43xx_DMA64_TXSTAT_DISABLED) { + i = -1; + break; + } + } else { + value &= BCM43xx_DMA32_TXSTATE; + if (value == BCM43xx_DMA32_TXSTAT_DISABLED) { + i = -1; + break; + } } udelay(10); } @@ -285,47 +327,98 @@ int bcm43xx_dmacontroller_tx_reset(struc return 0; } +static void fill_descriptor(struct bcm43xx_dmaring *ring, + struct bcm43xx_dmadesc_generic *desc, + dma_addr_t dmaaddr, + u16 bufsize, + int start, int end, int irq) +{ + int slot; + + slot = bcm43xx_dma_desc2idx(ring, desc); + assert(slot >= 0 && slot < ring->nr_slots); + + if (ring->dma64) { + u32 ctl0 = 0, ctl1 = 0; + u32 addrlo, addrhi; + u32 addrext; + + addrlo = (u32)(dmaaddr & 0xFFFFFFFF); + addrhi = (((u64)dmaaddr >> 32) & ~BCM43xx_DMA64_ROUTING); + addrext = (((u64)dmaaddr >> 32) >> BCM43xx_DMA64_ROUTING_SHIFT); + addrhi |= ring->routing; + if (slot == ring->nr_slots - 1) + ctl0 |= BCM43xx_DMA64_DCTL0_DTABLEEND; + if (start) + ctl0 |= BCM43xx_DMA64_DCTL0_FRAMESTART; + if (end) + ctl0 |= BCM43xx_DMA64_DCTL0_FRAMEEND; + if (irq) + ctl0 |= BCM43xx_DMA64_DCTL0_IRQ; + ctl1 |= (bufsize - ring->frameoffset) + & BCM43xx_DMA64_DCTL1_BYTECNT; + ctl1 |= (addrext << BCM43xx_DMA64_DCTL1_ADDREXT_SHIFT) + & BCM43xx_DMA64_DCTL1_ADDREXT_MASK; + + desc->dma64.control0 = cpu_to_le32(ctl0); + desc->dma64.control1 = cpu_to_le32(ctl1); + desc->dma64.address_low = cpu_to_le32(addrlo); + desc->dma64.address_high = cpu_to_le32(addrhi); + } else { + u32 ctl; + u32 addr; + u32 addrext; + + addr = (u32)(dmaaddr & ~BCM43xx_DMA32_ROUTING); + addrext = (u32)(dmaaddr & BCM43xx_DMA32_ROUTING) + >> BCM43xx_DMA32_ROUTING_SHIFT; + addr |= ring->routing; + ctl = (bufsize - ring->frameoffset) + & BCM43xx_DMA32_DCTL_BYTECNT; + if (slot == ring->nr_slots - 1) + ctl |= BCM43xx_DMA32_DCTL_DTABLEEND; + if (start) + ctl |= BCM43xx_DMA32_DCTL_FRAMESTART; + if (end) + ctl |= BCM43xx_DMA32_DCTL_FRAMEEND; + if (irq) + ctl |= BCM43xx_DMA32_DCTL_IRQ; + ctl |= (addrext << BCM43xx_DMA32_DCTL_ADDREXT_SHIFT) + & BCM43xx_DMA32_DCTL_ADDREXT_MASK; + + desc->dma32.control = cpu_to_le32(ctl); + desc->dma32.address = cpu_to_le32(addr); + } +} + static int setup_rx_descbuffer(struct bcm43xx_dmaring *ring, - struct bcm43xx_dmadesc *desc, + struct bcm43xx_dmadesc_generic *desc, struct bcm43xx_dmadesc_meta *meta, gfp_t gfp_flags) { struct bcm43xx_rxhdr *rxhdr; + struct bcm43xx_hwxmitstatus *xmitstat; dma_addr_t dmaaddr; - u32 desc_addr; - u32 desc_ctl; - const int slot = (int)(desc - ring->vbase); struct sk_buff *skb; - assert(slot >= 0 && slot < ring->nr_slots); assert(!ring->tx); skb = __dev_alloc_skb(ring->rx_buffersize, gfp_flags); if (unlikely(!skb)) return -ENOMEM; dmaaddr = map_descbuffer(ring, skb->data, ring->rx_buffersize, 0); - if (unlikely(dmaaddr + ring->rx_buffersize > BCM43xx_DMA_BUSADDRMAX)) { - unmap_descbuffer(ring, dmaaddr, ring->rx_buffersize, 0); - dev_kfree_skb_any(skb); - printk(KERN_ERR PFX ">>>FATAL ERROR<<< DMA RX SKB >1G " - "(0x%llx, len: %u)\n", - (unsigned long long)dmaaddr, ring->rx_buffersize); - return -ENOMEM; - } meta->skb = skb; meta->dmaaddr = dmaaddr; skb->dev = ring->bcm->net_dev; - desc_addr = (u32)(dmaaddr + ring->memoffset); - desc_ctl = (BCM43xx_DMADTOR_BYTECNT_MASK & - (u32)(ring->rx_buffersize - ring->frameoffset)); - if (slot == ring->nr_slots - 1) - desc_ctl |= BCM43xx_DMADTOR_DTABLEEND; - set_desc_addr(desc, desc_addr); - set_desc_ctl(desc, desc_ctl); + + fill_descriptor(ring, desc, dmaaddr, + ring->rx_buffersize, 0, 0, 0); rxhdr = (struct bcm43xx_rxhdr *)(skb->data); rxhdr->frame_length = 0; rxhdr->flags1 = 0; + xmitstat = (struct bcm43xx_hwxmitstatus *)(skb->data); + xmitstat->cookie = 0; return 0; } @@ -336,17 +429,17 @@ static int setup_rx_descbuffer(struct bc static int alloc_initial_descbuffers(struct bcm43xx_dmaring *ring) { int i, err = -ENOMEM; - struct bcm43xx_dmadesc *desc; + struct bcm43xx_dmadesc_generic *desc; struct bcm43xx_dmadesc_meta *meta; for (i = 0; i < ring->nr_slots; i++) { - desc = ring->vbase + i; - meta = ring->meta + i; + desc = bcm43xx_dma_idx2desc(ring, i, &meta); err = setup_rx_descbuffer(ring, desc, meta, GFP_KERNEL); if (err) goto err_unwind; } + mb(); ring->used_slots = ring->nr_slots; err = 0; out: @@ -354,8 +447,7 @@ out: err_unwind: for (i--; i >= 0; i--) { - desc = ring->vbase + i; - meta = ring->meta + i; + desc = bcm43xx_dma_idx2desc(ring, i, &meta); unmap_descbuffer(ring, meta->dmaaddr, ring->rx_buffersize, 0); dev_kfree_skb(meta->skb); @@ -371,27 +463,67 @@ static int dmacontroller_setup(struct bc { int err = 0; u32 value; + u32 addrext; if (ring->tx) { - /* Set Transmit Control register to "transmit enable" */ - bcm43xx_dma_write(ring, BCM43xx_DMA_TX_CONTROL, - BCM43xx_DMA_TXCTRL_ENABLE); - /* Set Transmit Descriptor ring address. */ - bcm43xx_dma_write(ring, BCM43xx_DMA_TX_DESC_RING, - ring->dmabase + ring->memoffset); + if (ring->dma64) { + u64 ringbase = (u64)(ring->dmabase); + + addrext = ((ringbase >> 32) >> BCM43xx_DMA64_ROUTING_SHIFT); + value = BCM43xx_DMA64_TXENABLE; + value |= (addrext << BCM43xx_DMA64_TXADDREXT_SHIFT) + & BCM43xx_DMA64_TXADDREXT_MASK; + bcm43xx_dma_write(ring, BCM43xx_DMA64_TXCTL, value); + bcm43xx_dma_write(ring, BCM43xx_DMA64_TXRINGLO, + (ringbase & 0xFFFFFFFF)); + bcm43xx_dma_write(ring, BCM43xx_DMA64_TXRINGHI, + ((ringbase >> 32) & ~BCM43xx_DMA64_ROUTING) + | ring->routing); + } else { + u32 ringbase = (u32)(ring->dmabase); + + addrext = (ringbase >> BCM43xx_DMA32_ROUTING_SHIFT); + value = BCM43xx_DMA32_TXENABLE; + value |= (addrext << BCM43xx_DMA32_TXADDREXT_SHIFT) + & BCM43xx_DMA32_TXADDREXT_MASK; + bcm43xx_dma_write(ring, BCM43xx_DMA32_TXCTL, value); + bcm43xx_dma_write(ring, BCM43xx_DMA32_TXRING, + (ringbase & ~BCM43xx_DMA32_ROUTING) + | ring->routing); + } } else { err = alloc_initial_descbuffers(ring); if (err) goto out; - /* Set Receive Control "receive enable" and frame offset */ - value = (ring->frameoffset << BCM43xx_DMA_RXCTRL_FRAMEOFF_SHIFT); - value |= BCM43xx_DMA_RXCTRL_ENABLE; - bcm43xx_dma_write(ring, BCM43xx_DMA_RX_CONTROL, value); - /* Set Receive Descriptor ring address. */ - bcm43xx_dma_write(ring, BCM43xx_DMA_RX_DESC_RING, - ring->dmabase + ring->memoffset); - /* Init the descriptor pointer. */ - bcm43xx_dma_write(ring, BCM43xx_DMA_RX_DESC_INDEX, 200); + if (ring->dma64) { + u64 ringbase = (u64)(ring->dmabase); + + addrext = ((ringbase >> 32) >> BCM43xx_DMA64_ROUTING_SHIFT); + value = (ring->frameoffset << BCM43xx_DMA64_RXFROFF_SHIFT); + value |= BCM43xx_DMA64_RXENABLE; + value |= (addrext << BCM43xx_DMA64_RXADDREXT_SHIFT) + & BCM43xx_DMA64_RXADDREXT_MASK; + bcm43xx_dma_write(ring, BCM43xx_DMA64_RXCTL, value); + bcm43xx_dma_write(ring, BCM43xx_DMA64_RXRINGLO, + (ringbase & 0xFFFFFFFF)); + bcm43xx_dma_write(ring, BCM43xx_DMA64_RXRINGHI, + ((ringbase >> 32) & ~BCM43xx_DMA64_ROUTING) + | ring->routing); + bcm43xx_dma_write(ring, BCM43xx_DMA64_RXINDEX, 200); + } else { + u32 ringbase = (u32)(ring->dmabase); + + addrext = (ringbase >> BCM43xx_DMA32_ROUTING_SHIFT); + value = (ring->frameoffset << BCM43xx_DMA32_RXFROFF_SHIFT); + value |= BCM43xx_DMA32_RXENABLE; + value |= (addrext << BCM43xx_DMA32_RXADDREXT_SHIFT) + & BCM43xx_DMA32_RXADDREXT_MASK; + bcm43xx_dma_write(ring, BCM43xx_DMA32_RXCTL, value); + bcm43xx_dma_write(ring, BCM43xx_DMA32_RXRING, + (ringbase & ~BCM43xx_DMA32_ROUTING) + | ring->routing); + bcm43xx_dma_write(ring, BCM43xx_DMA32_RXINDEX, 200); + } } out: @@ -402,27 +534,32 @@ out: static void dmacontroller_cleanup(struct bcm43xx_dmaring *ring) { if (ring->tx) { - bcm43xx_dmacontroller_tx_reset(ring->bcm, ring->mmio_base); - /* Zero out Transmit Descriptor ring address. */ - bcm43xx_dma_write(ring, BCM43xx_DMA_TX_DESC_RING, 0); + bcm43xx_dmacontroller_tx_reset(ring->bcm, ring->mmio_base, ring->dma64); + if (ring->dma64) { + bcm43xx_dma_write(ring, BCM43xx_DMA64_TXRINGLO, 0); + bcm43xx_dma_write(ring, BCM43xx_DMA64_TXRINGHI, 0); + } else + bcm43xx_dma_write(ring, BCM43xx_DMA32_TXRING, 0); } else { - bcm43xx_dmacontroller_rx_reset(ring->bcm, ring->mmio_base); - /* Zero out Receive Descriptor ring address. */ - bcm43xx_dma_write(ring, BCM43xx_DMA_RX_DESC_RING, 0); + bcm43xx_dmacontroller_rx_reset(ring->bcm, ring->mmio_base, ring->dma64); + if (ring->dma64) { + bcm43xx_dma_write(ring, BCM43xx_DMA64_RXRINGLO, 0); + bcm43xx_dma_write(ring, BCM43xx_DMA64_RXRINGHI, 0); + } else + bcm43xx_dma_write(ring, BCM43xx_DMA32_RXRING, 0); } } static void free_all_descbuffers(struct bcm43xx_dmaring *ring) { - struct bcm43xx_dmadesc *desc; + struct bcm43xx_dmadesc_generic *desc; struct bcm43xx_dmadesc_meta *meta; int i; if (!ring->used_slots) return; for (i = 0; i < ring->nr_slots; i++) { - desc = ring->vbase + i; - meta = ring->meta + i; + desc = bcm43xx_dma_idx2desc(ring, i, &meta); if (!meta->skb) { assert(ring->tx); @@ -430,62 +567,67 @@ static void free_all_descbuffers(struct } if (ring->tx) { unmap_descbuffer(ring, meta->dmaaddr, - meta->skb->len, 1); + meta->skb->len, 1); } else { unmap_descbuffer(ring, meta->dmaaddr, - ring->rx_buffersize, 0); + ring->rx_buffersize, 0); } - free_descriptor_buffer(ring, desc, meta, 0); + free_descriptor_buffer(ring, meta, 0); } } /* Main initialization function. */ static struct bcm43xx_dmaring * bcm43xx_setup_dmaring(struct bcm43xx_private *bcm, - u16 dma_controller_base, - int nr_descriptor_slots, - int tx) + int controller_index, + int for_tx, + int dma64) { struct bcm43xx_dmaring *ring; int err; + int nr_slots; ring = kzalloc(sizeof(*ring), GFP_KERNEL); if (!ring) goto out; - ring->meta = kzalloc(sizeof(*ring->meta) * nr_descriptor_slots, + nr_slots = BCM43xx_RXRING_SLOTS; + if (for_tx) + nr_slots = BCM43xx_TXRING_SLOTS; + + ring->meta = kcalloc(nr_slots, sizeof(struct bcm43xx_dmadesc_meta), GFP_KERNEL); if (!ring->meta) goto err_kfree_ring; - ring->memoffset = BCM43xx_DMA_DMABUSADDROFFSET; + ring->routing = BCM43xx_DMA32_CLIENTTRANS; + if (dma64) + ring->routing = BCM43xx_DMA64_CLIENTTRANS; #ifdef CONFIG_BCM947XX if (bcm->pci_dev->bus->number == 0) - ring->memoffset = 0; + ring->routing = dma64 ? BCM43xx_DMA64_NOTRANS : BCM43xx_DMA32_NOTRANS; #endif ring->bcm = bcm; - ring->nr_slots = nr_descriptor_slots; + ring->nr_slots = nr_slots; ring->suspend_mark = ring->nr_slots * BCM43xx_TXSUSPEND_PERCENT / 100; ring->resume_mark = ring->nr_slots * BCM43xx_TXRESUME_PERCENT / 100; assert(ring->suspend_mark < ring->resume_mark); - ring->mmio_base = dma_controller_base; - if (tx) { + ring->mmio_base = bcm43xx_dmacontroller_base(dma64, controller_index); + ring->index = controller_index; + ring->dma64 = !!dma64; + if (for_tx) { ring->tx = 1; ring->current_slot = -1; } else { - switch (dma_controller_base) { - case BCM43xx_MMIO_DMA1_BASE: - ring->rx_buffersize = BCM43xx_DMA1_RXBUFFERSIZE; - ring->frameoffset = BCM43xx_DMA1_RX_FRAMEOFFSET; - break; - case BCM43xx_MMIO_DMA4_BASE: - ring->rx_buffersize = BCM43xx_DMA4_RXBUFFERSIZE; - ring->frameoffset = BCM43xx_DMA4_RX_FRAMEOFFSET; - break; - default: + if (ring->index == 0) { + ring->rx_buffersize = BCM43xx_DMA0_RX_BUFFERSIZE; + ring->frameoffset = BCM43xx_DMA0_RX_FRAMEOFFSET; + } else if (ring->index == 3) { + ring->rx_buffersize = BCM43xx_DMA3_RX_BUFFERSIZE; + ring->frameoffset = BCM43xx_DMA3_RX_FRAMEOFFSET; + } else assert(0); - } } err = alloc_ringmemory(ring); @@ -514,7 +656,8 @@ static void bcm43xx_destroy_dmaring(stru if (!ring) return; - dprintk(KERN_INFO PFX "DMA 0x%04x (%s) max used slots: %d/%d\n", + dprintk(KERN_INFO PFX "DMA-%s 0x%04X (%s) max used slots: %d/%d\n", + (ring->dma64) ? "64" : "32", ring->mmio_base, (ring->tx) ? "TX" : "RX", ring->max_used_slots, ring->nr_slots); @@ -537,10 +680,15 @@ void bcm43xx_dma_free(struct bcm43xx_pri return; dma = bcm43xx_current_dma(bcm); - bcm43xx_destroy_dmaring(dma->rx_ring1); - dma->rx_ring1 = NULL; + bcm43xx_destroy_dmaring(dma->rx_ring3); + dma->rx_ring3 = NULL; bcm43xx_destroy_dmaring(dma->rx_ring0); dma->rx_ring0 = NULL; + + bcm43xx_destroy_dmaring(dma->tx_ring5); + dma->tx_ring5 = NULL; + bcm43xx_destroy_dmaring(dma->tx_ring4); + dma->tx_ring4 = NULL; bcm43xx_destroy_dmaring(dma->tx_ring3); dma->tx_ring3 = NULL; bcm43xx_destroy_dmaring(dma->tx_ring2); @@ -556,48 +704,59 @@ int bcm43xx_dma_init(struct bcm43xx_priv struct bcm43xx_dma *dma = bcm43xx_current_dma(bcm); struct bcm43xx_dmaring *ring; int err = -ENOMEM; + int dma64 = 0; + u32 sbtmstatehi; + + sbtmstatehi = bcm43xx_read32(bcm, BCM43xx_CIR_SBTMSTATEHIGH); + if (sbtmstatehi & BCM43xx_SBTMSTATEHIGH_DMA64BIT) + dma64 = 1; /* setup TX DMA channels. */ - ring = bcm43xx_setup_dmaring(bcm, BCM43xx_MMIO_DMA1_BASE, - BCM43xx_TXRING_SLOTS, 1); + ring = bcm43xx_setup_dmaring(bcm, 0, 1, dma64); if (!ring) goto out; dma->tx_ring0 = ring; - ring = bcm43xx_setup_dmaring(bcm, BCM43xx_MMIO_DMA2_BASE, - BCM43xx_TXRING_SLOTS, 1); + ring = bcm43xx_setup_dmaring(bcm, 1, 1, dma64); if (!ring) goto err_destroy_tx0; dma->tx_ring1 = ring; - ring = bcm43xx_setup_dmaring(bcm, BCM43xx_MMIO_DMA3_BASE, - BCM43xx_TXRING_SLOTS, 1); + ring = bcm43xx_setup_dmaring(bcm, 2, 1, dma64); if (!ring) goto err_destroy_tx1; dma->tx_ring2 = ring; - ring = bcm43xx_setup_dmaring(bcm, BCM43xx_MMIO_DMA4_BASE, - BCM43xx_TXRING_SLOTS, 1); + ring = bcm43xx_setup_dmaring(bcm, 3, 1, dma64); if (!ring) goto err_destroy_tx2; dma->tx_ring3 = ring; - /* setup RX DMA channels. */ - ring = bcm43xx_setup_dmaring(bcm, BCM43xx_MMIO_DMA1_BASE, - BCM43xx_RXRING_SLOTS, 0); + ring = bcm43xx_setup_dmaring(bcm, 4, 1, dma64); if (!ring) goto err_destroy_tx3; + dma->tx_ring4 = ring; + + ring = bcm43xx_setup_dmaring(bcm, 5, 1, dma64); + if (!ring) + goto err_destroy_tx4; + dma->tx_ring5 = ring; + + /* setup RX DMA channels. */ + ring = bcm43xx_setup_dmaring(bcm, 0, 0, dma64); + if (!ring) + goto err_destroy_tx5; dma->rx_ring0 = ring; if (bcm->current_core->rev < 5) { - ring = bcm43xx_setup_dmaring(bcm, BCM43xx_MMIO_DMA4_BASE, - BCM43xx_RXRING_SLOTS, 0); + ring = bcm43xx_setup_dmaring(bcm, 3, 0, dma64); if (!ring) goto err_destroy_rx0; - dma->rx_ring1 = ring; + dma->rx_ring3 = ring; } - dprintk(KERN_INFO PFX "DMA initialized\n"); + dprintk(KERN_INFO PFX "%s DMA initialized\n", + dma64 ? "64-bit" : "32-bit"); err = 0; out: return err; @@ -605,6 +764,12 @@ out: err_destroy_rx0: bcm43xx_destroy_dmaring(dma->rx_ring0); dma->rx_ring0 = NULL; +err_destroy_tx5: + bcm43xx_destroy_dmaring(dma->tx_ring5); + dma->tx_ring5 = NULL; +err_destroy_tx4: + bcm43xx_destroy_dmaring(dma->tx_ring4); + dma->tx_ring4 = NULL; err_destroy_tx3: bcm43xx_destroy_dmaring(dma->tx_ring3); dma->tx_ring3 = NULL; @@ -624,7 +789,7 @@ err_destroy_tx0: static u16 generate_cookie(struct bcm43xx_dmaring *ring, int slot) { - u16 cookie = 0xF000; + u16 cookie = 0x1000; /* Use the upper 4 bits of the cookie as * DMA controller ID and store the slot number @@ -632,21 +797,25 @@ static u16 generate_cookie(struct bcm43x * Note that the cookie must never be 0, as this * is a special value used in RX path. */ - switch (ring->mmio_base) { - default: - assert(0); - case BCM43xx_MMIO_DMA1_BASE: + switch (ring->index) { + case 0: cookie = 0xA000; break; - case BCM43xx_MMIO_DMA2_BASE: + case 1: cookie = 0xB000; break; - case BCM43xx_MMIO_DMA3_BASE: + case 2: cookie = 0xC000; break; - case BCM43xx_MMIO_DMA4_BASE: + case 3: cookie = 0xD000; break; + case 4: + cookie = 0xE000; + break; + case 5: + cookie = 0xF000; + break; } assert(((u16)slot & 0xF000) == 0x0000); cookie |= (u16)slot; @@ -675,6 +844,12 @@ struct bcm43xx_dmaring * parse_cookie(st case 0xD000: ring = dma->tx_ring3; break; + case 0xE000: + ring = dma->tx_ring4; + break; + case 0xF000: + ring = dma->tx_ring5; + break; default: assert(0); } @@ -687,6 +862,9 @@ struct bcm43xx_dmaring * parse_cookie(st static void dmacontroller_poke_tx(struct bcm43xx_dmaring *ring, int slot) { + u16 offset; + int descsize; + /* Everything is ready to start. Buffers are DMA mapped and * associated with slots. * "slot" is the last slot of the new frame we want to transmit. @@ -694,25 +872,26 @@ static void dmacontroller_poke_tx(struct */ wmb(); slot = next_slot(ring, slot); - bcm43xx_dma_write(ring, BCM43xx_DMA_TX_DESC_INDEX, - (u32)(slot * sizeof(struct bcm43xx_dmadesc))); + offset = (ring->dma64) ? BCM43xx_DMA64_TXINDEX : BCM43xx_DMA32_TXINDEX; + descsize = (ring->dma64) ? sizeof(struct bcm43xx_dmadesc64) + : sizeof(struct bcm43xx_dmadesc32); + bcm43xx_dma_write(ring, offset, + (u32)(slot * descsize)); } -static int dma_tx_fragment(struct bcm43xx_dmaring *ring, - struct sk_buff *skb, - u8 cur_frag) +static void dma_tx_fragment(struct bcm43xx_dmaring *ring, + struct sk_buff *skb, + u8 cur_frag) { int slot; - struct bcm43xx_dmadesc *desc; + struct bcm43xx_dmadesc_generic *desc; struct bcm43xx_dmadesc_meta *meta; - u32 desc_ctl; - u32 desc_addr; + dma_addr_t dmaaddr; assert(skb_shinfo(skb)->nr_frags == 0); slot = request_slot(ring); - desc = ring->vbase + slot; - meta = ring->meta + slot; + desc = bcm43xx_dma_idx2desc(ring, slot, &meta); /* Add a device specific TX header. */ assert(skb_headroom(skb) >= sizeof(struct bcm43xx_txhdr)); @@ -729,29 +908,14 @@ static int dma_tx_fragment(struct bcm43x generate_cookie(ring, slot)); meta->skb = skb; - meta->dmaaddr = map_descbuffer(ring, skb->data, skb->len, 1); - if (unlikely(meta->dmaaddr + skb->len > BCM43xx_DMA_BUSADDRMAX)) { - return_slot(ring, slot); - printk(KERN_ERR PFX ">>>FATAL ERROR<<< DMA TX SKB >1G " - "(0x%llx, len: %u)\n", - (unsigned long long)meta->dmaaddr, skb->len); - return -ENOMEM; - } + dmaaddr = map_descbuffer(ring, skb->data, skb->len, 1); + meta->dmaaddr = dmaaddr; - desc_addr = (u32)(meta->dmaaddr + ring->memoffset); - desc_ctl = BCM43xx_DMADTOR_FRAMESTART | BCM43xx_DMADTOR_FRAMEEND; - desc_ctl |= BCM43xx_DMADTOR_COMPIRQ; - desc_ctl |= (BCM43xx_DMADTOR_BYTECNT_MASK & - (u32)(meta->skb->len - ring->frameoffset)); - if (slot == ring->nr_slots - 1) - desc_ctl |= BCM43xx_DMADTOR_DTABLEEND; + fill_descriptor(ring, desc, dmaaddr, + skb->len, 1, 1, 1); - set_desc_ctl(desc, desc_ctl); - set_desc_addr(desc, desc_addr); /* Now transfer the whole frame. */ dmacontroller_poke_tx(ring, slot); - - return 0; } int bcm43xx_dma_tx(struct bcm43xx_private *bcm, @@ -781,7 +945,6 @@ int bcm43xx_dma_tx(struct bcm43xx_privat /* Take skb from ieee80211_txb_free */ txb->fragments[i] = NULL; dma_tx_fragment(ring, skb, i); - //TODO: handle failure of dma_tx_fragment } ieee80211_txb_free(txb); @@ -792,23 +955,28 @@ void bcm43xx_dma_handle_xmitstatus(struc struct bcm43xx_xmitstatus *status) { struct bcm43xx_dmaring *ring; - struct bcm43xx_dmadesc *desc; + struct bcm43xx_dmadesc_generic *desc; struct bcm43xx_dmadesc_meta *meta; int is_last_fragment; int slot; + u32 tmp; ring = parse_cookie(bcm, status->cookie, &slot); assert(ring); assert(ring->tx); - assert(get_desc_ctl(ring->vbase + slot) & BCM43xx_DMADTOR_FRAMESTART); while (1) { assert(slot >= 0 && slot < ring->nr_slots); - desc = ring->vbase + slot; - meta = ring->meta + slot; + desc = bcm43xx_dma_idx2desc(ring, slot, &meta); - is_last_fragment = !!(get_desc_ctl(desc) & BCM43xx_DMADTOR_FRAMEEND); + if (ring->dma64) { + tmp = le32_to_cpu(desc->dma64.control0); + is_last_fragment = !!(tmp & BCM43xx_DMA64_DCTL0_FRAMEEND); + } else { + tmp = le32_to_cpu(desc->dma32.control); + is_last_fragment = !!(tmp & BCM43xx_DMA32_DCTL_FRAMEEND); + } unmap_descbuffer(ring, meta->dmaaddr, meta->skb->len, 1); - free_descriptor_buffer(ring, desc, meta, 1); + free_descriptor_buffer(ring, meta, 1); /* Everything belonging to the slot is unmapped * and freed, so we can return it. */ @@ -824,7 +992,7 @@ void bcm43xx_dma_handle_xmitstatus(struc static void dma_rx(struct bcm43xx_dmaring *ring, int *slot) { - struct bcm43xx_dmadesc *desc; + struct bcm43xx_dmadesc_generic *desc; struct bcm43xx_dmadesc_meta *meta; struct bcm43xx_rxhdr *rxhdr; struct sk_buff *skb; @@ -832,13 +1000,12 @@ static void dma_rx(struct bcm43xx_dmarin int err; dma_addr_t dmaaddr; - desc = ring->vbase + *slot; - meta = ring->meta + *slot; + desc = bcm43xx_dma_idx2desc(ring, *slot, &meta); sync_descbuffer_for_cpu(ring, meta->dmaaddr, ring->rx_buffersize); skb = meta->skb; - if (ring->mmio_base == BCM43xx_MMIO_DMA4_BASE) { + if (ring->index == 3) { /* We received an xmit status. */ struct bcm43xx_hwxmitstatus *hw = (struct bcm43xx_hwxmitstatus *)skb->data; struct bcm43xx_xmitstatus stat; @@ -894,8 +1061,7 @@ static void dma_rx(struct bcm43xx_dmarin s32 tmp = len; while (1) { - desc = ring->vbase + *slot; - meta = ring->meta + *slot; + desc = bcm43xx_dma_idx2desc(ring, *slot, &meta); /* recycle the descriptor buffer. */ sync_descbuffer_for_device(ring, meta->dmaaddr, ring->rx_buffersize); @@ -906,8 +1072,8 @@ static void dma_rx(struct bcm43xx_dmarin break; } printkl(KERN_ERR PFX "DMA RX buffer too small " - "(len: %u, buffer: %u, nr-dropped: %d)\n", - len, ring->rx_buffersize, cnt); + "(len: %u, buffer: %u, nr-dropped: %d)\n", + len, ring->rx_buffersize, cnt); goto drop; } len -= IEEE80211_FCS_LEN; @@ -945,9 +1111,15 @@ void bcm43xx_dma_rx(struct bcm43xx_dmari #endif assert(!ring->tx); - status = bcm43xx_dma_read(ring, BCM43xx_DMA_RX_STATUS); - descptr = (status & BCM43xx_DMA_RXSTAT_DPTR_MASK); - current_slot = descptr / sizeof(struct bcm43xx_dmadesc); + if (ring->dma64) { + status = bcm43xx_dma_read(ring, BCM43xx_DMA64_RXSTATUS); + descptr = (status & BCM43xx_DMA64_RXSTATDPTR); + current_slot = descptr / sizeof(struct bcm43xx_dmadesc64); + } else { + status = bcm43xx_dma_read(ring, BCM43xx_DMA32_RXSTATUS); + descptr = (status & BCM43xx_DMA32_RXDPTR); + current_slot = descptr / sizeof(struct bcm43xx_dmadesc32); + } assert(current_slot >= 0 && current_slot < ring->nr_slots); slot = ring->current_slot; @@ -958,8 +1130,13 @@ void bcm43xx_dma_rx(struct bcm43xx_dmari ring->max_used_slots = used_slots; #endif } - bcm43xx_dma_write(ring, BCM43xx_DMA_RX_DESC_INDEX, - (u32)(slot * sizeof(struct bcm43xx_dmadesc))); + if (ring->dma64) { + bcm43xx_dma_write(ring, BCM43xx_DMA64_RXINDEX, + (u32)(slot * sizeof(struct bcm43xx_dmadesc64))); + } else { + bcm43xx_dma_write(ring, BCM43xx_DMA32_RXINDEX, + (u32)(slot * sizeof(struct bcm43xx_dmadesc32))); + } ring->current_slot = slot; } @@ -967,16 +1144,28 @@ void bcm43xx_dma_tx_suspend(struct bcm43 { assert(ring->tx); bcm43xx_power_saving_ctl_bits(ring->bcm, -1, 1); - bcm43xx_dma_write(ring, BCM43xx_DMA_TX_CONTROL, - bcm43xx_dma_read(ring, BCM43xx_DMA_TX_CONTROL) - | BCM43xx_DMA_TXCTRL_SUSPEND); + if (ring->dma64) { + bcm43xx_dma_write(ring, BCM43xx_DMA64_TXCTL, + bcm43xx_dma_read(ring, BCM43xx_DMA64_TXCTL) + | BCM43xx_DMA64_TXSUSPEND); + } else { + bcm43xx_dma_write(ring, BCM43xx_DMA32_TXCTL, + bcm43xx_dma_read(ring, BCM43xx_DMA32_TXCTL) + | BCM43xx_DMA32_TXSUSPEND); + } } void bcm43xx_dma_tx_resume(struct bcm43xx_dmaring *ring) { assert(ring->tx); - bcm43xx_dma_write(ring, BCM43xx_DMA_TX_CONTROL, - bcm43xx_dma_read(ring, BCM43xx_DMA_TX_CONTROL) - & ~BCM43xx_DMA_TXCTRL_SUSPEND); + if (ring->dma64) { + bcm43xx_dma_write(ring, BCM43xx_DMA64_TXCTL, + bcm43xx_dma_read(ring, BCM43xx_DMA64_TXCTL) + & ~BCM43xx_DMA64_TXSUSPEND); + } else { + bcm43xx_dma_write(ring, BCM43xx_DMA32_TXCTL, + bcm43xx_dma_read(ring, BCM43xx_DMA32_TXCTL) + & ~BCM43xx_DMA32_TXSUSPEND); + } bcm43xx_power_saving_ctl_bits(ring->bcm, -1, -1); } diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_dma.h linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_dma.h --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_dma.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_dma.h 2007-06-13 06:55:07.000000000 -0400 @@ -14,63 +14,179 @@ #define BCM43xx_DMAIRQ_NONFATALMASK (1 << 13) #define BCM43xx_DMAIRQ_RX_DONE (1 << 16) -/* DMA controller register offsets. (relative to BCM43xx_DMA#_BASE) */ -#define BCM43xx_DMA_TX_CONTROL 0x00 -#define BCM43xx_DMA_TX_DESC_RING 0x04 -#define BCM43xx_DMA_TX_DESC_INDEX 0x08 -#define BCM43xx_DMA_TX_STATUS 0x0c -#define BCM43xx_DMA_RX_CONTROL 0x10 -#define BCM43xx_DMA_RX_DESC_RING 0x14 -#define BCM43xx_DMA_RX_DESC_INDEX 0x18 -#define BCM43xx_DMA_RX_STATUS 0x1c - -/* DMA controller channel control word values. */ -#define BCM43xx_DMA_TXCTRL_ENABLE (1 << 0) -#define BCM43xx_DMA_TXCTRL_SUSPEND (1 << 1) -#define BCM43xx_DMA_TXCTRL_LOOPBACK (1 << 2) -#define BCM43xx_DMA_TXCTRL_FLUSH (1 << 4) -#define BCM43xx_DMA_RXCTRL_ENABLE (1 << 0) -#define BCM43xx_DMA_RXCTRL_FRAMEOFF_MASK 0x000000fe -#define BCM43xx_DMA_RXCTRL_FRAMEOFF_SHIFT 1 -#define BCM43xx_DMA_RXCTRL_PIO (1 << 8) -/* DMA controller channel status word values. */ -#define BCM43xx_DMA_TXSTAT_DPTR_MASK 0x00000fff -#define BCM43xx_DMA_TXSTAT_STAT_MASK 0x0000f000 -#define BCM43xx_DMA_TXSTAT_STAT_DISABLED 0x00000000 -#define BCM43xx_DMA_TXSTAT_STAT_ACTIVE 0x00001000 -#define BCM43xx_DMA_TXSTAT_STAT_IDLEWAIT 0x00002000 -#define BCM43xx_DMA_TXSTAT_STAT_STOPPED 0x00003000 -#define BCM43xx_DMA_TXSTAT_STAT_SUSP 0x00004000 -#define BCM43xx_DMA_TXSTAT_ERROR_MASK 0x000f0000 -#define BCM43xx_DMA_TXSTAT_FLUSHED (1 << 20) -#define BCM43xx_DMA_RXSTAT_DPTR_MASK 0x00000fff -#define BCM43xx_DMA_RXSTAT_STAT_MASK 0x0000f000 -#define BCM43xx_DMA_RXSTAT_STAT_DISABLED 0x00000000 -#define BCM43xx_DMA_RXSTAT_STAT_ACTIVE 0x00001000 -#define BCM43xx_DMA_RXSTAT_STAT_IDLEWAIT 0x00002000 -#define BCM43xx_DMA_RXSTAT_STAT_RESERVED 0x00003000 -#define BCM43xx_DMA_RXSTAT_STAT_ERRORS 0x00004000 -#define BCM43xx_DMA_RXSTAT_ERROR_MASK 0x000f0000 - -/* DMA descriptor control field values. */ -#define BCM43xx_DMADTOR_BYTECNT_MASK 0x00001fff -#define BCM43xx_DMADTOR_DTABLEEND (1 << 28) /* End of descriptor table */ -#define BCM43xx_DMADTOR_COMPIRQ (1 << 29) /* IRQ on completion request */ -#define BCM43xx_DMADTOR_FRAMEEND (1 << 30) -#define BCM43xx_DMADTOR_FRAMESTART (1 << 31) + +/*** 32-bit DMA Engine. ***/ + +/* 32-bit DMA controller registers. */ +#define BCM43xx_DMA32_TXCTL 0x00 +#define BCM43xx_DMA32_TXENABLE 0x00000001 +#define BCM43xx_DMA32_TXSUSPEND 0x00000002 +#define BCM43xx_DMA32_TXLOOPBACK 0x00000004 +#define BCM43xx_DMA32_TXFLUSH 0x00000010 +#define BCM43xx_DMA32_TXADDREXT_MASK 0x00030000 +#define BCM43xx_DMA32_TXADDREXT_SHIFT 16 +#define BCM43xx_DMA32_TXRING 0x04 +#define BCM43xx_DMA32_TXINDEX 0x08 +#define BCM43xx_DMA32_TXSTATUS 0x0C +#define BCM43xx_DMA32_TXDPTR 0x00000FFF +#define BCM43xx_DMA32_TXSTATE 0x0000F000 +#define BCM43xx_DMA32_TXSTAT_DISABLED 0x00000000 +#define BCM43xx_DMA32_TXSTAT_ACTIVE 0x00001000 +#define BCM43xx_DMA32_TXSTAT_IDLEWAIT 0x00002000 +#define BCM43xx_DMA32_TXSTAT_STOPPED 0x00003000 +#define BCM43xx_DMA32_TXSTAT_SUSP 0x00004000 +#define BCM43xx_DMA32_TXERROR 0x000F0000 +#define BCM43xx_DMA32_TXERR_NOERR 0x00000000 +#define BCM43xx_DMA32_TXERR_PROT 0x00010000 +#define BCM43xx_DMA32_TXERR_UNDERRUN 0x00020000 +#define BCM43xx_DMA32_TXERR_BUFREAD 0x00030000 +#define BCM43xx_DMA32_TXERR_DESCREAD 0x00040000 +#define BCM43xx_DMA32_TXACTIVE 0xFFF00000 +#define BCM43xx_DMA32_RXCTL 0x10 +#define BCM43xx_DMA32_RXENABLE 0x00000001 +#define BCM43xx_DMA32_RXFROFF_MASK 0x000000FE +#define BCM43xx_DMA32_RXFROFF_SHIFT 1 +#define BCM43xx_DMA32_RXDIRECTFIFO 0x00000100 +#define BCM43xx_DMA32_RXADDREXT_MASK 0x00030000 +#define BCM43xx_DMA32_RXADDREXT_SHIFT 16 +#define BCM43xx_DMA32_RXRING 0x14 +#define BCM43xx_DMA32_RXINDEX 0x18 +#define BCM43xx_DMA32_RXSTATUS 0x1C +#define BCM43xx_DMA32_RXDPTR 0x00000FFF +#define BCM43xx_DMA32_RXSTATE 0x0000F000 +#define BCM43xx_DMA32_RXSTAT_DISABLED 0x00000000 +#define BCM43xx_DMA32_RXSTAT_ACTIVE 0x00001000 +#define BCM43xx_DMA32_RXSTAT_IDLEWAIT 0x00002000 +#define BCM43xx_DMA32_RXSTAT_STOPPED 0x00003000 +#define BCM43xx_DMA32_RXERROR 0x000F0000 +#define BCM43xx_DMA32_RXERR_NOERR 0x00000000 +#define BCM43xx_DMA32_RXERR_PROT 0x00010000 +#define BCM43xx_DMA32_RXERR_OVERFLOW 0x00020000 +#define BCM43xx_DMA32_RXERR_BUFWRITE 0x00030000 +#define BCM43xx_DMA32_RXERR_DESCREAD 0x00040000 +#define BCM43xx_DMA32_RXACTIVE 0xFFF00000 + +/* 32-bit DMA descriptor. */ +struct bcm43xx_dmadesc32 { + __le32 control; + __le32 address; +} __attribute__((__packed__)); +#define BCM43xx_DMA32_DCTL_BYTECNT 0x00001FFF +#define BCM43xx_DMA32_DCTL_ADDREXT_MASK 0x00030000 +#define BCM43xx_DMA32_DCTL_ADDREXT_SHIFT 16 +#define BCM43xx_DMA32_DCTL_DTABLEEND 0x10000000 +#define BCM43xx_DMA32_DCTL_IRQ 0x20000000 +#define BCM43xx_DMA32_DCTL_FRAMEEND 0x40000000 +#define BCM43xx_DMA32_DCTL_FRAMESTART 0x80000000 + +/* Address field Routing value. */ +#define BCM43xx_DMA32_ROUTING 0xC0000000 +#define BCM43xx_DMA32_ROUTING_SHIFT 30 +#define BCM43xx_DMA32_NOTRANS 0x00000000 +#define BCM43xx_DMA32_CLIENTTRANS 0x40000000 + + + +/*** 64-bit DMA Engine. ***/ + +/* 64-bit DMA controller registers. */ +#define BCM43xx_DMA64_TXCTL 0x00 +#define BCM43xx_DMA64_TXENABLE 0x00000001 +#define BCM43xx_DMA64_TXSUSPEND 0x00000002 +#define BCM43xx_DMA64_TXLOOPBACK 0x00000004 +#define BCM43xx_DMA64_TXFLUSH 0x00000010 +#define BCM43xx_DMA64_TXADDREXT_MASK 0x00030000 +#define BCM43xx_DMA64_TXADDREXT_SHIFT 16 +#define BCM43xx_DMA64_TXINDEX 0x04 +#define BCM43xx_DMA64_TXRINGLO 0x08 +#define BCM43xx_DMA64_TXRINGHI 0x0C +#define BCM43xx_DMA64_TXSTATUS 0x10 +#define BCM43xx_DMA64_TXSTATDPTR 0x00001FFF +#define BCM43xx_DMA64_TXSTAT 0xF0000000 +#define BCM43xx_DMA64_TXSTAT_DISABLED 0x00000000 +#define BCM43xx_DMA64_TXSTAT_ACTIVE 0x10000000 +#define BCM43xx_DMA64_TXSTAT_IDLEWAIT 0x20000000 +#define BCM43xx_DMA64_TXSTAT_STOPPED 0x30000000 +#define BCM43xx_DMA64_TXSTAT_SUSP 0x40000000 +#define BCM43xx_DMA64_TXERROR 0x14 +#define BCM43xx_DMA64_TXERRDPTR 0x0001FFFF +#define BCM43xx_DMA64_TXERR 0xF0000000 +#define BCM43xx_DMA64_TXERR_NOERR 0x00000000 +#define BCM43xx_DMA64_TXERR_PROT 0x10000000 +#define BCM43xx_DMA64_TXERR_UNDERRUN 0x20000000 +#define BCM43xx_DMA64_TXERR_TRANSFER 0x30000000 +#define BCM43xx_DMA64_TXERR_DESCREAD 0x40000000 +#define BCM43xx_DMA64_TXERR_CORE 0x50000000 +#define BCM43xx_DMA64_RXCTL 0x20 +#define BCM43xx_DMA64_RXENABLE 0x00000001 +#define BCM43xx_DMA64_RXFROFF_MASK 0x000000FE +#define BCM43xx_DMA64_RXFROFF_SHIFT 1 +#define BCM43xx_DMA64_RXDIRECTFIFO 0x00000100 +#define BCM43xx_DMA64_RXADDREXT_MASK 0x00030000 +#define BCM43xx_DMA64_RXADDREXT_SHIFT 16 +#define BCM43xx_DMA64_RXINDEX 0x24 +#define BCM43xx_DMA64_RXRINGLO 0x28 +#define BCM43xx_DMA64_RXRINGHI 0x2C +#define BCM43xx_DMA64_RXSTATUS 0x30 +#define BCM43xx_DMA64_RXSTATDPTR 0x00001FFF +#define BCM43xx_DMA64_RXSTAT 0xF0000000 +#define BCM43xx_DMA64_RXSTAT_DISABLED 0x00000000 +#define BCM43xx_DMA64_RXSTAT_ACTIVE 0x10000000 +#define BCM43xx_DMA64_RXSTAT_IDLEWAIT 0x20000000 +#define BCM43xx_DMA64_RXSTAT_STOPPED 0x30000000 +#define BCM43xx_DMA64_RXSTAT_SUSP 0x40000000 +#define BCM43xx_DMA64_RXERROR 0x34 +#define BCM43xx_DMA64_RXERRDPTR 0x0001FFFF +#define BCM43xx_DMA64_RXERR 0xF0000000 +#define BCM43xx_DMA64_RXERR_NOERR 0x00000000 +#define BCM43xx_DMA64_RXERR_PROT 0x10000000 +#define BCM43xx_DMA64_RXERR_UNDERRUN 0x20000000 +#define BCM43xx_DMA64_RXERR_TRANSFER 0x30000000 +#define BCM43xx_DMA64_RXERR_DESCREAD 0x40000000 +#define BCM43xx_DMA64_RXERR_CORE 0x50000000 + +/* 64-bit DMA descriptor. */ +struct bcm43xx_dmadesc64 { + __le32 control0; + __le32 control1; + __le32 address_low; + __le32 address_high; +} __attribute__((__packed__)); +#define BCM43xx_DMA64_DCTL0_DTABLEEND 0x10000000 +#define BCM43xx_DMA64_DCTL0_IRQ 0x20000000 +#define BCM43xx_DMA64_DCTL0_FRAMEEND 0x40000000 +#define BCM43xx_DMA64_DCTL0_FRAMESTART 0x80000000 +#define BCM43xx_DMA64_DCTL1_BYTECNT 0x00001FFF +#define BCM43xx_DMA64_DCTL1_ADDREXT_MASK 0x00030000 +#define BCM43xx_DMA64_DCTL1_ADDREXT_SHIFT 16 + +/* Address field Routing value. */ +#define BCM43xx_DMA64_ROUTING 0xC0000000 +#define BCM43xx_DMA64_ROUTING_SHIFT 30 +#define BCM43xx_DMA64_NOTRANS 0x00000000 +#define BCM43xx_DMA64_CLIENTTRANS 0x80000000 + + + +struct bcm43xx_dmadesc_generic { + union { + struct bcm43xx_dmadesc32 dma32; + struct bcm43xx_dmadesc64 dma64; + } __attribute__((__packed__)); +} __attribute__((__packed__)); + /* Misc DMA constants */ #define BCM43xx_DMA_RINGMEMSIZE PAGE_SIZE -#define BCM43xx_DMA_BUSADDRMAX 0x3FFFFFFF -#define BCM43xx_DMA_DMABUSADDROFFSET (1 << 30) -#define BCM43xx_DMA1_RX_FRAMEOFFSET 30 -#define BCM43xx_DMA4_RX_FRAMEOFFSET 0 +#define BCM43xx_DMA0_RX_FRAMEOFFSET 30 +#define BCM43xx_DMA3_RX_FRAMEOFFSET 0 + /* DMA engine tuning knobs */ #define BCM43xx_TXRING_SLOTS 512 #define BCM43xx_RXRING_SLOTS 64 -#define BCM43xx_DMA1_RXBUFFERSIZE (2304 + 100) -#define BCM43xx_DMA4_RXBUFFERSIZE 16 +#define BCM43xx_DMA0_RX_BUFFERSIZE (2304 + 100) +#define BCM43xx_DMA3_RX_BUFFERSIZE 16 /* Suspend the tx queue, if less than this percent slots are free. */ #define BCM43xx_TXSUSPEND_PERCENT 20 /* Resume the tx queue, if more than this percent slots are free. */ @@ -86,17 +202,6 @@ struct bcm43xx_private; struct bcm43xx_xmitstatus; -struct bcm43xx_dmadesc { - __le32 _control; - __le32 _address; -} __attribute__((__packed__)); - -/* Macros to access the bcm43xx_dmadesc struct */ -#define get_desc_ctl(desc) le32_to_cpu((desc)->_control) -#define set_desc_ctl(desc, ctl) do { (desc)->_control = cpu_to_le32(ctl); } while (0) -#define get_desc_addr(desc) le32_to_cpu((desc)->_address) -#define set_desc_addr(desc, addr) do { (desc)->_address = cpu_to_le32(addr); } while (0) - struct bcm43xx_dmadesc_meta { /* The kernel DMA-able buffer. */ struct sk_buff *skb; @@ -105,15 +210,14 @@ struct bcm43xx_dmadesc_meta { }; struct bcm43xx_dmaring { - struct bcm43xx_private *bcm; /* Kernel virtual base address of the ring memory. */ - struct bcm43xx_dmadesc *vbase; - /* DMA memory offset */ - dma_addr_t memoffset; - /* (Unadjusted) DMA base bus-address of the ring memory. */ - dma_addr_t dmabase; + void *descbase; /* Meta data about all descriptors. */ struct bcm43xx_dmadesc_meta *meta; + /* DMA Routing value. */ + u32 routing; + /* (Unadjusted) DMA base bus-address of the ring memory. */ + dma_addr_t dmabase; /* Number of descriptor slots in the ring. */ int nr_slots; /* Number of used descriptor slots. */ @@ -127,12 +231,17 @@ struct bcm43xx_dmaring { u32 frameoffset; /* Descriptor buffer size. */ u16 rx_buffersize; - /* The MMIO base register of the DMA controller, this - * ring is posted to. - */ + /* The MMIO base register of the DMA controller. */ u16 mmio_base; - u8 tx:1, /* TRUE, if this is a TX ring. */ - suspended:1; /* TRUE, if transfers are suspended on this ring. */ + /* DMA controller index number (0-5). */ + int index; + /* Boolean. Is this a TX ring? */ + u8 tx; + /* Boolean. 64bit DMA if true, 32bit DMA otherwise. */ + u8 dma64; + /* Boolean. Are transfers suspended on this ring? */ + u8 suspended; + struct bcm43xx_private *bcm; #ifdef CONFIG_BCM43XX_DEBUG /* Maximum number of used slots. */ int max_used_slots; @@ -141,6 +250,34 @@ struct bcm43xx_dmaring { static inline +int bcm43xx_dma_desc2idx(struct bcm43xx_dmaring *ring, + struct bcm43xx_dmadesc_generic *desc) +{ + if (ring->dma64) { + struct bcm43xx_dmadesc64 *dd64 = ring->descbase; + return (int)(&(desc->dma64) - dd64); + } else { + struct bcm43xx_dmadesc32 *dd32 = ring->descbase; + return (int)(&(desc->dma32) - dd32); + } +} + +static inline +struct bcm43xx_dmadesc_generic * bcm43xx_dma_idx2desc(struct bcm43xx_dmaring *ring, + int slot, + struct bcm43xx_dmadesc_meta **meta) +{ + *meta = &(ring->meta[slot]); + if (ring->dma64) { + struct bcm43xx_dmadesc64 *dd64 = ring->descbase; + return (struct bcm43xx_dmadesc_generic *)(&(dd64[slot])); + } else { + struct bcm43xx_dmadesc32 *dd32 = ring->descbase; + return (struct bcm43xx_dmadesc_generic *)(&(dd32[slot])); + } +} + +static inline u32 bcm43xx_dma_read(struct bcm43xx_dmaring *ring, u16 offset) { @@ -159,9 +296,13 @@ int bcm43xx_dma_init(struct bcm43xx_priv void bcm43xx_dma_free(struct bcm43xx_private *bcm); int bcm43xx_dmacontroller_rx_reset(struct bcm43xx_private *bcm, - u16 dmacontroller_mmio_base); + u16 dmacontroller_mmio_base, + int dma64); int bcm43xx_dmacontroller_tx_reset(struct bcm43xx_private *bcm, - u16 dmacontroller_mmio_base); + u16 dmacontroller_mmio_base, + int dma64); + +u16 bcm43xx_dmacontroller_base(int dma64bit, int dmacontroller_idx); void bcm43xx_dma_tx_suspend(struct bcm43xx_dmaring *ring); void bcm43xx_dma_tx_resume(struct bcm43xx_dmaring *ring); @@ -173,7 +314,6 @@ int bcm43xx_dma_tx(struct bcm43xx_privat struct ieee80211_txb *txb); void bcm43xx_dma_rx(struct bcm43xx_dmaring *ring); - #else /* CONFIG_BCM43XX_DMA */ @@ -188,13 +328,15 @@ void bcm43xx_dma_free(struct bcm43xx_pri } static inline int bcm43xx_dmacontroller_rx_reset(struct bcm43xx_private *bcm, - u16 dmacontroller_mmio_base) + u16 dmacontroller_mmio_base, + int dma64) { return 0; } static inline int bcm43xx_dmacontroller_tx_reset(struct bcm43xx_private *bcm, - u16 dmacontroller_mmio_base) + u16 dmacontroller_mmio_base, + int dma64) { return 0; } diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_leds.c linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_leds.c --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_leds.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_leds.c 2007-06-13 06:55:07.000000000 -0400 @@ -51,12 +51,12 @@ static void bcm43xx_led_blink(unsigned l struct bcm43xx_private *bcm = led->bcm; unsigned long flags; - bcm43xx_lock_irqonly(bcm, flags); + spin_lock_irqsave(&bcm->leds_lock, flags); if (led->blink_interval) { bcm43xx_led_changestate(led); mod_timer(&led->blink_timer, jiffies + led->blink_interval); } - bcm43xx_unlock_irqonly(bcm, flags); + spin_unlock_irqrestore(&bcm->leds_lock, flags); } static void bcm43xx_led_blink_start(struct bcm43xx_led *led, @@ -177,7 +177,9 @@ void bcm43xx_leds_update(struct bcm43xx_ int i, turn_on; unsigned long interval = 0; u16 ledctl; + unsigned long flags; + spin_lock_irqsave(&bcm->leds_lock, flags); ledctl = bcm43xx_read16(bcm, BCM43xx_MMIO_GPIO_CONTROL); for (i = 0; i < BCM43xx_NR_LEDS; i++) { led = &(bcm->leds[i]); @@ -266,6 +268,7 @@ void bcm43xx_leds_update(struct bcm43xx_ ledctl &= ~(1 << i); } bcm43xx_write16(bcm, BCM43xx_MMIO_GPIO_CONTROL, ledctl); + spin_unlock_irqrestore(&bcm->leds_lock, flags); } void bcm43xx_leds_switch_all(struct bcm43xx_private *bcm, int on) @@ -274,7 +277,9 @@ void bcm43xx_leds_switch_all(struct bcm4 u16 ledctl; int i; int bit_on; + unsigned long flags; + spin_lock_irqsave(&bcm->leds_lock, flags); ledctl = bcm43xx_read16(bcm, BCM43xx_MMIO_GPIO_CONTROL); for (i = 0; i < BCM43xx_NR_LEDS; i++) { led = &(bcm->leds[i]); @@ -290,4 +295,5 @@ void bcm43xx_leds_switch_all(struct bcm4 ledctl &= ~(1 << i); } bcm43xx_write16(bcm, BCM43xx_MMIO_GPIO_CONTROL, ledctl); + spin_unlock_irqrestore(&bcm->leds_lock, flags); } diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_main.c linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_main.c --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_main.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_main.c 2007-06-13 06:55:07.000000000 -0400 @@ -509,23 +509,20 @@ static void bcm43xx_synchronize_irq(stru } /* Make sure we don't receive more data from the device. */ -static int bcm43xx_disable_interrupts_sync(struct bcm43xx_private *bcm, u32 *oldstate) +static int bcm43xx_disable_interrupts_sync(struct bcm43xx_private *bcm) { unsigned long flags; - u32 old; - bcm43xx_lock_irqonly(bcm, flags); + spin_lock_irqsave(&bcm->irq_lock, flags); if (unlikely(bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED)) { - bcm43xx_unlock_irqonly(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); return -EBUSY; } - old = bcm43xx_interrupt_disable(bcm, BCM43xx_IRQ_ALL); - bcm43xx_unlock_irqonly(bcm, flags); + bcm43xx_interrupt_disable(bcm, BCM43xx_IRQ_ALL); + bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_MASK); /* flush */ + spin_unlock_irqrestore(&bcm->irq_lock, flags); bcm43xx_synchronize_irq(bcm); - if (oldstate) - *oldstate = old; - return 0; } @@ -537,7 +534,6 @@ static int bcm43xx_read_radioinfo(struct u16 manufact; u16 version; u8 revision; - s8 i; if (bcm->chip_id == 0x4317) { if (bcm->chip_rev == 0x00) @@ -580,20 +576,11 @@ static int bcm43xx_read_radioinfo(struct radio->version = version; radio->revision = revision; - /* Set default attenuation values. */ - radio->baseband_atten = bcm43xx_default_baseband_attenuation(bcm); - radio->radio_atten = bcm43xx_default_radio_attenuation(bcm); - radio->txctl1 = bcm43xx_default_txctl1(bcm); - radio->txctl2 = 0xFFFF; if (phy->type == BCM43xx_PHYTYPE_A) radio->txpower_desired = bcm->sprom.maxpower_aphy; else radio->txpower_desired = bcm->sprom.maxpower_bgphy; - /* Initialize the in-memory nrssi Lookup Table. */ - for (i = 0; i < 64; i++) - radio->nrssi_lt[i] = i; - return 0; err_unsupported_radio: @@ -1250,10 +1237,6 @@ int bcm43xx_switch_core(struct bcm43xx_p goto out; bcm->current_core = new_core; - bcm->current_80211_core_idx = -1; - if (new_core->id == BCM43xx_COREID_80211) - bcm->current_80211_core_idx = (int)(new_core - &(bcm->core_80211[0])); - out: return err; } @@ -1389,6 +1372,7 @@ void bcm43xx_wireless_core_reset(struct if ((bcm43xx_core_enabled(bcm)) && !bcm43xx_using_pio(bcm)) { //FIXME: Do we _really_ want #ifndef CONFIG_BCM947XX here? +#if 0 #ifndef CONFIG_BCM947XX /* reset all used DMA controllers. */ bcm43xx_dmacontroller_tx_reset(bcm, BCM43xx_MMIO_DMA1_BASE); @@ -1399,6 +1383,7 @@ void bcm43xx_wireless_core_reset(struct if (bcm->current_core->rev < 5) bcm43xx_dmacontroller_rx_reset(bcm, BCM43xx_MMIO_DMA4_BASE); #endif +#endif } if (bcm43xx_status(bcm) == BCM43xx_STAT_SHUTTINGDOWN) { bcm43xx_write32(bcm, BCM43xx_MMIO_STATUS_BITFIELD, @@ -1423,43 +1408,23 @@ static void bcm43xx_wireless_core_disabl bcm43xx_core_disable(bcm, 0); } -/* Mark the current 80211 core inactive. - * "active_80211_core" is the other 80211 core, which is used. - */ -static int bcm43xx_wireless_core_mark_inactive(struct bcm43xx_private *bcm, - struct bcm43xx_coreinfo *active_80211_core) +/* Mark the current 80211 core inactive. */ +static void bcm43xx_wireless_core_mark_inactive(struct bcm43xx_private *bcm) { u32 sbtmstatelow; - struct bcm43xx_coreinfo *old_core; - int err = 0; bcm43xx_interrupt_disable(bcm, BCM43xx_IRQ_ALL); bcm43xx_radio_turn_off(bcm); sbtmstatelow = bcm43xx_read32(bcm, BCM43xx_CIR_SBTMSTATELOW); - sbtmstatelow &= ~0x200a0000; - sbtmstatelow |= 0xa0000; + sbtmstatelow &= 0xDFF5FFFF; + sbtmstatelow |= 0x000A0000; bcm43xx_write32(bcm, BCM43xx_CIR_SBTMSTATELOW, sbtmstatelow); udelay(1); sbtmstatelow = bcm43xx_read32(bcm, BCM43xx_CIR_SBTMSTATELOW); - sbtmstatelow &= ~0xa0000; - sbtmstatelow |= 0x80000; + sbtmstatelow &= 0xFFF5FFFF; + sbtmstatelow |= 0x00080000; bcm43xx_write32(bcm, BCM43xx_CIR_SBTMSTATELOW, sbtmstatelow); udelay(1); - - if (bcm43xx_current_phy(bcm)->type == BCM43xx_PHYTYPE_G) { - old_core = bcm->current_core; - err = bcm43xx_switch_core(bcm, active_80211_core); - if (err) - goto out; - sbtmstatelow = bcm43xx_read32(bcm, BCM43xx_CIR_SBTMSTATELOW); - sbtmstatelow &= ~0x20000000; - sbtmstatelow |= 0x20000000; - bcm43xx_write32(bcm, BCM43xx_CIR_SBTMSTATELOW, sbtmstatelow); - err = bcm43xx_switch_core(bcm, old_core); - } - -out: - return err; } static void handle_irq_transmit_status(struct bcm43xx_private *bcm) @@ -1484,12 +1449,10 @@ static void handle_irq_transmit_status(s bcm43xx_debugfs_log_txstat(bcm, &stat); - if (stat.flags & BCM43xx_TXSTAT_FLAG_IGNORE) + if (stat.flags & BCM43xx_TXSTAT_FLAG_AMPDU) + continue; + if (stat.flags & BCM43xx_TXSTAT_FLAG_INTER) continue; - if (!(stat.flags & BCM43xx_TXSTAT_FLAG_ACK)) { - //TODO: packet was not acked (was lost) - } - //TODO: There are more (unknown) flags to test. see bcm43xx_main.h if (bcm43xx_using_pio(bcm)) bcm43xx_pio_handle_xmitstatus(bcm, &stat); @@ -1498,6 +1461,23 @@ static void handle_irq_transmit_status(s } } +static void drain_txstatus_queue(struct bcm43xx_private *bcm) +{ + u32 dummy; + + if (bcm->current_core->rev < 5) + return; + /* Read all entries from the microcode TXstatus FIFO + * and throw them away. + */ + while (1) { + dummy = bcm43xx_read32(bcm, BCM43xx_MMIO_XMITSTAT_0); + if (!dummy) + break; + dummy = bcm43xx_read32(bcm, BCM43xx_MMIO_XMITSTAT_1); + } +} + static void bcm43xx_generate_noise_sample(struct bcm43xx_private *bcm) { bcm43xx_shm_write16(bcm, BCM43xx_SHM_SHARED, 0x408, 0x7F7F); @@ -1581,17 +1561,7 @@ static void handle_irq_noise(struct bcm4 else average -= 48; -/* FIXME: This is wrong, but people want fancy stats. well... */ -bcm->stats.noise = average; - if (average > -65) - bcm->stats.link_quality = 0; - else if (average > -75) - bcm->stats.link_quality = 1; - else if (average > -85) - bcm->stats.link_quality = 2; - else - bcm->stats.link_quality = 3; -// dprintk(KERN_INFO PFX "Link Quality: %u (avg was %d)\n", bcm->stats.link_quality, average); + bcm->stats.noise = average; drop_calculation: bcm->noisecalc.calculation_running = 0; return; @@ -1709,8 +1679,9 @@ static void handle_irq_beacon(struct bcm static void bcm43xx_interrupt_tasklet(struct bcm43xx_private *bcm) { u32 reason; - u32 dma_reason[4]; - int activity = 0; + u32 dma_reason[6]; + u32 merged_dma_reason = 0; + int i, activity = 0; unsigned long flags; #ifdef CONFIG_BCM43XX_DEBUG @@ -1720,12 +1691,12 @@ static void bcm43xx_interrupt_tasklet(st # define bcmirq_handled(irq) do { /* nothing */ } while (0) #endif /* CONFIG_BCM43XX_DEBUG*/ - bcm43xx_lock_irqonly(bcm, flags); + spin_lock_irqsave(&bcm->irq_lock, flags); reason = bcm->irq_reason; - dma_reason[0] = bcm->dma_reason[0]; - dma_reason[1] = bcm->dma_reason[1]; - dma_reason[2] = bcm->dma_reason[2]; - dma_reason[3] = bcm->dma_reason[3]; + for (i = 5; i >= 0; i--) { + dma_reason[i] = bcm->dma_reason[i]; + merged_dma_reason |= dma_reason[i]; + } if (unlikely(reason & BCM43xx_IRQ_XMIT_ERROR)) { /* TX error. We get this when Template Ram is written in wrong endianess @@ -1736,27 +1707,25 @@ static void bcm43xx_interrupt_tasklet(st printkl(KERN_ERR PFX "FATAL ERROR: BCM43xx_IRQ_XMIT_ERROR\n"); bcmirq_handled(BCM43xx_IRQ_XMIT_ERROR); } - if (unlikely((dma_reason[0] & BCM43xx_DMAIRQ_FATALMASK) | - (dma_reason[1] & BCM43xx_DMAIRQ_FATALMASK) | - (dma_reason[2] & BCM43xx_DMAIRQ_FATALMASK) | - (dma_reason[3] & BCM43xx_DMAIRQ_FATALMASK))) { + if (unlikely(merged_dma_reason & BCM43xx_DMAIRQ_FATALMASK)) { printkl(KERN_ERR PFX "FATAL ERROR: Fatal DMA error: " - "0x%08X, 0x%08X, 0x%08X, 0x%08X\n", + "0x%08X, 0x%08X, 0x%08X, " + "0x%08X, 0x%08X, 0x%08X\n", dma_reason[0], dma_reason[1], - dma_reason[2], dma_reason[3]); + dma_reason[2], dma_reason[3], + dma_reason[4], dma_reason[5]); bcm43xx_controller_restart(bcm, "DMA error"); mmiowb(); - bcm43xx_unlock_irqonly(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); return; } - if (unlikely((dma_reason[0] & BCM43xx_DMAIRQ_NONFATALMASK) | - (dma_reason[1] & BCM43xx_DMAIRQ_NONFATALMASK) | - (dma_reason[2] & BCM43xx_DMAIRQ_NONFATALMASK) | - (dma_reason[3] & BCM43xx_DMAIRQ_NONFATALMASK))) { + if (unlikely(merged_dma_reason & BCM43xx_DMAIRQ_NONFATALMASK)) { printkl(KERN_ERR PFX "DMA error: " - "0x%08X, 0x%08X, 0x%08X, 0x%08X\n", + "0x%08X, 0x%08X, 0x%08X, " + "0x%08X, 0x%08X, 0x%08X\n", dma_reason[0], dma_reason[1], - dma_reason[2], dma_reason[3]); + dma_reason[2], dma_reason[3], + dma_reason[4], dma_reason[5]); } if (reason & BCM43xx_IRQ_PS) { @@ -1791,8 +1760,6 @@ static void bcm43xx_interrupt_tasklet(st } /* Check the DMA reason registers for received data. */ - assert(!(dma_reason[1] & BCM43xx_DMAIRQ_RX_DONE)); - assert(!(dma_reason[2] & BCM43xx_DMAIRQ_RX_DONE)); if (dma_reason[0] & BCM43xx_DMAIRQ_RX_DONE) { if (bcm43xx_using_pio(bcm)) bcm43xx_pio_rx(bcm43xx_current_pio(bcm)->queue0); @@ -1800,13 +1767,17 @@ static void bcm43xx_interrupt_tasklet(st bcm43xx_dma_rx(bcm43xx_current_dma(bcm)->rx_ring0); /* We intentionally don't set "activity" to 1, here. */ } + assert(!(dma_reason[1] & BCM43xx_DMAIRQ_RX_DONE)); + assert(!(dma_reason[2] & BCM43xx_DMAIRQ_RX_DONE)); if (dma_reason[3] & BCM43xx_DMAIRQ_RX_DONE) { if (bcm43xx_using_pio(bcm)) bcm43xx_pio_rx(bcm43xx_current_pio(bcm)->queue3); else - bcm43xx_dma_rx(bcm43xx_current_dma(bcm)->rx_ring1); + bcm43xx_dma_rx(bcm43xx_current_dma(bcm)->rx_ring3); activity = 1; } + assert(!(dma_reason[4] & BCM43xx_DMAIRQ_RX_DONE)); + assert(!(dma_reason[5] & BCM43xx_DMAIRQ_RX_DONE)); bcmirq_handled(BCM43xx_IRQ_RX); if (reason & BCM43xx_IRQ_XMIT_STATUS) { @@ -1834,7 +1805,7 @@ static void bcm43xx_interrupt_tasklet(st bcm43xx_leds_update(bcm, activity); bcm43xx_interrupt_enable(bcm, bcm->irq_savedstate); mmiowb(); - bcm43xx_unlock_irqonly(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); } static void pio_irq_workaround(struct bcm43xx_private *bcm, @@ -1863,14 +1834,18 @@ static void bcm43xx_interrupt_ack(struct bcm43xx_write32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON, reason); - bcm43xx_write32(bcm, BCM43xx_MMIO_DMA1_REASON, + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA0_REASON, bcm->dma_reason[0]); - bcm43xx_write32(bcm, BCM43xx_MMIO_DMA2_REASON, + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA1_REASON, bcm->dma_reason[1]); - bcm43xx_write32(bcm, BCM43xx_MMIO_DMA3_REASON, + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA2_REASON, bcm->dma_reason[2]); - bcm43xx_write32(bcm, BCM43xx_MMIO_DMA4_REASON, + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA3_REASON, bcm->dma_reason[3]); + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA4_REASON, + bcm->dma_reason[4]); + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA5_REASON, + bcm->dma_reason[5]); } /* Interrupt handler top-half */ @@ -1885,14 +1860,8 @@ static irqreturn_t bcm43xx_interrupt_han spin_lock(&bcm->irq_lock); - /* Only accept IRQs, if we are initialized properly. - * This avoids an RX race while initializing. - * We should probably not enable IRQs before we are initialized - * completely, but some careful work is needed to fix this. I think it - * is best to stay with this cheap workaround for now... . - */ - if (unlikely(bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED)) - goto out; + assert(bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED); + assert(bcm->current_core->id == BCM43xx_COREID_80211); reason = bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); if (reason == 0xffffffff) { @@ -1904,14 +1873,18 @@ static irqreturn_t bcm43xx_interrupt_han if (!reason) goto out; - bcm->dma_reason[0] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA1_REASON) - & 0x0001dc00; - bcm->dma_reason[1] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA2_REASON) - & 0x0000dc00; - bcm->dma_reason[2] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA3_REASON) - & 0x0000dc00; - bcm->dma_reason[3] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA4_REASON) - & 0x0001dc00; + bcm->dma_reason[0] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA0_REASON) + & 0x0001DC00; + bcm->dma_reason[1] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA1_REASON) + & 0x0000DC00; + bcm->dma_reason[2] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA2_REASON) + & 0x0000DC00; + bcm->dma_reason[3] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA3_REASON) + & 0x0001DC00; + bcm->dma_reason[4] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA4_REASON) + & 0x0000DC00; + bcm->dma_reason[5] = bcm43xx_read32(bcm, BCM43xx_MMIO_DMA5_REASON) + & 0x0000DC00; bcm43xx_interrupt_ack(bcm, reason); @@ -1930,16 +1903,18 @@ out: static void bcm43xx_release_firmware(struct bcm43xx_private *bcm, int force) { + struct bcm43xx_phyinfo *phy = bcm43xx_current_phy(bcm); + if (bcm->firmware_norelease && !force) return; /* Suspending or controller reset. */ - release_firmware(bcm->ucode); - bcm->ucode = NULL; - release_firmware(bcm->pcm); - bcm->pcm = NULL; - release_firmware(bcm->initvals0); - bcm->initvals0 = NULL; - release_firmware(bcm->initvals1); - bcm->initvals1 = NULL; + release_firmware(phy->ucode); + phy->ucode = NULL; + release_firmware(phy->pcm); + phy->pcm = NULL; + release_firmware(phy->initvals0); + phy->initvals0 = NULL; + release_firmware(phy->initvals1); + phy->initvals1 = NULL; } static int bcm43xx_request_firmware(struct bcm43xx_private *bcm) @@ -1950,11 +1925,11 @@ static int bcm43xx_request_firmware(stru int nr; char buf[22 + sizeof(modparam_fwpostfix) - 1] = { 0 }; - if (!bcm->ucode) { + if (!phy->ucode) { snprintf(buf, ARRAY_SIZE(buf), "bcm43xx_microcode%d%s.fw", (rev >= 5 ? 5 : rev), modparam_fwpostfix); - err = request_firmware(&bcm->ucode, buf, &bcm->pci_dev->dev); + err = request_firmware(&phy->ucode, buf, &bcm->pci_dev->dev); if (err) { printk(KERN_ERR PFX "Error: Microcode \"%s\" not available or load failed.\n", @@ -1963,12 +1938,12 @@ static int bcm43xx_request_firmware(stru } } - if (!bcm->pcm) { + if (!phy->pcm) { snprintf(buf, ARRAY_SIZE(buf), "bcm43xx_pcm%d%s.fw", (rev < 5 ? 4 : 5), modparam_fwpostfix); - err = request_firmware(&bcm->pcm, buf, &bcm->pci_dev->dev); + err = request_firmware(&phy->pcm, buf, &bcm->pci_dev->dev); if (err) { printk(KERN_ERR PFX "Error: PCM \"%s\" not available or load failed.\n", @@ -1977,7 +1952,7 @@ static int bcm43xx_request_firmware(stru } } - if (!bcm->initvals0) { + if (!phy->initvals0) { if (rev == 2 || rev == 4) { switch (phy->type) { case BCM43xx_PHYTYPE_A: @@ -2008,20 +1983,20 @@ static int bcm43xx_request_firmware(stru snprintf(buf, ARRAY_SIZE(buf), "bcm43xx_initval%02d%s.fw", nr, modparam_fwpostfix); - err = request_firmware(&bcm->initvals0, buf, &bcm->pci_dev->dev); + err = request_firmware(&phy->initvals0, buf, &bcm->pci_dev->dev); if (err) { printk(KERN_ERR PFX "Error: InitVals \"%s\" not available or load failed.\n", buf); goto error; } - if (bcm->initvals0->size % sizeof(struct bcm43xx_initval)) { + if (phy->initvals0->size % sizeof(struct bcm43xx_initval)) { printk(KERN_ERR PFX "InitVals fileformat error.\n"); goto error; } } - if (!bcm->initvals1) { + if (!phy->initvals1) { if (rev >= 5) { u32 sbtmstatehigh; @@ -2043,14 +2018,14 @@ static int bcm43xx_request_firmware(stru snprintf(buf, ARRAY_SIZE(buf), "bcm43xx_initval%02d%s.fw", nr, modparam_fwpostfix); - err = request_firmware(&bcm->initvals1, buf, &bcm->pci_dev->dev); + err = request_firmware(&phy->initvals1, buf, &bcm->pci_dev->dev); if (err) { printk(KERN_ERR PFX "Error: InitVals \"%s\" not available or load failed.\n", buf); goto error; } - if (bcm->initvals1->size % sizeof(struct bcm43xx_initval)) { + if (phy->initvals1->size % sizeof(struct bcm43xx_initval)) { printk(KERN_ERR PFX "InitVals fileformat error.\n"); goto error; } @@ -2070,12 +2045,13 @@ err_noinitval: static void bcm43xx_upload_microcode(struct bcm43xx_private *bcm) { + struct bcm43xx_phyinfo *phy = bcm43xx_current_phy(bcm); const u32 *data; unsigned int i, len; /* Upload Microcode. */ - data = (u32 *)(bcm->ucode->data); - len = bcm->ucode->size / sizeof(u32); + data = (u32 *)(phy->ucode->data); + len = phy->ucode->size / sizeof(u32); bcm43xx_shm_control_word(bcm, BCM43xx_SHM_UCODE, 0x0000); for (i = 0; i < len; i++) { bcm43xx_write32(bcm, BCM43xx_MMIO_SHM_DATA, @@ -2084,8 +2060,8 @@ static void bcm43xx_upload_microcode(str } /* Upload PCM data. */ - data = (u32 *)(bcm->pcm->data); - len = bcm->pcm->size / sizeof(u32); + data = (u32 *)(phy->pcm->data); + len = phy->pcm->size / sizeof(u32); bcm43xx_shm_control_word(bcm, BCM43xx_SHM_PCM, 0x01ea); bcm43xx_write32(bcm, BCM43xx_MMIO_SHM_DATA, 0x00004000); bcm43xx_shm_control_word(bcm, BCM43xx_SHM_PCM, 0x01eb); @@ -2131,15 +2107,16 @@ err_format: static int bcm43xx_upload_initvals(struct bcm43xx_private *bcm) { + struct bcm43xx_phyinfo *phy = bcm43xx_current_phy(bcm); int err; - err = bcm43xx_write_initvals(bcm, (struct bcm43xx_initval *)bcm->initvals0->data, - bcm->initvals0->size / sizeof(struct bcm43xx_initval)); + err = bcm43xx_write_initvals(bcm, (struct bcm43xx_initval *)phy->initvals0->data, + phy->initvals0->size / sizeof(struct bcm43xx_initval)); if (err) goto out; - if (bcm->initvals1) { - err = bcm43xx_write_initvals(bcm, (struct bcm43xx_initval *)bcm->initvals1->data, - bcm->initvals1->size / sizeof(struct bcm43xx_initval)); + if (phy->initvals1) { + err = bcm43xx_write_initvals(bcm, (struct bcm43xx_initval *)phy->initvals1->data, + phy->initvals1->size / sizeof(struct bcm43xx_initval)); if (err) goto out; } @@ -2156,9 +2133,7 @@ static struct pci_device_id bcm43xx_47xx static int bcm43xx_initialize_irq(struct bcm43xx_private *bcm) { - int res; - unsigned int i; - u32 data; + int err; bcm->irq = bcm->pci_dev->irq; #ifdef CONFIG_BCM947XX @@ -2175,32 +2150,12 @@ static int bcm43xx_initialize_irq(struct } } #endif - res = request_irq(bcm->irq, bcm43xx_interrupt_handler, + err = request_irq(bcm->irq, bcm43xx_interrupt_handler, IRQF_SHARED, KBUILD_MODNAME, bcm); - if (res) { + if (err) printk(KERN_ERR PFX "Cannot register IRQ%d\n", bcm->irq); - return -ENODEV; - } - bcm43xx_write32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON, 0xffffffff); - bcm43xx_write32(bcm, BCM43xx_MMIO_STATUS_BITFIELD, 0x00020402); - i = 0; - while (1) { - data = bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); - if (data == BCM43xx_IRQ_READY) - break; - i++; - if (i >= BCM43xx_IRQWAIT_MAX_RETRIES) { - printk(KERN_ERR PFX "Card IRQ register not responding. " - "Giving up.\n"); - free_irq(bcm->irq, bcm); - return -ENODEV; - } - udelay(10); - } - // dummy read - bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); - return 0; + return err; } /* Switch to the core used to write the GPIO register. @@ -2298,13 +2253,17 @@ static int bcm43xx_gpio_cleanup(struct b /* http://bcm-specs.sipsolutions.net/EnableMac */ void bcm43xx_mac_enable(struct bcm43xx_private *bcm) { - bcm43xx_write32(bcm, BCM43xx_MMIO_STATUS_BITFIELD, - bcm43xx_read32(bcm, BCM43xx_MMIO_STATUS_BITFIELD) - | BCM43xx_SBF_MAC_ENABLED); - bcm43xx_write32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON, BCM43xx_IRQ_READY); - bcm43xx_read32(bcm, BCM43xx_MMIO_STATUS_BITFIELD); /* dummy read */ - bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); /* dummy read */ - bcm43xx_power_saving_ctl_bits(bcm, -1, -1); + bcm->mac_suspended--; + assert(bcm->mac_suspended >= 0); + if (bcm->mac_suspended == 0) { + bcm43xx_write32(bcm, BCM43xx_MMIO_STATUS_BITFIELD, + bcm43xx_read32(bcm, BCM43xx_MMIO_STATUS_BITFIELD) + | BCM43xx_SBF_MAC_ENABLED); + bcm43xx_write32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON, BCM43xx_IRQ_READY); + bcm43xx_read32(bcm, BCM43xx_MMIO_STATUS_BITFIELD); /* dummy read */ + bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); /* dummy read */ + bcm43xx_power_saving_ctl_bits(bcm, -1, -1); + } } /* http://bcm-specs.sipsolutions.net/SuspendMAC */ @@ -2313,18 +2272,23 @@ void bcm43xx_mac_suspend(struct bcm43xx_ int i; u32 tmp; - bcm43xx_power_saving_ctl_bits(bcm, -1, 1); - bcm43xx_write32(bcm, BCM43xx_MMIO_STATUS_BITFIELD, - bcm43xx_read32(bcm, BCM43xx_MMIO_STATUS_BITFIELD) - & ~BCM43xx_SBF_MAC_ENABLED); - bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); /* dummy read */ - for (i = 100000; i; i--) { - tmp = bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); - if (tmp & BCM43xx_IRQ_READY) - return; - udelay(10); + assert(bcm->mac_suspended >= 0); + if (bcm->mac_suspended == 0) { + bcm43xx_power_saving_ctl_bits(bcm, -1, 1); + bcm43xx_write32(bcm, BCM43xx_MMIO_STATUS_BITFIELD, + bcm43xx_read32(bcm, BCM43xx_MMIO_STATUS_BITFIELD) + & ~BCM43xx_SBF_MAC_ENABLED); + bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); /* dummy read */ + for (i = 10000; i; i--) { + tmp = bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); + if (tmp & BCM43xx_IRQ_READY) + goto out; + udelay(1); + } + printkl(KERN_ERR PFX "MAC suspend failed\n"); } - printkl(KERN_ERR PFX "MAC suspend failed\n"); +out: + bcm->mac_suspended++; } void bcm43xx_set_iwmode(struct bcm43xx_private *bcm, @@ -2394,7 +2358,6 @@ static void bcm43xx_chip_cleanup(struct if (!modparam_noleds) bcm43xx_leds_exit(bcm); bcm43xx_gpio_cleanup(bcm); - free_irq(bcm->irq, bcm); bcm43xx_release_firmware(bcm, 0); } @@ -2406,7 +2369,7 @@ static int bcm43xx_chip_init(struct bcm4 struct bcm43xx_radioinfo *radio = bcm43xx_current_radio(bcm); struct bcm43xx_phyinfo *phy = bcm43xx_current_phy(bcm); int err; - int tmp; + int i, tmp; u32 value32; u16 value16; @@ -2419,13 +2382,53 @@ static int bcm43xx_chip_init(struct bcm4 goto out; bcm43xx_upload_microcode(bcm); - err = bcm43xx_initialize_irq(bcm); - if (err) + bcm43xx_write32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON, 0xFFFFFFFF); + bcm43xx_write32(bcm, BCM43xx_MMIO_STATUS_BITFIELD, 0x00020402); + i = 0; + while (1) { + value32 = bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); + if (value32 == BCM43xx_IRQ_READY) + break; + i++; + if (i >= BCM43xx_IRQWAIT_MAX_RETRIES) { + printk(KERN_ERR PFX "IRQ_READY timeout\n"); + err = -ENODEV; + goto err_release_fw; + } + udelay(10); + } + bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); /* dummy read */ + + value16 = bcm43xx_shm_read16(bcm, BCM43xx_SHM_SHARED, + BCM43xx_UCODE_REVISION); + + dprintk(KERN_INFO PFX "Microcode rev 0x%x, pl 0x%x " + "(20%.2i-%.2i-%.2i %.2i:%.2i:%.2i)\n", value16, + bcm43xx_shm_read16(bcm, BCM43xx_SHM_SHARED, + BCM43xx_UCODE_PATCHLEVEL), + (bcm43xx_shm_read16(bcm, BCM43xx_SHM_SHARED, + BCM43xx_UCODE_DATE) >> 12) & 0xf, + (bcm43xx_shm_read16(bcm, BCM43xx_SHM_SHARED, + BCM43xx_UCODE_DATE) >> 8) & 0xf, + bcm43xx_shm_read16(bcm, BCM43xx_SHM_SHARED, + BCM43xx_UCODE_DATE) & 0xff, + (bcm43xx_shm_read16(bcm, BCM43xx_SHM_SHARED, + BCM43xx_UCODE_TIME) >> 11) & 0x1f, + (bcm43xx_shm_read16(bcm, BCM43xx_SHM_SHARED, + BCM43xx_UCODE_TIME) >> 5) & 0x3f, + bcm43xx_shm_read16(bcm, BCM43xx_SHM_SHARED, + BCM43xx_UCODE_TIME) & 0x1f); + + if ( value16 > 0x128 ) { + dprintk(KERN_ERR PFX + "Firmware: no support for microcode rev > 0x128\n"); + err = -1; goto err_release_fw; + } err = bcm43xx_gpio_init(bcm); if (err) - goto err_free_irq; + goto err_release_fw; err = bcm43xx_upload_initvals(bcm); if (err) @@ -2489,10 +2492,12 @@ static int bcm43xx_chip_init(struct bcm4 bcm43xx_write32(bcm, 0x018C, 0x02000000); } bcm43xx_write32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON, 0x00004000); - bcm43xx_write32(bcm, BCM43xx_MMIO_DMA1_IRQ_MASK, 0x0001DC00); + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA0_IRQ_MASK, 0x0001DC00); + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA1_IRQ_MASK, 0x0000DC00); bcm43xx_write32(bcm, BCM43xx_MMIO_DMA2_IRQ_MASK, 0x0000DC00); - bcm43xx_write32(bcm, BCM43xx_MMIO_DMA3_IRQ_MASK, 0x0000DC00); - bcm43xx_write32(bcm, BCM43xx_MMIO_DMA4_IRQ_MASK, 0x0001DC00); + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA3_IRQ_MASK, 0x0001DC00); + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA4_IRQ_MASK, 0x0000DC00); + bcm43xx_write32(bcm, BCM43xx_MMIO_DMA5_IRQ_MASK, 0x0000DC00); value32 = bcm43xx_read32(bcm, BCM43xx_CIR_SBTMSTATELOW); value32 |= 0x00100000; @@ -2509,8 +2514,6 @@ err_radio_off: bcm43xx_radio_turn_off(bcm); err_gpio_cleanup: bcm43xx_gpio_cleanup(bcm); -err_free_irq: - free_irq(bcm->irq, bcm); err_release_fw: bcm43xx_release_firmware(bcm, 1); goto out; @@ -2550,11 +2553,9 @@ static void bcm43xx_init_struct_phyinfo( { /* Initialize a "phyinfo" structure. The structure is already * zeroed out. + * This is called on insmod time to initialize members. */ - phy->antenna_diversity = 0xFFFF; phy->savedpctlreg = 0xFFFF; - phy->minlowsig[0] = 0xFFFF; - phy->minlowsig[1] = 0xFFFF; spin_lock_init(&phy->lock); } @@ -2562,14 +2563,11 @@ static void bcm43xx_init_struct_radioinf { /* Initialize a "radioinfo" structure. The structure is already * zeroed out. + * This is called on insmod time to initialize members. */ radio->interfmode = BCM43xx_RADIO_INTERFMODE_NONE; radio->channel = 0xFF; radio->initial_channel = 0xFF; - radio->lofcal = 0xFFFF; - radio->initval = 0xFFFF; - radio->nrssi[0] = -1000; - radio->nrssi[1] = -1000; } static int bcm43xx_probe_cores(struct bcm43xx_private *bcm) @@ -2587,7 +2585,6 @@ static int bcm43xx_probe_cores(struct bc * BCM43xx_MAX_80211_CORES); memset(&bcm->core_80211_ext, 0, sizeof(struct bcm43xx_coreinfo_80211) * BCM43xx_MAX_80211_CORES); - bcm->current_80211_core_idx = -1; bcm->nr_80211_available = 0; bcm->current_core = NULL; bcm->active_80211_core = NULL; @@ -2757,6 +2754,7 @@ static int bcm43xx_probe_cores(struct bc goto out; } bcm->nr_80211_available++; + core->priv = ext_80211; bcm43xx_init_struct_phyinfo(&ext_80211->phy); bcm43xx_init_struct_radioinfo(&ext_80211->radio); break; @@ -2857,7 +2855,8 @@ static void bcm43xx_wireless_core_cleanu } /* http://bcm-specs.sipsolutions.net/80211Init */ -static int bcm43xx_wireless_core_init(struct bcm43xx_private *bcm) +static int bcm43xx_wireless_core_init(struct bcm43xx_private *bcm, + int active_wlcore) { struct bcm43xx_phyinfo *phy = bcm43xx_current_phy(bcm); struct bcm43xx_radioinfo *radio = bcm43xx_current_radio(bcm); @@ -2939,19 +2938,26 @@ static int bcm43xx_wireless_core_init(st if (bcm->current_core->rev >= 5) bcm43xx_write16(bcm, 0x043C, 0x000C); - if (bcm43xx_using_pio(bcm)) - err = bcm43xx_pio_init(bcm); - else - err = bcm43xx_dma_init(bcm); - if (err) - goto err_chip_cleanup; + if (active_wlcore) { + if (bcm43xx_using_pio(bcm)) + err = bcm43xx_pio_init(bcm); + else + err = bcm43xx_dma_init(bcm); + if (err) + goto err_chip_cleanup; + } bcm43xx_write16(bcm, 0x0612, 0x0050); bcm43xx_shm_write16(bcm, BCM43xx_SHM_SHARED, 0x0416, 0x0050); bcm43xx_shm_write16(bcm, BCM43xx_SHM_SHARED, 0x0414, 0x01F4); - bcm43xx_mac_enable(bcm); - bcm43xx_interrupt_enable(bcm, bcm->irq_savedstate); + if (active_wlcore) { + if (radio->initial_channel != 0xFF) + bcm43xx_radio_selectchannel(bcm, radio->initial_channel, 0); + } + /* Don't enable MAC/IRQ here, as it will race with the IRQ handler. + * We enable it later. + */ bcm->current_core->initialized = 1; out: return err; @@ -3066,11 +3072,6 @@ out: return err; } -static void bcm43xx_softmac_init(struct bcm43xx_private *bcm) -{ - ieee80211softmac_start(bcm->net_dev); -} - static void bcm43xx_periodic_every120sec(struct bcm43xx_private *bcm) { struct bcm43xx_phyinfo *phy = bcm43xx_current_phy(bcm); @@ -3178,51 +3179,51 @@ static void bcm43xx_periodic_work_handle int badness; badness = estimate_periodic_work_badness(bcm->periodic_state); + mutex_lock(&bcm->mutex); + + /* We must fake a started transmission here, as we are going to + * disable TX. If we wouldn't fake a TX, it would be possible to + * trigger the netdev watchdog, if the last real TX is already + * some time on the past (slightly less than 5secs) + */ + bcm->net_dev->trans_start = jiffies; + netif_tx_disable(bcm->net_dev); + + spin_lock_irqsave(&bcm->irq_lock, flags); if (badness > BADNESS_LIMIT) { /* Periodic work will take a long time, so we want it to * be preemtible. */ - bcm43xx_lock_irqonly(bcm, flags); - netif_stop_queue(bcm->net_dev); + bcm43xx_mac_suspend(bcm); if (bcm43xx_using_pio(bcm)) bcm43xx_pio_freeze_txqueues(bcm); savedirqs = bcm43xx_interrupt_disable(bcm, BCM43xx_IRQ_ALL); - bcm43xx_unlock_irqonly(bcm, flags); - bcm43xx_lock_noirq(bcm); + spin_unlock_irqrestore(&bcm->irq_lock, flags); bcm43xx_synchronize_irq(bcm); - } else { - /* Periodic work should take short time, so we want low - * locking overhead. - */ - bcm43xx_lock_irqsafe(bcm, flags); } do_periodic_work(bcm); if (badness > BADNESS_LIMIT) { - bcm43xx_lock_irqonly(bcm, flags); - if (likely(bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED)) { - tasklet_enable(&bcm->isr_tasklet); - bcm43xx_interrupt_enable(bcm, savedirqs); - if (bcm43xx_using_pio(bcm)) - bcm43xx_pio_thaw_txqueues(bcm); - } - netif_wake_queue(bcm->net_dev); - mmiowb(); - bcm43xx_unlock_irqonly(bcm, flags); - bcm43xx_unlock_noirq(bcm); - } else { - mmiowb(); - bcm43xx_unlock_irqsafe(bcm, flags); + spin_lock_irqsave(&bcm->irq_lock, flags); + tasklet_enable(&bcm->isr_tasklet); + bcm43xx_interrupt_enable(bcm, savedirqs); + if (bcm43xx_using_pio(bcm)) + bcm43xx_pio_thaw_txqueues(bcm); + bcm43xx_mac_enable(bcm); } + mmiowb(); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + netif_wake_queue(bcm->net_dev); + mutex_unlock(&bcm->mutex); } -static void bcm43xx_periodic_tasks_delete(struct bcm43xx_private *bcm) +void bcm43xx_periodic_tasks_delete(struct bcm43xx_private *bcm) { cancel_rearming_delayed_work(&bcm->periodic_work); } -static void bcm43xx_periodic_tasks_setup(struct bcm43xx_private *bcm) +void bcm43xx_periodic_tasks_setup(struct bcm43xx_private *bcm) { struct work_struct *work = &(bcm->periodic_work); @@ -3243,9 +3244,9 @@ static int bcm43xx_rng_read(struct hwrng struct bcm43xx_private *bcm = (struct bcm43xx_private *)rng->priv; unsigned long flags; - bcm43xx_lock_irqonly(bcm, flags); + spin_lock_irqsave(&(bcm)->irq_lock, flags); *data = bcm43xx_read16(bcm, BCM43xx_MMIO_RNG); - bcm43xx_unlock_irqonly(bcm, flags); + spin_unlock_irqrestore(&(bcm)->irq_lock, flags); return (sizeof(u16)); } @@ -3271,139 +3272,330 @@ static int bcm43xx_rng_init(struct bcm43 return err; } -/* This is the opposite of bcm43xx_init_board() */ -static void bcm43xx_free_board(struct bcm43xx_private *bcm) +static int bcm43xx_shutdown_all_wireless_cores(struct bcm43xx_private *bcm) { + int ret = 0; int i, err; + struct bcm43xx_coreinfo *core; + + bcm43xx_set_status(bcm, BCM43xx_STAT_SHUTTINGDOWN); + for (i = 0; i < bcm->nr_80211_available; i++) { + core = &(bcm->core_80211[i]); + assert(core->available); + if (!core->initialized) + continue; + err = bcm43xx_switch_core(bcm, core); + if (err) { + dprintk(KERN_ERR PFX "shutdown_all_wireless_cores " + "switch_core failed (%d)\n", err); + ret = err; + continue; + } + bcm43xx_interrupt_disable(bcm, BCM43xx_IRQ_ALL); + bcm43xx_read32(bcm, BCM43xx_MMIO_GEN_IRQ_REASON); /* dummy read */ + bcm43xx_wireless_core_cleanup(bcm); + if (core == bcm->active_80211_core) + bcm->active_80211_core = NULL; + } + free_irq(bcm->irq, bcm); + bcm43xx_set_status(bcm, BCM43xx_STAT_UNINIT); + + return ret; +} - bcm43xx_lock_noirq(bcm); +/* This is the opposite of bcm43xx_init_board() */ +static void bcm43xx_free_board(struct bcm43xx_private *bcm) +{ + bcm43xx_rng_exit(bcm); bcm43xx_sysfs_unregister(bcm); bcm43xx_periodic_tasks_delete(bcm); - bcm43xx_set_status(bcm, BCM43xx_STAT_SHUTTINGDOWN); + mutex_lock(&(bcm)->mutex); + bcm43xx_shutdown_all_wireless_cores(bcm); + bcm43xx_pctl_set_crystal(bcm, 0); + mutex_unlock(&(bcm)->mutex); +} - bcm43xx_rng_exit(bcm); +static void prepare_phydata_for_init(struct bcm43xx_phyinfo *phy) +{ + phy->antenna_diversity = 0xFFFF; + memset(phy->minlowsig, 0xFF, sizeof(phy->minlowsig)); + memset(phy->minlowsigpos, 0, sizeof(phy->minlowsigpos)); + + /* Flags */ + phy->calibrated = 0; + phy->is_locked = 0; + + if (phy->_lo_pairs) { + memset(phy->_lo_pairs, 0, + sizeof(struct bcm43xx_lopair) * BCM43xx_LO_COUNT); + } + memset(phy->loopback_gain, 0, sizeof(phy->loopback_gain)); +} + +static void prepare_radiodata_for_init(struct bcm43xx_private *bcm, + struct bcm43xx_radioinfo *radio) +{ + int i; + + /* Set default attenuation values. */ + radio->baseband_atten = bcm43xx_default_baseband_attenuation(bcm); + radio->radio_atten = bcm43xx_default_radio_attenuation(bcm); + radio->txctl1 = bcm43xx_default_txctl1(bcm); + radio->txctl2 = 0xFFFF; + radio->txpwr_offset = 0; + + /* NRSSI */ + radio->nrssislope = 0; + for (i = 0; i < ARRAY_SIZE(radio->nrssi); i++) + radio->nrssi[i] = -1000; + for (i = 0; i < ARRAY_SIZE(radio->nrssi_lt); i++) + radio->nrssi_lt[i] = i; + + radio->lofcal = 0xFFFF; + radio->initval = 0xFFFF; + + radio->aci_enable = 0; + radio->aci_wlan_automatic = 0; + radio->aci_hw_rssi = 0; +} + +static void prepare_priv_for_init(struct bcm43xx_private *bcm) +{ + int i; + struct bcm43xx_coreinfo *core; + struct bcm43xx_coreinfo_80211 *wlext; + + assert(!bcm->active_80211_core); + + bcm43xx_set_status(bcm, BCM43xx_STAT_INITIALIZING); + + /* Flags */ + bcm->was_initialized = 0; + bcm->reg124_set_0x4 = 0; + + /* Stats */ + memset(&bcm->stats, 0, sizeof(bcm->stats)); + + /* Wireless core data */ for (i = 0; i < BCM43xx_MAX_80211_CORES; i++) { - if (!bcm->core_80211[i].available) - continue; - if (!bcm->core_80211[i].initialized) + core = &(bcm->core_80211[i]); + wlext = core->priv; + + if (!core->available) continue; + assert(wlext == &(bcm->core_80211_ext[i])); - err = bcm43xx_switch_core(bcm, &bcm->core_80211[i]); - assert(err == 0); - bcm43xx_wireless_core_cleanup(bcm); + prepare_phydata_for_init(&wlext->phy); + prepare_radiodata_for_init(bcm, &wlext->radio); } - bcm43xx_pctl_set_crystal(bcm, 0); + /* IRQ related flags */ + bcm->irq_reason = 0; + memset(bcm->dma_reason, 0, sizeof(bcm->dma_reason)); + bcm->irq_savedstate = BCM43xx_IRQ_INITIAL; - bcm43xx_set_status(bcm, BCM43xx_STAT_UNINIT); - bcm43xx_unlock_noirq(bcm); + bcm->mac_suspended = 1; + + /* Noise calculation context */ + memset(&bcm->noisecalc, 0, sizeof(bcm->noisecalc)); + + /* Periodic work context */ + bcm->periodic_state = 0; } -static int bcm43xx_init_board(struct bcm43xx_private *bcm) +static int wireless_core_up(struct bcm43xx_private *bcm, + int active_wlcore) +{ + int err; + + if (!bcm43xx_core_enabled(bcm)) + bcm43xx_wireless_core_reset(bcm, 1); + if (!active_wlcore) + bcm43xx_wireless_core_mark_inactive(bcm); + err = bcm43xx_wireless_core_init(bcm, active_wlcore); + if (err) + goto out; + if (!active_wlcore) + bcm43xx_radio_turn_off(bcm); +out: + return err; +} + +/* Select and enable the "to be used" wireless core. + * Locking: bcm->mutex must be aquired before calling this. + * bcm->irq_lock must not be aquired. + */ +int bcm43xx_select_wireless_core(struct bcm43xx_private *bcm, + int phytype) { int i, err; - int connect_phy; + struct bcm43xx_coreinfo *active_core = NULL; + struct bcm43xx_coreinfo_80211 *active_wlext = NULL; + struct bcm43xx_coreinfo *core; + struct bcm43xx_coreinfo_80211 *wlext; + int adjust_active_sbtmstatelow = 0; might_sleep(); - bcm43xx_lock_noirq(bcm); - bcm43xx_set_status(bcm, BCM43xx_STAT_INITIALIZING); + if (phytype < 0) { + /* If no phytype is requested, select the first core. */ + assert(bcm->core_80211[0].available); + wlext = bcm->core_80211[0].priv; + phytype = wlext->phy.type; + } + /* Find the requested core. */ + for (i = 0; i < bcm->nr_80211_available; i++) { + core = &(bcm->core_80211[i]); + wlext = core->priv; + if (wlext->phy.type == phytype) { + active_core = core; + active_wlext = wlext; + break; + } + } + if (!active_core) + return -ESRCH; /* No such PHYTYPE on this board. */ + + if (bcm->active_80211_core) { + /* We already selected a wl core in the past. + * So first clean up everything. + */ + dprintk(KERN_INFO PFX "select_wireless_core: cleanup\n"); + ieee80211softmac_stop(bcm->net_dev); + bcm43xx_set_status(bcm, BCM43xx_STAT_INITIALIZED); + err = bcm43xx_disable_interrupts_sync(bcm); + assert(!err); + tasklet_enable(&bcm->isr_tasklet); + err = bcm43xx_shutdown_all_wireless_cores(bcm); + if (err) + goto error; + /* Ok, everything down, continue to re-initialize. */ + bcm43xx_set_status(bcm, BCM43xx_STAT_INITIALIZING); + } + + /* Reset all data structures. */ + prepare_priv_for_init(bcm); - err = bcm43xx_pctl_set_crystal(bcm, 1); - if (err) - goto out; - err = bcm43xx_pctl_init(bcm); - if (err) - goto err_crystal_off; err = bcm43xx_pctl_set_clock(bcm, BCM43xx_PCTL_CLK_FAST); if (err) - goto err_crystal_off; + goto error; - tasklet_enable(&bcm->isr_tasklet); + /* Mark all unused cores "inactive". */ for (i = 0; i < bcm->nr_80211_available; i++) { - err = bcm43xx_switch_core(bcm, &bcm->core_80211[i]); - assert(err != -ENODEV); - if (err) - goto err_80211_unwind; + core = &(bcm->core_80211[i]); + wlext = core->priv; - /* Enable the selected wireless core. - * Connect PHY only on the first core. - */ - if (!bcm43xx_core_enabled(bcm)) { - if (bcm->nr_80211_available == 1) { - connect_phy = bcm43xx_current_phy(bcm)->connected; - } else { - if (i == 0) - connect_phy = 1; - else - connect_phy = 0; - } - bcm43xx_wireless_core_reset(bcm, connect_phy); + if (core == active_core) + continue; + err = bcm43xx_switch_core(bcm, core); + if (err) { + dprintk(KERN_ERR PFX "Could not switch to inactive " + "802.11 core (%d)\n", err); + goto error; } + err = wireless_core_up(bcm, 0); + if (err) { + dprintk(KERN_ERR PFX "core_up for inactive 802.11 core " + "failed (%d)\n", err); + goto error; + } + adjust_active_sbtmstatelow = 1; + } - if (i != 0) - bcm43xx_wireless_core_mark_inactive(bcm, &bcm->core_80211[0]); - - err = bcm43xx_wireless_core_init(bcm); - if (err) - goto err_80211_unwind; + /* Now initialize the active 802.11 core. */ + err = bcm43xx_switch_core(bcm, active_core); + if (err) { + dprintk(KERN_ERR PFX "Could not switch to active " + "802.11 core (%d)\n", err); + goto error; + } + if (adjust_active_sbtmstatelow && + active_wlext->phy.type == BCM43xx_PHYTYPE_G) { + u32 sbtmstatelow; - if (i != 0) { - bcm43xx_mac_suspend(bcm); - bcm43xx_interrupt_disable(bcm, BCM43xx_IRQ_ALL); - bcm43xx_radio_turn_off(bcm); - } + sbtmstatelow = bcm43xx_read32(bcm, BCM43xx_CIR_SBTMSTATELOW); + sbtmstatelow |= 0x20000000; + bcm43xx_write32(bcm, BCM43xx_CIR_SBTMSTATELOW, sbtmstatelow); } - bcm->active_80211_core = &bcm->core_80211[0]; - if (bcm->nr_80211_available >= 2) { - bcm43xx_switch_core(bcm, &bcm->core_80211[0]); - bcm43xx_mac_enable(bcm); + err = wireless_core_up(bcm, 1); + if (err) { + dprintk(KERN_ERR PFX "core_up for active 802.11 core " + "failed (%d)\n", err); + goto error; } - err = bcm43xx_rng_init(bcm); + err = bcm43xx_pctl_set_clock(bcm, BCM43xx_PCTL_CLK_DYNAMIC); if (err) - goto err_80211_unwind; + goto error; + bcm->active_80211_core = active_core; + bcm43xx_macfilter_clear(bcm, BCM43xx_MACFILTER_ASSOC); bcm43xx_macfilter_set(bcm, BCM43xx_MACFILTER_SELF, (u8 *)(bcm->net_dev->dev_addr)); - dprintk(KERN_INFO PFX "80211 cores initialized\n"); bcm43xx_security_init(bcm); - bcm43xx_softmac_init(bcm); + drain_txstatus_queue(bcm); + ieee80211softmac_start(bcm->net_dev); - bcm43xx_pctl_set_clock(bcm, BCM43xx_PCTL_CLK_DYNAMIC); + /* Let's go! Be careful after enabling the IRQs. + * Don't switch cores, for example. + */ + bcm43xx_mac_enable(bcm); + bcm43xx_set_status(bcm, BCM43xx_STAT_INITIALIZED); + err = bcm43xx_initialize_irq(bcm); + if (err) + goto error; + bcm43xx_interrupt_enable(bcm, bcm->irq_savedstate); - if (bcm43xx_current_radio(bcm)->initial_channel != 0xFF) { - bcm43xx_mac_suspend(bcm); - bcm43xx_radio_selectchannel(bcm, bcm43xx_current_radio(bcm)->initial_channel, 0); - bcm43xx_mac_enable(bcm); - } + dprintk(KERN_INFO PFX "Selected 802.11 core (phytype %d)\n", + active_wlext->phy.type); - /* Initialization of the board is done. Flag it as such. */ - bcm43xx_set_status(bcm, BCM43xx_STAT_INITIALIZED); + return 0; + +error: + bcm43xx_set_status(bcm, BCM43xx_STAT_UNINIT); + bcm43xx_pctl_set_clock(bcm, BCM43xx_PCTL_CLK_SLOW); + return err; +} + +static int bcm43xx_init_board(struct bcm43xx_private *bcm) +{ + int err; + mutex_lock(&(bcm)->mutex); + + tasklet_enable(&bcm->isr_tasklet); + err = bcm43xx_pctl_set_crystal(bcm, 1); + if (err) + goto err_tasklet; + err = bcm43xx_pctl_init(bcm); + if (err) + goto err_crystal_off; + err = bcm43xx_select_wireless_core(bcm, -1); + if (err) + goto err_crystal_off; + err = bcm43xx_sysfs_register(bcm); + if (err) + goto err_wlshutdown; + err = bcm43xx_rng_init(bcm); + if (err) + goto err_sysfs_unreg; bcm43xx_periodic_tasks_setup(bcm); - bcm43xx_sysfs_register(bcm); - //FIXME: check for bcm43xx_sysfs_register failure. This function is a bit messy regarding unwinding, though... /*FIXME: This should be handled by softmac instead. */ schedule_work(&bcm->softmac->associnfo.work); - assert(err == 0); out: - bcm43xx_unlock_noirq(bcm); + mutex_unlock(&(bcm)->mutex); return err; -err_80211_unwind: - tasklet_disable(&bcm->isr_tasklet); - /* unwind all 80211 initialization */ - for (i = 0; i < bcm->nr_80211_available; i++) { - if (!bcm->core_80211[i].initialized) - continue; - bcm43xx_interrupt_disable(bcm, BCM43xx_IRQ_ALL); - bcm43xx_wireless_core_cleanup(bcm); - } +err_sysfs_unreg: + bcm43xx_sysfs_unregister(bcm); +err_wlshutdown: + bcm43xx_shutdown_all_wireless_cores(bcm); err_crystal_off: bcm43xx_pctl_set_crystal(bcm, 0); +err_tasklet: + tasklet_disable(&bcm->isr_tasklet); goto out; } @@ -3647,7 +3839,8 @@ static void bcm43xx_ieee80211_set_chan(s struct bcm43xx_radioinfo *radio; unsigned long flags; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED) { bcm43xx_mac_suspend(bcm); bcm43xx_radio_selectchannel(bcm, channel, 0); @@ -3656,7 +3849,8 @@ static void bcm43xx_ieee80211_set_chan(s radio = bcm43xx_current_radio(bcm); radio->initial_channel = channel; } - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); } /* set_security() callback in struct ieee80211_device */ @@ -3670,7 +3864,8 @@ static void bcm43xx_ieee80211_set_securi dprintk(KERN_INFO PFX "set security called"); - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); for (keyidx = 0; keyidxflags & (1<irq_lock, flags); + mutex_unlock(&bcm->mutex); } /* hard_start_xmit() callback in struct ieee80211_device */ @@ -3751,12 +3947,14 @@ static int bcm43xx_ieee80211_hard_start_ int err = -ENODEV; unsigned long flags; - bcm43xx_lock_irqonly(bcm, flags); + spin_lock_irqsave(&bcm->irq_lock, flags); if (likely(bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED)) err = bcm43xx_tx(bcm, txb); - bcm43xx_unlock_irqonly(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); - return err; + if (unlikely(err)) + return NETDEV_TX_BUSY; + return NETDEV_TX_OK; } static struct net_device_stats * bcm43xx_net_get_stats(struct net_device *net_dev) @@ -3769,9 +3967,9 @@ static void bcm43xx_net_tx_timeout(struc struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); unsigned long flags; - bcm43xx_lock_irqonly(bcm, flags); + spin_lock_irqsave(&bcm->irq_lock, flags); bcm43xx_controller_restart(bcm, "TX timeout"); - bcm43xx_unlock_irqonly(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -3781,7 +3979,8 @@ static void bcm43xx_net_poll_controller( unsigned long flags; local_irq_save(flags); - bcm43xx_interrupt_handler(bcm->irq, bcm, NULL); + if (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED) + bcm43xx_interrupt_handler(bcm->irq, bcm, NULL); local_irq_restore(flags); } #endif /* CONFIG_NET_POLL_CONTROLLER */ @@ -3799,9 +3998,10 @@ static int bcm43xx_net_stop(struct net_d int err; ieee80211softmac_stop(net_dev); - err = bcm43xx_disable_interrupts_sync(bcm, NULL); + err = bcm43xx_disable_interrupts_sync(bcm); assert(!err); bcm43xx_free_board(bcm); + flush_scheduled_work(); return 0; } @@ -3818,10 +4018,12 @@ static int bcm43xx_init_private(struct b bcm->softmac->set_channel = bcm43xx_ieee80211_set_chan; bcm->irq_savedstate = BCM43xx_IRQ_INITIAL; + bcm->mac_suspended = 1; bcm->pci_dev = pci_dev; bcm->net_dev = net_dev; bcm->bad_frames_preempt = modparam_bad_frames_preempt; spin_lock_init(&bcm->irq_lock); + spin_lock_init(&bcm->leds_lock); mutex_init(&bcm->mutex); tasklet_init(&bcm->isr_tasklet, (void (*)(unsigned long))bcm43xx_interrupt_tasklet, @@ -3940,7 +4142,6 @@ static void __devexit bcm43xx_remove_one bcm43xx_debugfs_remove_device(bcm); unregister_netdev(net_dev); bcm43xx_detach_board(bcm); - assert(bcm->ucode == NULL); free_ieee80211softmac(net_dev); } @@ -3950,47 +4151,31 @@ static void __devexit bcm43xx_remove_one static void bcm43xx_chip_reset(void *_bcm) { struct bcm43xx_private *bcm = _bcm; - struct net_device *net_dev = bcm->net_dev; - struct pci_dev *pci_dev = bcm->pci_dev; - int err; - int was_initialized = (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED); - - netif_stop_queue(bcm->net_dev); - tasklet_disable(&bcm->isr_tasklet); + struct bcm43xx_phyinfo *phy; + int err = -ENODEV; - bcm->firmware_norelease = 1; - if (was_initialized) - bcm43xx_free_board(bcm); - bcm->firmware_norelease = 0; - bcm43xx_detach_board(bcm); - err = bcm43xx_init_private(bcm, net_dev, pci_dev); - if (err) - goto failure; - err = bcm43xx_attach_board(bcm); - if (err) - goto failure; - if (was_initialized) { - err = bcm43xx_init_board(bcm); - if (err) - goto failure; + mutex_lock(&(bcm)->mutex); + if (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED) { + bcm43xx_periodic_tasks_delete(bcm); + phy = bcm43xx_current_phy(bcm); + err = bcm43xx_select_wireless_core(bcm, phy->type); + if (!err) + bcm43xx_periodic_tasks_setup(bcm); } - netif_wake_queue(bcm->net_dev); - printk(KERN_INFO PFX "Controller restarted\n"); + mutex_unlock(&(bcm)->mutex); - return; -failure: - printk(KERN_ERR PFX "Controller restart failed\n"); + printk(KERN_ERR PFX "Controller restart%s\n", + (err == 0) ? "ed" : " failed"); } /* Hard-reset the chip. * This can be called from interrupt or process context. - * Make sure to _not_ re-enable device interrupts after this has been called. -*/ + * bcm->irq_lock must be locked. + */ void bcm43xx_controller_restart(struct bcm43xx_private *bcm, const char *reason) { - bcm43xx_set_status(bcm, BCM43xx_STAT_RESTARTING); - bcm43xx_interrupt_disable(bcm, BCM43xx_IRQ_ALL); - bcm43xx_read32(bcm, BCM43xx_MMIO_STATUS_BITFIELD); /* dummy read */ + if (bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED) + return; printk(KERN_ERR PFX "Controller RESET (%s) ...\n", reason); INIT_WORK(&bcm->restart_work, bcm43xx_chip_reset, bcm); schedule_work(&bcm->restart_work); @@ -4002,21 +4187,16 @@ static int bcm43xx_suspend(struct pci_de { struct net_device *net_dev = pci_get_drvdata(pdev); struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); - unsigned long flags; - int try_to_shutdown = 0, err; + int err; dprintk(KERN_INFO PFX "Suspending...\n"); - bcm43xx_lock_irqsafe(bcm, flags); - bcm->was_initialized = (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED); - if (bcm->was_initialized) - try_to_shutdown = 1; - bcm43xx_unlock_irqsafe(bcm, flags); - netif_device_detach(net_dev); - if (try_to_shutdown) { + bcm->was_initialized = 0; + if (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED) { + bcm->was_initialized = 1; ieee80211softmac_stop(net_dev); - err = bcm43xx_disable_interrupts_sync(bcm, &bcm->irq_savedstate); + err = bcm43xx_disable_interrupts_sync(bcm); if (unlikely(err)) { dprintk(KERN_ERR PFX "Suspend failed.\n"); return -EAGAIN; @@ -4049,17 +4229,14 @@ static int bcm43xx_resume(struct pci_dev pci_restore_state(pdev); bcm43xx_chipset_attach(bcm); - if (bcm->was_initialized) { - bcm->irq_savedstate = BCM43xx_IRQ_INITIAL; + if (bcm->was_initialized) err = bcm43xx_init_board(bcm); - } if (err) { printk(KERN_ERR PFX "Resume failed!\n"); return err; } - netif_device_attach(net_dev); - + dprintk(KERN_INFO PFX "Device resumed.\n"); return 0; diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_main.h linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_main.h --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_main.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_main.h 2007-06-13 06:55:07.000000000 -0400 @@ -133,11 +133,17 @@ void bcm43xx_dummy_transmission(struct b int bcm43xx_switch_core(struct bcm43xx_private *bcm, struct bcm43xx_coreinfo *new_core); +int bcm43xx_select_wireless_core(struct bcm43xx_private *bcm, + int phytype); + void bcm43xx_wireless_core_reset(struct bcm43xx_private *bcm, int connect_phy); void bcm43xx_mac_suspend(struct bcm43xx_private *bcm); void bcm43xx_mac_enable(struct bcm43xx_private *bcm); +void bcm43xx_periodic_tasks_delete(struct bcm43xx_private *bcm); +void bcm43xx_periodic_tasks_setup(struct bcm43xx_private *bcm); + void bcm43xx_controller_restart(struct bcm43xx_private *bcm, const char *reason); int bcm43xx_sprom_read(struct bcm43xx_private *bcm, u16 *sprom); diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_phy.c linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_phy.c --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_phy.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_phy.c 2007-06-13 06:55:07.000000000 -0400 @@ -81,6 +81,16 @@ static const s8 bcm43xx_tssi2dbm_g_table static void bcm43xx_phy_initg(struct bcm43xx_private *bcm); +static inline +void bcm43xx_voluntary_preempt(void) +{ + assert(!in_atomic() && !in_irq() && + !in_interrupt() && !irqs_disabled()); +#ifndef CONFIG_PREEMPT + cond_resched(); +#endif /* CONFIG_PREEMPT */ +} + void bcm43xx_raw_phy_lock(struct bcm43xx_private *bcm) { struct bcm43xx_phyinfo *phy = bcm43xx_current_phy(bcm); @@ -133,22 +143,14 @@ void bcm43xx_phy_write(struct bcm43xx_pr void bcm43xx_phy_calibrate(struct bcm43xx_private *bcm) { struct bcm43xx_phyinfo *phy = bcm43xx_current_phy(bcm); - unsigned long flags; bcm43xx_read32(bcm, BCM43xx_MMIO_STATUS_BITFIELD); /* Dummy read. */ if (phy->calibrated) return; if (phy->type == BCM43xx_PHYTYPE_G && phy->rev == 1) { - /* We do not want to be preempted while calibrating - * the hardware. - */ - local_irq_save(flags); - bcm43xx_wireless_core_reset(bcm, 0); bcm43xx_phy_initg(bcm); bcm43xx_wireless_core_reset(bcm, 1); - - local_irq_restore(flags); } phy->calibrated = 1; } @@ -359,7 +361,7 @@ static void bcm43xx_phy_setupg(struct bc if (phy->rev <= 2) for (i = 0; i < BCM43xx_ILT_NOISESCALEG_SIZE; i++) bcm43xx_ilt_write(bcm, 0x1400 + i, bcm43xx_ilt_noisescaleg1[i]); - else if ((phy->rev == 7) && (bcm43xx_phy_read(bcm, 0x0449) & 0x0200)) + else if ((phy->rev >= 7) && (bcm43xx_phy_read(bcm, 0x0449) & 0x0200)) for (i = 0; i < BCM43xx_ILT_NOISESCALEG_SIZE; i++) bcm43xx_ilt_write(bcm, 0x1400 + i, bcm43xx_ilt_noisescaleg3[i]); else @@ -369,7 +371,7 @@ static void bcm43xx_phy_setupg(struct bc if (phy->rev == 2) for (i = 0; i < BCM43xx_ILT_SIGMASQR_SIZE; i++) bcm43xx_ilt_write(bcm, 0x5000 + i, bcm43xx_ilt_sigmasqr1[i]); - else if ((phy->rev > 2) && (phy->rev <= 7)) + else if ((phy->rev > 2) && (phy->rev <= 8)) for (i = 0; i < BCM43xx_ILT_SIGMASQR_SIZE; i++) bcm43xx_ilt_write(bcm, 0x5000 + i, bcm43xx_ilt_sigmasqr2[i]); @@ -1195,7 +1197,7 @@ static void bcm43xx_phy_initg(struct bcm if (phy->rev == 1) bcm43xx_phy_initb5(bcm); - else if (phy->rev >= 2 && phy->rev <= 7) + else bcm43xx_phy_initb6(bcm); if (phy->rev >= 2 || phy->connected) bcm43xx_phy_inita(bcm); @@ -1239,23 +1241,22 @@ static void bcm43xx_phy_initg(struct bcm bcm43xx_phy_lo_g_measure(bcm); } else { if (radio->version == 0x2050 && radio->revision == 8) { - //FIXME + bcm43xx_radio_write16(bcm, 0x0052, + (radio->txctl1 << 4) | radio->txctl2); } else { bcm43xx_radio_write16(bcm, 0x0052, (bcm43xx_radio_read16(bcm, 0x0052) & 0xFFF0) | radio->txctl1); } if (phy->rev >= 6) { - /* bcm43xx_phy_write(bcm, 0x0036, (bcm43xx_phy_read(bcm, 0x0036) - & 0xF000) | (FIXME << 12)); - */ + & 0xF000) | (radio->txctl2 << 12)); } if (bcm->sprom.boardflags & BCM43xx_BFL_PACTRL) bcm43xx_phy_write(bcm, 0x002E, 0x8075); else - bcm43xx_phy_write(bcm, 0x003E, 0x807F); + bcm43xx_phy_write(bcm, 0x002E, 0x807F); if (phy->rev < 2) bcm43xx_phy_write(bcm, 0x002F, 0x0101); else @@ -1299,7 +1300,9 @@ static u16 bcm43xx_phy_lo_b_r15_loop(str { int i; u16 ret = 0; + unsigned long flags; + local_irq_save(flags); for (i = 0; i < 10; i++){ bcm43xx_phy_write(bcm, 0x0015, 0xAFA0); udelay(1); @@ -1309,6 +1312,8 @@ static u16 bcm43xx_phy_lo_b_r15_loop(str udelay(40); ret += bcm43xx_phy_read(bcm, 0x002C); } + local_irq_restore(flags); + bcm43xx_voluntary_preempt(); return ret; } @@ -1435,6 +1440,7 @@ u16 bcm43xx_phy_lo_g_deviation_subval(st } ret = bcm43xx_phy_read(bcm, 0x002D); local_irq_restore(flags); + bcm43xx_voluntary_preempt(); return ret; } @@ -1760,6 +1766,7 @@ void bcm43xx_phy_lo_g_measure(struct bcm bcm43xx_radio_write16(bcm, 0x43, i); bcm43xx_radio_write16(bcm, 0x52, radio->txctl2); udelay(10); + bcm43xx_voluntary_preempt(); bcm43xx_phy_set_baseband_attenuation(bcm, j * 2); @@ -1803,6 +1810,7 @@ void bcm43xx_phy_lo_g_measure(struct bcm radio->txctl2 | (3/*txctl1*/ << 4));//FIXME: shouldn't txctl1 be zero here and 3 in the loop above? udelay(10); + bcm43xx_voluntary_preempt(); bcm43xx_phy_set_baseband_attenuation(bcm, j * 2); @@ -1824,6 +1832,7 @@ void bcm43xx_phy_lo_g_measure(struct bcm bcm43xx_phy_write(bcm, 0x0812, (r27 << 8) | 0xA2); udelay(2); bcm43xx_phy_write(bcm, 0x0812, (r27 << 8) | 0xA3); + bcm43xx_voluntary_preempt(); } else bcm43xx_phy_write(bcm, 0x0015, r27 | 0xEFA0); bcm43xx_phy_lo_adjust(bcm, is_initializing); @@ -2188,12 +2197,6 @@ int bcm43xx_phy_init(struct bcm43xx_priv { struct bcm43xx_phyinfo *phy = bcm43xx_current_phy(bcm); int err = -ENODEV; - unsigned long flags; - - /* We do not want to be preempted while calibrating - * the hardware. - */ - local_irq_save(flags); switch (phy->type) { case BCM43xx_PHYTYPE_A: @@ -2227,7 +2230,6 @@ int bcm43xx_phy_init(struct bcm43xx_priv err = 0; break; } - local_irq_restore(flags); if (err) printk(KERN_WARNING PFX "Unknown PHYTYPE found!\n"); diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_pio.c linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_pio.c --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_pio.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_pio.c 2007-06-13 06:55:07.000000000 -0400 @@ -262,7 +262,7 @@ static void tx_tasklet(unsigned long d) int err; u16 txctl; - bcm43xx_lock_irqonly(bcm, flags); + spin_lock_irqsave(&bcm->irq_lock, flags); if (queue->tx_frozen) goto out_unlock; @@ -300,7 +300,7 @@ static void tx_tasklet(unsigned long d) continue; } out_unlock: - bcm43xx_unlock_irqonly(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); } static void setup_txqueues(struct bcm43xx_pioqueue *queue) diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_sysfs.c linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_sysfs.c --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_sysfs.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_sysfs.c 2007-06-13 06:55:07.000000000 -0400 @@ -120,12 +120,14 @@ static ssize_t bcm43xx_attr_sprom_show(s GFP_KERNEL); if (!sprom) return -ENOMEM; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); err = bcm43xx_sprom_read(bcm, sprom); if (!err) err = sprom2hex(sprom, buf, PAGE_SIZE); mmiowb(); - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); kfree(sprom); return err; @@ -150,10 +152,14 @@ static ssize_t bcm43xx_attr_sprom_store( err = hex2sprom(sprom, buf, count); if (err) goto out_kfree; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); + spin_lock(&bcm->leds_lock); err = bcm43xx_sprom_write(bcm, sprom); mmiowb(); - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock(&bcm->leds_lock); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); out_kfree: kfree(sprom); @@ -170,13 +176,12 @@ static ssize_t bcm43xx_attr_interfmode_s char *buf) { struct bcm43xx_private *bcm = dev_to_bcm(dev); - int err; ssize_t count = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; - bcm43xx_lock_noirq(bcm); + mutex_lock(&bcm->mutex); switch (bcm43xx_current_radio(bcm)->interfmode) { case BCM43xx_RADIO_INTERFMODE_NONE: @@ -191,11 +196,10 @@ static ssize_t bcm43xx_attr_interfmode_s default: assert(0); } - err = 0; - bcm43xx_unlock_noirq(bcm); + mutex_unlock(&bcm->mutex); - return err ? err : count; + return count; } @@ -229,7 +233,8 @@ static ssize_t bcm43xx_attr_interfmode_s return -EINVAL; } - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); err = bcm43xx_radio_set_interference_mitigation(bcm, mode); if (err) { @@ -237,7 +242,8 @@ static ssize_t bcm43xx_attr_interfmode_s "supported by device\n"); } mmiowb(); - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); return err ? err : count; } @@ -251,23 +257,21 @@ static ssize_t bcm43xx_attr_preamble_sho char *buf) { struct bcm43xx_private *bcm = dev_to_bcm(dev); - int err; ssize_t count; if (!capable(CAP_NET_ADMIN)) return -EPERM; - bcm43xx_lock_noirq(bcm); + mutex_lock(&bcm->mutex); if (bcm->short_preamble) count = snprintf(buf, PAGE_SIZE, "1 (Short Preamble enabled)\n"); else count = snprintf(buf, PAGE_SIZE, "0 (Short Preamble disabled)\n"); - err = 0; - bcm43xx_unlock_noirq(bcm); + mutex_unlock(&bcm->mutex); - return err ? err : count; + return count; } static ssize_t bcm43xx_attr_preamble_store(struct device *dev, @@ -276,7 +280,6 @@ static ssize_t bcm43xx_attr_preamble_sto { struct bcm43xx_private *bcm = dev_to_bcm(dev); unsigned long flags; - int err; int value; if (!capable(CAP_NET_ADMIN)) @@ -285,14 +288,15 @@ static ssize_t bcm43xx_attr_preamble_sto value = get_boolean(buf, count); if (value < 0) return value; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); bcm->short_preamble = !!value; - err = 0; - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); - return err ? err : count; + return count; } static DEVICE_ATTR(shortpreamble, 0644, diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_wx.c linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_wx.c --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_wx.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_wx.c 2007-06-13 06:55:07.000000000 -0400 @@ -56,12 +56,11 @@ static int bcm43xx_wx_get_name(struct ne { struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); int i; - unsigned long flags; struct bcm43xx_phyinfo *phy; char suffix[7] = { 0 }; int have_a = 0, have_b = 0, have_g = 0; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); for (i = 0; i < bcm->nr_80211_available; i++) { phy = &(bcm->core_80211_ext[i].phy); switch (phy->type) { @@ -77,7 +76,7 @@ static int bcm43xx_wx_get_name(struct ne assert(0); } } - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); i = 0; if (have_a) { @@ -111,7 +110,9 @@ static int bcm43xx_wx_set_channelfreq(st int freq; int err = -EINVAL; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); + if ((data->freq.m >= 0) && (data->freq.m <= 1000)) { channel = data->freq.m; freq = bcm43xx_channel_to_freq(bcm, channel); @@ -131,7 +132,8 @@ static int bcm43xx_wx_set_channelfreq(st err = 0; } out_unlock: - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); return err; } @@ -143,11 +145,10 @@ static int bcm43xx_wx_get_channelfreq(st { struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); struct bcm43xx_radioinfo *radio; - unsigned long flags; int err = -ENODEV; u16 channel; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); radio = bcm43xx_current_radio(bcm); channel = radio->channel; if (channel == 0xFF) { @@ -162,7 +163,7 @@ static int bcm43xx_wx_get_channelfreq(st err = 0; out_unlock: - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); return err; } @@ -180,13 +181,15 @@ static int bcm43xx_wx_set_mode(struct ne if (mode == IW_MODE_AUTO) mode = BCM43xx_INITIAL_IWMODE; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED) { if (bcm->ieee->iw_mode != mode) bcm43xx_set_iwmode(bcm, mode); } else bcm->ieee->iw_mode = mode; - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); return 0; } @@ -197,11 +200,10 @@ static int bcm43xx_wx_get_mode(struct ne char *extra) { struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); - unsigned long flags; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); data->mode = bcm->ieee->iw_mode; - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); return 0; } @@ -214,7 +216,6 @@ static int bcm43xx_wx_get_rangeparams(st struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); struct iw_range *range = (struct iw_range *)extra; const struct ieee80211_geo *geo; - unsigned long flags; int i, j; struct bcm43xx_phyinfo *phy; @@ -254,7 +255,7 @@ static int bcm43xx_wx_get_rangeparams(st IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); phy = bcm43xx_current_phy(bcm); range->num_bitrates = 0; @@ -301,7 +302,7 @@ static int bcm43xx_wx_get_rangeparams(st } range->num_frequency = j; - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); return 0; } @@ -314,11 +315,11 @@ static int bcm43xx_wx_set_nick(struct ne struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); size_t len; - bcm43xx_lock_noirq(bcm); + mutex_lock(&bcm->mutex); len = min((size_t)data->data.length, (size_t)IW_ESSID_MAX_SIZE); memcpy(bcm->nick, extra, len); bcm->nick[len] = '\0'; - bcm43xx_unlock_noirq(bcm); + mutex_unlock(&bcm->mutex); return 0; } @@ -331,12 +332,12 @@ static int bcm43xx_wx_get_nick(struct ne struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); size_t len; - bcm43xx_lock_noirq(bcm); + mutex_lock(&bcm->mutex); len = strlen(bcm->nick) + 1; memcpy(extra, bcm->nick, len); data->data.length = (__u16)len; data->data.flags = 1; - bcm43xx_unlock_noirq(bcm); + mutex_unlock(&bcm->mutex); return 0; } @@ -350,7 +351,8 @@ static int bcm43xx_wx_set_rts(struct net unsigned long flags; int err = -EINVAL; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (data->rts.disabled) { bcm->rts_threshold = BCM43xx_MAX_RTS_THRESHOLD; err = 0; @@ -361,7 +363,8 @@ static int bcm43xx_wx_set_rts(struct net err = 0; } } - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); return err; } @@ -372,13 +375,12 @@ static int bcm43xx_wx_get_rts(struct net char *extra) { struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); - unsigned long flags; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); data->rts.value = bcm->rts_threshold; data->rts.fixed = 0; data->rts.disabled = (bcm->rts_threshold == BCM43xx_MAX_RTS_THRESHOLD); - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); return 0; } @@ -392,7 +394,8 @@ static int bcm43xx_wx_set_frag(struct ne unsigned long flags; int err = -EINVAL; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (data->frag.disabled) { bcm->ieee->fts = MAX_FRAG_THRESHOLD; err = 0; @@ -403,7 +406,8 @@ static int bcm43xx_wx_set_frag(struct ne err = 0; } } - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); return err; } @@ -414,13 +418,12 @@ static int bcm43xx_wx_get_frag(struct ne char *extra) { struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); - unsigned long flags; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); data->frag.value = bcm->ieee->fts; data->frag.fixed = 0; data->frag.disabled = (bcm->ieee->fts == MAX_FRAG_THRESHOLD); - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); return 0; } @@ -442,7 +445,8 @@ static int bcm43xx_wx_set_xmitpower(stru return -EOPNOTSUPP; } - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED) goto out_unlock; radio = bcm43xx_current_radio(bcm); @@ -466,7 +470,8 @@ static int bcm43xx_wx_set_xmitpower(stru err = 0; out_unlock: - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); return err; } @@ -478,10 +483,9 @@ static int bcm43xx_wx_get_xmitpower(stru { struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); struct bcm43xx_radioinfo *radio; - unsigned long flags; int err = -ENODEV; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); if (bcm43xx_status(bcm) != BCM43xx_STAT_INITIALIZED) goto out_unlock; radio = bcm43xx_current_radio(bcm); @@ -493,7 +497,7 @@ static int bcm43xx_wx_get_xmitpower(stru err = 0; out_unlock: - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); return err; } @@ -580,7 +584,8 @@ static int bcm43xx_wx_set_interfmode(str return -EINVAL; } - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); if (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED) { err = bcm43xx_radio_set_interference_mitigation(bcm, mode); if (err) { @@ -595,7 +600,8 @@ static int bcm43xx_wx_set_interfmode(str } else bcm43xx_current_radio(bcm)->interfmode = mode; } - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); return err; } @@ -606,12 +612,11 @@ static int bcm43xx_wx_get_interfmode(str char *extra) { struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); - unsigned long flags; int mode; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); mode = bcm43xx_current_radio(bcm)->interfmode; - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); switch (mode) { case BCM43xx_RADIO_INTERFMODE_NONE: @@ -641,9 +646,11 @@ static int bcm43xx_wx_set_shortpreamble( int on; on = *((int *)extra); - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); bcm->short_preamble = !!on; - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); return 0; } @@ -654,12 +661,11 @@ static int bcm43xx_wx_get_shortpreamble( char *extra) { struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); - unsigned long flags; int on; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); on = bcm->short_preamble; - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); if (on) strncpy(extra, "1 (Short Preamble enabled)", MAX_WX_STRING); @@ -681,11 +687,13 @@ static int bcm43xx_wx_set_swencryption(s on = *((int *)extra); - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); bcm->ieee->host_encrypt = !!on; bcm->ieee->host_decrypt = !!on; bcm->ieee->host_build_iv = !on; - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); return 0; } @@ -696,12 +704,11 @@ static int bcm43xx_wx_get_swencryption(s char *extra) { struct bcm43xx_private *bcm = bcm43xx_priv(net_dev); - unsigned long flags; int on; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); on = bcm->ieee->host_encrypt; - bcm43xx_unlock_irqsafe(bcm, flags); + mutex_unlock(&bcm->mutex); if (on) strncpy(extra, "1 (SW encryption enabled) ", MAX_WX_STRING); @@ -764,11 +771,13 @@ static int bcm43xx_wx_sprom_read(struct if (!sprom) goto out; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); err = -ENODEV; if (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED) err = bcm43xx_sprom_read(bcm, sprom); - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); if (!err) data->data.length = sprom2hex(sprom, extra); kfree(sprom); @@ -809,11 +818,15 @@ static int bcm43xx_wx_sprom_write(struct if (err) goto out_kfree; - bcm43xx_lock_irqsafe(bcm, flags); + mutex_lock(&bcm->mutex); + spin_lock_irqsave(&bcm->irq_lock, flags); + spin_lock(&bcm->leds_lock); err = -ENODEV; if (bcm43xx_status(bcm) == BCM43xx_STAT_INITIALIZED) err = bcm43xx_sprom_write(bcm, sprom); - bcm43xx_unlock_irqsafe(bcm, flags); + spin_unlock(&bcm->leds_lock); + spin_unlock_irqrestore(&bcm->irq_lock, flags); + mutex_unlock(&bcm->mutex); out_kfree: kfree(sprom); out: diff -uprN linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_xmit.h linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_xmit.h --- linux-2.6.18/drivers/net/wireless/bcm43xx/bcm43xx_xmit.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/bcm43xx/bcm43xx_xmit.h 2007-06-13 06:55:07.000000000 -0400 @@ -137,14 +137,8 @@ struct bcm43xx_xmitstatus { u16 unknown; //FIXME }; -#define BCM43xx_TXSTAT_FLAG_ACK 0x01 -//TODO #define BCM43xx_TXSTAT_FLAG_??? 0x02 -//TODO #define BCM43xx_TXSTAT_FLAG_??? 0x04 -//TODO #define BCM43xx_TXSTAT_FLAG_??? 0x08 -//TODO #define BCM43xx_TXSTAT_FLAG_??? 0x10 -#define BCM43xx_TXSTAT_FLAG_IGNORE 0x20 -//TODO #define BCM43xx_TXSTAT_FLAG_??? 0x40 -//TODO #define BCM43xx_TXSTAT_FLAG_??? 0x80 +#define BCM43xx_TXSTAT_FLAG_AMPDU 0x10 +#define BCM43xx_TXSTAT_FLAG_INTER 0x20 u8 bcm43xx_plcp_get_ratecode_cck(const u8 bitrate); u8 bcm43xx_plcp_get_ratecode_ofdm(const u8 bitrate); diff -uprN linux-2.6.18/drivers/net/wireless/zd1211rw/zd_chip.c linux-2.6.18.ovz/drivers/net/wireless/zd1211rw/zd_chip.c --- linux-2.6.18/drivers/net/wireless/zd1211rw/zd_chip.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/net/wireless/zd1211rw/zd_chip.c 2007-06-13 06:55:07.000000000 -0400 @@ -717,7 +717,7 @@ static int zd1211b_hw_reset_phy(struct z { CR21, 0x0e }, { CR22, 0x23 }, { CR23, 0x90 }, { CR24, 0x14 }, { CR25, 0x40 }, { CR26, 0x10 }, { CR27, 0x10 }, { CR28, 0x7f }, { CR29, 0x80 }, - { CR30, 0x49 }, /* jointly decoder, no ASIC */ + { CR30, 0x4b }, /* ASIC/FWT, no jointly decoder */ { CR31, 0x60 }, { CR32, 0x43 }, { CR33, 0x08 }, { CR34, 0x06 }, { CR35, 0x0a }, { CR36, 0x00 }, { CR37, 0x00 }, { CR38, 0x38 }, { CR39, 0x0c }, diff -uprN linux-2.6.18/drivers/parisc/led.c linux-2.6.18.ovz/drivers/parisc/led.c --- linux-2.6.18/drivers/parisc/led.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/parisc/led.c 2007-06-13 06:55:07.000000000 -0400 @@ -684,7 +684,7 @@ int __init led_init(void) int ret; snprintf(lcd_text_default, sizeof(lcd_text_default), - "Linux %s", system_utsname.release); + "Linux %s", init_utsname()->release); /* Work around the buggy PDC of KittyHawk-machines */ switch (CPU_HVERSION) { diff -uprN linux-2.6.18/drivers/pci/pci-sysfs.c linux-2.6.18.ovz/drivers/pci/pci-sysfs.c --- linux-2.6.18/drivers/pci/pci-sysfs.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/pci/pci-sysfs.c 2007-06-13 06:55:07.000000000 -0400 @@ -571,6 +571,9 @@ int pci_create_sysfs_dev_files (struct p */ void pci_remove_sysfs_dev_files(struct pci_dev *pdev) { + if (!sysfs_initialized) + return; + if (pdev->cfg_size < 4096) sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr); else diff -uprN linux-2.6.18/drivers/pci/probe.c linux-2.6.18.ovz/drivers/pci/probe.c --- linux-2.6.18/drivers/pci/probe.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/pci/probe.c 2007-06-13 06:55:07.000000000 -0400 @@ -21,6 +21,7 @@ LIST_HEAD(pci_root_buses); EXPORT_SYMBOL(pci_root_buses); LIST_HEAD(pci_devices); +EXPORT_SYMBOL(pci_devices); #ifdef HAVE_PCI_LEGACY /** diff -uprN linux-2.6.18/drivers/pci/quirks.c linux-2.6.18.ovz/drivers/pci/quirks.c --- linux-2.6.18/drivers/pci/quirks.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/pci/quirks.c 2007-06-13 06:55:07.000000000 -0400 @@ -685,33 +685,6 @@ static void __devinit quirk_vt82c598_id( } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C597_0, quirk_vt82c598_id ); -#ifdef CONFIG_ACPI_SLEEP - -/* - * Some VIA systems boot with the abnormal status flag set. This can cause - * the BIOS to re-POST the system on resume rather than passing control - * back to the OS. Clear the flag on boot - */ -static void __devinit quirk_via_abnormal_poweroff(struct pci_dev *dev) -{ - u32 reg; - - acpi_hw_register_read(ACPI_MTX_DO_NOT_LOCK, ACPI_REGISTER_PM1_STATUS, - ®); - - if (reg & 0x800) { - printk("Clearing abnormal poweroff flag\n"); - acpi_hw_register_write(ACPI_MTX_DO_NOT_LOCK, - ACPI_REGISTER_PM1_STATUS, - (u16)0x800); - } -} - -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, quirk_via_abnormal_poweroff); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, quirk_via_abnormal_poweroff); - -#endif - /* * CardBus controllers have a legacy base address that enables them * to respond as i82365 pcmcia controllers. We don't want them to diff -uprN linux-2.6.18/drivers/pcmcia/ds.c linux-2.6.18.ovz/drivers/pcmcia/ds.c --- linux-2.6.18/drivers/pcmcia/ds.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/pcmcia/ds.c 2007-06-13 06:55:07.000000000 -0400 @@ -1264,6 +1264,11 @@ static void pcmcia_bus_remove_socket(str socket->pcmcia_state.dead = 1; pccard_register_pcmcia(socket, NULL); + /* unregister any unbound devices */ + mutex_lock(&socket->skt_mutex); + pcmcia_card_remove(socket, NULL); + mutex_unlock(&socket->skt_mutex); + pcmcia_put_socket(socket); return; diff -uprN linux-2.6.18/drivers/rtc/rtc-max6902.c linux-2.6.18.ovz/drivers/rtc/rtc-max6902.c --- linux-2.6.18/drivers/rtc/rtc-max6902.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/rtc/rtc-max6902.c 2007-06-13 06:55:07.000000000 -0400 @@ -137,7 +137,7 @@ static int max6902_get_datetime(struct d dt->tm_min = BCD2BIN(chip->buf[2]); dt->tm_hour = BCD2BIN(chip->buf[3]); dt->tm_mday = BCD2BIN(chip->buf[4]); - dt->tm_mon = BCD2BIN(chip->buf[5] - 1); + dt->tm_mon = BCD2BIN(chip->buf[5]) - 1; dt->tm_wday = BCD2BIN(chip->buf[6]); dt->tm_year = BCD2BIN(chip->buf[7]); diff -uprN linux-2.6.18/drivers/rtc/rtc-pcf8563.c linux-2.6.18.ovz/drivers/rtc/rtc-pcf8563.c --- linux-2.6.18/drivers/rtc/rtc-pcf8563.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/rtc/rtc-pcf8563.c 2007-06-13 06:55:07.000000000 -0400 @@ -95,7 +95,7 @@ static int pcf8563_get_datetime(struct i tm->tm_wday = buf[PCF8563_REG_DW] & 0x07; tm->tm_mon = BCD2BIN(buf[PCF8563_REG_MO] & 0x1F) - 1; /* rtc mn 1-12 */ tm->tm_year = BCD2BIN(buf[PCF8563_REG_YR]) - + (buf[PCF8563_REG_MO] & PCF8563_MO_C ? 100 : 0); + + (buf[PCF8563_REG_MO] & PCF8563_MO_C ? 0 : 100); dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, " "mday=%d, mon=%d, year=%d, wday=%d\n", @@ -135,7 +135,7 @@ static int pcf8563_set_datetime(struct i /* year and century */ buf[PCF8563_REG_YR] = BIN2BCD(tm->tm_year % 100); - if (tm->tm_year / 100) + if (tm->tm_year < 100) buf[PCF8563_REG_MO] |= PCF8563_MO_C; buf[PCF8563_REG_DW] = tm->tm_wday & 0x07; diff -uprN linux-2.6.18/drivers/sbus/char/bbc_envctrl.c linux-2.6.18.ovz/drivers/sbus/char/bbc_envctrl.c --- linux-2.6.18/drivers/sbus/char/bbc_envctrl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/sbus/char/bbc_envctrl.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,6 +14,7 @@ static int errno; #include #include #include +#include #include "bbc_i2c.h" #include "max1617.h" diff -uprN linux-2.6.18/drivers/sbus/char/envctrl.c linux-2.6.18.ovz/drivers/sbus/char/envctrl.c --- linux-2.6.18/drivers/sbus/char/envctrl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/sbus/char/envctrl.c 2007-06-13 06:55:07.000000000 -0400 @@ -37,6 +37,7 @@ static int errno; #include #include #include +#include #define ENVCTRL_MINOR 162 diff -uprN linux-2.6.18/drivers/scsi/Kconfig linux-2.6.18.ovz/drivers/scsi/Kconfig --- linux-2.6.18/drivers/scsi/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/scsi/Kconfig 2007-06-13 06:55:08.000000000 -0400 @@ -209,7 +209,7 @@ config SCSI_LOGGING there should be no noticeable performance impact as long as you have logging turned off. -menu "SCSI Transport Attributes" +menu "SCSI Transport" depends on SCSI config SCSI_SPI_ATTRS @@ -242,6 +242,8 @@ config SCSI_SAS_ATTRS If you wish to export transport-specific information about each attached SAS device to sysfs, say Y. +source "drivers/scsi/libsas/Kconfig" + endmenu menu "SCSI low-level drivers" @@ -431,6 +433,7 @@ config SCSI_AIC7XXX_OLD module will be called aic7xxx_old. source "drivers/scsi/aic7xxx/Kconfig.aic79xx" +source "drivers/scsi/aic94xx/Kconfig" # All the I2O code and drivers do not seem to be 64bit safe. config SCSI_DPT_I2O diff -uprN linux-2.6.18/drivers/scsi/Makefile linux-2.6.18.ovz/drivers/scsi/Makefile --- linux-2.6.18/drivers/scsi/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/scsi/Makefile 2007-06-13 06:55:08.000000000 -0400 @@ -32,6 +32,7 @@ obj-$(CONFIG_SCSI_SPI_ATTRS) += scsi_tra obj-$(CONFIG_SCSI_FC_ATTRS) += scsi_transport_fc.o obj-$(CONFIG_SCSI_ISCSI_ATTRS) += scsi_transport_iscsi.o obj-$(CONFIG_SCSI_SAS_ATTRS) += scsi_transport_sas.o +obj-$(CONFIG_SCSI_SAS_LIBSAS) += libsas/ obj-$(CONFIG_ISCSI_TCP) += libiscsi.o iscsi_tcp.o obj-$(CONFIG_INFINIBAND_ISER) += libiscsi.o @@ -67,6 +68,7 @@ obj-$(CONFIG_SCSI_AIC7XXX) += aic7xxx/ obj-$(CONFIG_SCSI_AIC79XX) += aic7xxx/ obj-$(CONFIG_SCSI_AACRAID) += aacraid/ obj-$(CONFIG_SCSI_AIC7XXX_OLD) += aic7xxx_old.o +obj-$(CONFIG_SCSI_AIC94XX) += aic94xx/ obj-$(CONFIG_SCSI_IPS) += ips.o obj-$(CONFIG_SCSI_FD_MCS) += fd_mcs.o obj-$(CONFIG_SCSI_FUTURE_DOMAIN)+= fdomain.o diff -uprN linux-2.6.18/drivers/scsi/aic7xxx/aic7xxx_osm.c linux-2.6.18.ovz/drivers/scsi/aic7xxx/aic7xxx_osm.c --- linux-2.6.18/drivers/scsi/aic7xxx/aic7xxx_osm.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/scsi/aic7xxx/aic7xxx_osm.c 2007-06-13 06:55:07.000000000 -0400 @@ -2539,15 +2539,28 @@ static void ahc_linux_set_iu(struct scsi static void ahc_linux_get_signalling(struct Scsi_Host *shost) { struct ahc_softc *ahc = *(struct ahc_softc **)shost->hostdata; - u8 mode = ahc_inb(ahc, SBLKCTL); + unsigned long flags; + u8 mode; - if (mode & ENAB40) - spi_signalling(shost) = SPI_SIGNAL_LVD; - else if (mode & ENAB20) + if (!(ahc->features & AHC_ULTRA2)) { + /* non-LVD chipset, may not have SBLKCTL reg */ spi_signalling(shost) = ahc->features & AHC_HVD ? SPI_SIGNAL_HVD : SPI_SIGNAL_SE; + return; + } + + ahc_lock(ahc, &flags); + ahc_pause(ahc); + mode = ahc_inb(ahc, SBLKCTL); + ahc_unpause(ahc); + ahc_unlock(ahc, &flags); + + if (mode & ENAB40) + spi_signalling(shost) = SPI_SIGNAL_LVD; + else if (mode & ENAB20) + spi_signalling(shost) = SPI_SIGNAL_SE; else spi_signalling(shost) = SPI_SIGNAL_UNKNOWN; } diff -uprN linux-2.6.18/drivers/scsi/aic94xx/Kconfig linux-2.6.18.ovz/drivers/scsi/aic94xx/Kconfig --- linux-2.6.18/drivers/scsi/aic94xx/Kconfig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/Kconfig 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,41 @@ +# +# Kernel configuration file for aic94xx SAS/SATA driver. +# +# Copyright (c) 2005 Adaptec, Inc. All rights reserved. +# Copyright (c) 2005 Luben Tuikov +# +# This file is licensed under GPLv2. +# +# This file is part of the aic94xx driver. +# +# The aic94xx driver is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; version 2 of the +# License. +# +# The aic94xx driver is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Aic94xx Driver; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# + +config SCSI_AIC94XX + tristate "Adaptec AIC94xx SAS/SATA support" + depends on PCI + select SCSI_SAS_LIBSAS + help + This driver supports Adaptec's SAS/SATA 3Gb/s 64 bit PCI-X + AIC94xx chip based host adapters. + +config AIC94XX_DEBUG + bool "Compile in debug mode" + default y + depends on SCSI_AIC94XX + help + Compiles the aic94xx driver in debug mode. In debug mode, + the driver prints some messages to the console. diff -uprN linux-2.6.18/drivers/scsi/aic94xx/Makefile linux-2.6.18.ovz/drivers/scsi/aic94xx/Makefile --- linux-2.6.18/drivers/scsi/aic94xx/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/Makefile 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,39 @@ +# +# Makefile for Adaptec aic94xx SAS/SATA driver. +# +# Copyright (C) 2005 Adaptec, Inc. All rights reserved. +# Copyright (C) 2005 Luben Tuikov +# +# This file is licensed under GPLv2. +# +# This file is part of the the aic94xx driver. +# +# The aic94xx driver is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; version 2 of the +# License. +# +# The aic94xx driver is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the aic94xx driver; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +ifeq ($(CONFIG_AIC94XX_DEBUG),y) + EXTRA_CFLAGS += -DASD_DEBUG -DASD_ENTER_EXIT +endif + +obj-$(CONFIG_SCSI_AIC94XX) += aic94xx.o +aic94xx-y += aic94xx_init.o \ + aic94xx_hwi.o \ + aic94xx_reg.o \ + aic94xx_sds.o \ + aic94xx_seq.o \ + aic94xx_dump.o \ + aic94xx_scb.o \ + aic94xx_dev.o \ + aic94xx_tmf.o \ + aic94xx_task.o diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx.h linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx.h --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,114 @@ +/* + * Aic94xx SAS/SATA driver header file. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * $Id: 0001-2.6.18-openvz-combined-028.035.patch,v 1.1 2007-07-23 23:01:52 niro Exp $ + */ + +#ifndef _AIC94XX_H_ +#define _AIC94XX_H_ + +#include +#include +#include + +#define ASD_DRIVER_NAME "aic94xx" +#define ASD_DRIVER_DESCRIPTION "Adaptec aic94xx SAS/SATA driver" + +#define asd_printk(fmt, ...) printk(KERN_NOTICE ASD_DRIVER_NAME ": " fmt, ## __VA_ARGS__) + +#ifdef ASD_ENTER_EXIT +#define ENTER printk(KERN_NOTICE "%s: ENTER %s\n", ASD_DRIVER_NAME, \ + __FUNCTION__) +#define EXIT printk(KERN_NOTICE "%s: --EXIT %s\n", ASD_DRIVER_NAME, \ + __FUNCTION__) +#else +#define ENTER +#define EXIT +#endif + +#ifdef ASD_DEBUG +#define ASD_DPRINTK asd_printk +#else +#define ASD_DPRINTK(fmt, ...) +#endif + +/* 2*ITNL timeout + 1 second */ +#define AIC94XX_SCB_TIMEOUT (5*HZ) + +extern kmem_cache_t *asd_dma_token_cache; +extern kmem_cache_t *asd_ascb_cache; +extern char sas_addr_str[2*SAS_ADDR_SIZE + 1]; + +static inline void asd_stringify_sas_addr(char *p, const u8 *sas_addr) +{ + int i; + for (i = 0; i < SAS_ADDR_SIZE; i++, p += 2) + snprintf(p, 3, "%02X", sas_addr[i]); + *p = '\0'; +} + +static inline void asd_destringify_sas_addr(u8 *sas_addr, const char *p) +{ + int i; + for (i = 0; i < SAS_ADDR_SIZE; i++) { + u8 h, l; + if (!*p) + break; + h = isdigit(*p) ? *p-'0' : *p-'A'+10; + p++; + l = isdigit(*p) ? *p-'0' : *p-'A'+10; + p++; + sas_addr[i] = (h<<4) | l; + } +} + +struct asd_ha_struct; +struct asd_ascb; + +int asd_read_ocm(struct asd_ha_struct *asd_ha); +int asd_read_flash(struct asd_ha_struct *asd_ha); + +int asd_dev_found(struct domain_device *dev); +void asd_dev_gone(struct domain_device *dev); + +void asd_invalidate_edb(struct asd_ascb *ascb, int edb_id); + +int asd_execute_task(struct sas_task *, int num, unsigned long gfp_flags); + +/* ---------- TMFs ---------- */ +int asd_abort_task(struct sas_task *); +int asd_abort_task_set(struct domain_device *, u8 *lun); +int asd_clear_aca(struct domain_device *, u8 *lun); +int asd_clear_task_set(struct domain_device *, u8 *lun); +int asd_lu_reset(struct domain_device *, u8 *lun); +int asd_query_task(struct sas_task *); + +/* ---------- Adapter and Port management ---------- */ +int asd_clear_nexus_port(struct asd_sas_port *port); +int asd_clear_nexus_ha(struct sas_ha_struct *sas_ha); + +/* ---------- Phy Management ---------- */ +int asd_control_phy(struct asd_sas_phy *phy, enum phy_func func); + +#endif diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_dev.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_dev.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_dev.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_dev.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,353 @@ +/* + * Aic94xx SAS/SATA DDB management + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * $Id: 0001-2.6.18-openvz-combined-028.035.patch,v 1.1 2007-07-23 23:01:52 niro Exp $ + */ + +#include "aic94xx.h" +#include "aic94xx_hwi.h" +#include "aic94xx_reg.h" +#include "aic94xx_sas.h" + +#define FIND_FREE_DDB(_ha) find_first_zero_bit((_ha)->hw_prof.ddb_bitmap, \ + (_ha)->hw_prof.max_ddbs) +#define SET_DDB(_ddb, _ha) set_bit(_ddb, (_ha)->hw_prof.ddb_bitmap) +#define CLEAR_DDB(_ddb, _ha) clear_bit(_ddb, (_ha)->hw_prof.ddb_bitmap) + +static inline int asd_get_ddb(struct asd_ha_struct *asd_ha) +{ + unsigned long flags; + int ddb, i; + + spin_lock_irqsave(&asd_ha->hw_prof.ddb_lock, flags); + ddb = FIND_FREE_DDB(asd_ha); + if (ddb >= asd_ha->hw_prof.max_ddbs) { + ddb = -ENOMEM; + spin_unlock_irqrestore(&asd_ha->hw_prof.ddb_lock, flags); + goto out; + } + SET_DDB(ddb, asd_ha); + spin_unlock_irqrestore(&asd_ha->hw_prof.ddb_lock, flags); + + for (i = 0; i < sizeof(struct asd_ddb_ssp_smp_target_port); i+= 4) + asd_ddbsite_write_dword(asd_ha, ddb, i, 0); +out: + return ddb; +} + +#define INIT_CONN_TAG offsetof(struct asd_ddb_ssp_smp_target_port, init_conn_tag) +#define DEST_SAS_ADDR offsetof(struct asd_ddb_ssp_smp_target_port, dest_sas_addr) +#define SEND_QUEUE_HEAD offsetof(struct asd_ddb_ssp_smp_target_port, send_queue_head) +#define DDB_TYPE offsetof(struct asd_ddb_ssp_smp_target_port, ddb_type) +#define CONN_MASK offsetof(struct asd_ddb_ssp_smp_target_port, conn_mask) +#define DDB_TARG_FLAGS offsetof(struct asd_ddb_ssp_smp_target_port, flags) +#define DDB_TARG_FLAGS2 offsetof(struct asd_ddb_stp_sata_target_port, flags2) +#define EXEC_QUEUE_TAIL offsetof(struct asd_ddb_ssp_smp_target_port, exec_queue_tail) +#define SEND_QUEUE_TAIL offsetof(struct asd_ddb_ssp_smp_target_port, send_queue_tail) +#define SISTER_DDB offsetof(struct asd_ddb_ssp_smp_target_port, sister_ddb) +#define MAX_CCONN offsetof(struct asd_ddb_ssp_smp_target_port, max_concurrent_conn) +#define NUM_CTX offsetof(struct asd_ddb_ssp_smp_target_port, num_contexts) +#define ATA_CMD_SCBPTR offsetof(struct asd_ddb_stp_sata_target_port, ata_cmd_scbptr) +#define SATA_TAG_ALLOC_MASK offsetof(struct asd_ddb_stp_sata_target_port, sata_tag_alloc_mask) +#define NUM_SATA_TAGS offsetof(struct asd_ddb_stp_sata_target_port, num_sata_tags) +#define SATA_STATUS offsetof(struct asd_ddb_stp_sata_target_port, sata_status) +#define NCQ_DATA_SCB_PTR offsetof(struct asd_ddb_stp_sata_target_port, ncq_data_scb_ptr) +#define ITNL_TIMEOUT offsetof(struct asd_ddb_ssp_smp_target_port, itnl_timeout) + +static inline void asd_free_ddb(struct asd_ha_struct *asd_ha, int ddb) +{ + unsigned long flags; + + if (!ddb || ddb >= 0xFFFF) + return; + asd_ddbsite_write_byte(asd_ha, ddb, DDB_TYPE, DDB_TYPE_UNUSED); + spin_lock_irqsave(&asd_ha->hw_prof.ddb_lock, flags); + CLEAR_DDB(ddb, asd_ha); + spin_unlock_irqrestore(&asd_ha->hw_prof.ddb_lock, flags); +} + +static inline void asd_set_ddb_type(struct domain_device *dev) +{ + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + int ddb = (int) (unsigned long) dev->lldd_dev; + + if (dev->dev_type == SATA_PM_PORT) + asd_ddbsite_write_byte(asd_ha,ddb, DDB_TYPE, DDB_TYPE_PM_PORT); + else if (dev->tproto) + asd_ddbsite_write_byte(asd_ha,ddb, DDB_TYPE, DDB_TYPE_TARGET); + else + asd_ddbsite_write_byte(asd_ha,ddb,DDB_TYPE,DDB_TYPE_INITIATOR); +} + +static int asd_init_sata_tag_ddb(struct domain_device *dev) +{ + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + int ddb, i; + + ddb = asd_get_ddb(asd_ha); + if (ddb < 0) + return ddb; + + for (i = 0; i < sizeof(struct asd_ddb_sata_tag); i += 2) + asd_ddbsite_write_word(asd_ha, ddb, i, 0xFFFF); + + asd_ddbsite_write_word(asd_ha, (int) (unsigned long) dev->lldd_dev, + SISTER_DDB, ddb); + return 0; +} + +static inline int asd_init_sata(struct domain_device *dev) +{ + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + int ddb = (int) (unsigned long) dev->lldd_dev; + u32 qdepth = 0; + int res = 0; + + asd_ddbsite_write_word(asd_ha, ddb, ATA_CMD_SCBPTR, 0xFFFF); + if ((dev->dev_type == SATA_DEV || dev->dev_type == SATA_PM_PORT) && + dev->sata_dev.identify_device && + dev->sata_dev.identify_device[10] != 0) { + u16 w75 = le16_to_cpu(dev->sata_dev.identify_device[75]); + u16 w76 = le16_to_cpu(dev->sata_dev.identify_device[76]); + + if (w76 & 0x100) /* NCQ? */ + qdepth = (w75 & 0x1F) + 1; + asd_ddbsite_write_dword(asd_ha, ddb, SATA_TAG_ALLOC_MASK, + (1<dev_type == SATA_DEV || dev->dev_type == SATA_PM || + dev->dev_type == SATA_PM_PORT) { + struct dev_to_host_fis *fis = (struct dev_to_host_fis *) + dev->frame_rcvd; + asd_ddbsite_write_byte(asd_ha, ddb, SATA_STATUS, fis->status); + } + asd_ddbsite_write_word(asd_ha, ddb, NCQ_DATA_SCB_PTR, 0xFFFF); + if (qdepth > 0) + res = asd_init_sata_tag_ddb(dev); + return res; +} + +static int asd_init_target_ddb(struct domain_device *dev) +{ + int ddb, i; + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + u8 flags = 0; + + ddb = asd_get_ddb(asd_ha); + if (ddb < 0) + return ddb; + + dev->lldd_dev = (void *) (unsigned long) ddb; + + asd_ddbsite_write_byte(asd_ha, ddb, 0, DDB_TP_CONN_TYPE); + asd_ddbsite_write_byte(asd_ha, ddb, 1, 0); + asd_ddbsite_write_word(asd_ha, ddb, INIT_CONN_TAG, 0xFFFF); + for (i = 0; i < SAS_ADDR_SIZE; i++) + asd_ddbsite_write_byte(asd_ha, ddb, DEST_SAS_ADDR+i, + dev->sas_addr[i]); + asd_ddbsite_write_word(asd_ha, ddb, SEND_QUEUE_HEAD, 0xFFFF); + asd_set_ddb_type(dev); + asd_ddbsite_write_byte(asd_ha, ddb, CONN_MASK, dev->port->phy_mask); + if (dev->port->oob_mode != SATA_OOB_MODE) { + flags |= OPEN_REQUIRED; + if ((dev->dev_type == SATA_DEV) || + (dev->tproto & SAS_PROTO_STP)) { + struct smp_resp *rps_resp = &dev->sata_dev.rps_resp; + if (rps_resp->frame_type == SMP_RESPONSE && + rps_resp->function == SMP_REPORT_PHY_SATA && + rps_resp->result == SMP_RESP_FUNC_ACC) { + if (rps_resp->rps.affil_valid) + flags |= STP_AFFIL_POL; + if (rps_resp->rps.affil_supp) + flags |= SUPPORTS_AFFIL; + } + } else { + flags |= CONCURRENT_CONN_SUPP; + if (!dev->parent && + (dev->dev_type == EDGE_DEV || + dev->dev_type == FANOUT_DEV)) + asd_ddbsite_write_byte(asd_ha, ddb, MAX_CCONN, + 4); + else + asd_ddbsite_write_byte(asd_ha, ddb, MAX_CCONN, + dev->pathways); + asd_ddbsite_write_byte(asd_ha, ddb, NUM_CTX, 1); + } + } + if (dev->dev_type == SATA_PM) + flags |= SATA_MULTIPORT; + asd_ddbsite_write_byte(asd_ha, ddb, DDB_TARG_FLAGS, flags); + + flags = 0; + if (dev->tproto & SAS_PROTO_STP) + flags |= STP_CL_POL_NO_TX; + asd_ddbsite_write_byte(asd_ha, ddb, DDB_TARG_FLAGS2, flags); + + asd_ddbsite_write_word(asd_ha, ddb, EXEC_QUEUE_TAIL, 0xFFFF); + asd_ddbsite_write_word(asd_ha, ddb, SEND_QUEUE_TAIL, 0xFFFF); + asd_ddbsite_write_word(asd_ha, ddb, SISTER_DDB, 0xFFFF); + + if (dev->dev_type == SATA_DEV || (dev->tproto & SAS_PROTO_STP)) { + i = asd_init_sata(dev); + if (i < 0) { + asd_free_ddb(asd_ha, ddb); + return i; + } + } + + if (dev->dev_type == SAS_END_DEV) { + struct sas_end_device *rdev = rphy_to_end_device(dev->rphy); + if (rdev->I_T_nexus_loss_timeout > 0) + asd_ddbsite_write_word(asd_ha, ddb, ITNL_TIMEOUT, + min(rdev->I_T_nexus_loss_timeout, + (u16)ITNL_TIMEOUT_CONST)); + else + asd_ddbsite_write_word(asd_ha, ddb, ITNL_TIMEOUT, + (u16)ITNL_TIMEOUT_CONST); + } + return 0; +} + +static int asd_init_sata_pm_table_ddb(struct domain_device *dev) +{ + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + int ddb, i; + + ddb = asd_get_ddb(asd_ha); + if (ddb < 0) + return ddb; + + for (i = 0; i < 32; i += 2) + asd_ddbsite_write_word(asd_ha, ddb, i, 0xFFFF); + + asd_ddbsite_write_word(asd_ha, (int) (unsigned long) dev->lldd_dev, + SISTER_DDB, ddb); + + return 0; +} + +#define PM_PORT_FLAGS offsetof(struct asd_ddb_sata_pm_port, pm_port_flags) +#define PARENT_DDB offsetof(struct asd_ddb_sata_pm_port, parent_ddb) + +/** + * asd_init_sata_pm_port_ddb -- SATA Port Multiplier Port + * dev: pointer to domain device + * + * For SATA Port Multiplier Ports we need to allocate one SATA Port + * Multiplier Port DDB and depending on whether the target on it + * supports SATA II NCQ, one SATA Tag DDB. + */ +static int asd_init_sata_pm_port_ddb(struct domain_device *dev) +{ + int ddb, i, parent_ddb, pmtable_ddb; + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + u8 flags; + + ddb = asd_get_ddb(asd_ha); + if (ddb < 0) + return ddb; + + asd_set_ddb_type(dev); + flags = (dev->sata_dev.port_no << 4) | PM_PORT_SET; + asd_ddbsite_write_byte(asd_ha, ddb, PM_PORT_FLAGS, flags); + asd_ddbsite_write_word(asd_ha, ddb, SISTER_DDB, 0xFFFF); + asd_ddbsite_write_word(asd_ha, ddb, ATA_CMD_SCBPTR, 0xFFFF); + asd_init_sata(dev); + + parent_ddb = (int) (unsigned long) dev->parent->lldd_dev; + asd_ddbsite_write_word(asd_ha, ddb, PARENT_DDB, parent_ddb); + pmtable_ddb = asd_ddbsite_read_word(asd_ha, parent_ddb, SISTER_DDB); + asd_ddbsite_write_word(asd_ha, pmtable_ddb, dev->sata_dev.port_no,ddb); + + if (asd_ddbsite_read_byte(asd_ha, ddb, NUM_SATA_TAGS) > 0) { + i = asd_init_sata_tag_ddb(dev); + if (i < 0) { + asd_free_ddb(asd_ha, ddb); + return i; + } + } + return 0; +} + +static int asd_init_initiator_ddb(struct domain_device *dev) +{ + return -ENODEV; +} + +/** + * asd_init_sata_pm_ddb -- SATA Port Multiplier + * dev: pointer to domain device + * + * For STP and direct-attached SATA Port Multipliers we need + * one target port DDB entry and one SATA PM table DDB entry. + */ +static int asd_init_sata_pm_ddb(struct domain_device *dev) +{ + int res = 0; + + res = asd_init_target_ddb(dev); + if (res) + goto out; + res = asd_init_sata_pm_table_ddb(dev); + if (res) + asd_free_ddb(dev->port->ha->lldd_ha, + (int) (unsigned long) dev->lldd_dev); +out: + return res; +} + +int asd_dev_found(struct domain_device *dev) +{ + int res = 0; + + switch (dev->dev_type) { + case SATA_PM: + res = asd_init_sata_pm_ddb(dev); + break; + case SATA_PM_PORT: + res = asd_init_sata_pm_port_ddb(dev); + break; + default: + if (dev->tproto) + res = asd_init_target_ddb(dev); + else + res = asd_init_initiator_ddb(dev); + } + return res; +} + +void asd_dev_gone(struct domain_device *dev) +{ + int ddb, sister_ddb; + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + + ddb = (int) (unsigned long) dev->lldd_dev; + sister_ddb = asd_ddbsite_read_word(asd_ha, ddb, SISTER_DDB); + + if (sister_ddb != 0xFFFF) + asd_free_ddb(asd_ha, sister_ddb); + asd_free_ddb(asd_ha, ddb); + dev->lldd_dev = NULL; +} diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_dump.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_dump.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_dump.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_dump.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,959 @@ +/* + * Aic94xx SAS/SATA driver dump interface. + * + * Copyright (C) 2004 Adaptec, Inc. All rights reserved. + * Copyright (C) 2004 David Chaw + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * 2005/07/14/LT Complete overhaul of this file. Update pages, register + * locations, names, etc. Make use of macros. Print more information. + * Print all cseq and lseq mip and mdp. + * + */ + +#include "linux/pci.h" +#include "aic94xx.h" +#include "aic94xx_reg.h" +#include "aic94xx_reg_def.h" +#include "aic94xx_sas.h" + +#include "aic94xx_dump.h" + +#ifdef ASD_DEBUG + +#define MD(x) (1 << (x)) +#define MODE_COMMON (1 << 31) +#define MODE_0_7 (0xFF) + +static const struct lseq_cio_regs { + char *name; + u32 offs; + u8 width; + u32 mode; +} LSEQmCIOREGS[] = { + {"LmMnSCBPTR", 0x20, 16, MD(0)|MD(1)|MD(2)|MD(3)|MD(4) }, + {"LmMnDDBPTR", 0x22, 16, MD(0)|MD(1)|MD(2)|MD(3)|MD(4) }, + {"LmREQMBX", 0x30, 32, MODE_COMMON }, + {"LmRSPMBX", 0x34, 32, MODE_COMMON }, + {"LmMnINT", 0x38, 32, MODE_0_7 }, + {"LmMnINTEN", 0x3C, 32, MODE_0_7 }, + {"LmXMTPRIMD", 0x40, 32, MODE_COMMON }, + {"LmXMTPRIMCS", 0x44, 8, MODE_COMMON }, + {"LmCONSTAT", 0x45, 8, MODE_COMMON }, + {"LmMnDMAERRS", 0x46, 8, MD(0)|MD(1) }, + {"LmMnSGDMAERRS", 0x47, 8, MD(0)|MD(1) }, + {"LmMnEXPHDRP", 0x48, 8, MD(0) }, + {"LmMnSASAALIGN", 0x48, 8, MD(1) }, + {"LmMnMSKHDRP", 0x49, 8, MD(0) }, + {"LmMnSTPALIGN", 0x49, 8, MD(1) }, + {"LmMnRCVHDRP", 0x4A, 8, MD(0) }, + {"LmMnXMTHDRP", 0x4A, 8, MD(1) }, + {"LmALIGNMODE", 0x4B, 8, MD(1) }, + {"LmMnEXPRCVCNT", 0x4C, 32, MD(0) }, + {"LmMnXMTCNT", 0x4C, 32, MD(1) }, + {"LmMnCURRTAG", 0x54, 16, MD(0) }, + {"LmMnPREVTAG", 0x56, 16, MD(0) }, + {"LmMnACKOFS", 0x58, 8, MD(1) }, + {"LmMnXFRLVL", 0x59, 8, MD(0)|MD(1) }, + {"LmMnSGDMACTL", 0x5A, 8, MD(0)|MD(1) }, + {"LmMnSGDMASTAT", 0x5B, 8, MD(0)|MD(1) }, + {"LmMnDDMACTL", 0x5C, 8, MD(0)|MD(1) }, + {"LmMnDDMASTAT", 0x5D, 8, MD(0)|MD(1) }, + {"LmMnDDMAMODE", 0x5E, 16, MD(0)|MD(1) }, + {"LmMnPIPECTL", 0x61, 8, MD(0)|MD(1) }, + {"LmMnACTSCB", 0x62, 16, MD(0)|MD(1) }, + {"LmMnSGBHADR", 0x64, 8, MD(0)|MD(1) }, + {"LmMnSGBADR", 0x65, 8, MD(0)|MD(1) }, + {"LmMnSGDCNT", 0x66, 8, MD(0)|MD(1) }, + {"LmMnSGDMADR", 0x68, 32, MD(0)|MD(1) }, + {"LmMnSGDMADR", 0x6C, 32, MD(0)|MD(1) }, + {"LmMnXFRCNT", 0x70, 32, MD(0)|MD(1) }, + {"LmMnXMTCRC", 0x74, 32, MD(1) }, + {"LmCURRTAG", 0x74, 16, MD(0) }, + {"LmPREVTAG", 0x76, 16, MD(0) }, + {"LmMnDPSEL", 0x7B, 8, MD(0)|MD(1) }, + {"LmDPTHSTAT", 0x7C, 8, MODE_COMMON }, + {"LmMnHOLDLVL", 0x7D, 8, MD(0) }, + {"LmMnSATAFS", 0x7E, 8, MD(1) }, + {"LmMnCMPLTSTAT", 0x7F, 8, MD(0)|MD(1) }, + {"LmPRMSTAT0", 0x80, 32, MODE_COMMON }, + {"LmPRMSTAT1", 0x84, 32, MODE_COMMON }, + {"LmGPRMINT", 0x88, 8, MODE_COMMON }, + {"LmMnCURRSCB", 0x8A, 16, MD(0) }, + {"LmPRMICODE", 0x8C, 32, MODE_COMMON }, + {"LmMnRCVCNT", 0x90, 16, MD(0) }, + {"LmMnBUFSTAT", 0x92, 16, MD(0) }, + {"LmMnXMTHDRSIZE",0x92, 8, MD(1) }, + {"LmMnXMTSIZE", 0x93, 8, MD(1) }, + {"LmMnTGTXFRCNT", 0x94, 32, MD(0) }, + {"LmMnEXPROFS", 0x98, 32, MD(0) }, + {"LmMnXMTROFS", 0x98, 32, MD(1) }, + {"LmMnRCVROFS", 0x9C, 32, MD(0) }, + {"LmCONCTL", 0xA0, 16, MODE_COMMON }, + {"LmBITLTIMER", 0xA2, 16, MODE_COMMON }, + {"LmWWNLOW", 0xA8, 32, MODE_COMMON }, + {"LmWWNHIGH", 0xAC, 32, MODE_COMMON }, + {"LmMnFRMERR", 0xB0, 32, MD(0) }, + {"LmMnFRMERREN", 0xB4, 32, MD(0) }, + {"LmAWTIMER", 0xB8, 16, MODE_COMMON }, + {"LmAWTCTL", 0xBA, 8, MODE_COMMON }, + {"LmMnHDRCMPS", 0xC0, 32, MD(0) }, + {"LmMnXMTSTAT", 0xC4, 8, MD(1) }, + {"LmHWTSTATEN", 0xC5, 8, MODE_COMMON }, + {"LmMnRRDYRC", 0xC6, 8, MD(0) }, + {"LmMnRRDYTC", 0xC6, 8, MD(1) }, + {"LmHWTSTAT", 0xC7, 8, MODE_COMMON }, + {"LmMnDATABUFADR",0xC8, 16, MD(0)|MD(1) }, + {"LmDWSSTATUS", 0xCB, 8, MODE_COMMON }, + {"LmMnACTSTAT", 0xCE, 16, MD(0)|MD(1) }, + {"LmMnREQSCB", 0xD2, 16, MD(0)|MD(1) }, + {"LmXXXPRIM", 0xD4, 32, MODE_COMMON }, + {"LmRCVASTAT", 0xD9, 8, MODE_COMMON }, + {"LmINTDIS1", 0xDA, 8, MODE_COMMON }, + {"LmPSTORESEL", 0xDB, 8, MODE_COMMON }, + {"LmPSTORE", 0xDC, 32, MODE_COMMON }, + {"LmPRIMSTAT0EN", 0xE0, 32, MODE_COMMON }, + {"LmPRIMSTAT1EN", 0xE4, 32, MODE_COMMON }, + {"LmDONETCTL", 0xF2, 16, MODE_COMMON }, + {NULL, 0, 0, 0 } +}; +/* +static struct lseq_cio_regs LSEQmOOBREGS[] = { + {"OOB_BFLTR" ,0x100, 8, MD(5)}, + {"OOB_INIT_MIN" ,0x102,16, MD(5)}, + {"OOB_INIT_MAX" ,0x104,16, MD(5)}, + {"OOB_INIT_NEG" ,0x106,16, MD(5)}, + {"OOB_SAS_MIN" ,0x108,16, MD(5)}, + {"OOB_SAS_MAX" ,0x10A,16, MD(5)}, + {"OOB_SAS_NEG" ,0x10C,16, MD(5)}, + {"OOB_WAKE_MIN" ,0x10E,16, MD(5)}, + {"OOB_WAKE_MAX" ,0x110,16, MD(5)}, + {"OOB_WAKE_NEG" ,0x112,16, MD(5)}, + {"OOB_IDLE_MAX" ,0x114,16, MD(5)}, + {"OOB_BURST_MAX" ,0x116,16, MD(5)}, + {"OOB_XMIT_BURST" ,0x118, 8, MD(5)}, + {"OOB_SEND_PAIRS" ,0x119, 8, MD(5)}, + {"OOB_INIT_IDLE" ,0x11A, 8, MD(5)}, + {"OOB_INIT_NEGO" ,0x11C, 8, MD(5)}, + {"OOB_SAS_IDLE" ,0x11E, 8, MD(5)}, + {"OOB_SAS_NEGO" ,0x120, 8, MD(5)}, + {"OOB_WAKE_IDLE" ,0x122, 8, MD(5)}, + {"OOB_WAKE_NEGO" ,0x124, 8, MD(5)}, + {"OOB_DATA_KBITS" ,0x126, 8, MD(5)}, + {"OOB_BURST_DATA" ,0x128,32, MD(5)}, + {"OOB_ALIGN_0_DATA" ,0x12C,32, MD(5)}, + {"OOB_ALIGN_1_DATA" ,0x130,32, MD(5)}, + {"OOB_SYNC_DATA" ,0x134,32, MD(5)}, + {"OOB_D10_2_DATA" ,0x138,32, MD(5)}, + {"OOB_PHY_RST_CNT" ,0x13C,32, MD(5)}, + {"OOB_SIG_GEN" ,0x140, 8, MD(5)}, + {"OOB_XMIT" ,0x141, 8, MD(5)}, + {"FUNCTION_MAKS" ,0x142, 8, MD(5)}, + {"OOB_MODE" ,0x143, 8, MD(5)}, + {"CURRENT_STATUS" ,0x144, 8, MD(5)}, + {"SPEED_MASK" ,0x145, 8, MD(5)}, + {"PRIM_COUNT" ,0x146, 8, MD(5)}, + {"OOB_SIGNALS" ,0x148, 8, MD(5)}, + {"OOB_DATA_DET" ,0x149, 8, MD(5)}, + {"OOB_TIME_OUT" ,0x14C, 8, MD(5)}, + {"OOB_TIMER_ENABLE" ,0x14D, 8, MD(5)}, + {"OOB_STATUS" ,0x14E, 8, MD(5)}, + {"HOT_PLUG_DELAY" ,0x150, 8, MD(5)}, + {"RCD_DELAY" ,0x151, 8, MD(5)}, + {"COMSAS_TIMER" ,0x152, 8, MD(5)}, + {"SNTT_DELAY" ,0x153, 8, MD(5)}, + {"SPD_CHNG_DELAY" ,0x154, 8, MD(5)}, + {"SNLT_DELAY" ,0x155, 8, MD(5)}, + {"SNWT_DELAY" ,0x156, 8, MD(5)}, + {"ALIGN_DELAY" ,0x157, 8, MD(5)}, + {"INT_ENABLE_0" ,0x158, 8, MD(5)}, + {"INT_ENABLE_1" ,0x159, 8, MD(5)}, + {"INT_ENABLE_2" ,0x15A, 8, MD(5)}, + {"INT_ENABLE_3" ,0x15B, 8, MD(5)}, + {"OOB_TEST_REG" ,0x15C, 8, MD(5)}, + {"PHY_CONTROL_0" ,0x160, 8, MD(5)}, + {"PHY_CONTROL_1" ,0x161, 8, MD(5)}, + {"PHY_CONTROL_2" ,0x162, 8, MD(5)}, + {"PHY_CONTROL_3" ,0x163, 8, MD(5)}, + {"PHY_OOB_CAL_TX" ,0x164, 8, MD(5)}, + {"PHY_OOB_CAL_RX" ,0x165, 8, MD(5)}, + {"OOB_PHY_CAL_TX" ,0x166, 8, MD(5)}, + {"OOB_PHY_CAL_RX" ,0x167, 8, MD(5)}, + {"PHY_CONTROL_4" ,0x168, 8, MD(5)}, + {"PHY_TEST" ,0x169, 8, MD(5)}, + {"PHY_PWR_CTL" ,0x16A, 8, MD(5)}, + {"PHY_PWR_DELAY" ,0x16B, 8, MD(5)}, + {"OOB_SM_CON" ,0x16C, 8, MD(5)}, + {"ADDR_TRAP_1" ,0x16D, 8, MD(5)}, + {"ADDR_NEXT_1" ,0x16E, 8, MD(5)}, + {"NEXT_ST_1" ,0x16F, 8, MD(5)}, + {"OOB_SM_STATE" ,0x170, 8, MD(5)}, + {"ADDR_TRAP_2" ,0x171, 8, MD(5)}, + {"ADDR_NEXT_2" ,0x172, 8, MD(5)}, + {"NEXT_ST_2" ,0x173, 8, MD(5)}, + {NULL, 0, 0, 0 } +}; +*/ +#define STR_8BIT " %30s[0x%04x]:0x%02x\n" +#define STR_16BIT " %30s[0x%04x]:0x%04x\n" +#define STR_32BIT " %30s[0x%04x]:0x%08x\n" +#define STR_64BIT " %30s[0x%04x]:0x%llx\n" + +#define PRINT_REG_8bit(_ha, _n, _r) asd_printk(STR_8BIT, #_n, _n, \ + asd_read_reg_byte(_ha, _r)) +#define PRINT_REG_16bit(_ha, _n, _r) asd_printk(STR_16BIT, #_n, _n, \ + asd_read_reg_word(_ha, _r)) +#define PRINT_REG_32bit(_ha, _n, _r) asd_printk(STR_32BIT, #_n, _n, \ + asd_read_reg_dword(_ha, _r)) + +#define PRINT_CREG_8bit(_ha, _n) asd_printk(STR_8BIT, #_n, _n, \ + asd_read_reg_byte(_ha, C##_n)) +#define PRINT_CREG_16bit(_ha, _n) asd_printk(STR_16BIT, #_n, _n, \ + asd_read_reg_word(_ha, C##_n)) +#define PRINT_CREG_32bit(_ha, _n) asd_printk(STR_32BIT, #_n, _n, \ + asd_read_reg_dword(_ha, C##_n)) + +#define MSTR_8BIT " Mode:%02d %30s[0x%04x]:0x%02x\n" +#define MSTR_16BIT " Mode:%02d %30s[0x%04x]:0x%04x\n" +#define MSTR_32BIT " Mode:%02d %30s[0x%04x]:0x%08x\n" + +#define PRINT_MREG_8bit(_ha, _m, _n, _r) asd_printk(MSTR_8BIT, _m, #_n, _n, \ + asd_read_reg_byte(_ha, _r)) +#define PRINT_MREG_16bit(_ha, _m, _n, _r) asd_printk(MSTR_16BIT, _m, #_n, _n, \ + asd_read_reg_word(_ha, _r)) +#define PRINT_MREG_32bit(_ha, _m, _n, _r) asd_printk(MSTR_32BIT, _m, #_n, _n, \ + asd_read_reg_dword(_ha, _r)) + +/* can also be used for MD when the register is mode aware already */ +#define PRINT_MIS_byte(_ha, _n) asd_printk(STR_8BIT, #_n,CSEQ_##_n-CMAPPEDSCR,\ + asd_read_reg_byte(_ha, CSEQ_##_n)) +#define PRINT_MIS_word(_ha, _n) asd_printk(STR_16BIT,#_n,CSEQ_##_n-CMAPPEDSCR,\ + asd_read_reg_word(_ha, CSEQ_##_n)) +#define PRINT_MIS_dword(_ha, _n) \ + asd_printk(STR_32BIT,#_n,CSEQ_##_n-CMAPPEDSCR,\ + asd_read_reg_dword(_ha, CSEQ_##_n)) +#define PRINT_MIS_qword(_ha, _n) \ + asd_printk(STR_64BIT, #_n,CSEQ_##_n-CMAPPEDSCR, \ + (unsigned long long)(((u64)asd_read_reg_dword(_ha, CSEQ_##_n)) \ + | (((u64)asd_read_reg_dword(_ha, (CSEQ_##_n)+4))<<32))) + +#define CMDP_REG(_n, _m) (_m*(CSEQ_PAGE_SIZE*2)+CSEQ_##_n) +#define PRINT_CMDP_word(_ha, _n) \ +asd_printk("%20s 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x\n", \ + #_n, \ + asd_read_reg_word(_ha, CMDP_REG(_n, 0)), \ + asd_read_reg_word(_ha, CMDP_REG(_n, 1)), \ + asd_read_reg_word(_ha, CMDP_REG(_n, 2)), \ + asd_read_reg_word(_ha, CMDP_REG(_n, 3)), \ + asd_read_reg_word(_ha, CMDP_REG(_n, 4)), \ + asd_read_reg_word(_ha, CMDP_REG(_n, 5)), \ + asd_read_reg_word(_ha, CMDP_REG(_n, 6)), \ + asd_read_reg_word(_ha, CMDP_REG(_n, 7))) + +#define PRINT_CMDP_byte(_ha, _n) \ +asd_printk("%20s 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x\n", \ + #_n, \ + asd_read_reg_byte(_ha, CMDP_REG(_n, 0)), \ + asd_read_reg_byte(_ha, CMDP_REG(_n, 1)), \ + asd_read_reg_byte(_ha, CMDP_REG(_n, 2)), \ + asd_read_reg_byte(_ha, CMDP_REG(_n, 3)), \ + asd_read_reg_byte(_ha, CMDP_REG(_n, 4)), \ + asd_read_reg_byte(_ha, CMDP_REG(_n, 5)), \ + asd_read_reg_byte(_ha, CMDP_REG(_n, 6)), \ + asd_read_reg_byte(_ha, CMDP_REG(_n, 7))) + +static void asd_dump_cseq_state(struct asd_ha_struct *asd_ha) +{ + int mode; + + asd_printk("CSEQ STATE\n"); + + asd_printk("ARP2 REGISTERS\n"); + + PRINT_CREG_32bit(asd_ha, ARP2CTL); + PRINT_CREG_32bit(asd_ha, ARP2INT); + PRINT_CREG_32bit(asd_ha, ARP2INTEN); + PRINT_CREG_8bit(asd_ha, MODEPTR); + PRINT_CREG_8bit(asd_ha, ALTMODE); + PRINT_CREG_8bit(asd_ha, FLAG); + PRINT_CREG_8bit(asd_ha, ARP2INTCTL); + PRINT_CREG_16bit(asd_ha, STACK); + PRINT_CREG_16bit(asd_ha, PRGMCNT); + PRINT_CREG_16bit(asd_ha, ACCUM); + PRINT_CREG_16bit(asd_ha, SINDEX); + PRINT_CREG_16bit(asd_ha, DINDEX); + PRINT_CREG_8bit(asd_ha, SINDIR); + PRINT_CREG_8bit(asd_ha, DINDIR); + PRINT_CREG_8bit(asd_ha, JUMLDIR); + PRINT_CREG_8bit(asd_ha, ARP2HALTCODE); + PRINT_CREG_16bit(asd_ha, CURRADDR); + PRINT_CREG_16bit(asd_ha, LASTADDR); + PRINT_CREG_16bit(asd_ha, NXTLADDR); + + asd_printk("IOP REGISTERS\n"); + + PRINT_REG_32bit(asd_ha, BISTCTL1, CBISTCTL); + PRINT_CREG_32bit(asd_ha, MAPPEDSCR); + + asd_printk("CIO REGISTERS\n"); + + for (mode = 0; mode < 9; mode++) + PRINT_MREG_16bit(asd_ha, mode, MnSCBPTR, CMnSCBPTR(mode)); + PRINT_MREG_16bit(asd_ha, 15, MnSCBPTR, CMnSCBPTR(15)); + + for (mode = 0; mode < 9; mode++) + PRINT_MREG_16bit(asd_ha, mode, MnDDBPTR, CMnDDBPTR(mode)); + PRINT_MREG_16bit(asd_ha, 15, MnDDBPTR, CMnDDBPTR(15)); + + for (mode = 0; mode < 8; mode++) + PRINT_MREG_32bit(asd_ha, mode, MnREQMBX, CMnREQMBX(mode)); + for (mode = 0; mode < 8; mode++) + PRINT_MREG_32bit(asd_ha, mode, MnRSPMBX, CMnRSPMBX(mode)); + for (mode = 0; mode < 8; mode++) + PRINT_MREG_32bit(asd_ha, mode, MnINT, CMnINT(mode)); + for (mode = 0; mode < 8; mode++) + PRINT_MREG_32bit(asd_ha, mode, MnINTEN, CMnINTEN(mode)); + + PRINT_CREG_8bit(asd_ha, SCRATCHPAGE); + for (mode = 0; mode < 8; mode++) + PRINT_MREG_8bit(asd_ha, mode, MnSCRATCHPAGE, + CMnSCRATCHPAGE(mode)); + + PRINT_REG_32bit(asd_ha, CLINKCON, CLINKCON); + PRINT_REG_8bit(asd_ha, CCONMSK, CCONMSK); + PRINT_REG_8bit(asd_ha, CCONEXIST, CCONEXIST); + PRINT_REG_16bit(asd_ha, CCONMODE, CCONMODE); + PRINT_REG_32bit(asd_ha, CTIMERCALC, CTIMERCALC); + PRINT_REG_8bit(asd_ha, CINTDIS, CINTDIS); + + asd_printk("SCRATCH MEMORY\n"); + + asd_printk("MIP 4 >>>>>\n"); + PRINT_MIS_word(asd_ha, Q_EXE_HEAD); + PRINT_MIS_word(asd_ha, Q_EXE_TAIL); + PRINT_MIS_word(asd_ha, Q_DONE_HEAD); + PRINT_MIS_word(asd_ha, Q_DONE_TAIL); + PRINT_MIS_word(asd_ha, Q_SEND_HEAD); + PRINT_MIS_word(asd_ha, Q_SEND_TAIL); + PRINT_MIS_word(asd_ha, Q_DMA2CHIM_HEAD); + PRINT_MIS_word(asd_ha, Q_DMA2CHIM_TAIL); + PRINT_MIS_word(asd_ha, Q_COPY_HEAD); + PRINT_MIS_word(asd_ha, Q_COPY_TAIL); + PRINT_MIS_word(asd_ha, REG0); + PRINT_MIS_word(asd_ha, REG1); + PRINT_MIS_dword(asd_ha, REG2); + PRINT_MIS_byte(asd_ha, LINK_CTL_Q_MAP); + PRINT_MIS_byte(asd_ha, MAX_CSEQ_MODE); + PRINT_MIS_byte(asd_ha, FREE_LIST_HACK_COUNT); + + asd_printk("MIP 5 >>>>\n"); + PRINT_MIS_qword(asd_ha, EST_NEXUS_REQ_QUEUE); + PRINT_MIS_qword(asd_ha, EST_NEXUS_REQ_COUNT); + PRINT_MIS_word(asd_ha, Q_EST_NEXUS_HEAD); + PRINT_MIS_word(asd_ha, Q_EST_NEXUS_TAIL); + PRINT_MIS_word(asd_ha, NEED_EST_NEXUS_SCB); + PRINT_MIS_byte(asd_ha, EST_NEXUS_REQ_HEAD); + PRINT_MIS_byte(asd_ha, EST_NEXUS_REQ_TAIL); + PRINT_MIS_byte(asd_ha, EST_NEXUS_SCB_OFFSET); + + asd_printk("MIP 6 >>>>\n"); + PRINT_MIS_word(asd_ha, INT_ROUT_RET_ADDR0); + PRINT_MIS_word(asd_ha, INT_ROUT_RET_ADDR1); + PRINT_MIS_word(asd_ha, INT_ROUT_SCBPTR); + PRINT_MIS_byte(asd_ha, INT_ROUT_MODE); + PRINT_MIS_byte(asd_ha, ISR_SCRATCH_FLAGS); + PRINT_MIS_word(asd_ha, ISR_SAVE_SINDEX); + PRINT_MIS_word(asd_ha, ISR_SAVE_DINDEX); + PRINT_MIS_word(asd_ha, Q_MONIRTT_HEAD); + PRINT_MIS_word(asd_ha, Q_MONIRTT_TAIL); + PRINT_MIS_byte(asd_ha, FREE_SCB_MASK); + PRINT_MIS_word(asd_ha, BUILTIN_FREE_SCB_HEAD); + PRINT_MIS_word(asd_ha, BUILTIN_FREE_SCB_TAIL); + PRINT_MIS_word(asd_ha, EXTENDED_FREE_SCB_HEAD); + PRINT_MIS_word(asd_ha, EXTENDED_FREE_SCB_TAIL); + + asd_printk("MIP 7 >>>>\n"); + PRINT_MIS_qword(asd_ha, EMPTY_REQ_QUEUE); + PRINT_MIS_qword(asd_ha, EMPTY_REQ_COUNT); + PRINT_MIS_word(asd_ha, Q_EMPTY_HEAD); + PRINT_MIS_word(asd_ha, Q_EMPTY_TAIL); + PRINT_MIS_word(asd_ha, NEED_EMPTY_SCB); + PRINT_MIS_byte(asd_ha, EMPTY_REQ_HEAD); + PRINT_MIS_byte(asd_ha, EMPTY_REQ_TAIL); + PRINT_MIS_byte(asd_ha, EMPTY_SCB_OFFSET); + PRINT_MIS_word(asd_ha, PRIMITIVE_DATA); + PRINT_MIS_dword(asd_ha, TIMEOUT_CONST); + + asd_printk("MDP 0 >>>>\n"); + asd_printk("%-20s %6s %6s %6s %6s %6s %6s %6s %6s\n", + "Mode: ", "0", "1", "2", "3", "4", "5", "6", "7"); + PRINT_CMDP_word(asd_ha, LRM_SAVE_SINDEX); + PRINT_CMDP_word(asd_ha, LRM_SAVE_SCBPTR); + PRINT_CMDP_word(asd_ha, Q_LINK_HEAD); + PRINT_CMDP_word(asd_ha, Q_LINK_TAIL); + PRINT_CMDP_byte(asd_ha, LRM_SAVE_SCRPAGE); + + asd_printk("MDP 0 Mode 8 >>>>\n"); + PRINT_MIS_word(asd_ha, RET_ADDR); + PRINT_MIS_word(asd_ha, RET_SCBPTR); + PRINT_MIS_word(asd_ha, SAVE_SCBPTR); + PRINT_MIS_word(asd_ha, EMPTY_TRANS_CTX); + PRINT_MIS_word(asd_ha, RESP_LEN); + PRINT_MIS_word(asd_ha, TMF_SCBPTR); + PRINT_MIS_word(asd_ha, GLOBAL_PREV_SCB); + PRINT_MIS_word(asd_ha, GLOBAL_HEAD); + PRINT_MIS_word(asd_ha, CLEAR_LU_HEAD); + PRINT_MIS_byte(asd_ha, TMF_OPCODE); + PRINT_MIS_byte(asd_ha, SCRATCH_FLAGS); + PRINT_MIS_word(asd_ha, HSB_SITE); + PRINT_MIS_word(asd_ha, FIRST_INV_SCB_SITE); + PRINT_MIS_word(asd_ha, FIRST_INV_DDB_SITE); + + asd_printk("MDP 1 Mode 8 >>>>\n"); + PRINT_MIS_qword(asd_ha, LUN_TO_CLEAR); + PRINT_MIS_qword(asd_ha, LUN_TO_CHECK); + + asd_printk("MDP 2 Mode 8 >>>>\n"); + PRINT_MIS_qword(asd_ha, HQ_NEW_POINTER); + PRINT_MIS_qword(asd_ha, HQ_DONE_BASE); + PRINT_MIS_dword(asd_ha, HQ_DONE_POINTER); + PRINT_MIS_byte(asd_ha, HQ_DONE_PASS); +} + +#define PRINT_LREG_8bit(_h, _lseq, _n) \ + asd_printk(STR_8BIT, #_n, _n, asd_read_reg_byte(_h, Lm##_n(_lseq))) +#define PRINT_LREG_16bit(_h, _lseq, _n) \ + asd_printk(STR_16BIT, #_n, _n, asd_read_reg_word(_h, Lm##_n(_lseq))) +#define PRINT_LREG_32bit(_h, _lseq, _n) \ + asd_printk(STR_32BIT, #_n, _n, asd_read_reg_dword(_h, Lm##_n(_lseq))) + +#define PRINT_LMIP_byte(_h, _lseq, _n) \ + asd_printk(STR_8BIT, #_n, LmSEQ_##_n(_lseq)-LmSCRATCH(_lseq), \ + asd_read_reg_byte(_h, LmSEQ_##_n(_lseq))) +#define PRINT_LMIP_word(_h, _lseq, _n) \ + asd_printk(STR_16BIT, #_n, LmSEQ_##_n(_lseq)-LmSCRATCH(_lseq), \ + asd_read_reg_word(_h, LmSEQ_##_n(_lseq))) +#define PRINT_LMIP_dword(_h, _lseq, _n) \ + asd_printk(STR_32BIT, #_n, LmSEQ_##_n(_lseq)-LmSCRATCH(_lseq), \ + asd_read_reg_dword(_h, LmSEQ_##_n(_lseq))) +#define PRINT_LMIP_qword(_h, _lseq, _n) \ + asd_printk(STR_64BIT, #_n, LmSEQ_##_n(_lseq)-LmSCRATCH(_lseq), \ + (unsigned long long)(((unsigned long long) \ + asd_read_reg_dword(_h, LmSEQ_##_n(_lseq))) \ + | (((unsigned long long) \ + asd_read_reg_dword(_h, LmSEQ_##_n(_lseq)+4))<<32))) + +static void asd_print_lseq_cio_reg(struct asd_ha_struct *asd_ha, + u32 lseq_cio_addr, int i) +{ + switch (LSEQmCIOREGS[i].width) { + case 8: + asd_printk("%20s[0x%x]: 0x%02x\n", LSEQmCIOREGS[i].name, + LSEQmCIOREGS[i].offs, + asd_read_reg_byte(asd_ha, lseq_cio_addr + + LSEQmCIOREGS[i].offs)); + + break; + case 16: + asd_printk("%20s[0x%x]: 0x%04x\n", LSEQmCIOREGS[i].name, + LSEQmCIOREGS[i].offs, + asd_read_reg_word(asd_ha, lseq_cio_addr + + LSEQmCIOREGS[i].offs)); + + break; + case 32: + asd_printk("%20s[0x%x]: 0x%08x\n", LSEQmCIOREGS[i].name, + LSEQmCIOREGS[i].offs, + asd_read_reg_dword(asd_ha, lseq_cio_addr + + LSEQmCIOREGS[i].offs)); + break; + } +} + +static void asd_dump_lseq_state(struct asd_ha_struct *asd_ha, int lseq) +{ + u32 moffs; + int mode; + + asd_printk("LSEQ %d STATE\n", lseq); + + asd_printk("LSEQ%d: ARP2 REGISTERS\n", lseq); + PRINT_LREG_32bit(asd_ha, lseq, ARP2CTL); + PRINT_LREG_32bit(asd_ha, lseq, ARP2INT); + PRINT_LREG_32bit(asd_ha, lseq, ARP2INTEN); + PRINT_LREG_8bit(asd_ha, lseq, MODEPTR); + PRINT_LREG_8bit(asd_ha, lseq, ALTMODE); + PRINT_LREG_8bit(asd_ha, lseq, FLAG); + PRINT_LREG_8bit(asd_ha, lseq, ARP2INTCTL); + PRINT_LREG_16bit(asd_ha, lseq, STACK); + PRINT_LREG_16bit(asd_ha, lseq, PRGMCNT); + PRINT_LREG_16bit(asd_ha, lseq, ACCUM); + PRINT_LREG_16bit(asd_ha, lseq, SINDEX); + PRINT_LREG_16bit(asd_ha, lseq, DINDEX); + PRINT_LREG_8bit(asd_ha, lseq, SINDIR); + PRINT_LREG_8bit(asd_ha, lseq, DINDIR); + PRINT_LREG_8bit(asd_ha, lseq, JUMLDIR); + PRINT_LREG_8bit(asd_ha, lseq, ARP2HALTCODE); + PRINT_LREG_16bit(asd_ha, lseq, CURRADDR); + PRINT_LREG_16bit(asd_ha, lseq, LASTADDR); + PRINT_LREG_16bit(asd_ha, lseq, NXTLADDR); + + asd_printk("LSEQ%d: IOP REGISTERS\n", lseq); + + PRINT_LREG_32bit(asd_ha, lseq, MODECTL); + PRINT_LREG_32bit(asd_ha, lseq, DBGMODE); + PRINT_LREG_32bit(asd_ha, lseq, CONTROL); + PRINT_REG_32bit(asd_ha, BISTCTL0, LmBISTCTL0(lseq)); + PRINT_REG_32bit(asd_ha, BISTCTL1, LmBISTCTL1(lseq)); + + asd_printk("LSEQ%d: CIO REGISTERS\n", lseq); + asd_printk("Mode common:\n"); + + for (mode = 0; mode < 8; mode++) { + u32 lseq_cio_addr = LmSEQ_PHY_BASE(mode, lseq); + int i; + + for (i = 0; LSEQmCIOREGS[i].name; i++) + if (LSEQmCIOREGS[i].mode == MODE_COMMON) + asd_print_lseq_cio_reg(asd_ha,lseq_cio_addr,i); + } + + asd_printk("Mode unique:\n"); + for (mode = 0; mode < 8; mode++) { + u32 lseq_cio_addr = LmSEQ_PHY_BASE(mode, lseq); + int i; + + asd_printk("Mode %d\n", mode); + for (i = 0; LSEQmCIOREGS[i].name; i++) { + if (!(LSEQmCIOREGS[i].mode & (1 << mode))) + continue; + asd_print_lseq_cio_reg(asd_ha, lseq_cio_addr, i); + } + } + + asd_printk("SCRATCH MEMORY\n"); + + asd_printk("LSEQ%d MIP 0 >>>>\n", lseq); + PRINT_LMIP_word(asd_ha, lseq, Q_TGTXFR_HEAD); + PRINT_LMIP_word(asd_ha, lseq, Q_TGTXFR_TAIL); + PRINT_LMIP_byte(asd_ha, lseq, LINK_NUMBER); + PRINT_LMIP_byte(asd_ha, lseq, SCRATCH_FLAGS); + PRINT_LMIP_qword(asd_ha, lseq, CONNECTION_STATE); + PRINT_LMIP_word(asd_ha, lseq, CONCTL); + PRINT_LMIP_byte(asd_ha, lseq, CONSTAT); + PRINT_LMIP_byte(asd_ha, lseq, CONNECTION_MODES); + PRINT_LMIP_word(asd_ha, lseq, REG1_ISR); + PRINT_LMIP_word(asd_ha, lseq, REG2_ISR); + PRINT_LMIP_word(asd_ha, lseq, REG3_ISR); + PRINT_LMIP_qword(asd_ha, lseq,REG0_ISR); + + asd_printk("LSEQ%d MIP 1 >>>>\n", lseq); + PRINT_LMIP_word(asd_ha, lseq, EST_NEXUS_SCBPTR0); + PRINT_LMIP_word(asd_ha, lseq, EST_NEXUS_SCBPTR1); + PRINT_LMIP_word(asd_ha, lseq, EST_NEXUS_SCBPTR2); + PRINT_LMIP_word(asd_ha, lseq, EST_NEXUS_SCBPTR3); + PRINT_LMIP_byte(asd_ha, lseq, EST_NEXUS_SCB_OPCODE0); + PRINT_LMIP_byte(asd_ha, lseq, EST_NEXUS_SCB_OPCODE1); + PRINT_LMIP_byte(asd_ha, lseq, EST_NEXUS_SCB_OPCODE2); + PRINT_LMIP_byte(asd_ha, lseq, EST_NEXUS_SCB_OPCODE3); + PRINT_LMIP_byte(asd_ha, lseq, EST_NEXUS_SCB_HEAD); + PRINT_LMIP_byte(asd_ha, lseq, EST_NEXUS_SCB_TAIL); + PRINT_LMIP_byte(asd_ha, lseq, EST_NEXUS_BUF_AVAIL); + PRINT_LMIP_dword(asd_ha, lseq, TIMEOUT_CONST); + PRINT_LMIP_word(asd_ha, lseq, ISR_SAVE_SINDEX); + PRINT_LMIP_word(asd_ha, lseq, ISR_SAVE_DINDEX); + + asd_printk("LSEQ%d MIP 2 >>>>\n", lseq); + PRINT_LMIP_word(asd_ha, lseq, EMPTY_SCB_PTR0); + PRINT_LMIP_word(asd_ha, lseq, EMPTY_SCB_PTR1); + PRINT_LMIP_word(asd_ha, lseq, EMPTY_SCB_PTR2); + PRINT_LMIP_word(asd_ha, lseq, EMPTY_SCB_PTR3); + PRINT_LMIP_byte(asd_ha, lseq, EMPTY_SCB_OPCD0); + PRINT_LMIP_byte(asd_ha, lseq, EMPTY_SCB_OPCD1); + PRINT_LMIP_byte(asd_ha, lseq, EMPTY_SCB_OPCD2); + PRINT_LMIP_byte(asd_ha, lseq, EMPTY_SCB_OPCD3); + PRINT_LMIP_byte(asd_ha, lseq, EMPTY_SCB_HEAD); + PRINT_LMIP_byte(asd_ha, lseq, EMPTY_SCB_TAIL); + PRINT_LMIP_byte(asd_ha, lseq, EMPTY_BUFS_AVAIL); + + asd_printk("LSEQ%d MIP 3 >>>>\n", lseq); + PRINT_LMIP_dword(asd_ha, lseq, DEV_PRES_TMR_TOUT_CONST); + PRINT_LMIP_dword(asd_ha, lseq, SATA_INTERLOCK_TIMEOUT); + PRINT_LMIP_dword(asd_ha, lseq, SRST_ASSERT_TIMEOUT); + PRINT_LMIP_dword(asd_ha, lseq, RCV_FIS_TIMEOUT); + PRINT_LMIP_dword(asd_ha, lseq, ONE_MILLISEC_TIMEOUT); + PRINT_LMIP_dword(asd_ha, lseq, TEN_MS_COMINIT_TIMEOUT); + PRINT_LMIP_dword(asd_ha, lseq, SMP_RCV_TIMEOUT); + + for (mode = 0; mode < 3; mode++) { + asd_printk("LSEQ%d MDP 0 MODE %d >>>>\n", lseq, mode); + moffs = mode * LSEQ_MODE_SCRATCH_SIZE; + + asd_printk(STR_16BIT, "RET_ADDR", 0, + asd_read_reg_word(asd_ha, LmSEQ_RET_ADDR(lseq) + + moffs)); + asd_printk(STR_16BIT, "REG0_MODE", 2, + asd_read_reg_word(asd_ha, LmSEQ_REG0_MODE(lseq) + + moffs)); + asd_printk(STR_16BIT, "MODE_FLAGS", 4, + asd_read_reg_word(asd_ha, LmSEQ_MODE_FLAGS(lseq) + + moffs)); + asd_printk(STR_16BIT, "RET_ADDR2", 0x6, + asd_read_reg_word(asd_ha, LmSEQ_RET_ADDR2(lseq) + + moffs)); + asd_printk(STR_16BIT, "RET_ADDR1", 0x8, + asd_read_reg_word(asd_ha, LmSEQ_RET_ADDR1(lseq) + + moffs)); + asd_printk(STR_8BIT, "OPCODE_TO_CSEQ", 0xB, + asd_read_reg_byte(asd_ha, LmSEQ_OPCODE_TO_CSEQ(lseq) + + moffs)); + asd_printk(STR_16BIT, "DATA_TO_CSEQ", 0xC, + asd_read_reg_word(asd_ha, LmSEQ_DATA_TO_CSEQ(lseq) + + moffs)); + } + + asd_printk("LSEQ%d MDP 0 MODE 5 >>>>\n", lseq); + moffs = LSEQ_MODE5_PAGE0_OFFSET; + asd_printk(STR_16BIT, "RET_ADDR", 0, + asd_read_reg_word(asd_ha, LmSEQ_RET_ADDR(lseq) + moffs)); + asd_printk(STR_16BIT, "REG0_MODE", 2, + asd_read_reg_word(asd_ha, LmSEQ_REG0_MODE(lseq) + moffs)); + asd_printk(STR_16BIT, "MODE_FLAGS", 4, + asd_read_reg_word(asd_ha, LmSEQ_MODE_FLAGS(lseq) + moffs)); + asd_printk(STR_16BIT, "RET_ADDR2", 0x6, + asd_read_reg_word(asd_ha, LmSEQ_RET_ADDR2(lseq) + moffs)); + asd_printk(STR_16BIT, "RET_ADDR1", 0x8, + asd_read_reg_word(asd_ha, LmSEQ_RET_ADDR1(lseq) + moffs)); + asd_printk(STR_8BIT, "OPCODE_TO_CSEQ", 0xB, + asd_read_reg_byte(asd_ha, LmSEQ_OPCODE_TO_CSEQ(lseq) + moffs)); + asd_printk(STR_16BIT, "DATA_TO_CSEQ", 0xC, + asd_read_reg_word(asd_ha, LmSEQ_DATA_TO_CSEQ(lseq) + moffs)); + + asd_printk("LSEQ%d MDP 0 MODE 0 >>>>\n", lseq); + PRINT_LMIP_word(asd_ha, lseq, FIRST_INV_DDB_SITE); + PRINT_LMIP_word(asd_ha, lseq, EMPTY_TRANS_CTX); + PRINT_LMIP_word(asd_ha, lseq, RESP_LEN); + PRINT_LMIP_word(asd_ha, lseq, FIRST_INV_SCB_SITE); + PRINT_LMIP_dword(asd_ha, lseq, INTEN_SAVE); + PRINT_LMIP_byte(asd_ha, lseq, LINK_RST_FRM_LEN); + PRINT_LMIP_byte(asd_ha, lseq, LINK_RST_PROTOCOL); + PRINT_LMIP_byte(asd_ha, lseq, RESP_STATUS); + PRINT_LMIP_byte(asd_ha, lseq, LAST_LOADED_SGE); + PRINT_LMIP_byte(asd_ha, lseq, SAVE_SCBPTR); + + asd_printk("LSEQ%d MDP 0 MODE 1 >>>>\n", lseq); + PRINT_LMIP_word(asd_ha, lseq, Q_XMIT_HEAD); + PRINT_LMIP_word(asd_ha, lseq, M1_EMPTY_TRANS_CTX); + PRINT_LMIP_word(asd_ha, lseq, INI_CONN_TAG); + PRINT_LMIP_byte(asd_ha, lseq, FAILED_OPEN_STATUS); + PRINT_LMIP_byte(asd_ha, lseq, XMIT_REQUEST_TYPE); + PRINT_LMIP_byte(asd_ha, lseq, M1_RESP_STATUS); + PRINT_LMIP_byte(asd_ha, lseq, M1_LAST_LOADED_SGE); + PRINT_LMIP_word(asd_ha, lseq, M1_SAVE_SCBPTR); + + asd_printk("LSEQ%d MDP 0 MODE 2 >>>>\n", lseq); + PRINT_LMIP_word(asd_ha, lseq, PORT_COUNTER); + PRINT_LMIP_word(asd_ha, lseq, PM_TABLE_PTR); + PRINT_LMIP_word(asd_ha, lseq, SATA_INTERLOCK_TMR_SAVE); + PRINT_LMIP_word(asd_ha, lseq, IP_BITL); + PRINT_LMIP_word(asd_ha, lseq, COPY_SMP_CONN_TAG); + PRINT_LMIP_byte(asd_ha, lseq, P0M2_OFFS1AH); + + asd_printk("LSEQ%d MDP 0 MODE 4/5 >>>>\n", lseq); + PRINT_LMIP_byte(asd_ha, lseq, SAVED_OOB_STATUS); + PRINT_LMIP_byte(asd_ha, lseq, SAVED_OOB_MODE); + PRINT_LMIP_word(asd_ha, lseq, Q_LINK_HEAD); + PRINT_LMIP_byte(asd_ha, lseq, LINK_RST_ERR); + PRINT_LMIP_byte(asd_ha, lseq, SAVED_OOB_SIGNALS); + PRINT_LMIP_byte(asd_ha, lseq, SAS_RESET_MODE); + PRINT_LMIP_byte(asd_ha, lseq, LINK_RESET_RETRY_COUNT); + PRINT_LMIP_byte(asd_ha, lseq, NUM_LINK_RESET_RETRIES); + PRINT_LMIP_word(asd_ha, lseq, OOB_INT_ENABLES); + PRINT_LMIP_word(asd_ha, lseq, NOTIFY_TIMER_TIMEOUT); + PRINT_LMIP_word(asd_ha, lseq, NOTIFY_TIMER_DOWN_COUNT); + + asd_printk("LSEQ%d MDP 1 MODE 0 >>>>\n", lseq); + PRINT_LMIP_qword(asd_ha, lseq, SG_LIST_PTR_ADDR0); + PRINT_LMIP_qword(asd_ha, lseq, SG_LIST_PTR_ADDR1); + + asd_printk("LSEQ%d MDP 1 MODE 1 >>>>\n", lseq); + PRINT_LMIP_qword(asd_ha, lseq, M1_SG_LIST_PTR_ADDR0); + PRINT_LMIP_qword(asd_ha, lseq, M1_SG_LIST_PTR_ADDR1); + + asd_printk("LSEQ%d MDP 1 MODE 2 >>>>\n", lseq); + PRINT_LMIP_dword(asd_ha, lseq, INVALID_DWORD_COUNT); + PRINT_LMIP_dword(asd_ha, lseq, DISPARITY_ERROR_COUNT); + PRINT_LMIP_dword(asd_ha, lseq, LOSS_OF_SYNC_COUNT); + + asd_printk("LSEQ%d MDP 1 MODE 4/5 >>>>\n", lseq); + PRINT_LMIP_dword(asd_ha, lseq, FRAME_TYPE_MASK); + PRINT_LMIP_dword(asd_ha, lseq, HASHED_SRC_ADDR_MASK_PRINT); + PRINT_LMIP_byte(asd_ha, lseq, NUM_FILL_BYTES_MASK); + PRINT_LMIP_word(asd_ha, lseq, TAG_MASK); + PRINT_LMIP_word(asd_ha, lseq, TARGET_PORT_XFER_TAG); + PRINT_LMIP_dword(asd_ha, lseq, DATA_OFFSET); + + asd_printk("LSEQ%d MDP 2 MODE 0 >>>>\n", lseq); + PRINT_LMIP_dword(asd_ha, lseq, SMP_RCV_TIMER_TERM_TS); + PRINT_LMIP_byte(asd_ha, lseq, DEVICE_BITS); + PRINT_LMIP_word(asd_ha, lseq, SDB_DDB); + PRINT_LMIP_word(asd_ha, lseq, SDB_NUM_TAGS); + PRINT_LMIP_word(asd_ha, lseq, SDB_CURR_TAG); + + asd_printk("LSEQ%d MDP 2 MODE 1 >>>>\n", lseq); + PRINT_LMIP_qword(asd_ha, lseq, TX_ID_ADDR_FRAME); + PRINT_LMIP_dword(asd_ha, lseq, OPEN_TIMER_TERM_TS); + PRINT_LMIP_dword(asd_ha, lseq, SRST_AS_TIMER_TERM_TS); + PRINT_LMIP_dword(asd_ha, lseq, LAST_LOADED_SG_EL); + + asd_printk("LSEQ%d MDP 2 MODE 2 >>>>\n", lseq); + PRINT_LMIP_dword(asd_ha, lseq, CLOSE_TIMER_TERM_TS); + PRINT_LMIP_dword(asd_ha, lseq, BREAK_TIMER_TERM_TS); + PRINT_LMIP_dword(asd_ha, lseq, DWS_RESET_TIMER_TERM_TS); + PRINT_LMIP_dword(asd_ha, lseq, SATA_INTERLOCK_TIMER_TERM_TS); + PRINT_LMIP_dword(asd_ha, lseq, MCTL_TIMER_TERM_TS); + + asd_printk("LSEQ%d MDP 2 MODE 4/5 >>>>\n", lseq); + PRINT_LMIP_dword(asd_ha, lseq, COMINIT_TIMER_TERM_TS); + PRINT_LMIP_dword(asd_ha, lseq, RCV_ID_TIMER_TERM_TS); + PRINT_LMIP_dword(asd_ha, lseq, RCV_FIS_TIMER_TERM_TS); + PRINT_LMIP_dword(asd_ha, lseq, DEV_PRES_TIMER_TERM_TS); +} + +/** + * asd_dump_ddb_site -- dump a CSEQ DDB site + * @asd_ha: pointer to host adapter structure + * @site_no: site number of interest + */ +void asd_dump_target_ddb(struct asd_ha_struct *asd_ha, u16 site_no) +{ + if (site_no >= asd_ha->hw_prof.max_ddbs) + return; + +#define DDB_FIELDB(__name) \ + asd_ddbsite_read_byte(asd_ha, site_no, \ + offsetof(struct asd_ddb_ssp_smp_target_port, __name)) +#define DDB2_FIELDB(__name) \ + asd_ddbsite_read_byte(asd_ha, site_no, \ + offsetof(struct asd_ddb_stp_sata_target_port, __name)) +#define DDB_FIELDW(__name) \ + asd_ddbsite_read_word(asd_ha, site_no, \ + offsetof(struct asd_ddb_ssp_smp_target_port, __name)) + +#define DDB_FIELDD(__name) \ + asd_ddbsite_read_dword(asd_ha, site_no, \ + offsetof(struct asd_ddb_ssp_smp_target_port, __name)) + + asd_printk("DDB: 0x%02x\n", site_no); + asd_printk("conn_type: 0x%02x\n", DDB_FIELDB(conn_type)); + asd_printk("conn_rate: 0x%02x\n", DDB_FIELDB(conn_rate)); + asd_printk("init_conn_tag: 0x%04x\n", be16_to_cpu(DDB_FIELDW(init_conn_tag))); + asd_printk("send_queue_head: 0x%04x\n", be16_to_cpu(DDB_FIELDW(send_queue_head))); + asd_printk("sq_suspended: 0x%02x\n", DDB_FIELDB(sq_suspended)); + asd_printk("DDB Type: 0x%02x\n", DDB_FIELDB(ddb_type)); + asd_printk("AWT Default: 0x%04x\n", DDB_FIELDW(awt_def)); + asd_printk("compat_features: 0x%02x\n", DDB_FIELDB(compat_features)); + asd_printk("Pathway Blocked Count: 0x%02x\n", + DDB_FIELDB(pathway_blocked_count)); + asd_printk("arb_wait_time: 0x%04x\n", DDB_FIELDW(arb_wait_time)); + asd_printk("more_compat_features: 0x%08x\n", + DDB_FIELDD(more_compat_features)); + asd_printk("Conn Mask: 0x%02x\n", DDB_FIELDB(conn_mask)); + asd_printk("flags: 0x%02x\n", DDB_FIELDB(flags)); + asd_printk("flags2: 0x%02x\n", DDB2_FIELDB(flags2)); + asd_printk("ExecQ Tail: 0x%04x\n",DDB_FIELDW(exec_queue_tail)); + asd_printk("SendQ Tail: 0x%04x\n",DDB_FIELDW(send_queue_tail)); + asd_printk("Active Task Count: 0x%04x\n", + DDB_FIELDW(active_task_count)); + asd_printk("ITNL Reason: 0x%02x\n", DDB_FIELDB(itnl_reason)); + asd_printk("ITNL Timeout Const: 0x%04x\n", DDB_FIELDW(itnl_timeout)); + asd_printk("ITNL timestamp: 0x%08x\n", DDB_FIELDD(itnl_timestamp)); +} + +void asd_dump_ddb_0(struct asd_ha_struct *asd_ha) +{ +#define DDB0_FIELDB(__name) \ + asd_ddbsite_read_byte(asd_ha, 0, \ + offsetof(struct asd_ddb_seq_shared, __name)) +#define DDB0_FIELDW(__name) \ + asd_ddbsite_read_word(asd_ha, 0, \ + offsetof(struct asd_ddb_seq_shared, __name)) + +#define DDB0_FIELDD(__name) \ + asd_ddbsite_read_dword(asd_ha,0 , \ + offsetof(struct asd_ddb_seq_shared, __name)) + +#define DDB0_FIELDA(__name, _o) \ + asd_ddbsite_read_byte(asd_ha, 0, \ + offsetof(struct asd_ddb_seq_shared, __name)+_o) + + + asd_printk("DDB: 0\n"); + asd_printk("q_free_ddb_head:%04x\n", DDB0_FIELDW(q_free_ddb_head)); + asd_printk("q_free_ddb_tail:%04x\n", DDB0_FIELDW(q_free_ddb_tail)); + asd_printk("q_free_ddb_cnt:%04x\n", DDB0_FIELDW(q_free_ddb_cnt)); + asd_printk("q_used_ddb_head:%04x\n", DDB0_FIELDW(q_used_ddb_head)); + asd_printk("q_used_ddb_tail:%04x\n", DDB0_FIELDW(q_used_ddb_tail)); + asd_printk("shared_mem_lock:%04x\n", DDB0_FIELDW(shared_mem_lock)); + asd_printk("smp_conn_tag:%04x\n", DDB0_FIELDW(smp_conn_tag)); + asd_printk("est_nexus_buf_cnt:%04x\n", DDB0_FIELDW(est_nexus_buf_cnt)); + asd_printk("est_nexus_buf_thresh:%04x\n", + DDB0_FIELDW(est_nexus_buf_thresh)); + asd_printk("conn_not_active:%02x\n", DDB0_FIELDB(conn_not_active)); + asd_printk("phy_is_up:%02x\n", DDB0_FIELDB(phy_is_up)); + asd_printk("port_map_by_links:%02x %02x %02x %02x " + "%02x %02x %02x %02x\n", + DDB0_FIELDA(port_map_by_links, 0), + DDB0_FIELDA(port_map_by_links, 1), + DDB0_FIELDA(port_map_by_links, 2), + DDB0_FIELDA(port_map_by_links, 3), + DDB0_FIELDA(port_map_by_links, 4), + DDB0_FIELDA(port_map_by_links, 5), + DDB0_FIELDA(port_map_by_links, 6), + DDB0_FIELDA(port_map_by_links, 7)); +} + +static void asd_dump_scb_site(struct asd_ha_struct *asd_ha, u16 site_no) +{ + +#define SCB_FIELDB(__name) \ + asd_scbsite_read_byte(asd_ha, site_no, sizeof(struct scb_header) \ + + offsetof(struct initiate_ssp_task, __name)) +#define SCB_FIELDW(__name) \ + asd_scbsite_read_word(asd_ha, site_no, sizeof(struct scb_header) \ + + offsetof(struct initiate_ssp_task, __name)) +#define SCB_FIELDD(__name) \ + asd_scbsite_read_dword(asd_ha, site_no, sizeof(struct scb_header) \ + + offsetof(struct initiate_ssp_task, __name)) + + asd_printk("Total Xfer Len: 0x%08x.\n", SCB_FIELDD(total_xfer_len)); + asd_printk("Frame Type: 0x%02x.\n", SCB_FIELDB(ssp_frame.frame_type)); + asd_printk("Tag: 0x%04x.\n", SCB_FIELDW(ssp_frame.tag)); + asd_printk("Target Port Xfer Tag: 0x%04x.\n", + SCB_FIELDW(ssp_frame.tptt)); + asd_printk("Data Offset: 0x%08x.\n", SCB_FIELDW(ssp_frame.data_offs)); + asd_printk("Retry Count: 0x%02x.\n", SCB_FIELDB(retry_count)); +} + +/** + * asd_dump_scb_sites -- dump currently used CSEQ SCB sites + * @asd_ha: pointer to host adapter struct + */ +void asd_dump_scb_sites(struct asd_ha_struct *asd_ha) +{ + u16 site_no; + + for (site_no = 0; site_no < asd_ha->hw_prof.max_scbs; site_no++) { + u8 opcode; + + if (!SCB_SITE_VALID(site_no)) + continue; + + /* We are only interested in SCB sites currently used. + */ + opcode = asd_scbsite_read_byte(asd_ha, site_no, + offsetof(struct scb_header, + opcode)); + if (opcode == 0xFF) + continue; + + asd_printk("\nSCB: 0x%x\n", site_no); + asd_dump_scb_site(asd_ha, site_no); + } +} + +/** + * ads_dump_seq_state -- dump CSEQ and LSEQ states + * @asd_ha: pointer to host adapter structure + * @lseq_mask: mask of LSEQs of interest + */ +void asd_dump_seq_state(struct asd_ha_struct *asd_ha, u8 lseq_mask) +{ + int lseq; + + asd_dump_cseq_state(asd_ha); + + if (lseq_mask != 0) + for_each_sequencer(lseq_mask, lseq_mask, lseq) + asd_dump_lseq_state(asd_ha, lseq); +} + +void asd_dump_frame_rcvd(struct asd_phy *phy, + struct done_list_struct *dl) +{ + unsigned long flags; + int i; + + switch ((dl->status_block[1] & 0x70) >> 3) { + case SAS_PROTO_STP: + ASD_DPRINTK("STP proto device-to-host FIS:\n"); + break; + default: + case SAS_PROTO_SSP: + ASD_DPRINTK("SAS proto IDENTIFY:\n"); + break; + } + spin_lock_irqsave(&phy->sas_phy.frame_rcvd_lock, flags); + for (i = 0; i < phy->sas_phy.frame_rcvd_size; i+=4) + ASD_DPRINTK("%02x: %02x %02x %02x %02x\n", + i, + phy->frame_rcvd[i], + phy->frame_rcvd[i+1], + phy->frame_rcvd[i+2], + phy->frame_rcvd[i+3]); + spin_unlock_irqrestore(&phy->sas_phy.frame_rcvd_lock, flags); +} + +static inline void asd_dump_scb(struct asd_ascb *ascb, int ind) +{ + asd_printk("scb%d: vaddr: 0x%p, dma_handle: 0x%llx, next: 0x%llx, " + "index:%d, opcode:0x%02x\n", + ind, ascb->dma_scb.vaddr, + (unsigned long long)ascb->dma_scb.dma_handle, + (unsigned long long) + le64_to_cpu(ascb->scb->header.next_scb), + le16_to_cpu(ascb->scb->header.index), + ascb->scb->header.opcode); +} + +void asd_dump_scb_list(struct asd_ascb *ascb, int num) +{ + int i = 0; + + asd_printk("dumping %d scbs:\n", num); + + asd_dump_scb(ascb, i++); + --num; + + if (num > 0 && !list_empty(&ascb->list)) { + struct list_head *el; + + list_for_each(el, &ascb->list) { + struct asd_ascb *s = list_entry(el, struct asd_ascb, + list); + asd_dump_scb(s, i++); + if (--num <= 0) + break; + } + } +} + +#endif /* ASD_DEBUG */ diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_dump.h linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_dump.h --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_dump.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_dump.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,52 @@ +/* + * Aic94xx SAS/SATA driver dump header file. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _AIC94XX_DUMP_H_ +#define _AIC94XX_DUMP_H_ + +#ifdef ASD_DEBUG + +void asd_dump_ddb_0(struct asd_ha_struct *asd_ha); +void asd_dump_target_ddb(struct asd_ha_struct *asd_ha, u16 site_no); +void asd_dump_scb_sites(struct asd_ha_struct *asd_ha); +void asd_dump_seq_state(struct asd_ha_struct *asd_ha, u8 lseq_mask); +void asd_dump_frame_rcvd(struct asd_phy *phy, + struct done_list_struct *dl); +void asd_dump_scb_list(struct asd_ascb *ascb, int num); +#else /* ASD_DEBUG */ + +static inline void asd_dump_ddb_0(struct asd_ha_struct *asd_ha) { } +static inline void asd_dump_target_ddb(struct asd_ha_struct *asd_ha, + u16 site_no) { } +static inline void asd_dump_scb_sites(struct asd_ha_struct *asd_ha) { } +static inline void asd_dump_seq_state(struct asd_ha_struct *asd_ha, + u8 lseq_mask) { } +static inline void asd_dump_frame_rcvd(struct asd_phy *phy, + struct done_list_struct *dl) { } +static inline void asd_dump_scb_list(struct asd_ascb *ascb, int num) { } +#endif /* ASD_DEBUG */ + +#endif /* _AIC94XX_DUMP_H_ */ diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_hwi.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_hwi.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_hwi.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_hwi.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1376 @@ +/* + * Aic94xx SAS/SATA driver hardware interface. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include + +#include "aic94xx.h" +#include "aic94xx_reg.h" +#include "aic94xx_hwi.h" +#include "aic94xx_seq.h" +#include "aic94xx_dump.h" + +u32 MBAR0_SWB_SIZE; + +/* ---------- Initialization ---------- */ + +static void asd_get_user_sas_addr(struct asd_ha_struct *asd_ha) +{ + extern char sas_addr_str[]; + /* If the user has specified a WWN it overrides other settings + */ + if (sas_addr_str[0] != '\0') + asd_destringify_sas_addr(asd_ha->hw_prof.sas_addr, + sas_addr_str); + else if (asd_ha->hw_prof.sas_addr[0] != 0) + asd_stringify_sas_addr(sas_addr_str, asd_ha->hw_prof.sas_addr); +} + +static void asd_propagate_sas_addr(struct asd_ha_struct *asd_ha) +{ + int i; + + for (i = 0; i < ASD_MAX_PHYS; i++) { + if (asd_ha->hw_prof.phy_desc[i].sas_addr[0] == 0) + continue; + /* Set a phy's address only if it has none. + */ + ASD_DPRINTK("setting phy%d addr to %llx\n", i, + SAS_ADDR(asd_ha->hw_prof.sas_addr)); + memcpy(asd_ha->hw_prof.phy_desc[i].sas_addr, + asd_ha->hw_prof.sas_addr, SAS_ADDR_SIZE); + } +} + +/* ---------- PHY initialization ---------- */ + +static void asd_init_phy_identify(struct asd_phy *phy) +{ + phy->identify_frame = phy->id_frm_tok->vaddr; + + memset(phy->identify_frame, 0, sizeof(*phy->identify_frame)); + + phy->identify_frame->dev_type = SAS_END_DEV; + if (phy->sas_phy.role & PHY_ROLE_INITIATOR) + phy->identify_frame->initiator_bits = phy->sas_phy.iproto; + if (phy->sas_phy.role & PHY_ROLE_TARGET) + phy->identify_frame->target_bits = phy->sas_phy.tproto; + memcpy(phy->identify_frame->sas_addr, phy->phy_desc->sas_addr, + SAS_ADDR_SIZE); + phy->identify_frame->phy_id = phy->sas_phy.id; +} + +static int asd_init_phy(struct asd_phy *phy) +{ + struct asd_ha_struct *asd_ha = phy->sas_phy.ha->lldd_ha; + struct asd_sas_phy *sas_phy = &phy->sas_phy; + + sas_phy->enabled = 1; + sas_phy->class = SAS; + sas_phy->iproto = SAS_PROTO_ALL; + sas_phy->tproto = 0; + sas_phy->type = PHY_TYPE_PHYSICAL; + sas_phy->role = PHY_ROLE_INITIATOR; + sas_phy->oob_mode = OOB_NOT_CONNECTED; + sas_phy->linkrate = PHY_LINKRATE_NONE; + + phy->id_frm_tok = asd_alloc_coherent(asd_ha, + sizeof(*phy->identify_frame), + GFP_KERNEL); + if (!phy->id_frm_tok) { + asd_printk("no mem for IDENTIFY for phy%d\n", sas_phy->id); + return -ENOMEM; + } else + asd_init_phy_identify(phy); + + memset(phy->frame_rcvd, 0, sizeof(phy->frame_rcvd)); + + return 0; +} + +static int asd_init_phys(struct asd_ha_struct *asd_ha) +{ + u8 i; + u8 phy_mask = asd_ha->hw_prof.enabled_phys; + + for (i = 0; i < ASD_MAX_PHYS; i++) { + struct asd_phy *phy = &asd_ha->phys[i]; + + phy->phy_desc = &asd_ha->hw_prof.phy_desc[i]; + + phy->sas_phy.enabled = 0; + phy->sas_phy.id = i; + phy->sas_phy.sas_addr = &phy->phy_desc->sas_addr[0]; + phy->sas_phy.frame_rcvd = &phy->frame_rcvd[0]; + phy->sas_phy.ha = &asd_ha->sas_ha; + phy->sas_phy.lldd_phy = phy; + } + + /* Now enable and initialize only the enabled phys. */ + for_each_phy(phy_mask, phy_mask, i) { + int err = asd_init_phy(&asd_ha->phys[i]); + if (err) + return err; + } + + return 0; +} + +/* ---------- Sliding windows ---------- */ + +static int asd_init_sw(struct asd_ha_struct *asd_ha) +{ + struct pci_dev *pcidev = asd_ha->pcidev; + int err; + u32 v; + + /* Unlock MBARs */ + err = pci_read_config_dword(pcidev, PCI_CONF_MBAR_KEY, &v); + if (err) { + asd_printk("couldn't access conf. space of %s\n", + pci_name(pcidev)); + goto Err; + } + if (v) + err = pci_write_config_dword(pcidev, PCI_CONF_MBAR_KEY, v); + if (err) { + asd_printk("couldn't write to MBAR_KEY of %s\n", + pci_name(pcidev)); + goto Err; + } + + /* Set sliding windows A, B and C to point to proper internal + * memory regions. + */ + pci_write_config_dword(pcidev, PCI_CONF_MBAR0_SWA, REG_BASE_ADDR); + pci_write_config_dword(pcidev, PCI_CONF_MBAR0_SWB, + REG_BASE_ADDR_CSEQCIO); + pci_write_config_dword(pcidev, PCI_CONF_MBAR0_SWC, REG_BASE_ADDR_EXSI); + asd_ha->io_handle[0].swa_base = REG_BASE_ADDR; + asd_ha->io_handle[0].swb_base = REG_BASE_ADDR_CSEQCIO; + asd_ha->io_handle[0].swc_base = REG_BASE_ADDR_EXSI; + MBAR0_SWB_SIZE = asd_ha->io_handle[0].len - 0x80; + if (!asd_ha->iospace) { + /* MBAR1 will point to OCM (On Chip Memory) */ + pci_write_config_dword(pcidev, PCI_CONF_MBAR1, OCM_BASE_ADDR); + asd_ha->io_handle[1].swa_base = OCM_BASE_ADDR; + } + spin_lock_init(&asd_ha->iolock); +Err: + return err; +} + +/* ---------- SCB initialization ---------- */ + +/** + * asd_init_scbs - manually allocate the first SCB. + * @asd_ha: pointer to host adapter structure + * + * This allocates the very first SCB which would be sent to the + * sequencer for execution. Its bus address is written to + * CSEQ_Q_NEW_POINTER, mode page 2, mode 8. Since the bus address of + * the _next_ scb to be DMA-ed to the host adapter is read from the last + * SCB DMA-ed to the host adapter, we have to always stay one step + * ahead of the sequencer and keep one SCB already allocated. + */ +static int asd_init_scbs(struct asd_ha_struct *asd_ha) +{ + struct asd_seq_data *seq = &asd_ha->seq; + int bitmap_bytes; + + /* allocate the index array and bitmap */ + asd_ha->seq.tc_index_bitmap_bits = asd_ha->hw_prof.max_scbs; + asd_ha->seq.tc_index_array = kzalloc(asd_ha->seq.tc_index_bitmap_bits* + sizeof(void *), GFP_KERNEL); + if (!asd_ha->seq.tc_index_array) + return -ENOMEM; + + bitmap_bytes = (asd_ha->seq.tc_index_bitmap_bits+7)/8; + bitmap_bytes = BITS_TO_LONGS(bitmap_bytes*8)*sizeof(unsigned long); + asd_ha->seq.tc_index_bitmap = kzalloc(bitmap_bytes, GFP_KERNEL); + if (!asd_ha->seq.tc_index_bitmap) + return -ENOMEM; + + spin_lock_init(&seq->tc_index_lock); + + seq->next_scb.size = sizeof(struct scb); + seq->next_scb.vaddr = dma_pool_alloc(asd_ha->scb_pool, GFP_KERNEL, + &seq->next_scb.dma_handle); + if (!seq->next_scb.vaddr) { + kfree(asd_ha->seq.tc_index_bitmap); + kfree(asd_ha->seq.tc_index_array); + asd_ha->seq.tc_index_bitmap = NULL; + asd_ha->seq.tc_index_array = NULL; + return -ENOMEM; + } + + seq->pending = 0; + spin_lock_init(&seq->pend_q_lock); + INIT_LIST_HEAD(&seq->pend_q); + + return 0; +} + +static inline void asd_get_max_scb_ddb(struct asd_ha_struct *asd_ha) +{ + asd_ha->hw_prof.max_scbs = asd_get_cmdctx_size(asd_ha)/ASD_SCB_SIZE; + asd_ha->hw_prof.max_ddbs = asd_get_devctx_size(asd_ha)/ASD_DDB_SIZE; + ASD_DPRINTK("max_scbs:%d, max_ddbs:%d\n", + asd_ha->hw_prof.max_scbs, + asd_ha->hw_prof.max_ddbs); +} + +/* ---------- Done List initialization ---------- */ + +static void asd_dl_tasklet_handler(unsigned long); + +static int asd_init_dl(struct asd_ha_struct *asd_ha) +{ + asd_ha->seq.actual_dl + = asd_alloc_coherent(asd_ha, + ASD_DL_SIZE * sizeof(struct done_list_struct), + GFP_KERNEL); + if (!asd_ha->seq.actual_dl) + return -ENOMEM; + asd_ha->seq.dl = asd_ha->seq.actual_dl->vaddr; + asd_ha->seq.dl_toggle = ASD_DEF_DL_TOGGLE; + asd_ha->seq.dl_next = 0; + tasklet_init(&asd_ha->seq.dl_tasklet, asd_dl_tasklet_handler, + (unsigned long) asd_ha); + + return 0; +} + +/* ---------- EDB and ESCB init ---------- */ + +static int asd_alloc_edbs(struct asd_ha_struct *asd_ha, unsigned int gfp_flags) +{ + struct asd_seq_data *seq = &asd_ha->seq; + int i; + + seq->edb_arr = kmalloc(seq->num_edbs*sizeof(*seq->edb_arr), gfp_flags); + if (!seq->edb_arr) + return -ENOMEM; + + for (i = 0; i < seq->num_edbs; i++) { + seq->edb_arr[i] = asd_alloc_coherent(asd_ha, ASD_EDB_SIZE, + gfp_flags); + if (!seq->edb_arr[i]) + goto Err_unroll; + memset(seq->edb_arr[i]->vaddr, 0, ASD_EDB_SIZE); + } + + ASD_DPRINTK("num_edbs:%d\n", seq->num_edbs); + + return 0; + +Err_unroll: + for (i-- ; i >= 0; i--) + asd_free_coherent(asd_ha, seq->edb_arr[i]); + kfree(seq->edb_arr); + seq->edb_arr = NULL; + + return -ENOMEM; +} + +static int asd_alloc_escbs(struct asd_ha_struct *asd_ha, + unsigned int gfp_flags) +{ + struct asd_seq_data *seq = &asd_ha->seq; + struct asd_ascb *escb; + int i, escbs; + + seq->escb_arr = kmalloc(seq->num_escbs*sizeof(*seq->escb_arr), + gfp_flags); + if (!seq->escb_arr) + return -ENOMEM; + + escbs = seq->num_escbs; + escb = asd_ascb_alloc_list(asd_ha, &escbs, gfp_flags); + if (!escb) { + asd_printk("couldn't allocate list of escbs\n"); + goto Err; + } + seq->num_escbs -= escbs; /* subtract what was not allocated */ + ASD_DPRINTK("num_escbs:%d\n", seq->num_escbs); + + for (i = 0; i < seq->num_escbs; i++, escb = list_entry(escb->list.next, + struct asd_ascb, + list)) { + seq->escb_arr[i] = escb; + escb->scb->header.opcode = EMPTY_SCB; + } + + return 0; +Err: + kfree(seq->escb_arr); + seq->escb_arr = NULL; + return -ENOMEM; + +} + +static void asd_assign_edbs2escbs(struct asd_ha_struct *asd_ha) +{ + struct asd_seq_data *seq = &asd_ha->seq; + int i, k, z = 0; + + for (i = 0; i < seq->num_escbs; i++) { + struct asd_ascb *ascb = seq->escb_arr[i]; + struct empty_scb *escb = &ascb->scb->escb; + + ascb->edb_index = z; + + escb->num_valid = ASD_EDBS_PER_SCB; + + for (k = 0; k < ASD_EDBS_PER_SCB; k++) { + struct sg_el *eb = &escb->eb[k]; + struct asd_dma_tok *edb = seq->edb_arr[z++]; + + memset(eb, 0, sizeof(*eb)); + eb->bus_addr = cpu_to_le64(((u64) edb->dma_handle)); + eb->size = cpu_to_le32(((u32) edb->size)); + } + } +} + +/** + * asd_init_escbs -- allocate and initialize empty scbs + * @asd_ha: pointer to host adapter structure + * + * An empty SCB has sg_elements of ASD_EDBS_PER_SCB (7) buffers. + * They transport sense data, etc. + */ +static int asd_init_escbs(struct asd_ha_struct *asd_ha) +{ + struct asd_seq_data *seq = &asd_ha->seq; + int err = 0; + + /* Allocate two empty data buffers (edb) per sequencer. */ + int edbs = 2*(1+asd_ha->hw_prof.num_phys); + + seq->num_escbs = (edbs+ASD_EDBS_PER_SCB-1)/ASD_EDBS_PER_SCB; + seq->num_edbs = seq->num_escbs * ASD_EDBS_PER_SCB; + + err = asd_alloc_edbs(asd_ha, GFP_KERNEL); + if (err) { + asd_printk("couldn't allocate edbs\n"); + return err; + } + + err = asd_alloc_escbs(asd_ha, GFP_KERNEL); + if (err) { + asd_printk("couldn't allocate escbs\n"); + return err; + } + + asd_assign_edbs2escbs(asd_ha); + /* In order to insure that normal SCBs do not overfill sequencer + * memory and leave no space for escbs (halting condition), + * we increment pending here by the number of escbs. However, + * escbs are never pending. + */ + seq->pending = seq->num_escbs; + seq->can_queue = 1 + (asd_ha->hw_prof.max_scbs - seq->pending)/2; + + return 0; +} + +/* ---------- HW initialization ---------- */ + +/** + * asd_chip_hardrst -- hard reset the chip + * @asd_ha: pointer to host adapter structure + * + * This takes 16 cycles and is synchronous to CFCLK, which runs + * at 200 MHz, so this should take at most 80 nanoseconds. + */ +int asd_chip_hardrst(struct asd_ha_struct *asd_ha) +{ + int i; + int count = 100; + u32 reg; + + for (i = 0 ; i < 4 ; i++) { + asd_write_reg_dword(asd_ha, COMBIST, HARDRST); + } + + do { + udelay(1); + reg = asd_read_reg_dword(asd_ha, CHIMINT); + if (reg & HARDRSTDET) { + asd_write_reg_dword(asd_ha, CHIMINT, + HARDRSTDET|PORRSTDET); + return 0; + } + } while (--count > 0); + + return -ENODEV; +} + +/** + * asd_init_chip -- initialize the chip + * @asd_ha: pointer to host adapter structure + * + * Hard resets the chip, disables HA interrupts, downloads the sequnecer + * microcode and starts the sequencers. The caller has to explicitly + * enable HA interrupts with asd_enable_ints(asd_ha). + */ +static int asd_init_chip(struct asd_ha_struct *asd_ha) +{ + int err; + + err = asd_chip_hardrst(asd_ha); + if (err) { + asd_printk("couldn't hard reset %s\n", + pci_name(asd_ha->pcidev)); + goto out; + } + + asd_disable_ints(asd_ha); + + err = asd_init_seqs(asd_ha); + if (err) { + asd_printk("couldn't init seqs for %s\n", + pci_name(asd_ha->pcidev)); + goto out; + } + + err = asd_start_seqs(asd_ha); + if (err) { + asd_printk("coudln't start seqs for %s\n", + pci_name(asd_ha->pcidev)); + goto out; + } +out: + return err; +} + +#define MAX_DEVS ((OCM_MAX_SIZE) / (ASD_DDB_SIZE)) + +static int max_devs = 0; +module_param_named(max_devs, max_devs, int, S_IRUGO); +MODULE_PARM_DESC(max_devs, "\n" + "\tMaximum number of SAS devices to support (not LUs).\n" + "\tDefault: 2176, Maximum: 65663.\n"); + +static int max_cmnds = 0; +module_param_named(max_cmnds, max_cmnds, int, S_IRUGO); +MODULE_PARM_DESC(max_cmnds, "\n" + "\tMaximum number of commands queuable.\n" + "\tDefault: 512, Maximum: 66047.\n"); + +static void asd_extend_devctx_ocm(struct asd_ha_struct *asd_ha) +{ + unsigned long dma_addr = OCM_BASE_ADDR; + u32 d; + + dma_addr -= asd_ha->hw_prof.max_ddbs * ASD_DDB_SIZE; + asd_write_reg_addr(asd_ha, DEVCTXBASE, (dma_addr_t) dma_addr); + d = asd_read_reg_dword(asd_ha, CTXDOMAIN); + d |= 4; + asd_write_reg_dword(asd_ha, CTXDOMAIN, d); + asd_ha->hw_prof.max_ddbs += MAX_DEVS; +} + +static int asd_extend_devctx(struct asd_ha_struct *asd_ha) +{ + dma_addr_t dma_handle; + unsigned long dma_addr; + u32 d; + int size; + + asd_extend_devctx_ocm(asd_ha); + + asd_ha->hw_prof.ddb_ext = NULL; + if (max_devs <= asd_ha->hw_prof.max_ddbs || max_devs > 0xFFFF) { + max_devs = asd_ha->hw_prof.max_ddbs; + return 0; + } + + size = (max_devs - asd_ha->hw_prof.max_ddbs + 1) * ASD_DDB_SIZE; + + asd_ha->hw_prof.ddb_ext = asd_alloc_coherent(asd_ha, size, GFP_KERNEL); + if (!asd_ha->hw_prof.ddb_ext) { + asd_printk("couldn't allocate memory for %d devices\n", + max_devs); + max_devs = asd_ha->hw_prof.max_ddbs; + return -ENOMEM; + } + dma_handle = asd_ha->hw_prof.ddb_ext->dma_handle; + dma_addr = ALIGN((unsigned long) dma_handle, ASD_DDB_SIZE); + dma_addr -= asd_ha->hw_prof.max_ddbs * ASD_DDB_SIZE; + dma_handle = (dma_addr_t) dma_addr; + asd_write_reg_addr(asd_ha, DEVCTXBASE, dma_handle); + d = asd_read_reg_dword(asd_ha, CTXDOMAIN); + d &= ~4; + asd_write_reg_dword(asd_ha, CTXDOMAIN, d); + + asd_ha->hw_prof.max_ddbs = max_devs; + + return 0; +} + +static int asd_extend_cmdctx(struct asd_ha_struct *asd_ha) +{ + dma_addr_t dma_handle; + unsigned long dma_addr; + u32 d; + int size; + + asd_ha->hw_prof.scb_ext = NULL; + if (max_cmnds <= asd_ha->hw_prof.max_scbs || max_cmnds > 0xFFFF) { + max_cmnds = asd_ha->hw_prof.max_scbs; + return 0; + } + + size = (max_cmnds - asd_ha->hw_prof.max_scbs + 1) * ASD_SCB_SIZE; + + asd_ha->hw_prof.scb_ext = asd_alloc_coherent(asd_ha, size, GFP_KERNEL); + if (!asd_ha->hw_prof.scb_ext) { + asd_printk("couldn't allocate memory for %d commands\n", + max_cmnds); + max_cmnds = asd_ha->hw_prof.max_scbs; + return -ENOMEM; + } + dma_handle = asd_ha->hw_prof.scb_ext->dma_handle; + dma_addr = ALIGN((unsigned long) dma_handle, ASD_SCB_SIZE); + dma_addr -= asd_ha->hw_prof.max_scbs * ASD_SCB_SIZE; + dma_handle = (dma_addr_t) dma_addr; + asd_write_reg_addr(asd_ha, CMDCTXBASE, dma_handle); + d = asd_read_reg_dword(asd_ha, CTXDOMAIN); + d &= ~1; + asd_write_reg_dword(asd_ha, CTXDOMAIN, d); + + asd_ha->hw_prof.max_scbs = max_cmnds; + + return 0; +} + +/** + * asd_init_ctxmem -- initialize context memory + * asd_ha: pointer to host adapter structure + * + * This function sets the maximum number of SCBs and + * DDBs which can be used by the sequencer. This is normally + * 512 and 128 respectively. If support for more SCBs or more DDBs + * is required then CMDCTXBASE, DEVCTXBASE and CTXDOMAIN are + * initialized here to extend context memory to point to host memory, + * thus allowing unlimited support for SCBs and DDBs -- only limited + * by host memory. + */ +static int asd_init_ctxmem(struct asd_ha_struct *asd_ha) +{ + int bitmap_bytes; + + asd_get_max_scb_ddb(asd_ha); + asd_extend_devctx(asd_ha); + asd_extend_cmdctx(asd_ha); + + /* The kernel wants bitmaps to be unsigned long sized. */ + bitmap_bytes = (asd_ha->hw_prof.max_ddbs+7)/8; + bitmap_bytes = BITS_TO_LONGS(bitmap_bytes*8)*sizeof(unsigned long); + asd_ha->hw_prof.ddb_bitmap = kzalloc(bitmap_bytes, GFP_KERNEL); + if (!asd_ha->hw_prof.ddb_bitmap) + return -ENOMEM; + spin_lock_init(&asd_ha->hw_prof.ddb_lock); + + return 0; +} + +int asd_init_hw(struct asd_ha_struct *asd_ha) +{ + int err; + u32 v; + + err = asd_init_sw(asd_ha); + if (err) + return err; + + err = pci_read_config_dword(asd_ha->pcidev, PCIC_HSTPCIX_CNTRL, &v); + if (err) { + asd_printk("couldn't read PCIC_HSTPCIX_CNTRL of %s\n", + pci_name(asd_ha->pcidev)); + return err; + } + pci_write_config_dword(asd_ha->pcidev, PCIC_HSTPCIX_CNTRL, + v | SC_TMR_DIS); + if (err) { + asd_printk("couldn't disable split completion timer of %s\n", + pci_name(asd_ha->pcidev)); + return err; + } + + err = asd_read_ocm(asd_ha); + if (err) { + asd_printk("couldn't read ocm(%d)\n", err); + /* While suspicios, it is not an error that we + * couldn't read the OCM. */ + } + + err = asd_read_flash(asd_ha); + if (err) { + asd_printk("couldn't read flash(%d)\n", err); + /* While suspicios, it is not an error that we + * couldn't read FLASH memory. + */ + } + + asd_init_ctxmem(asd_ha); + + asd_get_user_sas_addr(asd_ha); + if (!asd_ha->hw_prof.sas_addr[0]) { + asd_printk("No SAS Address provided for %s\n", + pci_name(asd_ha->pcidev)); + err = -ENODEV; + goto Out; + } + + asd_propagate_sas_addr(asd_ha); + + err = asd_init_phys(asd_ha); + if (err) { + asd_printk("couldn't initialize phys for %s\n", + pci_name(asd_ha->pcidev)); + goto Out; + } + + err = asd_init_scbs(asd_ha); + if (err) { + asd_printk("couldn't initialize scbs for %s\n", + pci_name(asd_ha->pcidev)); + goto Out; + } + + err = asd_init_dl(asd_ha); + if (err) { + asd_printk("couldn't initialize the done list:%d\n", + err); + goto Out; + } + + err = asd_init_escbs(asd_ha); + if (err) { + asd_printk("couldn't initialize escbs\n"); + goto Out; + } + + err = asd_init_chip(asd_ha); + if (err) { + asd_printk("couldn't init the chip\n"); + goto Out; + } +Out: + return err; +} + +/* ---------- Chip reset ---------- */ + +/** + * asd_chip_reset -- reset the host adapter, etc + * @asd_ha: pointer to host adapter structure of interest + * + * Called from the ISR. Hard reset the chip. Let everything + * timeout. This should be no different than hot-unplugging the + * host adapter. Once everything times out we'll init the chip with + * a call to asd_init_chip() and enable interrupts with asd_enable_ints(). + * XXX finish. + */ +static void asd_chip_reset(struct asd_ha_struct *asd_ha) +{ + struct sas_ha_struct *sas_ha = &asd_ha->sas_ha; + + ASD_DPRINTK("chip reset for %s\n", pci_name(asd_ha->pcidev)); + asd_chip_hardrst(asd_ha); + sas_ha->notify_ha_event(sas_ha, HAE_RESET); +} + +/* ---------- Done List Routines ---------- */ + +static void asd_dl_tasklet_handler(unsigned long data) +{ + struct asd_ha_struct *asd_ha = (struct asd_ha_struct *) data; + struct asd_seq_data *seq = &asd_ha->seq; + unsigned long flags; + + while (1) { + struct done_list_struct *dl = &seq->dl[seq->dl_next]; + struct asd_ascb *ascb; + + if ((dl->toggle & DL_TOGGLE_MASK) != seq->dl_toggle) + break; + + /* find the aSCB */ + spin_lock_irqsave(&seq->tc_index_lock, flags); + ascb = asd_tc_index_find(seq, (int)le16_to_cpu(dl->index)); + spin_unlock_irqrestore(&seq->tc_index_lock, flags); + if (unlikely(!ascb)) { + ASD_DPRINTK("BUG:sequencer:dl:no ascb?!\n"); + goto next_1; + } else if (ascb->scb->header.opcode == EMPTY_SCB) { + goto out; + } else if (!ascb->uldd_timer && !del_timer(&ascb->timer)) { + goto next_1; + } + spin_lock_irqsave(&seq->pend_q_lock, flags); + list_del_init(&ascb->list); + seq->pending--; + spin_unlock_irqrestore(&seq->pend_q_lock, flags); + out: + ascb->tasklet_complete(ascb, dl); + + next_1: + seq->dl_next = (seq->dl_next + 1) & (ASD_DL_SIZE-1); + if (!seq->dl_next) + seq->dl_toggle ^= DL_TOGGLE_MASK; + } +} + +/* ---------- Interrupt Service Routines ---------- */ + +/** + * asd_process_donelist_isr -- schedule processing of done list entries + * @asd_ha: pointer to host adapter structure + */ +static inline void asd_process_donelist_isr(struct asd_ha_struct *asd_ha) +{ + tasklet_schedule(&asd_ha->seq.dl_tasklet); +} + +/** + * asd_com_sas_isr -- process device communication interrupt (COMINT) + * @asd_ha: pointer to host adapter structure + */ +static inline void asd_com_sas_isr(struct asd_ha_struct *asd_ha) +{ + u32 comstat = asd_read_reg_dword(asd_ha, COMSTAT); + + /* clear COMSTAT int */ + asd_write_reg_dword(asd_ha, COMSTAT, 0xFFFFFFFF); + + if (comstat & CSBUFPERR) { + asd_printk("%s: command/status buffer dma parity error\n", + pci_name(asd_ha->pcidev)); + } else if (comstat & CSERR) { + int i; + u32 dmaerr = asd_read_reg_dword(asd_ha, DMAERR); + dmaerr &= 0xFF; + asd_printk("%s: command/status dma error, DMAERR: 0x%02x, " + "CSDMAADR: 0x%04x, CSDMAADR+4: 0x%04x\n", + pci_name(asd_ha->pcidev), + dmaerr, + asd_read_reg_dword(asd_ha, CSDMAADR), + asd_read_reg_dword(asd_ha, CSDMAADR+4)); + asd_printk("CSBUFFER:\n"); + for (i = 0; i < 8; i++) { + asd_printk("%08x %08x %08x %08x\n", + asd_read_reg_dword(asd_ha, CSBUFFER), + asd_read_reg_dword(asd_ha, CSBUFFER+4), + asd_read_reg_dword(asd_ha, CSBUFFER+8), + asd_read_reg_dword(asd_ha, CSBUFFER+12)); + } + asd_dump_seq_state(asd_ha, 0); + } else if (comstat & OVLYERR) { + u32 dmaerr = asd_read_reg_dword(asd_ha, DMAERR); + dmaerr = (dmaerr >> 8) & 0xFF; + asd_printk("%s: overlay dma error:0x%x\n", + pci_name(asd_ha->pcidev), + dmaerr); + } + asd_chip_reset(asd_ha); +} + +static inline void asd_arp2_err(struct asd_ha_struct *asd_ha, u32 dchstatus) +{ + static const char *halt_code[256] = { + "UNEXPECTED_INTERRUPT0", + "UNEXPECTED_INTERRUPT1", + "UNEXPECTED_INTERRUPT2", + "UNEXPECTED_INTERRUPT3", + "UNEXPECTED_INTERRUPT4", + "UNEXPECTED_INTERRUPT5", + "UNEXPECTED_INTERRUPT6", + "UNEXPECTED_INTERRUPT7", + "UNEXPECTED_INTERRUPT8", + "UNEXPECTED_INTERRUPT9", + "UNEXPECTED_INTERRUPT10", + [11 ... 19] = "unknown[11,19]", + "NO_FREE_SCB_AVAILABLE", + "INVALID_SCB_OPCODE", + "INVALID_MBX_OPCODE", + "INVALID_ATA_STATE", + "ATA_QUEUE_FULL", + "ATA_TAG_TABLE_FAULT", + "ATA_TAG_MASK_FAULT", + "BAD_LINK_QUEUE_STATE", + "DMA2CHIM_QUEUE_ERROR", + "EMPTY_SCB_LIST_FULL", + "unknown[30]", + "IN_USE_SCB_ON_FREE_LIST", + "BAD_OPEN_WAIT_STATE", + "INVALID_STP_AFFILIATION", + "unknown[34]", + "EXEC_QUEUE_ERROR", + "TOO_MANY_EMPTIES_NEEDED", + "EMPTY_REQ_QUEUE_ERROR", + "Q_MONIRTT_MGMT_ERROR", + "TARGET_MODE_FLOW_ERROR", + "DEVICE_QUEUE_NOT_FOUND", + "START_IRTT_TIMER_ERROR", + "ABORT_TASK_ILLEGAL_REQ", + [43 ... 255] = "unknown[43,255]" + }; + + if (dchstatus & CSEQINT) { + u32 arp2int = asd_read_reg_dword(asd_ha, CARP2INT); + + if (arp2int & (ARP2WAITTO|ARP2ILLOPC|ARP2PERR|ARP2CIOPERR)) { + asd_printk("%s: CSEQ arp2int:0x%x\n", + pci_name(asd_ha->pcidev), + arp2int); + } else if (arp2int & ARP2HALTC) + asd_printk("%s: CSEQ halted: %s\n", + pci_name(asd_ha->pcidev), + halt_code[(arp2int>>16)&0xFF]); + else + asd_printk("%s: CARP2INT:0x%x\n", + pci_name(asd_ha->pcidev), + arp2int); + } + if (dchstatus & LSEQINT_MASK) { + int lseq; + u8 lseq_mask = dchstatus & LSEQINT_MASK; + + for_each_sequencer(lseq_mask, lseq_mask, lseq) { + u32 arp2int = asd_read_reg_dword(asd_ha, + LmARP2INT(lseq)); + if (arp2int & (ARP2WAITTO | ARP2ILLOPC | ARP2PERR + | ARP2CIOPERR)) { + asd_printk("%s: LSEQ%d arp2int:0x%x\n", + pci_name(asd_ha->pcidev), + lseq, arp2int); + /* XXX we should only do lseq reset */ + } else if (arp2int & ARP2HALTC) + asd_printk("%s: LSEQ%d halted: %s\n", + pci_name(asd_ha->pcidev), + lseq,halt_code[(arp2int>>16)&0xFF]); + else + asd_printk("%s: LSEQ%d ARP2INT:0x%x\n", + pci_name(asd_ha->pcidev), lseq, + arp2int); + } + } + asd_chip_reset(asd_ha); +} + +/** + * asd_dch_sas_isr -- process device channel interrupt (DEVINT) + * @asd_ha: pointer to host adapter structure + */ +static inline void asd_dch_sas_isr(struct asd_ha_struct *asd_ha) +{ + u32 dchstatus = asd_read_reg_dword(asd_ha, DCHSTATUS); + + if (dchstatus & CFIFTOERR) { + asd_printk("%s: CFIFTOERR\n", pci_name(asd_ha->pcidev)); + asd_chip_reset(asd_ha); + } else + asd_arp2_err(asd_ha, dchstatus); +} + +/** + * ads_rbi_exsi_isr -- process external system interface interrupt (INITERR) + * @asd_ha: pointer to host adapter structure + */ +static inline void asd_rbi_exsi_isr(struct asd_ha_struct *asd_ha) +{ + u32 stat0r = asd_read_reg_dword(asd_ha, ASISTAT0R); + + if (!(stat0r & ASIERR)) { + asd_printk("hmm, EXSI interrupted but no error?\n"); + return; + } + + if (stat0r & ASIFMTERR) { + asd_printk("ASI SEEPROM format error for %s\n", + pci_name(asd_ha->pcidev)); + } else if (stat0r & ASISEECHKERR) { + u32 stat1r = asd_read_reg_dword(asd_ha, ASISTAT1R); + asd_printk("ASI SEEPROM checksum 0x%x error for %s\n", + stat1r & CHECKSUM_MASK, + pci_name(asd_ha->pcidev)); + } else { + u32 statr = asd_read_reg_dword(asd_ha, ASIERRSTATR); + + if (!(statr & CPI2ASIMSTERR_MASK)) { + ASD_DPRINTK("hmm, ASIERR?\n"); + return; + } else { + u32 addr = asd_read_reg_dword(asd_ha, ASIERRADDR); + u32 data = asd_read_reg_dword(asd_ha, ASIERRDATAR); + + asd_printk("%s: CPI2 xfer err: addr: 0x%x, wdata: 0x%x, " + "count: 0x%x, byteen: 0x%x, targerr: 0x%x " + "master id: 0x%x, master err: 0x%x\n", + pci_name(asd_ha->pcidev), + addr, data, + (statr & CPI2ASIBYTECNT_MASK) >> 16, + (statr & CPI2ASIBYTEEN_MASK) >> 12, + (statr & CPI2ASITARGERR_MASK) >> 8, + (statr & CPI2ASITARGMID_MASK) >> 4, + (statr & CPI2ASIMSTERR_MASK)); + } + } + asd_chip_reset(asd_ha); +} + +/** + * asd_hst_pcix_isr -- process host interface interrupts + * @asd_ha: pointer to host adapter structure + * + * Asserted on PCIX errors: target abort, etc. + */ +static inline void asd_hst_pcix_isr(struct asd_ha_struct *asd_ha) +{ + u16 status; + u32 pcix_status; + u32 ecc_status; + + pci_read_config_word(asd_ha->pcidev, PCI_STATUS, &status); + pci_read_config_dword(asd_ha->pcidev, PCIX_STATUS, &pcix_status); + pci_read_config_dword(asd_ha->pcidev, ECC_CTRL_STAT, &ecc_status); + + if (status & PCI_STATUS_DETECTED_PARITY) + asd_printk("parity error for %s\n", pci_name(asd_ha->pcidev)); + else if (status & PCI_STATUS_REC_MASTER_ABORT) + asd_printk("master abort for %s\n", pci_name(asd_ha->pcidev)); + else if (status & PCI_STATUS_REC_TARGET_ABORT) + asd_printk("target abort for %s\n", pci_name(asd_ha->pcidev)); + else if (status & PCI_STATUS_PARITY) + asd_printk("data parity for %s\n", pci_name(asd_ha->pcidev)); + else if (pcix_status & RCV_SCE) { + asd_printk("received split completion error for %s\n", + pci_name(asd_ha->pcidev)); + pci_write_config_dword(asd_ha->pcidev,PCIX_STATUS,pcix_status); + /* XXX: Abort task? */ + return; + } else if (pcix_status & UNEXP_SC) { + asd_printk("unexpected split completion for %s\n", + pci_name(asd_ha->pcidev)); + pci_write_config_dword(asd_ha->pcidev,PCIX_STATUS,pcix_status); + /* ignore */ + return; + } else if (pcix_status & SC_DISCARD) + asd_printk("split completion discarded for %s\n", + pci_name(asd_ha->pcidev)); + else if (ecc_status & UNCOR_ECCERR) + asd_printk("uncorrectable ECC error for %s\n", + pci_name(asd_ha->pcidev)); + asd_chip_reset(asd_ha); +} + +/** + * asd_hw_isr -- host adapter interrupt service routine + * @irq: ignored + * @dev_id: pointer to host adapter structure + * @regs: ignored + * + * The ISR processes done list entries and level 3 error handling. + */ +irqreturn_t asd_hw_isr(int irq, void *dev_id, struct pt_regs *regs) +{ + struct asd_ha_struct *asd_ha = dev_id; + u32 chimint = asd_read_reg_dword(asd_ha, CHIMINT); + + if (!chimint) + return IRQ_NONE; + + asd_write_reg_dword(asd_ha, CHIMINT, chimint); + (void) asd_read_reg_dword(asd_ha, CHIMINT); + + if (chimint & DLAVAIL) + asd_process_donelist_isr(asd_ha); + if (chimint & COMINT) + asd_com_sas_isr(asd_ha); + if (chimint & DEVINT) + asd_dch_sas_isr(asd_ha); + if (chimint & INITERR) + asd_rbi_exsi_isr(asd_ha); + if (chimint & HOSTERR) + asd_hst_pcix_isr(asd_ha); + + return IRQ_HANDLED; +} + +/* ---------- SCB handling ---------- */ + +static inline struct asd_ascb *asd_ascb_alloc(struct asd_ha_struct *asd_ha, + unsigned int gfp_flags) +{ + extern kmem_cache_t *asd_ascb_cache; + struct asd_seq_data *seq = &asd_ha->seq; + struct asd_ascb *ascb; + unsigned long flags; + + ascb = kmem_cache_alloc(asd_ascb_cache, gfp_flags); + + if (ascb) { + memset(ascb, 0, sizeof(*ascb)); + ascb->dma_scb.size = sizeof(struct scb); + ascb->dma_scb.vaddr = dma_pool_alloc(asd_ha->scb_pool, + gfp_flags, + &ascb->dma_scb.dma_handle); + if (!ascb->dma_scb.vaddr) { + kmem_cache_free(asd_ascb_cache, ascb); + return NULL; + } + memset(ascb->dma_scb.vaddr, 0, sizeof(struct scb)); + asd_init_ascb(asd_ha, ascb); + + spin_lock_irqsave(&seq->tc_index_lock, flags); + ascb->tc_index = asd_tc_index_get(seq, ascb); + spin_unlock_irqrestore(&seq->tc_index_lock, flags); + if (ascb->tc_index == -1) + goto undo; + + ascb->scb->header.index = cpu_to_le16((u16)ascb->tc_index); + } + + return ascb; +undo: + dma_pool_free(asd_ha->scb_pool, ascb->dma_scb.vaddr, + ascb->dma_scb.dma_handle); + kmem_cache_free(asd_ascb_cache, ascb); + ASD_DPRINTK("no index for ascb\n"); + return NULL; +} + +/** + * asd_ascb_alloc_list -- allocate a list of aSCBs + * @asd_ha: pointer to host adapter structure + * @num: pointer to integer number of aSCBs + * @gfp_flags: GFP_ flags. + * + * This is the only function which is used to allocate aSCBs. + * It can allocate one or many. If more than one, then they form + * a linked list in two ways: by their list field of the ascb struct + * and by the next_scb field of the scb_header. + * + * Returns NULL if no memory was available, else pointer to a list + * of ascbs. When this function returns, @num would be the number + * of SCBs which were not able to be allocated, 0 if all requested + * were able to be allocated. + */ +struct asd_ascb *asd_ascb_alloc_list(struct asd_ha_struct + *asd_ha, int *num, + unsigned int gfp_flags) +{ + struct asd_ascb *first = NULL; + + for ( ; *num > 0; --*num) { + struct asd_ascb *ascb = asd_ascb_alloc(asd_ha, gfp_flags); + + if (!ascb) + break; + else if (!first) + first = ascb; + else { + struct asd_ascb *last = list_entry(first->list.prev, + struct asd_ascb, + list); + list_add_tail(&ascb->list, &first->list); + last->scb->header.next_scb = + cpu_to_le64(((u64)ascb->dma_scb.dma_handle)); + } + } + + return first; +} + +/** + * asd_swap_head_scb -- swap the head scb + * @asd_ha: pointer to host adapter structure + * @ascb: pointer to the head of an ascb list + * + * The sequencer knows the DMA address of the next SCB to be DMAed to + * the host adapter, from initialization or from the last list DMAed. + * seq->next_scb keeps the address of this SCB. The sequencer will + * DMA to the host adapter this list of SCBs. But the head (first + * element) of this list is not known to the sequencer. Here we swap + * the head of the list with the known SCB (memcpy()). + * Only one memcpy() is required per list so it is in our interest + * to keep the list of SCB as long as possible so that the ratio + * of number of memcpy calls to the number of SCB DMA-ed is as small + * as possible. + * + * LOCKING: called with the pending list lock held. + */ +static inline void asd_swap_head_scb(struct asd_ha_struct *asd_ha, + struct asd_ascb *ascb) +{ + struct asd_seq_data *seq = &asd_ha->seq; + struct asd_ascb *last = list_entry(ascb->list.prev, + struct asd_ascb, + list); + struct asd_dma_tok t = ascb->dma_scb; + + memcpy(seq->next_scb.vaddr, ascb->scb, sizeof(*ascb->scb)); + ascb->dma_scb = seq->next_scb; + ascb->scb = ascb->dma_scb.vaddr; + seq->next_scb = t; + last->scb->header.next_scb = + cpu_to_le64(((u64)seq->next_scb.dma_handle)); +} + +/** + * asd_start_timers -- (add and) start timers of SCBs + * @list: pointer to struct list_head of the scbs + * @to: timeout in jiffies + * + * If an SCB in the @list has no timer function, assign the default + * one, then start the timer of the SCB. This function is + * intended to be called from asd_post_ascb_list(), just prior to + * posting the SCBs to the sequencer. + */ +static inline void asd_start_scb_timers(struct list_head *list) +{ + struct asd_ascb *ascb; + list_for_each_entry(ascb, list, list) { + if (!ascb->uldd_timer) { + ascb->timer.data = (unsigned long) ascb; + ascb->timer.function = asd_ascb_timedout; + ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT; + add_timer(&ascb->timer); + } + } +} + +/** + * asd_post_ascb_list -- post a list of 1 or more aSCBs to the host adapter + * @asd_ha: pointer to a host adapter structure + * @ascb: pointer to the first aSCB in the list + * @num: number of aSCBs in the list (to be posted) + * + * See queueing comment in asd_post_escb_list(). + * + * Additional note on queuing: In order to minimize the ratio of memcpy() + * to the number of ascbs sent, we try to batch-send as many ascbs as possible + * in one go. + * Two cases are possible: + * A) can_queue >= num, + * B) can_queue < num. + * Case A: we can send the whole batch at once. Increment "pending" + * in the beginning of this function, when it is checked, in order to + * eliminate races when this function is called by multiple processes. + * Case B: should never happen if the managing layer considers + * lldd_queue_size. + */ +int asd_post_ascb_list(struct asd_ha_struct *asd_ha, struct asd_ascb *ascb, + int num) +{ + unsigned long flags; + LIST_HEAD(list); + int can_queue; + + spin_lock_irqsave(&asd_ha->seq.pend_q_lock, flags); + can_queue = asd_ha->hw_prof.max_scbs - asd_ha->seq.pending; + if (can_queue >= num) + asd_ha->seq.pending += num; + else + can_queue = 0; + + if (!can_queue) { + spin_unlock_irqrestore(&asd_ha->seq.pend_q_lock, flags); + asd_printk("%s: scb queue full\n", pci_name(asd_ha->pcidev)); + return -SAS_QUEUE_FULL; + } + + asd_swap_head_scb(asd_ha, ascb); + + __list_add(&list, ascb->list.prev, &ascb->list); + + asd_start_scb_timers(&list); + + asd_ha->seq.scbpro += num; + list_splice_init(&list, asd_ha->seq.pend_q.prev); + asd_write_reg_dword(asd_ha, SCBPRO, (u32)asd_ha->seq.scbpro); + spin_unlock_irqrestore(&asd_ha->seq.pend_q_lock, flags); + + return 0; +} + +/** + * asd_post_escb_list -- post a list of 1 or more empty scb + * @asd_ha: pointer to a host adapter structure + * @ascb: pointer to the first empty SCB in the list + * @num: number of aSCBs in the list (to be posted) + * + * This is essentially the same as asd_post_ascb_list, but we do not + * increment pending, add those to the pending list or get indexes. + * See asd_init_escbs() and asd_init_post_escbs(). + * + * Since sending a list of ascbs is a superset of sending a single + * ascb, this function exists to generalize this. More specifically, + * when sending a list of those, we want to do only a _single_ + * memcpy() at swap head, as opposed to for each ascb sent (in the + * case of sending them one by one). That is, we want to minimize the + * ratio of memcpy() operations to the number of ascbs sent. The same + * logic applies to asd_post_ascb_list(). + */ +int asd_post_escb_list(struct asd_ha_struct *asd_ha, struct asd_ascb *ascb, + int num) +{ + unsigned long flags; + + spin_lock_irqsave(&asd_ha->seq.pend_q_lock, flags); + asd_swap_head_scb(asd_ha, ascb); + asd_ha->seq.scbpro += num; + asd_write_reg_dword(asd_ha, SCBPRO, (u32)asd_ha->seq.scbpro); + spin_unlock_irqrestore(&asd_ha->seq.pend_q_lock, flags); + + return 0; +} + +/* ---------- LED ---------- */ + +/** + * asd_turn_led -- turn on/off an LED + * @asd_ha: pointer to host adapter structure + * @phy_id: the PHY id whose LED we want to manupulate + * @op: 1 to turn on, 0 to turn off + */ +void asd_turn_led(struct asd_ha_struct *asd_ha, int phy_id, int op) +{ + if (phy_id < ASD_MAX_PHYS) { + u32 v = asd_read_reg_dword(asd_ha, LmCONTROL(phy_id)); + if (op) + v |= LEDPOL; + else + v &= ~LEDPOL; + asd_write_reg_dword(asd_ha, LmCONTROL(phy_id), v); + } +} + +/** + * asd_control_led -- enable/disable an LED on the board + * @asd_ha: pointer to host adapter structure + * @phy_id: integer, the phy id + * @op: integer, 1 to enable, 0 to disable the LED + * + * First we output enable the LED, then we set the source + * to be an external module. + */ +void asd_control_led(struct asd_ha_struct *asd_ha, int phy_id, int op) +{ + if (phy_id < ASD_MAX_PHYS) { + u32 v; + + v = asd_read_reg_dword(asd_ha, GPIOOER); + if (op) + v |= (1 << phy_id); + else + v &= ~(1 << phy_id); + asd_write_reg_dword(asd_ha, GPIOOER, v); + + v = asd_read_reg_dword(asd_ha, GPIOCNFGR); + if (op) + v |= (1 << phy_id); + else + v &= ~(1 << phy_id); + asd_write_reg_dword(asd_ha, GPIOCNFGR, v); + } +} + +/* ---------- PHY enable ---------- */ + +static int asd_enable_phy(struct asd_ha_struct *asd_ha, int phy_id) +{ + struct asd_phy *phy = &asd_ha->phys[phy_id]; + + asd_write_reg_byte(asd_ha, LmSEQ_OOB_REG(phy_id, INT_ENABLE_2), 0); + asd_write_reg_byte(asd_ha, LmSEQ_OOB_REG(phy_id, HOT_PLUG_DELAY), + HOTPLUG_DELAY_TIMEOUT); + + /* Get defaults from manuf. sector */ + /* XXX we need defaults for those in case MS is broken. */ + asd_write_reg_byte(asd_ha, LmSEQ_OOB_REG(phy_id, PHY_CONTROL_0), + phy->phy_desc->phy_control_0); + asd_write_reg_byte(asd_ha, LmSEQ_OOB_REG(phy_id, PHY_CONTROL_1), + phy->phy_desc->phy_control_1); + asd_write_reg_byte(asd_ha, LmSEQ_OOB_REG(phy_id, PHY_CONTROL_2), + phy->phy_desc->phy_control_2); + asd_write_reg_byte(asd_ha, LmSEQ_OOB_REG(phy_id, PHY_CONTROL_3), + phy->phy_desc->phy_control_3); + + asd_write_reg_dword(asd_ha, LmSEQ_TEN_MS_COMINIT_TIMEOUT(phy_id), + ASD_COMINIT_TIMEOUT); + + asd_write_reg_addr(asd_ha, LmSEQ_TX_ID_ADDR_FRAME(phy_id), + phy->id_frm_tok->dma_handle); + + asd_control_led(asd_ha, phy_id, 1); + + return 0; +} + +int asd_enable_phys(struct asd_ha_struct *asd_ha, const u8 phy_mask) +{ + u8 phy_m; + u8 i; + int num = 0, k; + struct asd_ascb *ascb; + struct asd_ascb *ascb_list; + + if (!phy_mask) { + asd_printk("%s called with phy_mask of 0!?\n", __FUNCTION__); + return 0; + } + + for_each_phy(phy_mask, phy_m, i) { + num++; + asd_enable_phy(asd_ha, i); + } + + k = num; + ascb_list = asd_ascb_alloc_list(asd_ha, &k, GFP_KERNEL); + if (!ascb_list) { + asd_printk("no memory for control phy ascb list\n"); + return -ENOMEM; + } + num -= k; + + ascb = ascb_list; + for_each_phy(phy_mask, phy_m, i) { + asd_build_control_phy(ascb, i, ENABLE_PHY); + ascb = list_entry(ascb->list.next, struct asd_ascb, list); + } + ASD_DPRINTK("posting %d control phy scbs\n", num); + k = asd_post_ascb_list(asd_ha, ascb_list, num); + if (k) + asd_ascb_free_list(ascb_list); + + return k; +} diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_hwi.h linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_hwi.h --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_hwi.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_hwi.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,397 @@ +/* + * Aic94xx SAS/SATA driver hardware interface header file. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _AIC94XX_HWI_H_ +#define _AIC94XX_HWI_H_ + +#include +#include +#include + +#include + +#include "aic94xx.h" +#include "aic94xx_sas.h" + +/* Define ASD_MAX_PHYS to the maximum phys ever. Currently 8. */ +#define ASD_MAX_PHYS 8 +#define ASD_PCBA_SN_SIZE 12 + +/* Those are to be further named properly, the "RAZORx" part, and + * subsequently included in include/linux/pci_ids.h. + */ +#define PCI_DEVICE_ID_ADAPTEC2_RAZOR10 0x410 +#define PCI_DEVICE_ID_ADAPTEC2_RAZOR12 0x412 +#define PCI_DEVICE_ID_ADAPTEC2_RAZOR1E 0x41E +#define PCI_DEVICE_ID_ADAPTEC2_RAZOR30 0x430 +#define PCI_DEVICE_ID_ADAPTEC2_RAZOR32 0x432 +#define PCI_DEVICE_ID_ADAPTEC2_RAZOR3E 0x43E +#define PCI_DEVICE_ID_ADAPTEC2_RAZOR3F 0x43F + +struct asd_ha_addrspace { + void __iomem *addr; + unsigned long start; /* pci resource start */ + unsigned long len; /* pci resource len */ + unsigned long flags; /* pci resource flags */ + + /* addresses internal to the host adapter */ + u32 swa_base; /* mmspace 1 (MBAR1) uses this only */ + u32 swb_base; + u32 swc_base; +}; + +struct bios_struct { + int present; + u8 maj; + u8 min; + u32 bld; +}; + +struct unit_element_struct { + u16 num; + u16 size; + void *area; +}; + +struct flash_struct { + u32 bar; + int present; + int wide; + u8 manuf; + u8 dev_id; + u8 sec_prot; + + u32 dir_offs; +}; + +struct asd_phy_desc { + /* From CTRL-A settings, then set to what is appropriate */ + u8 sas_addr[SAS_ADDR_SIZE]; + u8 max_sas_lrate; + u8 min_sas_lrate; + u8 max_sata_lrate; + u8 min_sata_lrate; + u8 flags; +#define ASD_CRC_DIS 1 +#define ASD_SATA_SPINUP_HOLD 2 + + u8 phy_control_0; /* mode 5 reg 0x160 */ + u8 phy_control_1; /* mode 5 reg 0x161 */ + u8 phy_control_2; /* mode 5 reg 0x162 */ + u8 phy_control_3; /* mode 5 reg 0x163 */ +}; + +struct asd_dma_tok { + void *vaddr; + dma_addr_t dma_handle; + size_t size; +}; + +struct hw_profile { + struct bios_struct bios; + struct unit_element_struct ue; + struct flash_struct flash; + + u8 sas_addr[SAS_ADDR_SIZE]; + char pcba_sn[ASD_PCBA_SN_SIZE+1]; + + u8 enabled_phys; /* mask of enabled phys */ + struct asd_phy_desc phy_desc[ASD_MAX_PHYS]; + u32 max_scbs; /* absolute sequencer scb queue size */ + struct asd_dma_tok *scb_ext; + u32 max_ddbs; + struct asd_dma_tok *ddb_ext; + + spinlock_t ddb_lock; + void *ddb_bitmap; + + int num_phys; /* ENABLEABLE */ + int max_phys; /* REPORTED + ENABLEABLE */ + + unsigned addr_range; /* max # of addrs; max # of possible ports */ + unsigned port_name_base; + unsigned dev_name_base; + unsigned sata_name_base; +}; + +struct asd_ascb { + struct list_head list; + struct asd_ha_struct *ha; + + struct scb *scb; /* equals dma_scb->vaddr */ + struct asd_dma_tok dma_scb; + struct asd_dma_tok *sg_arr; + + void (*tasklet_complete)(struct asd_ascb *, struct done_list_struct *); + u8 uldd_timer:1; + + /* internally generated command */ + struct timer_list timer; + struct completion completion; + u8 tag_valid:1; + __be16 tag; /* error recovery only */ + + /* If this is an Empty SCB, index of first edb in seq->edb_arr. */ + int edb_index; + + /* Used by the timer timeout function. */ + int tc_index; + + void *uldd_task; +}; + +#define ASD_DL_SIZE_BITS 0x8 +#define ASD_DL_SIZE (1<<(2+ASD_DL_SIZE_BITS)) +#define ASD_DEF_DL_TOGGLE 0x01 + +struct asd_seq_data { + spinlock_t pend_q_lock; + u16 scbpro; + int pending; + struct list_head pend_q; + int can_queue; /* per adapter */ + struct asd_dma_tok next_scb; /* next scb to be delivered to CSEQ */ + + spinlock_t tc_index_lock; + void **tc_index_array; + void *tc_index_bitmap; + int tc_index_bitmap_bits; + + struct tasklet_struct dl_tasklet; + struct done_list_struct *dl; /* array of done list entries, equals */ + struct asd_dma_tok *actual_dl; /* actual_dl->vaddr */ + int dl_toggle; + int dl_next; + + int num_edbs; + struct asd_dma_tok **edb_arr; + int num_escbs; + struct asd_ascb **escb_arr; /* array of pointers to escbs */ +}; + +/* This is the Host Adapter structure. It describes the hardware + * SAS adapter. + */ +struct asd_ha_struct { + struct pci_dev *pcidev; + const char *name; + + struct sas_ha_struct sas_ha; + + u8 revision_id; + + int iospace; + spinlock_t iolock; + struct asd_ha_addrspace io_handle[2]; + + struct hw_profile hw_prof; + + struct asd_phy phys[ASD_MAX_PHYS]; + struct asd_sas_port ports[ASD_MAX_PHYS]; + + struct dma_pool *scb_pool; + + struct asd_seq_data seq; /* sequencer related */ +}; + +/* ---------- Common macros ---------- */ + +#define ASD_BUSADDR_LO(__dma_handle) ((u32)(__dma_handle)) +#define ASD_BUSADDR_HI(__dma_handle) (((sizeof(dma_addr_t))==8) \ + ? ((u32)((__dma_handle) >> 32)) \ + : ((u32)0)) + +#define dev_to_asd_ha(__dev) pci_get_drvdata(to_pci_dev(__dev)) +#define SCB_SITE_VALID(__site_no) (((__site_no) & 0xF0FF) != 0x00FF \ + && ((__site_no) & 0xF0FF) > 0x001F) +/* For each bit set in __lseq_mask, set __lseq to equal the bit + * position of the set bit and execute the statement following. + * __mc is the temporary mask, used as a mask "counter". + */ +#define for_each_sequencer(__lseq_mask, __mc, __lseq) \ + for ((__mc)=(__lseq_mask),(__lseq)=0;(__mc)!=0;(__lseq++),(__mc)>>=1)\ + if (((__mc) & 1)) +#define for_each_phy(__lseq_mask, __mc, __lseq) \ + for ((__mc)=(__lseq_mask),(__lseq)=0;(__mc)!=0;(__lseq++),(__mc)>>=1)\ + if (((__mc) & 1)) + +#define PHY_ENABLED(_HA, _I) ((_HA)->hw_prof.enabled_phys & (1<<(_I))) + +/* ---------- DMA allocs ---------- */ + +static inline struct asd_dma_tok *asd_dmatok_alloc(unsigned int flags) +{ + return kmem_cache_alloc(asd_dma_token_cache, flags); +} + +static inline void asd_dmatok_free(struct asd_dma_tok *token) +{ + kmem_cache_free(asd_dma_token_cache, token); +} + +static inline struct asd_dma_tok *asd_alloc_coherent(struct asd_ha_struct * + asd_ha, size_t size, + unsigned int flags) +{ + struct asd_dma_tok *token = asd_dmatok_alloc(flags); + if (token) { + token->size = size; + token->vaddr = dma_alloc_coherent(&asd_ha->pcidev->dev, + token->size, + &token->dma_handle, + flags); + if (!token->vaddr) { + asd_dmatok_free(token); + token = NULL; + } + } + return token; +} + +static inline void asd_free_coherent(struct asd_ha_struct *asd_ha, + struct asd_dma_tok *token) +{ + if (token) { + dma_free_coherent(&asd_ha->pcidev->dev, token->size, + token->vaddr, token->dma_handle); + asd_dmatok_free(token); + } +} + +static inline void asd_init_ascb(struct asd_ha_struct *asd_ha, + struct asd_ascb *ascb) +{ + INIT_LIST_HEAD(&ascb->list); + ascb->scb = ascb->dma_scb.vaddr; + ascb->ha = asd_ha; + ascb->timer.function = NULL; + init_timer(&ascb->timer); + ascb->tc_index = -1; + init_completion(&ascb->completion); +} + +/* Must be called with the tc_index_lock held! + */ +static inline void asd_tc_index_release(struct asd_seq_data *seq, int index) +{ + seq->tc_index_array[index] = NULL; + clear_bit(index, seq->tc_index_bitmap); +} + +/* Must be called with the tc_index_lock held! + */ +static inline int asd_tc_index_get(struct asd_seq_data *seq, void *ptr) +{ + int index; + + index = find_first_zero_bit(seq->tc_index_bitmap, + seq->tc_index_bitmap_bits); + if (index == seq->tc_index_bitmap_bits) + return -1; + + seq->tc_index_array[index] = ptr; + set_bit(index, seq->tc_index_bitmap); + + return index; +} + +/* Must be called with the tc_index_lock held! + */ +static inline void *asd_tc_index_find(struct asd_seq_data *seq, int index) +{ + return seq->tc_index_array[index]; +} + +/** + * asd_ascb_free -- free a single aSCB after is has completed + * @ascb: pointer to the aSCB of interest + * + * This frees an aSCB after it has been executed/completed by + * the sequencer. + */ +static inline void asd_ascb_free(struct asd_ascb *ascb) +{ + if (ascb) { + struct asd_ha_struct *asd_ha = ascb->ha; + unsigned long flags; + + BUG_ON(!list_empty(&ascb->list)); + spin_lock_irqsave(&ascb->ha->seq.tc_index_lock, flags); + asd_tc_index_release(&ascb->ha->seq, ascb->tc_index); + spin_unlock_irqrestore(&ascb->ha->seq.tc_index_lock, flags); + dma_pool_free(asd_ha->scb_pool, ascb->dma_scb.vaddr, + ascb->dma_scb.dma_handle); + kmem_cache_free(asd_ascb_cache, ascb); + } +} + +/** + * asd_ascb_list_free -- free a list of ascbs + * @ascb_list: a list of ascbs + * + * This function will free a list of ascbs allocated by asd_ascb_alloc_list. + * It is used when say the scb queueing function returned QUEUE_FULL, + * and we do not need the ascbs any more. + */ +static inline void asd_ascb_free_list(struct asd_ascb *ascb_list) +{ + LIST_HEAD(list); + struct list_head *n, *pos; + + __list_add(&list, ascb_list->list.prev, &ascb_list->list); + list_for_each_safe(pos, n, &list) { + list_del_init(pos); + asd_ascb_free(list_entry(pos, struct asd_ascb, list)); + } +} + +/* ---------- Function declarations ---------- */ + +int asd_init_hw(struct asd_ha_struct *asd_ha); +irqreturn_t asd_hw_isr(int irq, void *dev_id, struct pt_regs *regs); + + +struct asd_ascb *asd_ascb_alloc_list(struct asd_ha_struct + *asd_ha, int *num, + unsigned int gfp_mask); + +int asd_post_ascb_list(struct asd_ha_struct *asd_ha, struct asd_ascb *ascb, + int num); +int asd_post_escb_list(struct asd_ha_struct *asd_ha, struct asd_ascb *ascb, + int num); + +int asd_init_post_escbs(struct asd_ha_struct *asd_ha); +void asd_build_control_phy(struct asd_ascb *ascb, int phy_id, u8 subfunc); +void asd_control_led(struct asd_ha_struct *asd_ha, int phy_id, int op); +void asd_turn_led(struct asd_ha_struct *asd_ha, int phy_id, int op); +int asd_enable_phys(struct asd_ha_struct *asd_ha, const u8 phy_mask); +void asd_build_initiate_link_adm_task(struct asd_ascb *ascb, int phy_id, + u8 subfunc); + +void asd_ascb_timedout(unsigned long data); +int asd_chip_hardrst(struct asd_ha_struct *asd_ha); + +#endif diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_init.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_init.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_init.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_init.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,861 @@ +/* + * Aic94xx SAS/SATA driver initialization. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "aic94xx.h" +#include "aic94xx_reg.h" +#include "aic94xx_hwi.h" +#include "aic94xx_seq.h" + +/* The format is "version.release.patchlevel" */ +#define ASD_DRIVER_VERSION "1.0.2" + +static int use_msi = 0; +module_param_named(use_msi, use_msi, int, S_IRUGO); +MODULE_PARM_DESC(use_msi, "\n" + "\tEnable(1) or disable(0) using PCI MSI.\n" + "\tDefault: 0"); + +static int lldd_max_execute_num = 0; +module_param_named(collector, lldd_max_execute_num, int, S_IRUGO); +MODULE_PARM_DESC(collector, "\n" + "\tIf greater than one, tells the SAS Layer to run in Task Collector\n" + "\tMode. If 1 or 0, tells the SAS Layer to run in Direct Mode.\n" + "\tThe aic94xx SAS LLDD supports both modes.\n" + "\tDefault: 0 (Direct Mode).\n"); + +char sas_addr_str[2*SAS_ADDR_SIZE + 1] = ""; + +static struct scsi_transport_template *aic94xx_transport_template; + +static struct scsi_host_template aic94xx_sht = { + .module = THIS_MODULE, + /* .name is initialized */ + .name = "aic94xx", + .queuecommand = sas_queuecommand, + .target_alloc = sas_target_alloc, + .slave_configure = sas_slave_configure, + .slave_destroy = sas_slave_destroy, + .change_queue_depth = sas_change_queue_depth, + .change_queue_type = sas_change_queue_type, + .bios_param = sas_bios_param, + .can_queue = 1, + .cmd_per_lun = 1, + .this_id = -1, + .sg_tablesize = SG_ALL, + .max_sectors = SCSI_DEFAULT_MAX_SECTORS, + .use_clustering = ENABLE_CLUSTERING, +}; + +static int __devinit asd_map_memio(struct asd_ha_struct *asd_ha) +{ + int err, i; + struct asd_ha_addrspace *io_handle; + + asd_ha->iospace = 0; + for (i = 0; i < 3; i += 2) { + io_handle = &asd_ha->io_handle[i==0?0:1]; + io_handle->start = pci_resource_start(asd_ha->pcidev, i); + io_handle->len = pci_resource_len(asd_ha->pcidev, i); + io_handle->flags = pci_resource_flags(asd_ha->pcidev, i); + err = -ENODEV; + if (!io_handle->start || !io_handle->len) { + asd_printk("MBAR%d start or length for %s is 0.\n", + i==0?0:1, pci_name(asd_ha->pcidev)); + goto Err; + } + err = pci_request_region(asd_ha->pcidev, i, ASD_DRIVER_NAME); + if (err) { + asd_printk("couldn't reserve memory region for %s\n", + pci_name(asd_ha->pcidev)); + goto Err; + } + if (io_handle->flags & IORESOURCE_CACHEABLE) + io_handle->addr = ioremap(io_handle->start, + io_handle->len); + else + io_handle->addr = ioremap_nocache(io_handle->start, + io_handle->len); + if (!io_handle->addr) { + asd_printk("couldn't map MBAR%d of %s\n", i==0?0:1, + pci_name(asd_ha->pcidev)); + goto Err_unreq; + } + } + + return 0; +Err_unreq: + pci_release_region(asd_ha->pcidev, i); +Err: + if (i > 0) { + io_handle = &asd_ha->io_handle[0]; + iounmap(io_handle->addr); + pci_release_region(asd_ha->pcidev, 0); + } + return err; +} + +static void __devexit asd_unmap_memio(struct asd_ha_struct *asd_ha) +{ + struct asd_ha_addrspace *io_handle; + + io_handle = &asd_ha->io_handle[1]; + iounmap(io_handle->addr); + pci_release_region(asd_ha->pcidev, 2); + + io_handle = &asd_ha->io_handle[0]; + iounmap(io_handle->addr); + pci_release_region(asd_ha->pcidev, 0); +} + +static int __devinit asd_map_ioport(struct asd_ha_struct *asd_ha) +{ + int i = PCI_IOBAR_OFFSET, err; + struct asd_ha_addrspace *io_handle = &asd_ha->io_handle[0]; + + asd_ha->iospace = 1; + io_handle->start = pci_resource_start(asd_ha->pcidev, i); + io_handle->len = pci_resource_len(asd_ha->pcidev, i); + io_handle->flags = pci_resource_flags(asd_ha->pcidev, i); + io_handle->addr = (void __iomem *) io_handle->start; + if (!io_handle->start || !io_handle->len) { + asd_printk("couldn't get IO ports for %s\n", + pci_name(asd_ha->pcidev)); + return -ENODEV; + } + err = pci_request_region(asd_ha->pcidev, i, ASD_DRIVER_NAME); + if (err) { + asd_printk("couldn't reserve io space for %s\n", + pci_name(asd_ha->pcidev)); + } + + return err; +} + +static void __devexit asd_unmap_ioport(struct asd_ha_struct *asd_ha) +{ + pci_release_region(asd_ha->pcidev, PCI_IOBAR_OFFSET); +} + +static int __devinit asd_map_ha(struct asd_ha_struct *asd_ha) +{ + int err; + u16 cmd_reg; + + err = pci_read_config_word(asd_ha->pcidev, PCI_COMMAND, &cmd_reg); + if (err) { + asd_printk("couldn't read command register of %s\n", + pci_name(asd_ha->pcidev)); + goto Err; + } + + err = -ENODEV; + if (cmd_reg & PCI_COMMAND_MEMORY) { + if ((err = asd_map_memio(asd_ha))) + goto Err; + } else if (cmd_reg & PCI_COMMAND_IO) { + if ((err = asd_map_ioport(asd_ha))) + goto Err; + asd_printk("%s ioport mapped -- upgrade your hardware\n", + pci_name(asd_ha->pcidev)); + } else { + asd_printk("no proper device access to %s\n", + pci_name(asd_ha->pcidev)); + goto Err; + } + + return 0; +Err: + return err; +} + +static void __devexit asd_unmap_ha(struct asd_ha_struct *asd_ha) +{ + if (asd_ha->iospace) + asd_unmap_ioport(asd_ha); + else + asd_unmap_memio(asd_ha); +} + +static const char *asd_dev_rev[30] = { + [0] = "A0", + [1] = "A1", + [8] = "B0", +}; + +static int __devinit asd_common_setup(struct asd_ha_struct *asd_ha) +{ + int err, i; + + err = pci_read_config_byte(asd_ha->pcidev, PCI_REVISION_ID, + &asd_ha->revision_id); + if (err) { + asd_printk("couldn't read REVISION ID register of %s\n", + pci_name(asd_ha->pcidev)); + goto Err; + } + err = -ENODEV; + if (asd_ha->revision_id < AIC9410_DEV_REV_B0) { + asd_printk("%s is revision %s (%X), which is not supported\n", + pci_name(asd_ha->pcidev), + asd_dev_rev[asd_ha->revision_id], + asd_ha->revision_id); + goto Err; + } + /* Provide some sane default values. */ + asd_ha->hw_prof.max_scbs = 512; + asd_ha->hw_prof.max_ddbs = 128; + asd_ha->hw_prof.num_phys = ASD_MAX_PHYS; + /* All phys are enabled, by default. */ + asd_ha->hw_prof.enabled_phys = 0xFF; + for (i = 0; i < ASD_MAX_PHYS; i++) { + asd_ha->hw_prof.phy_desc[i].max_sas_lrate = PHY_LINKRATE_3; + asd_ha->hw_prof.phy_desc[i].min_sas_lrate = PHY_LINKRATE_1_5; + asd_ha->hw_prof.phy_desc[i].max_sata_lrate= PHY_LINKRATE_1_5; + asd_ha->hw_prof.phy_desc[i].min_sata_lrate= PHY_LINKRATE_1_5; + } + + return 0; +Err: + return err; +} + +static int __devinit asd_aic9410_setup(struct asd_ha_struct *asd_ha) +{ + int err = asd_common_setup(asd_ha); + + if (err) + return err; + + asd_ha->hw_prof.addr_range = 8; + asd_ha->hw_prof.port_name_base = 0; + asd_ha->hw_prof.dev_name_base = 8; + asd_ha->hw_prof.sata_name_base = 16; + + return 0; +} + +static int __devinit asd_aic9405_setup(struct asd_ha_struct *asd_ha) +{ + int err = asd_common_setup(asd_ha); + + if (err) + return err; + + asd_ha->hw_prof.addr_range = 4; + asd_ha->hw_prof.port_name_base = 0; + asd_ha->hw_prof.dev_name_base = 4; + asd_ha->hw_prof.sata_name_base = 8; + + return 0; +} + +static ssize_t asd_show_dev_rev(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct asd_ha_struct *asd_ha = dev_to_asd_ha(dev); + return snprintf(buf, PAGE_SIZE, "%s\n", + asd_dev_rev[asd_ha->revision_id]); +} +static DEVICE_ATTR(revision, S_IRUGO, asd_show_dev_rev, NULL); + +static ssize_t asd_show_dev_bios_build(struct device *dev, + struct device_attribute *attr,char *buf) +{ + struct asd_ha_struct *asd_ha = dev_to_asd_ha(dev); + return snprintf(buf, PAGE_SIZE, "%d\n", asd_ha->hw_prof.bios.bld); +} +static DEVICE_ATTR(bios_build, S_IRUGO, asd_show_dev_bios_build, NULL); + +static ssize_t asd_show_dev_pcba_sn(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct asd_ha_struct *asd_ha = dev_to_asd_ha(dev); + return snprintf(buf, PAGE_SIZE, "%s\n", asd_ha->hw_prof.pcba_sn); +} +static DEVICE_ATTR(pcba_sn, S_IRUGO, asd_show_dev_pcba_sn, NULL); + +static void asd_create_dev_attrs(struct asd_ha_struct *asd_ha) +{ + device_create_file(&asd_ha->pcidev->dev, &dev_attr_revision); + device_create_file(&asd_ha->pcidev->dev, &dev_attr_bios_build); + device_create_file(&asd_ha->pcidev->dev, &dev_attr_pcba_sn); +} + +static void asd_remove_dev_attrs(struct asd_ha_struct *asd_ha) +{ + device_remove_file(&asd_ha->pcidev->dev, &dev_attr_revision); + device_remove_file(&asd_ha->pcidev->dev, &dev_attr_bios_build); + device_remove_file(&asd_ha->pcidev->dev, &dev_attr_pcba_sn); +} + +/* The first entry, 0, is used for dynamic ids, the rest for devices + * we know about. + */ +static struct asd_pcidev_struct { + const char * name; + int (*setup)(struct asd_ha_struct *asd_ha); +} asd_pcidev_data[] = { + /* Id 0 is used for dynamic ids. */ + { .name = "Adaptec AIC-94xx SAS/SATA Host Adapter", + .setup = asd_aic9410_setup + }, + { .name = "Adaptec AIC-9410W SAS/SATA Host Adapter", + .setup = asd_aic9410_setup + }, + { .name = "Adaptec AIC-9405W SAS/SATA Host Adapter", + .setup = asd_aic9405_setup + }, +}; + +static inline int asd_create_ha_caches(struct asd_ha_struct *asd_ha) +{ + asd_ha->scb_pool = dma_pool_create(ASD_DRIVER_NAME "_scb_pool", + &asd_ha->pcidev->dev, + sizeof(struct scb), + 8, 0); + if (!asd_ha->scb_pool) { + asd_printk("couldn't create scb pool\n"); + return -ENOMEM; + } + + return 0; +} + +/** + * asd_free_edbs -- free empty data buffers + * asd_ha: pointer to host adapter structure + */ +static inline void asd_free_edbs(struct asd_ha_struct *asd_ha) +{ + struct asd_seq_data *seq = &asd_ha->seq; + int i; + + for (i = 0; i < seq->num_edbs; i++) + asd_free_coherent(asd_ha, seq->edb_arr[i]); + kfree(seq->edb_arr); + seq->edb_arr = NULL; +} + +static inline void asd_free_escbs(struct asd_ha_struct *asd_ha) +{ + struct asd_seq_data *seq = &asd_ha->seq; + int i; + + for (i = 0; i < seq->num_escbs; i++) { + if (!list_empty(&seq->escb_arr[i]->list)) + list_del_init(&seq->escb_arr[i]->list); + + asd_ascb_free(seq->escb_arr[i]); + } + kfree(seq->escb_arr); + seq->escb_arr = NULL; +} + +static inline void asd_destroy_ha_caches(struct asd_ha_struct *asd_ha) +{ + int i; + + if (asd_ha->hw_prof.ddb_ext) + asd_free_coherent(asd_ha, asd_ha->hw_prof.ddb_ext); + if (asd_ha->hw_prof.scb_ext) + asd_free_coherent(asd_ha, asd_ha->hw_prof.scb_ext); + + if (asd_ha->hw_prof.ddb_bitmap) + kfree(asd_ha->hw_prof.ddb_bitmap); + asd_ha->hw_prof.ddb_bitmap = NULL; + + for (i = 0; i < ASD_MAX_PHYS; i++) { + struct asd_phy *phy = &asd_ha->phys[i]; + + asd_free_coherent(asd_ha, phy->id_frm_tok); + } + if (asd_ha->seq.escb_arr) + asd_free_escbs(asd_ha); + if (asd_ha->seq.edb_arr) + asd_free_edbs(asd_ha); + if (asd_ha->hw_prof.ue.area) { + kfree(asd_ha->hw_prof.ue.area); + asd_ha->hw_prof.ue.area = NULL; + } + if (asd_ha->seq.tc_index_array) { + kfree(asd_ha->seq.tc_index_array); + kfree(asd_ha->seq.tc_index_bitmap); + asd_ha->seq.tc_index_array = NULL; + asd_ha->seq.tc_index_bitmap = NULL; + } + if (asd_ha->seq.actual_dl) { + asd_free_coherent(asd_ha, asd_ha->seq.actual_dl); + asd_ha->seq.actual_dl = NULL; + asd_ha->seq.dl = NULL; + } + if (asd_ha->seq.next_scb.vaddr) { + dma_pool_free(asd_ha->scb_pool, asd_ha->seq.next_scb.vaddr, + asd_ha->seq.next_scb.dma_handle); + asd_ha->seq.next_scb.vaddr = NULL; + } + dma_pool_destroy(asd_ha->scb_pool); + asd_ha->scb_pool = NULL; +} + +kmem_cache_t *asd_dma_token_cache; +kmem_cache_t *asd_ascb_cache; + +static int asd_create_global_caches(void) +{ + if (!asd_dma_token_cache) { + asd_dma_token_cache + = kmem_cache_create(ASD_DRIVER_NAME "_dma_token", + sizeof(struct asd_dma_tok), + 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!asd_dma_token_cache) { + asd_printk("couldn't create dma token cache\n"); + return -ENOMEM; + } + } + + if (!asd_ascb_cache) { + asd_ascb_cache = kmem_cache_create(ASD_DRIVER_NAME "_ascb", + sizeof(struct asd_ascb), + 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!asd_ascb_cache) { + asd_printk("couldn't create ascb cache\n"); + goto Err; + } + } + + return 0; +Err: + kmem_cache_destroy(asd_dma_token_cache); + asd_dma_token_cache = NULL; + return -ENOMEM; +} + +static void asd_destroy_global_caches(void) +{ + if (asd_dma_token_cache) + kmem_cache_destroy(asd_dma_token_cache); + asd_dma_token_cache = NULL; + + if (asd_ascb_cache) + kmem_cache_destroy(asd_ascb_cache); + asd_ascb_cache = NULL; +} + +static int asd_register_sas_ha(struct asd_ha_struct *asd_ha) +{ + int i; + struct asd_sas_phy **sas_phys = + kmalloc(ASD_MAX_PHYS * sizeof(struct asd_sas_phy), GFP_KERNEL); + struct asd_sas_port **sas_ports = + kmalloc(ASD_MAX_PHYS * sizeof(struct asd_sas_port), GFP_KERNEL); + + if (!sas_phys || !sas_ports) { + kfree(sas_phys); + kfree(sas_ports); + return -ENOMEM; + } + + asd_ha->sas_ha.sas_ha_name = (char *) asd_ha->name; + asd_ha->sas_ha.lldd_module = THIS_MODULE; + asd_ha->sas_ha.sas_addr = &asd_ha->hw_prof.sas_addr[0]; + + for (i = 0; i < ASD_MAX_PHYS; i++) { + sas_phys[i] = &asd_ha->phys[i].sas_phy; + sas_ports[i] = &asd_ha->ports[i]; + } + + asd_ha->sas_ha.sas_phy = sas_phys; + asd_ha->sas_ha.sas_port= sas_ports; + asd_ha->sas_ha.num_phys= ASD_MAX_PHYS; + + asd_ha->sas_ha.lldd_queue_size = asd_ha->seq.can_queue; + + return sas_register_ha(&asd_ha->sas_ha); +} + +static int asd_unregister_sas_ha(struct asd_ha_struct *asd_ha) +{ + int err; + + err = sas_unregister_ha(&asd_ha->sas_ha); + + sas_remove_host(asd_ha->sas_ha.core.shost); + scsi_remove_host(asd_ha->sas_ha.core.shost); + scsi_host_put(asd_ha->sas_ha.core.shost); + + kfree(asd_ha->sas_ha.sas_phy); + kfree(asd_ha->sas_ha.sas_port); + + return err; +} + +static int __devinit asd_pci_probe(struct pci_dev *dev, + const struct pci_device_id *id) +{ + struct asd_pcidev_struct *asd_dev; + unsigned asd_id = (unsigned) id->driver_data; + struct asd_ha_struct *asd_ha; + struct Scsi_Host *shost; + int err; + + if (asd_id >= ARRAY_SIZE(asd_pcidev_data)) { + asd_printk("wrong driver_data in PCI table\n"); + return -ENODEV; + } + + if ((err = pci_enable_device(dev))) { + asd_printk("couldn't enable device %s\n", pci_name(dev)); + return err; + } + + pci_set_master(dev); + + err = -ENOMEM; + + shost = scsi_host_alloc(&aic94xx_sht, sizeof(void *)); + if (!shost) + goto Err; + + asd_dev = &asd_pcidev_data[asd_id]; + + asd_ha = kzalloc(sizeof(*asd_ha), GFP_KERNEL); + if (!asd_ha) { + asd_printk("out of memory\n"); + goto Err; + } + asd_ha->pcidev = dev; + asd_ha->sas_ha.pcidev = asd_ha->pcidev; + asd_ha->sas_ha.lldd_ha = asd_ha; + + asd_ha->name = asd_dev->name; + asd_printk("found %s, device %s\n", asd_ha->name, pci_name(dev)); + + SHOST_TO_SAS_HA(shost) = &asd_ha->sas_ha; + asd_ha->sas_ha.core.shost = shost; + shost->transportt = aic94xx_transport_template; + shost->max_id = ~0; + shost->max_lun = ~0; + shost->max_cmd_len = 16; + + err = scsi_add_host(shost, &dev->dev); + if (err) { + scsi_host_put(shost); + goto Err_free; + } + + + + err = asd_dev->setup(asd_ha); + if (err) + goto Err_free; + + err = -ENODEV; + if (!pci_set_dma_mask(dev, DMA_64BIT_MASK) + && !pci_set_consistent_dma_mask(dev, DMA_64BIT_MASK)) + ; + else if (!pci_set_dma_mask(dev, DMA_32BIT_MASK) + && !pci_set_consistent_dma_mask(dev, DMA_32BIT_MASK)) + ; + else { + asd_printk("no suitable DMA mask for %s\n", pci_name(dev)); + goto Err_free; + } + + pci_set_drvdata(dev, asd_ha); + + err = asd_map_ha(asd_ha); + if (err) + goto Err_free; + + err = asd_create_ha_caches(asd_ha); + if (err) + goto Err_unmap; + + err = asd_init_hw(asd_ha); + if (err) + goto Err_free_cache; + + asd_printk("device %s: SAS addr %llx, PCBA SN %s, %d phys, %d enabled " + "phys, flash %s, BIOS %s%d\n", + pci_name(dev), SAS_ADDR(asd_ha->hw_prof.sas_addr), + asd_ha->hw_prof.pcba_sn, asd_ha->hw_prof.max_phys, + asd_ha->hw_prof.num_phys, + asd_ha->hw_prof.flash.present ? "present" : "not present", + asd_ha->hw_prof.bios.present ? "build " : "not present", + asd_ha->hw_prof.bios.bld); + + shost->can_queue = asd_ha->seq.can_queue; + + if (use_msi) + pci_enable_msi(asd_ha->pcidev); + + err = request_irq(asd_ha->pcidev->irq, asd_hw_isr, SA_SHIRQ, + ASD_DRIVER_NAME, asd_ha); + if (err) { + asd_printk("couldn't get irq %d for %s\n", + asd_ha->pcidev->irq, pci_name(asd_ha->pcidev)); + goto Err_irq; + } + asd_enable_ints(asd_ha); + + err = asd_init_post_escbs(asd_ha); + if (err) { + asd_printk("couldn't post escbs for %s\n", + pci_name(asd_ha->pcidev)); + goto Err_escbs; + } + ASD_DPRINTK("escbs posted\n"); + + asd_create_dev_attrs(asd_ha); + + err = asd_register_sas_ha(asd_ha); + if (err) + goto Err_reg_sas; + + err = asd_enable_phys(asd_ha, asd_ha->hw_prof.enabled_phys); + if (err) { + asd_printk("coudln't enable phys, err:%d\n", err); + goto Err_en_phys; + } + ASD_DPRINTK("enabled phys\n"); + /* give the phy enabling interrupt event time to come in (1s + * is empirically about all it takes) */ + ssleep(1); + /* Wait for discovery to finish */ + scsi_flush_work(asd_ha->sas_ha.core.shost); + + return 0; +Err_en_phys: + asd_unregister_sas_ha(asd_ha); +Err_reg_sas: + asd_remove_dev_attrs(asd_ha); +Err_escbs: + asd_disable_ints(asd_ha); + free_irq(dev->irq, asd_ha); +Err_irq: + if (use_msi) + pci_disable_msi(dev); + asd_chip_hardrst(asd_ha); +Err_free_cache: + asd_destroy_ha_caches(asd_ha); +Err_unmap: + asd_unmap_ha(asd_ha); +Err_free: + kfree(asd_ha); + scsi_remove_host(shost); +Err: + pci_disable_device(dev); + return err; +} + +static void asd_free_queues(struct asd_ha_struct *asd_ha) +{ + unsigned long flags; + LIST_HEAD(pending); + struct list_head *n, *pos; + + spin_lock_irqsave(&asd_ha->seq.pend_q_lock, flags); + asd_ha->seq.pending = 0; + list_splice_init(&asd_ha->seq.pend_q, &pending); + spin_unlock_irqrestore(&asd_ha->seq.pend_q_lock, flags); + + if (!list_empty(&pending)) + ASD_DPRINTK("Uh-oh! Pending is not empty!\n"); + + list_for_each_safe(pos, n, &pending) { + struct asd_ascb *ascb = list_entry(pos, struct asd_ascb, list); + list_del_init(pos); + ASD_DPRINTK("freeing from pending\n"); + asd_ascb_free(ascb); + } +} + +static void asd_turn_off_leds(struct asd_ha_struct *asd_ha) +{ + u8 phy_mask = asd_ha->hw_prof.enabled_phys; + u8 i; + + for_each_phy(phy_mask, phy_mask, i) { + asd_turn_led(asd_ha, i, 0); + asd_control_led(asd_ha, i, 0); + } +} + +static void __devexit asd_pci_remove(struct pci_dev *dev) +{ + struct asd_ha_struct *asd_ha = pci_get_drvdata(dev); + + if (!asd_ha) + return; + + asd_unregister_sas_ha(asd_ha); + + asd_disable_ints(asd_ha); + + asd_remove_dev_attrs(asd_ha); + + /* XXX more here as needed */ + + free_irq(dev->irq, asd_ha); + if (use_msi) + pci_disable_msi(asd_ha->pcidev); + asd_turn_off_leds(asd_ha); + asd_chip_hardrst(asd_ha); + asd_free_queues(asd_ha); + asd_destroy_ha_caches(asd_ha); + asd_unmap_ha(asd_ha); + kfree(asd_ha); + pci_disable_device(dev); + return; +} + +static ssize_t asd_version_show(struct device_driver *driver, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", ASD_DRIVER_VERSION); +} +static DRIVER_ATTR(version, S_IRUGO, asd_version_show, NULL); + +static void asd_create_driver_attrs(struct device_driver *driver) +{ + driver_create_file(driver, &driver_attr_version); +} + +static void asd_remove_driver_attrs(struct device_driver *driver) +{ + driver_remove_file(driver, &driver_attr_version); +} + +static struct sas_domain_function_template aic94xx_transport_functions = { + .lldd_port_formed = asd_update_port_links, + + .lldd_dev_found = asd_dev_found, + .lldd_dev_gone = asd_dev_gone, + + .lldd_execute_task = asd_execute_task, + + .lldd_abort_task = asd_abort_task, + .lldd_abort_task_set = asd_abort_task_set, + .lldd_clear_aca = asd_clear_aca, + .lldd_clear_task_set = asd_clear_task_set, + .lldd_I_T_nexus_reset = NULL, + .lldd_lu_reset = asd_lu_reset, + .lldd_query_task = asd_query_task, + + .lldd_clear_nexus_port = asd_clear_nexus_port, + .lldd_clear_nexus_ha = asd_clear_nexus_ha, + + .lldd_control_phy = asd_control_phy, +}; + +static const struct pci_device_id aic94xx_pci_table[] __devinitdata = { + {PCI_DEVICE(PCI_VENDOR_ID_ADAPTEC2, PCI_DEVICE_ID_ADAPTEC2_RAZOR10), + 0, 0, 1}, + {PCI_DEVICE(PCI_VENDOR_ID_ADAPTEC2, PCI_DEVICE_ID_ADAPTEC2_RAZOR12), + 0, 0, 1}, + {PCI_DEVICE(PCI_VENDOR_ID_ADAPTEC2, PCI_DEVICE_ID_ADAPTEC2_RAZOR1E), + 0, 0, 1}, + {PCI_DEVICE(PCI_VENDOR_ID_ADAPTEC2, PCI_DEVICE_ID_ADAPTEC2_RAZOR30), + 0, 0, 2}, + {PCI_DEVICE(PCI_VENDOR_ID_ADAPTEC2, PCI_DEVICE_ID_ADAPTEC2_RAZOR32), + 0, 0, 2}, + {PCI_DEVICE(PCI_VENDOR_ID_ADAPTEC2, PCI_DEVICE_ID_ADAPTEC2_RAZOR3E), + 0, 0, 2}, + {PCI_DEVICE(PCI_VENDOR_ID_ADAPTEC2, PCI_DEVICE_ID_ADAPTEC2_RAZOR3F), + 0, 0, 2}, + {} +}; + +MODULE_DEVICE_TABLE(pci, aic94xx_pci_table); + +static struct pci_driver aic94xx_pci_driver = { + .name = ASD_DRIVER_NAME, + .id_table = aic94xx_pci_table, + .probe = asd_pci_probe, + .remove = __devexit_p(asd_pci_remove), +}; + +static int __init aic94xx_init(void) +{ + int err; + + + asd_printk("%s version %s loaded\n", ASD_DRIVER_DESCRIPTION, + ASD_DRIVER_VERSION); + + err = asd_create_global_caches(); + if (err) + return err; + + aic94xx_transport_template = + sas_domain_attach_transport(&aic94xx_transport_functions); + if (err) + goto out_destroy_caches; + + err = pci_register_driver(&aic94xx_pci_driver); + if (err) + goto out_release_transport; + + asd_create_driver_attrs(&aic94xx_pci_driver.driver); + + return err; + + out_release_transport: + sas_release_transport(aic94xx_transport_template); + out_destroy_caches: + asd_destroy_global_caches(); + + return err; +} + +static void __exit aic94xx_exit(void) +{ + asd_remove_driver_attrs(&aic94xx_pci_driver.driver); + pci_unregister_driver(&aic94xx_pci_driver); + sas_release_transport(aic94xx_transport_template); + asd_destroy_global_caches(); + asd_printk("%s version %s unloaded\n", ASD_DRIVER_DESCRIPTION, + ASD_DRIVER_VERSION); +} + +module_init(aic94xx_init); +module_exit(aic94xx_exit); + +MODULE_AUTHOR("Luben Tuikov "); +MODULE_DESCRIPTION(ASD_DRIVER_DESCRIPTION); +MODULE_LICENSE("GPL v2"); +MODULE_VERSION(ASD_DRIVER_VERSION); diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_reg.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_reg.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_reg.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_reg.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,332 @@ +/* + * Aic94xx SAS/SATA driver register access. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "aic94xx_reg.h" +#include "aic94xx.h" + +/* Writing to device address space. + * Offset comes before value to remind that the operation of + * this function is *offs = val. + */ +static inline void asd_write_byte(struct asd_ha_struct *asd_ha, + unsigned long offs, u8 val) +{ + if (unlikely(asd_ha->iospace)) + outb(val, + (unsigned long)asd_ha->io_handle[0].addr + (offs & 0xFF)); + else + writeb(val, asd_ha->io_handle[0].addr + offs); + wmb(); +} + +static inline void asd_write_word(struct asd_ha_struct *asd_ha, + unsigned long offs, u16 val) +{ + if (unlikely(asd_ha->iospace)) + outw(val, + (unsigned long)asd_ha->io_handle[0].addr + (offs & 0xFF)); + else + writew(val, asd_ha->io_handle[0].addr + offs); + wmb(); +} + +static inline void asd_write_dword(struct asd_ha_struct *asd_ha, + unsigned long offs, u32 val) +{ + if (unlikely(asd_ha->iospace)) + outl(val, + (unsigned long)asd_ha->io_handle[0].addr + (offs & 0xFF)); + else + writel(val, asd_ha->io_handle[0].addr + offs); + wmb(); +} + +/* Reading from device address space. + */ +static inline u8 asd_read_byte(struct asd_ha_struct *asd_ha, + unsigned long offs) +{ + u8 val; + if (unlikely(asd_ha->iospace)) + val = inb((unsigned long) asd_ha->io_handle[0].addr + + (offs & 0xFF)); + else + val = readb(asd_ha->io_handle[0].addr + offs); + rmb(); + return val; +} + +static inline u16 asd_read_word(struct asd_ha_struct *asd_ha, + unsigned long offs) +{ + u16 val; + if (unlikely(asd_ha->iospace)) + val = inw((unsigned long)asd_ha->io_handle[0].addr + + (offs & 0xFF)); + else + val = readw(asd_ha->io_handle[0].addr + offs); + rmb(); + return val; +} + +static inline u32 asd_read_dword(struct asd_ha_struct *asd_ha, + unsigned long offs) +{ + u32 val; + if (unlikely(asd_ha->iospace)) + val = inl((unsigned long) asd_ha->io_handle[0].addr + + (offs & 0xFF)); + else + val = readl(asd_ha->io_handle[0].addr + offs); + rmb(); + return val; +} + +static inline u32 asd_mem_offs_swa(void) +{ + return 0; +} + +static inline u32 asd_mem_offs_swc(void) +{ + return asd_mem_offs_swa() + MBAR0_SWA_SIZE; +} + +static inline u32 asd_mem_offs_swb(void) +{ + return asd_mem_offs_swc() + MBAR0_SWC_SIZE + 0x20; +} + +/* We know that the register wanted is in the range + * of the sliding window. + */ +#define ASD_READ_SW(ww, type, ord) \ +static inline type asd_read_##ww##_##ord (struct asd_ha_struct *asd_ha,\ + u32 reg) \ +{ \ + struct asd_ha_addrspace *io_handle = &asd_ha->io_handle[0]; \ + u32 map_offs=(reg - io_handle-> ww##_base )+asd_mem_offs_##ww ();\ + return asd_read_##ord (asd_ha, (unsigned long) map_offs); \ +} + +#define ASD_WRITE_SW(ww, type, ord) \ +static inline void asd_write_##ww##_##ord (struct asd_ha_struct *asd_ha,\ + u32 reg, type val) \ +{ \ + struct asd_ha_addrspace *io_handle = &asd_ha->io_handle[0]; \ + u32 map_offs=(reg - io_handle-> ww##_base )+asd_mem_offs_##ww ();\ + asd_write_##ord (asd_ha, (unsigned long) map_offs, val); \ +} + +ASD_READ_SW(swa, u8, byte); +ASD_READ_SW(swa, u16, word); +ASD_READ_SW(swa, u32, dword); + +ASD_READ_SW(swb, u8, byte); +ASD_READ_SW(swb, u16, word); +ASD_READ_SW(swb, u32, dword); + +ASD_READ_SW(swc, u8, byte); +ASD_READ_SW(swc, u16, word); +ASD_READ_SW(swc, u32, dword); + +ASD_WRITE_SW(swa, u8, byte); +ASD_WRITE_SW(swa, u16, word); +ASD_WRITE_SW(swa, u32, dword); + +ASD_WRITE_SW(swb, u8, byte); +ASD_WRITE_SW(swb, u16, word); +ASD_WRITE_SW(swb, u32, dword); + +ASD_WRITE_SW(swc, u8, byte); +ASD_WRITE_SW(swc, u16, word); +ASD_WRITE_SW(swc, u32, dword); + +/* + * A word about sliding windows: + * MBAR0 is divided into sliding windows A, C and B, in that order. + * SWA starts at offset 0 of MBAR0, up to 0x57, with size 0x58 bytes. + * SWC starts at offset 0x58 of MBAR0, up to 0x60, with size 0x8 bytes. + * From 0x60 to 0x7F, we have a copy of PCI config space 0x60-0x7F. + * SWB starts at offset 0x80 of MBAR0 and extends to the end of MBAR0. + * See asd_init_sw() in aic94xx_hwi.c + * + * We map the most common registers we'd access of the internal 4GB + * host adapter memory space. If a register/internal memory location + * is wanted which is not mapped, we slide SWB, by paging it, + * see asd_move_swb() in aic94xx_reg.c. + */ + +/** + * asd_move_swb -- move sliding window B + * @asd_ha: pointer to host adapter structure + * @reg: register desired to be within range of the new window + */ +static inline void asd_move_swb(struct asd_ha_struct *asd_ha, u32 reg) +{ + u32 base = reg & ~(MBAR0_SWB_SIZE-1); + pci_write_config_dword(asd_ha->pcidev, PCI_CONF_MBAR0_SWB, base); + asd_ha->io_handle[0].swb_base = base; +} + +static void __asd_write_reg_byte(struct asd_ha_struct *asd_ha, u32 reg, u8 val) +{ + struct asd_ha_addrspace *io_handle=&asd_ha->io_handle[0]; + BUG_ON(reg >= 0xC0000000 || reg < ALL_BASE_ADDR); + if (io_handle->swa_base <= reg + && reg < io_handle->swa_base + MBAR0_SWA_SIZE) + asd_write_swa_byte (asd_ha, reg,val); + else if (io_handle->swb_base <= reg + && reg < io_handle->swb_base + MBAR0_SWB_SIZE) + asd_write_swb_byte (asd_ha, reg, val); + else if (io_handle->swc_base <= reg + && reg < io_handle->swc_base + MBAR0_SWC_SIZE) + asd_write_swc_byte (asd_ha, reg, val); + else { + /* Ok, we have to move SWB */ + asd_move_swb(asd_ha, reg); + asd_write_swb_byte (asd_ha, reg, val); + } +} + +#define ASD_WRITE_REG(type, ord) \ +void asd_write_reg_##ord (struct asd_ha_struct *asd_ha, u32 reg, type val)\ +{ \ + struct asd_ha_addrspace *io_handle=&asd_ha->io_handle[0]; \ + unsigned long flags; \ + BUG_ON(reg >= 0xC0000000 || reg < ALL_BASE_ADDR); \ + spin_lock_irqsave(&asd_ha->iolock, flags); \ + if (io_handle->swa_base <= reg \ + && reg < io_handle->swa_base + MBAR0_SWA_SIZE) \ + asd_write_swa_##ord (asd_ha, reg,val); \ + else if (io_handle->swb_base <= reg \ + && reg < io_handle->swb_base + MBAR0_SWB_SIZE) \ + asd_write_swb_##ord (asd_ha, reg, val); \ + else if (io_handle->swc_base <= reg \ + && reg < io_handle->swc_base + MBAR0_SWC_SIZE) \ + asd_write_swc_##ord (asd_ha, reg, val); \ + else { \ + /* Ok, we have to move SWB */ \ + asd_move_swb(asd_ha, reg); \ + asd_write_swb_##ord (asd_ha, reg, val); \ + } \ + spin_unlock_irqrestore(&asd_ha->iolock, flags); \ +} + +ASD_WRITE_REG(u8, byte); +ASD_WRITE_REG(u16,word); +ASD_WRITE_REG(u32,dword); + +static u8 __asd_read_reg_byte(struct asd_ha_struct *asd_ha, u32 reg) +{ + struct asd_ha_addrspace *io_handle=&asd_ha->io_handle[0]; + u8 val; + BUG_ON(reg >= 0xC0000000 || reg < ALL_BASE_ADDR); + if (io_handle->swa_base <= reg + && reg < io_handle->swa_base + MBAR0_SWA_SIZE) + val = asd_read_swa_byte (asd_ha, reg); + else if (io_handle->swb_base <= reg + && reg < io_handle->swb_base + MBAR0_SWB_SIZE) + val = asd_read_swb_byte (asd_ha, reg); + else if (io_handle->swc_base <= reg + && reg < io_handle->swc_base + MBAR0_SWC_SIZE) + val = asd_read_swc_byte (asd_ha, reg); + else { + /* Ok, we have to move SWB */ + asd_move_swb(asd_ha, reg); + val = asd_read_swb_byte (asd_ha, reg); + } + return val; +} + +#define ASD_READ_REG(type, ord) \ +type asd_read_reg_##ord (struct asd_ha_struct *asd_ha, u32 reg) \ +{ \ + struct asd_ha_addrspace *io_handle=&asd_ha->io_handle[0]; \ + type val; \ + unsigned long flags; \ + BUG_ON(reg >= 0xC0000000 || reg < ALL_BASE_ADDR); \ + spin_lock_irqsave(&asd_ha->iolock, flags); \ + if (io_handle->swa_base <= reg \ + && reg < io_handle->swa_base + MBAR0_SWA_SIZE) \ + val = asd_read_swa_##ord (asd_ha, reg); \ + else if (io_handle->swb_base <= reg \ + && reg < io_handle->swb_base + MBAR0_SWB_SIZE) \ + val = asd_read_swb_##ord (asd_ha, reg); \ + else if (io_handle->swc_base <= reg \ + && reg < io_handle->swc_base + MBAR0_SWC_SIZE) \ + val = asd_read_swc_##ord (asd_ha, reg); \ + else { \ + /* Ok, we have to move SWB */ \ + asd_move_swb(asd_ha, reg); \ + val = asd_read_swb_##ord (asd_ha, reg); \ + } \ + spin_unlock_irqrestore(&asd_ha->iolock, flags); \ + return val; \ +} + +ASD_READ_REG(u8, byte); +ASD_READ_REG(u16,word); +ASD_READ_REG(u32,dword); + +/** + * asd_read_reg_string -- read a string of bytes from io space memory + * @asd_ha: pointer to host adapter structure + * @dst: pointer to a destination buffer where data will be written to + * @offs: start offset (register) to read from + * @count: number of bytes to read + */ +void asd_read_reg_string(struct asd_ha_struct *asd_ha, void *dst, + u32 offs, int count) +{ + u8 *p = dst; + unsigned long flags; + + spin_lock_irqsave(&asd_ha->iolock, flags); + for ( ; count > 0; count--, offs++, p++) + *p = __asd_read_reg_byte(asd_ha, offs); + spin_unlock_irqrestore(&asd_ha->iolock, flags); +} + +/** + * asd_write_reg_string -- write a string of bytes to io space memory + * @asd_ha: pointer to host adapter structure + * @src: pointer to source buffer where data will be read from + * @offs: start offset (register) to write to + * @count: number of bytes to write + */ +void asd_write_reg_string(struct asd_ha_struct *asd_ha, void *src, + u32 offs, int count) +{ + u8 *p = src; + unsigned long flags; + + spin_lock_irqsave(&asd_ha->iolock, flags); + for ( ; count > 0; count--, offs++, p++) + __asd_write_reg_byte(asd_ha, offs, *p); + spin_unlock_irqrestore(&asd_ha->iolock, flags); +} diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_reg.h linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_reg.h --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_reg.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_reg.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,302 @@ +/* + * Aic94xx SAS/SATA driver hardware registers definitions. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _AIC94XX_REG_H_ +#define _AIC94XX_REG_H_ + +#include +#include "aic94xx_hwi.h" + +/* Values */ +#define AIC9410_DEV_REV_B0 0x8 + +/* MBAR0, SWA, SWB, SWC, internal memory space addresses */ +#define REG_BASE_ADDR 0xB8000000 +#define REG_BASE_ADDR_CSEQCIO 0xB8002000 +#define REG_BASE_ADDR_EXSI 0xB8042800 + +#define MBAR0_SWA_SIZE 0x58 +extern u32 MBAR0_SWB_SIZE; +#define MBAR0_SWC_SIZE 0x8 + +/* MBAR1, points to On Chip Memory */ +#define OCM_BASE_ADDR 0xA0000000 +#define OCM_MAX_SIZE 0x20000 + +/* Smallest address possible to reference */ +#define ALL_BASE_ADDR OCM_BASE_ADDR + +/* PCI configuration space registers */ +#define PCI_IOBAR_OFFSET 4 + +#define PCI_CONF_MBAR1 0x6C +#define PCI_CONF_MBAR0_SWA 0x70 +#define PCI_CONF_MBAR0_SWB 0x74 +#define PCI_CONF_MBAR0_SWC 0x78 +#define PCI_CONF_MBAR_KEY 0x7C +#define PCI_CONF_FLSH_BAR 0xB8 + +#include "aic94xx_reg_def.h" + +u8 asd_read_reg_byte(struct asd_ha_struct *asd_ha, u32 reg); +u16 asd_read_reg_word(struct asd_ha_struct *asd_ha, u32 reg); +u32 asd_read_reg_dword(struct asd_ha_struct *asd_ha, u32 reg); + +void asd_write_reg_byte(struct asd_ha_struct *asd_ha, u32 reg, u8 val); +void asd_write_reg_word(struct asd_ha_struct *asd_ha, u32 reg, u16 val); +void asd_write_reg_dword(struct asd_ha_struct *asd_ha, u32 reg, u32 val); + +void asd_read_reg_string(struct asd_ha_struct *asd_ha, void *dst, + u32 offs, int count); +void asd_write_reg_string(struct asd_ha_struct *asd_ha, void *src, + u32 offs, int count); + +#define ASD_READ_OCM(type, ord, S) \ +static inline type asd_read_ocm_##ord (struct asd_ha_struct *asd_ha, \ + u32 offs) \ +{ \ + struct asd_ha_addrspace *io_handle = &asd_ha->io_handle[1]; \ + type val = read##S (io_handle->addr + (unsigned long) offs); \ + rmb(); \ + return val; \ +} + +ASD_READ_OCM(u8, byte, b); +ASD_READ_OCM(u16,word, w); +ASD_READ_OCM(u32,dword,l); + +#define ASD_WRITE_OCM(type, ord, S) \ +static inline void asd_write_ocm_##ord (struct asd_ha_struct *asd_ha, \ + u32 offs, type val) \ +{ \ + struct asd_ha_addrspace *io_handle = &asd_ha->io_handle[1]; \ + write##S (val, io_handle->addr + (unsigned long) offs); \ + return; \ +} + +ASD_WRITE_OCM(u8, byte, b); +ASD_WRITE_OCM(u16,word, w); +ASD_WRITE_OCM(u32,dword,l); + +#define ASD_DDBSITE_READ(type, ord) \ +static inline type asd_ddbsite_read_##ord (struct asd_ha_struct *asd_ha, \ + u16 ddb_site_no, \ + u16 offs) \ +{ \ + asd_write_reg_word(asd_ha, ALTCIOADR, MnDDB_SITE + offs); \ + asd_write_reg_word(asd_ha, ADDBPTR, ddb_site_no); \ + return asd_read_reg_##ord (asd_ha, CTXACCESS); \ +} + +ASD_DDBSITE_READ(u32, dword); +ASD_DDBSITE_READ(u16, word); + +static inline u8 asd_ddbsite_read_byte(struct asd_ha_struct *asd_ha, + u16 ddb_site_no, + u16 offs) +{ + if (offs & 1) + return asd_ddbsite_read_word(asd_ha, ddb_site_no, + offs & ~1) >> 8; + else + return asd_ddbsite_read_word(asd_ha, ddb_site_no, + offs) & 0xFF; +} + + +#define ASD_DDBSITE_WRITE(type, ord) \ +static inline void asd_ddbsite_write_##ord (struct asd_ha_struct *asd_ha, \ + u16 ddb_site_no, \ + u16 offs, type val) \ +{ \ + asd_write_reg_word(asd_ha, ALTCIOADR, MnDDB_SITE + offs); \ + asd_write_reg_word(asd_ha, ADDBPTR, ddb_site_no); \ + asd_write_reg_##ord (asd_ha, CTXACCESS, val); \ +} + +ASD_DDBSITE_WRITE(u32, dword); +ASD_DDBSITE_WRITE(u16, word); + +static inline void asd_ddbsite_write_byte(struct asd_ha_struct *asd_ha, + u16 ddb_site_no, + u16 offs, u8 val) +{ + u16 base = offs & ~1; + u16 rval = asd_ddbsite_read_word(asd_ha, ddb_site_no, base); + if (offs & 1) + rval = (val << 8) | (rval & 0xFF); + else + rval = (rval & 0xFF00) | val; + asd_ddbsite_write_word(asd_ha, ddb_site_no, base, rval); +} + + +#define ASD_SCBSITE_READ(type, ord) \ +static inline type asd_scbsite_read_##ord (struct asd_ha_struct *asd_ha, \ + u16 scb_site_no, \ + u16 offs) \ +{ \ + asd_write_reg_word(asd_ha, ALTCIOADR, MnSCB_SITE + offs); \ + asd_write_reg_word(asd_ha, ASCBPTR, scb_site_no); \ + return asd_read_reg_##ord (asd_ha, CTXACCESS); \ +} + +ASD_SCBSITE_READ(u32, dword); +ASD_SCBSITE_READ(u16, word); + +static inline u8 asd_scbsite_read_byte(struct asd_ha_struct *asd_ha, + u16 scb_site_no, + u16 offs) +{ + if (offs & 1) + return asd_scbsite_read_word(asd_ha, scb_site_no, + offs & ~1) >> 8; + else + return asd_scbsite_read_word(asd_ha, scb_site_no, + offs) & 0xFF; +} + + +#define ASD_SCBSITE_WRITE(type, ord) \ +static inline void asd_scbsite_write_##ord (struct asd_ha_struct *asd_ha, \ + u16 scb_site_no, \ + u16 offs, type val) \ +{ \ + asd_write_reg_word(asd_ha, ALTCIOADR, MnSCB_SITE + offs); \ + asd_write_reg_word(asd_ha, ASCBPTR, scb_site_no); \ + asd_write_reg_##ord (asd_ha, CTXACCESS, val); \ +} + +ASD_SCBSITE_WRITE(u32, dword); +ASD_SCBSITE_WRITE(u16, word); + +static inline void asd_scbsite_write_byte(struct asd_ha_struct *asd_ha, + u16 scb_site_no, + u16 offs, u8 val) +{ + u16 base = offs & ~1; + u16 rval = asd_scbsite_read_word(asd_ha, scb_site_no, base); + if (offs & 1) + rval = (val << 8) | (rval & 0xFF); + else + rval = (rval & 0xFF00) | val; + asd_scbsite_write_word(asd_ha, scb_site_no, base, rval); +} + +/** + * asd_ddbsite_update_word -- atomically update a word in a ddb site + * @asd_ha: pointer to host adapter structure + * @ddb_site_no: the DDB site number + * @offs: the offset into the DDB + * @oldval: old value found in that offset + * @newval: the new value to replace it + * + * This function is used when the sequencers are running and we need to + * update a DDB site atomically without expensive pausing and upausing + * of the sequencers and accessing the DDB site through the CIO bus. + * + * Return 0 on success; -EFAULT on parity error; -EAGAIN if the old value + * is different than the current value at that offset. + */ +static inline int asd_ddbsite_update_word(struct asd_ha_struct *asd_ha, + u16 ddb_site_no, u16 offs, + u16 oldval, u16 newval) +{ + u8 done; + u16 oval = asd_ddbsite_read_word(asd_ha, ddb_site_no, offs); + if (oval != oldval) + return -EAGAIN; + asd_write_reg_word(asd_ha, AOLDDATA, oldval); + asd_write_reg_word(asd_ha, ANEWDATA, newval); + do { + done = asd_read_reg_byte(asd_ha, ATOMICSTATCTL); + } while (!(done & ATOMICDONE)); + if (done & ATOMICERR) + return -EFAULT; /* parity error */ + else if (done & ATOMICWIN) + return 0; /* success */ + else + return -EAGAIN; /* oldval different than current value */ +} + +static inline int asd_ddbsite_update_byte(struct asd_ha_struct *asd_ha, + u16 ddb_site_no, u16 offs, + u8 _oldval, u8 _newval) +{ + u16 base = offs & ~1; + u16 oval; + u16 nval = asd_ddbsite_read_word(asd_ha, ddb_site_no, base); + if (offs & 1) { + if ((nval >> 8) != _oldval) + return -EAGAIN; + nval = (_newval << 8) | (nval & 0xFF); + oval = (_oldval << 8) | (nval & 0xFF); + } else { + if ((nval & 0xFF) != _oldval) + return -EAGAIN; + nval = (nval & 0xFF00) | _newval; + oval = (nval & 0xFF00) | _oldval; + } + return asd_ddbsite_update_word(asd_ha, ddb_site_no, base, oval, nval); +} + +static inline void asd_write_reg_addr(struct asd_ha_struct *asd_ha, u32 reg, + dma_addr_t dma_handle) +{ + asd_write_reg_dword(asd_ha, reg, ASD_BUSADDR_LO(dma_handle)); + asd_write_reg_dword(asd_ha, reg+4, ASD_BUSADDR_HI(dma_handle)); +} + +static inline u32 asd_get_cmdctx_size(struct asd_ha_struct *asd_ha) +{ + /* DCHREVISION returns 0, possibly broken */ + u32 ctxmemsize = asd_read_reg_dword(asd_ha, LmMnINT(0,0)) & CTXMEMSIZE; + return ctxmemsize ? 65536 : 32768; +} + +static inline u32 asd_get_devctx_size(struct asd_ha_struct *asd_ha) +{ + u32 ctxmemsize = asd_read_reg_dword(asd_ha, LmMnINT(0,0)) & CTXMEMSIZE; + return ctxmemsize ? 8192 : 4096; +} + +static inline void asd_disable_ints(struct asd_ha_struct *asd_ha) +{ + asd_write_reg_dword(asd_ha, CHIMINTEN, RST_CHIMINTEN); +} + +static inline void asd_enable_ints(struct asd_ha_struct *asd_ha) +{ + /* Enable COM SAS interrupt on errors, COMSTAT */ + asd_write_reg_dword(asd_ha, COMSTATEN, + EN_CSBUFPERR | EN_CSERR | EN_OVLYERR); + /* Enable DCH SAS CFIFTOERR */ + asd_write_reg_dword(asd_ha, DCHSTATUS, EN_CFIFTOERR); + /* Enable Host Device interrupts */ + asd_write_reg_dword(asd_ha, CHIMINTEN, SET_CHIMINTEN); +} + +#endif diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_reg_def.h linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_reg_def.h --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_reg_def.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_reg_def.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,2398 @@ +/* + * Aic94xx SAS/SATA driver hardware registers defintions. + * + * Copyright (C) 2004 Adaptec, Inc. All rights reserved. + * Copyright (C) 2004 David Chaw + * Copyright (C) 2005 Luben Tuikov + * + * Luben Tuikov: Some register value updates to make it work with the window + * agnostic register r/w functions. Some register corrections, sizes, + * etc. + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * $Id: 0001-2.6.18-openvz-combined-028.035.patch,v 1.1 2007-07-23 23:01:52 niro Exp $ + * + */ + +#ifndef _ADP94XX_REG_DEF_H_ +#define _ADP94XX_REG_DEF_H_ + +/* + * Common definitions. + */ +#define CSEQ_MODE_PAGE_SIZE 0x200 /* CSEQ mode page size */ +#define LmSEQ_MODE_PAGE_SIZE 0x200 /* LmSEQ mode page size */ +#define LmSEQ_HOST_REG_SIZE 0x4000 /* LmSEQ Host Register size */ + +/********************* COM_SAS registers definition *************************/ + +/* The base is REG_BASE_ADDR, defined in aic94xx_reg.h. + */ + +/* + * CHIM Registers, Address Range : (0x00-0xFF) + */ +#define COMBIST (REG_BASE_ADDR + 0x00) + +/* bits 31:24 */ +#define L7BLKRST 0x80000000 +#define L6BLKRST 0x40000000 +#define L5BLKRST 0x20000000 +#define L4BLKRST 0x10000000 +#define L3BLKRST 0x08000000 +#define L2BLKRST 0x04000000 +#define L1BLKRST 0x02000000 +#define L0BLKRST 0x01000000 +#define LmBLKRST 0xFF000000 +#define LmBLKRST_COMBIST(phyid) (1 << (24 + phyid)) + +#define OCMBLKRST 0x00400000 +#define CTXMEMBLKRST 0x00200000 +#define CSEQBLKRST 0x00100000 +#define EXSIBLKRST 0x00040000 +#define DPIBLKRST 0x00020000 +#define DFIFBLKRST 0x00010000 +#define HARDRST 0x00000200 +#define COMBLKRST 0x00000100 +#define FRCDFPERR 0x00000080 +#define FRCCIOPERR 0x00000020 +#define FRCBISTERR 0x00000010 +#define COMBISTEN 0x00000004 +#define COMBISTDONE 0x00000002 /* ro */ +#define COMBISTFAIL 0x00000001 /* ro */ + +#define COMSTAT (REG_BASE_ADDR + 0x04) + +#define REQMBXREAD 0x00000040 +#define RSPMBXAVAIL 0x00000020 +#define CSBUFPERR 0x00000008 +#define OVLYERR 0x00000004 +#define CSERR 0x00000002 +#define OVLYDMADONE 0x00000001 + +#define COMSTAT_MASK (REQMBXREAD | RSPMBXAVAIL | \ + CSBUFPERR | OVLYERR | CSERR |\ + OVLYDMADONE) + +#define COMSTATEN (REG_BASE_ADDR + 0x08) + +#define EN_REQMBXREAD 0x00000040 +#define EN_RSPMBXAVAIL 0x00000020 +#define EN_CSBUFPERR 0x00000008 +#define EN_OVLYERR 0x00000004 +#define EN_CSERR 0x00000002 +#define EN_OVLYDONE 0x00000001 + +#define SCBPRO (REG_BASE_ADDR + 0x0C) + +#define SCBCONS_MASK 0xFFFF0000 +#define SCBPRO_MASK 0x0000FFFF + +#define CHIMREQMBX (REG_BASE_ADDR + 0x10) + +#define CHIMRSPMBX (REG_BASE_ADDR + 0x14) + +#define CHIMINT (REG_BASE_ADDR + 0x18) + +#define EXT_INT0 0x00000800 +#define EXT_INT1 0x00000400 +#define PORRSTDET 0x00000200 +#define HARDRSTDET 0x00000100 +#define DLAVAILQ 0x00000080 /* ro */ +#define HOSTERR 0x00000040 +#define INITERR 0x00000020 +#define DEVINT 0x00000010 +#define COMINT 0x00000008 +#define DEVTIMER2 0x00000004 +#define DEVTIMER1 0x00000002 +#define DLAVAIL 0x00000001 + +#define CHIMINT_MASK (HOSTERR | INITERR | DEVINT | COMINT |\ + DEVTIMER2 | DEVTIMER1 | DLAVAIL) + +#define DEVEXCEPT_MASK (HOSTERR | INITERR | DEVINT | COMINT) + +#define CHIMINTEN (REG_BASE_ADDR + 0x1C) + +#define RST_EN_EXT_INT1 0x01000000 +#define RST_EN_EXT_INT0 0x00800000 +#define RST_EN_HOSTERR 0x00400000 +#define RST_EN_INITERR 0x00200000 +#define RST_EN_DEVINT 0x00100000 +#define RST_EN_COMINT 0x00080000 +#define RST_EN_DEVTIMER2 0x00040000 +#define RST_EN_DEVTIMER1 0x00020000 +#define RST_EN_DLAVAIL 0x00010000 +#define SET_EN_EXT_INT1 0x00000100 +#define SET_EN_EXT_INT0 0x00000080 +#define SET_EN_HOSTERR 0x00000040 +#define SET_EN_INITERR 0x00000020 +#define SET_EN_DEVINT 0x00000010 +#define SET_EN_COMINT 0x00000008 +#define SET_EN_DEVTIMER2 0x00000004 +#define SET_EN_DEVTIMER1 0x00000002 +#define SET_EN_DLAVAIL 0x00000001 + +#define RST_CHIMINTEN (RST_EN_HOSTERR | RST_EN_INITERR | \ + RST_EN_DEVINT | RST_EN_COMINT | \ + RST_EN_DEVTIMER2 | RST_EN_DEVTIMER1 |\ + RST_EN_DLAVAIL) + +#define SET_CHIMINTEN (SET_EN_HOSTERR | SET_EN_INITERR |\ + SET_EN_DEVINT | SET_EN_COMINT |\ + SET_EN_DLAVAIL) + +#define OVLYDMACTL (REG_BASE_ADDR + 0x20) + +#define OVLYADR_MASK 0x07FF0000 +#define OVLYLSEQ_MASK 0x0000FF00 +#define OVLYCSEQ 0x00000080 +#define OVLYHALTERR 0x00000040 +#define PIOCMODE 0x00000020 +#define RESETOVLYDMA 0x00000008 /* wo */ +#define STARTOVLYDMA 0x00000004 +#define STOPOVLYDMA 0x00000002 /* wo */ +#define OVLYDMAACT 0x00000001 /* ro */ + +#define OVLYDMACNT (REG_BASE_ADDR + 0x24) + +#define OVLYDOMAIN1 0x20000000 /* ro */ +#define OVLYDOMAIN0 0x10000000 +#define OVLYBUFADR_MASK 0x007F0000 +#define OVLYDMACNT_MASK 0x00003FFF + +#define OVLYDMAADR (REG_BASE_ADDR + 0x28) + +#define DMAERR (REG_BASE_ADDR + 0x30) + +#define OVLYERRSTAT_MASK 0x0000FF00 /* ro */ +#define CSERRSTAT_MASK 0x000000FF /* ro */ + +#define SPIODATA (REG_BASE_ADDR + 0x34) + +/* 0x38 - 0x3C are reserved */ + +#define T1CNTRLR (REG_BASE_ADDR + 0x40) + +#define T1DONE 0x00010000 /* ro */ +#define TIMER64 0x00000400 +#define T1ENABLE 0x00000200 +#define T1RELOAD 0x00000100 +#define T1PRESCALER_MASK 0x00000003 + +#define T1CMPR (REG_BASE_ADDR + 0x44) + +#define T1CNTR (REG_BASE_ADDR + 0x48) + +#define T2CNTRLR (REG_BASE_ADDR + 0x4C) + +#define T2DONE 0x00010000 /* ro */ +#define T2ENABLE 0x00000200 +#define T2RELOAD 0x00000100 +#define T2PRESCALER_MASK 0x00000003 + +#define T2CMPR (REG_BASE_ADDR + 0x50) + +#define T2CNTR (REG_BASE_ADDR + 0x54) + +/* 0x58h - 0xFCh are reserved */ + +/* + * DCH_SAS Registers, Address Range : (0x800-0xFFF) + */ +#define CMDCTXBASE (REG_BASE_ADDR + 0x800) + +#define DEVCTXBASE (REG_BASE_ADDR + 0x808) + +#define CTXDOMAIN (REG_BASE_ADDR + 0x810) + +#define DEVCTXDOMAIN1 0x00000008 /* ro */ +#define DEVCTXDOMAIN0 0x00000004 +#define CMDCTXDOMAIN1 0x00000002 /* ro */ +#define CMDCTXDOMAIN0 0x00000001 + +#define DCHCTL (REG_BASE_ADDR + 0x814) + +#define OCMBISTREPAIR 0x00080000 +#define OCMBISTEN 0x00040000 +#define OCMBISTDN 0x00020000 /* ro */ +#define OCMBISTFAIL 0x00010000 /* ro */ +#define DDBBISTEN 0x00004000 +#define DDBBISTDN 0x00002000 /* ro */ +#define DDBBISTFAIL 0x00001000 /* ro */ +#define SCBBISTEN 0x00000400 +#define SCBBISTDN 0x00000200 /* ro */ +#define SCBBISTFAIL 0x00000100 /* ro */ + +#define MEMSEL_MASK 0x000000E0 +#define MEMSEL_CCM_LSEQ 0x00000000 +#define MEMSEL_CCM_IOP 0x00000020 +#define MEMSEL_CCM_SASCTL 0x00000040 +#define MEMSEL_DCM_LSEQ 0x00000060 +#define MEMSEL_DCM_IOP 0x00000080 +#define MEMSEL_OCM 0x000000A0 + +#define FRCERR 0x00000010 +#define AUTORLS 0x00000001 + +#define DCHREVISION (REG_BASE_ADDR + 0x818) + +#define DCHREVISION_MASK 0x000000FF + +#define DCHSTATUS (REG_BASE_ADDR + 0x81C) + +#define EN_CFIFTOERR 0x00020000 +#define CFIFTOERR 0x00000200 +#define CSEQINT 0x00000100 /* ro */ +#define LSEQ7INT 0x00000080 /* ro */ +#define LSEQ6INT 0x00000040 /* ro */ +#define LSEQ5INT 0x00000020 /* ro */ +#define LSEQ4INT 0x00000010 /* ro */ +#define LSEQ3INT 0x00000008 /* ro */ +#define LSEQ2INT 0x00000004 /* ro */ +#define LSEQ1INT 0x00000002 /* ro */ +#define LSEQ0INT 0x00000001 /* ro */ + +#define LSEQINT_MASK (LSEQ7INT | LSEQ6INT | LSEQ5INT |\ + LSEQ4INT | LSEQ3INT | LSEQ2INT |\ + LSEQ1INT | LSEQ0INT) + +#define DCHDFIFDEBUG (REG_BASE_ADDR + 0x820) +#define ENFAIRMST 0x00FF0000 +#define DISWRMST9 0x00000200 +#define DISWRMST8 0x00000100 +#define DISRDMST 0x000000FF + +#define ATOMICSTATCTL (REG_BASE_ADDR + 0x824) +/* 8 bit wide */ +#define AUTOINC 0x80 +#define ATOMICERR 0x04 +#define ATOMICWIN 0x02 +#define ATOMICDONE 0x01 + + +#define ALTCIOADR (REG_BASE_ADDR + 0x828) +/* 16 bit; bits 8:0 define CIO addr space of CSEQ */ + +#define ASCBPTR (REG_BASE_ADDR + 0x82C) +/* 16 bit wide */ + +#define ADDBPTR (REG_BASE_ADDR + 0x82E) +/* 16 bit wide */ + +#define ANEWDATA (REG_BASE_ADDR + 0x830) +/* 16 bit */ + +#define AOLDDATA (REG_BASE_ADDR + 0x834) +/* 16 bit */ + +#define CTXACCESS (REG_BASE_ADDR + 0x838) +/* 32 bit */ + +/* 0x83Ch - 0xFFCh are reserved */ + +/* + * ARP2 External Processor Registers, Address Range : (0x00-0x1F) + */ +#define ARP2CTL 0x00 + +#define FRCSCRPERR 0x00040000 +#define FRCARP2PERR 0x00020000 +#define FRCARP2ILLOPC 0x00010000 +#define ENWAITTO 0x00008000 +#define PERRORDIS 0x00004000 +#define FAILDIS 0x00002000 +#define CIOPERRDIS 0x00001000 +#define BREAKEN3 0x00000800 +#define BREAKEN2 0x00000400 +#define BREAKEN1 0x00000200 +#define BREAKEN0 0x00000100 +#define EPAUSE 0x00000008 +#define PAUSED 0x00000004 /* ro */ +#define STEP 0x00000002 +#define ARP2RESET 0x00000001 /* wo */ + +#define ARP2INT 0x04 + +#define HALTCODE_MASK 0x00FF0000 /* ro */ +#define ARP2WAITTO 0x00000100 +#define ARP2HALTC 0x00000080 +#define ARP2ILLOPC 0x00000040 +#define ARP2PERR 0x00000020 +#define ARP2CIOPERR 0x00000010 +#define ARP2BREAK3 0x00000008 +#define ARP2BREAK2 0x00000004 +#define ARP2BREAK1 0x00000002 +#define ARP2BREAK0 0x00000001 + +#define ARP2INTEN 0x08 + +#define EN_ARP2WAITTO 0x00000100 +#define EN_ARP2HALTC 0x00000080 +#define EN_ARP2ILLOPC 0x00000040 +#define EN_ARP2PERR 0x00000020 +#define EN_ARP2CIOPERR 0x00000010 +#define EN_ARP2BREAK3 0x00000008 +#define EN_ARP2BREAK2 0x00000004 +#define EN_ARP2BREAK1 0x00000002 +#define EN_ARP2BREAK0 0x00000001 + +#define ARP2BREAKADR01 0x0C + +#define BREAKADR1_MASK 0x0FFF0000 +#define BREAKADR0_MASK 0x00000FFF + +#define ARP2BREAKADR23 0x10 + +#define BREAKADR3_MASK 0x0FFF0000 +#define BREAKADR2_MASK 0x00000FFF + +/* 0x14h - 0x1Ch are reserved */ + +/* + * ARP2 Registers, Address Range : (0x00-0x1F) + * The definitions have the same address offset for CSEQ and LmSEQ + * CIO Bus Registers. + */ +#define MODEPTR 0x00 + +#define DSTMODE 0xF0 +#define SRCMODE 0x0F + +#define ALTMODE 0x01 + +#define ALTDMODE 0xF0 +#define ALTSMODE 0x0F + +#define ATOMICXCHG 0x02 + +#define FLAG 0x04 + +#define INTCODE_MASK 0xF0 +#define ALTMODEV2 0x04 +#define CARRY_INT 0x02 +#define CARRY 0x01 + +#define ARP2INTCTL 0x05 + +#define PAUSEDIS 0x80 +#define RSTINTCTL 0x40 +#define POPALTMODE 0x08 +#define ALTMODEV 0x04 +#define INTMASK 0x02 +#define IRET 0x01 + +#define STACK 0x06 + +#define FUNCTION1 0x07 + +#define PRGMCNT 0x08 + +#define ACCUM 0x0A + +#define SINDEX 0x0C + +#define DINDEX 0x0E + +#define ALLONES 0x10 + +#define ALLZEROS 0x11 + +#define SINDIR 0x12 + +#define DINDIR 0x13 + +#define JUMLDIR 0x14 + +#define ARP2HALTCODE 0x15 + +#define CURRADDR 0x16 + +#define LASTADDR 0x18 + +#define NXTLADDR 0x1A + +#define DBGPORTPTR 0x1C + +#define DBGPORT 0x1D + +/* + * CIO Registers. + * The definitions have the same address offset for CSEQ and LmSEQ + * CIO Bus Registers. + */ +#define MnSCBPTR 0x20 + +#define MnDDBPTR 0x22 + +#define SCRATCHPAGE 0x24 + +#define MnSCRATCHPAGE 0x25 + +#define SCRATCHPAGESV 0x26 + +#define MnSCRATCHPAGESV 0x27 + +#define MnDMAERRS 0x46 + +#define MnSGDMAERRS 0x47 + +#define MnSGBUF 0x53 + +#define MnSGDMASTAT 0x5b + +#define MnDDMACTL 0x5c /* RAZOR.rspec.fm rev 1.5 is wrong */ + +#define MnDDMASTAT 0x5d /* RAZOR.rspec.fm rev 1.5 is wrong */ + +#define MnDDMAMODE 0x5e /* RAZOR.rspec.fm rev 1.5 is wrong */ + +#define MnDMAENG 0x60 + +#define MnPIPECTL 0x61 + +#define MnSGBADR 0x65 + +#define MnSCB_SITE 0x100 + +#define MnDDB_SITE 0x180 + +/* + * The common definitions below have the same address offset for both + * CSEQ and LmSEQ. + */ +#define BISTCTL0 0x4C + +#define BISTCTL1 0x50 + +#define MAPPEDSCR 0x800 + +/* + * CSEQ Host Register, Address Range : (0x000-0xFFC) + */ +#define CSEQ_HOST_REG_BASE_ADR 0xB8001000 + +#define CARP2CTL (CSEQ_HOST_REG_BASE_ADR + ARP2CTL) + +#define CARP2INT (CSEQ_HOST_REG_BASE_ADR + ARP2INT) + +#define CARP2INTEN (CSEQ_HOST_REG_BASE_ADR + ARP2INTEN) + +#define CARP2BREAKADR01 (CSEQ_HOST_REG_BASE_ADR+ARP2BREAKADR01) + +#define CARP2BREAKADR23 (CSEQ_HOST_REG_BASE_ADR+ARP2BREAKADR23) + +#define CBISTCTL (CSEQ_HOST_REG_BASE_ADR + BISTCTL1) + +#define CSEQRAMBISTEN 0x00000040 +#define CSEQRAMBISTDN 0x00000020 /* ro */ +#define CSEQRAMBISTFAIL 0x00000010 /* ro */ +#define CSEQSCRBISTEN 0x00000004 +#define CSEQSCRBISTDN 0x00000002 /* ro */ +#define CSEQSCRBISTFAIL 0x00000001 /* ro */ + +#define CMAPPEDSCR (CSEQ_HOST_REG_BASE_ADR + MAPPEDSCR) + +/* + * CSEQ CIO Bus Registers, Address Range : (0x0000-0x1FFC) + * 16 modes, each mode is 512 bytes. + * Unless specified, the register should valid for all modes. + */ +#define CSEQ_CIO_REG_BASE_ADR REG_BASE_ADDR_CSEQCIO + +#define CSEQm_CIO_REG(Mode, Reg) \ + (CSEQ_CIO_REG_BASE_ADR + \ + ((u32) (Mode) * CSEQ_MODE_PAGE_SIZE) + (u32) (Reg)) + +#define CMODEPTR (CSEQ_CIO_REG_BASE_ADR + MODEPTR) + +#define CALTMODE (CSEQ_CIO_REG_BASE_ADR + ALTMODE) + +#define CATOMICXCHG (CSEQ_CIO_REG_BASE_ADR + ATOMICXCHG) + +#define CFLAG (CSEQ_CIO_REG_BASE_ADR + FLAG) + +#define CARP2INTCTL (CSEQ_CIO_REG_BASE_ADR + ARP2INTCTL) + +#define CSTACK (CSEQ_CIO_REG_BASE_ADR + STACK) + +#define CFUNCTION1 (CSEQ_CIO_REG_BASE_ADR + FUNCTION1) + +#define CPRGMCNT (CSEQ_CIO_REG_BASE_ADR + PRGMCNT) + +#define CACCUM (CSEQ_CIO_REG_BASE_ADR + ACCUM) + +#define CSINDEX (CSEQ_CIO_REG_BASE_ADR + SINDEX) + +#define CDINDEX (CSEQ_CIO_REG_BASE_ADR + DINDEX) + +#define CALLONES (CSEQ_CIO_REG_BASE_ADR + ALLONES) + +#define CALLZEROS (CSEQ_CIO_REG_BASE_ADR + ALLZEROS) + +#define CSINDIR (CSEQ_CIO_REG_BASE_ADR + SINDIR) + +#define CDINDIR (CSEQ_CIO_REG_BASE_ADR + DINDIR) + +#define CJUMLDIR (CSEQ_CIO_REG_BASE_ADR + JUMLDIR) + +#define CARP2HALTCODE (CSEQ_CIO_REG_BASE_ADR + ARP2HALTCODE) + +#define CCURRADDR (CSEQ_CIO_REG_BASE_ADR + CURRADDR) + +#define CLASTADDR (CSEQ_CIO_REG_BASE_ADR + LASTADDR) + +#define CNXTLADDR (CSEQ_CIO_REG_BASE_ADR + NXTLADDR) + +#define CDBGPORTPTR (CSEQ_CIO_REG_BASE_ADR + DBGPORTPTR) + +#define CDBGPORT (CSEQ_CIO_REG_BASE_ADR + DBGPORT) + +#define CSCRATCHPAGE (CSEQ_CIO_REG_BASE_ADR + SCRATCHPAGE) + +#define CMnSCBPTR(Mode) CSEQm_CIO_REG(Mode, MnSCBPTR) + +#define CMnDDBPTR(Mode) CSEQm_CIO_REG(Mode, MnDDBPTR) + +#define CMnSCRATCHPAGE(Mode) CSEQm_CIO_REG(Mode, MnSCRATCHPAGE) + +#define CLINKCON (CSEQ_CIO_REG_BASE_ADR + 0x28) + +#define CCIOAACESS (CSEQ_CIO_REG_BASE_ADR + 0x2C) + +/* mode 0-7 */ +#define MnREQMBX 0x30 +#define CMnREQMBX(Mode) CSEQm_CIO_REG(Mode, 0x30) + +/* mode 8 */ +#define CSEQCON CSEQm_CIO_REG(8, 0x30) + +/* mode 0-7 */ +#define MnRSPMBX 0x34 +#define CMnRSPMBX(Mode) CSEQm_CIO_REG(Mode, 0x34) + +/* mode 8 */ +#define CSEQCOMCTL CSEQm_CIO_REG(8, 0x34) + +/* mode 8 */ +#define CSEQCOMSTAT CSEQm_CIO_REG(8, 0x35) + +/* mode 8 */ +#define CSEQCOMINTEN CSEQm_CIO_REG(8, 0x36) + +/* mode 8 */ +#define CSEQCOMDMACTL CSEQm_CIO_REG(8, 0x37) + +#define CSHALTERR 0x10 +#define RESETCSDMA 0x08 /* wo */ +#define STARTCSDMA 0x04 +#define STOPCSDMA 0x02 /* wo */ +#define CSDMAACT 0x01 /* ro */ + +/* mode 0-7 */ +#define MnINT 0x38 +#define CMnINT(Mode) CSEQm_CIO_REG(Mode, 0x38) + +#define CMnREQMBXE 0x02 +#define CMnRSPMBXF 0x01 +#define CMnINT_MASK 0x00000003 + +/* mode 8 */ +#define CSEQREQMBX CSEQm_CIO_REG(8, 0x38) + +/* mode 0-7 */ +#define MnINTEN 0x3C +#define CMnINTEN(Mode) CSEQm_CIO_REG(Mode, 0x3C) + +#define EN_CMnRSPMBXF 0x01 + +/* mode 8 */ +#define CSEQRSPMBX CSEQm_CIO_REG(8, 0x3C) + +/* mode 8 */ +#define CSDMAADR CSEQm_CIO_REG(8, 0x40) + +/* mode 8 */ +#define CSDMACNT CSEQm_CIO_REG(8, 0x48) + +/* mode 8 */ +#define CSEQDLCTL CSEQm_CIO_REG(8, 0x4D) + +#define DONELISTEND 0x10 +#define DONELISTSIZE_MASK 0x0F +#define DONELISTSIZE_8ELEM 0x01 +#define DONELISTSIZE_16ELEM 0x02 +#define DONELISTSIZE_32ELEM 0x03 +#define DONELISTSIZE_64ELEM 0x04 +#define DONELISTSIZE_128ELEM 0x05 +#define DONELISTSIZE_256ELEM 0x06 +#define DONELISTSIZE_512ELEM 0x07 +#define DONELISTSIZE_1024ELEM 0x08 +#define DONELISTSIZE_2048ELEM 0x09 +#define DONELISTSIZE_4096ELEM 0x0A +#define DONELISTSIZE_8192ELEM 0x0B +#define DONELISTSIZE_16384ELEM 0x0C + +/* mode 8 */ +#define CSEQDLOFFS CSEQm_CIO_REG(8, 0x4E) + +/* mode 11 */ +#define CM11INTVEC0 CSEQm_CIO_REG(11, 0x50) + +/* mode 11 */ +#define CM11INTVEC1 CSEQm_CIO_REG(11, 0x52) + +/* mode 11 */ +#define CM11INTVEC2 CSEQm_CIO_REG(11, 0x54) + +#define CCONMSK (CSEQ_CIO_REG_BASE_ADR + 0x60) + +#define CCONEXIST (CSEQ_CIO_REG_BASE_ADR + 0x61) + +#define CCONMODE (CSEQ_CIO_REG_BASE_ADR + 0x62) + +#define CTIMERCALC (CSEQ_CIO_REG_BASE_ADR + 0x64) + +#define CINTDIS (CSEQ_CIO_REG_BASE_ADR + 0x68) + +/* mode 8, 32x32 bits, 128 bytes of mapped buffer */ +#define CSBUFFER CSEQm_CIO_REG(8, 0x80) + +#define CSCRATCH (CSEQ_CIO_REG_BASE_ADR + 0x1C0) + +/* mode 0-8 */ +#define CMnSCRATCH(Mode) CSEQm_CIO_REG(Mode, 0x1E0) + +/* + * CSEQ Mapped Instruction RAM Page, Address Range : (0x0000-0x1FFC) + */ +#define CSEQ_RAM_REG_BASE_ADR 0xB8004000 + +/* + * The common definitions below have the same address offset for all the Link + * sequencers. + */ +#define MODECTL 0x40 + +#define DBGMODE 0x44 + +#define CONTROL 0x48 +#define LEDTIMER 0x00010000 +#define LEDTIMERS_10us 0x00000000 +#define LEDTIMERS_1ms 0x00000800 +#define LEDTIMERS_100ms 0x00001000 +#define LEDMODE_TXRX 0x00000000 +#define LEDMODE_CONNECTED 0x00000200 +#define LEDPOL 0x00000100 + +#define LSEQRAM 0x1000 + +/* + * LmSEQ Host Registers, Address Range : (0x0000-0x3FFC) + */ +#define LSEQ0_HOST_REG_BASE_ADR 0xB8020000 +#define LSEQ1_HOST_REG_BASE_ADR 0xB8024000 +#define LSEQ2_HOST_REG_BASE_ADR 0xB8028000 +#define LSEQ3_HOST_REG_BASE_ADR 0xB802C000 +#define LSEQ4_HOST_REG_BASE_ADR 0xB8030000 +#define LSEQ5_HOST_REG_BASE_ADR 0xB8034000 +#define LSEQ6_HOST_REG_BASE_ADR 0xB8038000 +#define LSEQ7_HOST_REG_BASE_ADR 0xB803C000 + +#define LmARP2CTL(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + ARP2CTL) + +#define LmARP2INT(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + ARP2INT) + +#define LmARP2INTEN(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + ARP2INTEN) + +#define LmDBGMODE(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + DBGMODE) + +#define LmCONTROL(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + CONTROL) + +#define LmARP2BREAKADR01(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + ARP2BREAKADR01) + +#define LmARP2BREAKADR23(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + ARP2BREAKADR23) + +#define LmMODECTL(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + MODECTL) + +#define LmAUTODISCI 0x08000000 +#define LmDSBLBITLT 0x04000000 +#define LmDSBLANTT 0x02000000 +#define LmDSBLCRTT 0x01000000 +#define LmDSBLCONT 0x00000100 +#define LmPRIMODE 0x00000080 +#define LmDSBLHOLD 0x00000040 +#define LmDISACK 0x00000020 +#define LmBLIND48 0x00000010 +#define LmRCVMODE_MASK 0x0000000C +#define LmRCVMODE_PLD 0x00000000 +#define LmRCVMODE_HPC 0x00000004 + +#define LmDBGMODE(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + DBGMODE) + +#define LmFRCPERR 0x80000000 +#define LmMEMSEL_MASK 0x30000000 +#define LmFRCRBPERR 0x00000000 +#define LmFRCTBPERR 0x10000000 +#define LmFRCSGBPERR 0x20000000 +#define LmFRCARBPERR 0x30000000 +#define LmRCVIDW 0x00080000 +#define LmINVDWERR 0x00040000 +#define LmRCVDISP 0x00004000 +#define LmDISPERR 0x00002000 +#define LmDSBLDSCR 0x00000800 +#define LmDSBLSCR 0x00000400 +#define LmFRCNAK 0x00000200 +#define LmFRCROFS 0x00000100 +#define LmFRCCRC 0x00000080 +#define LmFRMTYPE_MASK 0x00000070 +#define LmSG_DATA 0x00000000 +#define LmSG_COMMAND 0x00000010 +#define LmSG_TASK 0x00000020 +#define LmSG_TGTXFER 0x00000030 +#define LmSG_RESPONSE 0x00000040 +#define LmSG_IDENADDR 0x00000050 +#define LmSG_OPENADDR 0x00000060 +#define LmDISCRCGEN 0x00000008 +#define LmDISCRCCHK 0x00000004 +#define LmSSXMTFRM 0x00000002 +#define LmSSRCVFRM 0x00000001 + +#define LmCONTROL(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + CONTROL) + +#define LmSTEPXMTFRM 0x00000002 +#define LmSTEPRCVFRM 0x00000001 + +#define LmBISTCTL0(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) + \ + BISTCTL0) + +#define ARBBISTEN 0x40000000 +#define ARBBISTDN 0x20000000 /* ro */ +#define ARBBISTFAIL 0x10000000 /* ro */ +#define TBBISTEN 0x00000400 +#define TBBISTDN 0x00000200 /* ro */ +#define TBBISTFAIL 0x00000100 /* ro */ +#define RBBISTEN 0x00000040 +#define RBBISTDN 0x00000020 /* ro */ +#define RBBISTFAIL 0x00000010 /* ro */ +#define SGBISTEN 0x00000004 +#define SGBISTDN 0x00000002 /* ro */ +#define SGBISTFAIL 0x00000001 /* ro */ + +#define LmBISTCTL1(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum)*LmSEQ_HOST_REG_SIZE) +\ + BISTCTL1) + +#define LmRAMPAGE1 0x00000200 +#define LmRAMPAGE0 0x00000100 +#define LmIMEMBISTEN 0x00000040 +#define LmIMEMBISTDN 0x00000020 /* ro */ +#define LmIMEMBISTFAIL 0x00000010 /* ro */ +#define LmSCRBISTEN 0x00000004 +#define LmSCRBISTDN 0x00000002 /* ro */ +#define LmSCRBISTFAIL 0x00000001 /* ro */ +#define LmRAMPAGE (LmRAMPAGE1 + LmRAMPAGE0) +#define LmRAMPAGE_LSHIFT 0x8 + +#define LmSCRATCH(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum) * LmSEQ_HOST_REG_SIZE) +\ + MAPPEDSCR) + +#define LmSEQRAM(LinkNum) (LSEQ0_HOST_REG_BASE_ADR + \ + ((LinkNum) * LmSEQ_HOST_REG_SIZE) +\ + LSEQRAM) + +/* + * LmSEQ CIO Bus Register, Address Range : (0x0000-0xFFC) + * 8 modes, each mode is 512 bytes. + * Unless specified, the register should valid for all modes. + */ +#define LmSEQ_CIOBUS_REG_BASE 0x2000 + +#define LmSEQ_PHY_BASE(Mode, LinkNum) \ + (LSEQ0_HOST_REG_BASE_ADR + \ + (LmSEQ_HOST_REG_SIZE * (u32) (LinkNum)) + \ + LmSEQ_CIOBUS_REG_BASE + \ + ((u32) (Mode) * LmSEQ_MODE_PAGE_SIZE)) + +#define LmSEQ_PHY_REG(Mode, LinkNum, Reg) \ + (LmSEQ_PHY_BASE(Mode, LinkNum) + (u32) (Reg)) + +#define LmMODEPTR(LinkNum) LmSEQ_PHY_REG(0, LinkNum, MODEPTR) + +#define LmALTMODE(LinkNum) LmSEQ_PHY_REG(0, LinkNum, ALTMODE) + +#define LmATOMICXCHG(LinkNum) LmSEQ_PHY_REG(0, LinkNum, ATOMICXCHG) + +#define LmFLAG(LinkNum) LmSEQ_PHY_REG(0, LinkNum, FLAG) + +#define LmARP2INTCTL(LinkNum) LmSEQ_PHY_REG(0, LinkNum, ARP2INTCTL) + +#define LmSTACK(LinkNum) LmSEQ_PHY_REG(0, LinkNum, STACK) + +#define LmFUNCTION1(LinkNum) LmSEQ_PHY_REG(0, LinkNum, FUNCTION1) + +#define LmPRGMCNT(LinkNum) LmSEQ_PHY_REG(0, LinkNum, PRGMCNT) + +#define LmACCUM(LinkNum) LmSEQ_PHY_REG(0, LinkNum, ACCUM) + +#define LmSINDEX(LinkNum) LmSEQ_PHY_REG(0, LinkNum, SINDEX) + +#define LmDINDEX(LinkNum) LmSEQ_PHY_REG(0, LinkNum, DINDEX) + +#define LmALLONES(LinkNum) LmSEQ_PHY_REG(0, LinkNum, ALLONES) + +#define LmALLZEROS(LinkNum) LmSEQ_PHY_REG(0, LinkNum, ALLZEROS) + +#define LmSINDIR(LinkNum) LmSEQ_PHY_REG(0, LinkNum, SINDIR) + +#define LmDINDIR(LinkNum) LmSEQ_PHY_REG(0, LinkNum, DINDIR) + +#define LmJUMLDIR(LinkNum) LmSEQ_PHY_REG(0, LinkNum, JUMLDIR) + +#define LmARP2HALTCODE(LinkNum) LmSEQ_PHY_REG(0, LinkNum, ARP2HALTCODE) + +#define LmCURRADDR(LinkNum) LmSEQ_PHY_REG(0, LinkNum, CURRADDR) + +#define LmLASTADDR(LinkNum) LmSEQ_PHY_REG(0, LinkNum, LASTADDR) + +#define LmNXTLADDR(LinkNum) LmSEQ_PHY_REG(0, LinkNum, NXTLADDR) + +#define LmDBGPORTPTR(LinkNum) LmSEQ_PHY_REG(0, LinkNum, DBGPORTPTR) + +#define LmDBGPORT(LinkNum) LmSEQ_PHY_REG(0, LinkNum, DBGPORT) + +#define LmSCRATCHPAGE(LinkNum) LmSEQ_PHY_REG(0, LinkNum, SCRATCHPAGE) + +#define LmMnSCRATCHPAGE(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, \ + MnSCRATCHPAGE) + +#define LmTIMERCALC(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x28) + +#define LmREQMBX(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x30) + +#define LmRSPMBX(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x34) + +#define LmMnINT(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x38) + +#define CTXMEMSIZE 0x80000000 /* ro */ +#define LmACKREQ 0x08000000 +#define LmNAKREQ 0x04000000 +#define LmMnXMTERR 0x02000000 +#define LmM5OOBSVC 0x01000000 +#define LmHWTINT 0x00800000 +#define LmMnCTXDONE 0x00100000 +#define LmM2REQMBXF 0x00080000 +#define LmM2RSPMBXE 0x00040000 +#define LmMnDMAERR 0x00020000 +#define LmRCVPRIM 0x00010000 +#define LmRCVERR 0x00008000 +#define LmADDRRCV 0x00004000 +#define LmMnHDRMISS 0x00002000 +#define LmMnWAITSCB 0x00001000 +#define LmMnRLSSCB 0x00000800 +#define LmMnSAVECTX 0x00000400 +#define LmMnFETCHSG 0x00000200 +#define LmMnLOADCTX 0x00000100 +#define LmMnCFGICL 0x00000080 +#define LmMnCFGSATA 0x00000040 +#define LmMnCFGEXPSATA 0x00000020 +#define LmMnCFGCMPLT 0x00000010 +#define LmMnCFGRBUF 0x00000008 +#define LmMnSAVETTR 0x00000004 +#define LmMnCFGRDAT 0x00000002 +#define LmMnCFGHDR 0x00000001 + +#define LmMnINTEN(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x3C) + +#define EN_LmACKREQ 0x08000000 +#define EN_LmNAKREQ 0x04000000 +#define EN_LmMnXMTERR 0x02000000 +#define EN_LmM5OOBSVC 0x01000000 +#define EN_LmHWTINT 0x00800000 +#define EN_LmMnCTXDONE 0x00100000 +#define EN_LmM2REQMBXF 0x00080000 +#define EN_LmM2RSPMBXE 0x00040000 +#define EN_LmMnDMAERR 0x00020000 +#define EN_LmRCVPRIM 0x00010000 +#define EN_LmRCVERR 0x00008000 +#define EN_LmADDRRCV 0x00004000 +#define EN_LmMnHDRMISS 0x00002000 +#define EN_LmMnWAITSCB 0x00001000 +#define EN_LmMnRLSSCB 0x00000800 +#define EN_LmMnSAVECTX 0x00000400 +#define EN_LmMnFETCHSG 0x00000200 +#define EN_LmMnLOADCTX 0x00000100 +#define EN_LmMnCFGICL 0x00000080 +#define EN_LmMnCFGSATA 0x00000040 +#define EN_LmMnCFGEXPSATA 0x00000020 +#define EN_LmMnCFGCMPLT 0x00000010 +#define EN_LmMnCFGRBUF 0x00000008 +#define EN_LmMnSAVETTR 0x00000004 +#define EN_LmMnCFGRDAT 0x00000002 +#define EN_LmMnCFGHDR 0x00000001 + +#define LmM0INTEN_MASK (EN_LmMnCFGCMPLT | EN_LmMnCFGRBUF | \ + EN_LmMnSAVETTR | EN_LmMnCFGRDAT | \ + EN_LmMnCFGHDR | EN_LmRCVERR | \ + EN_LmADDRRCV | EN_LmMnHDRMISS | \ + EN_LmMnRLSSCB | EN_LmMnSAVECTX | \ + EN_LmMnFETCHSG | EN_LmMnLOADCTX | \ + EN_LmHWTINT | EN_LmMnCTXDONE | \ + EN_LmRCVPRIM | EN_LmMnCFGSATA | \ + EN_LmMnCFGEXPSATA | EN_LmMnDMAERR) + +#define LmM1INTEN_MASK (EN_LmMnCFGCMPLT | EN_LmADDRRCV | \ + EN_LmMnRLSSCB | EN_LmMnSAVECTX | \ + EN_LmMnFETCHSG | EN_LmMnLOADCTX | \ + EN_LmMnXMTERR | EN_LmHWTINT | \ + EN_LmMnCTXDONE | EN_LmRCVPRIM | \ + EN_LmRCVERR | EN_LmMnDMAERR) + +#define LmM2INTEN_MASK (EN_LmADDRRCV | EN_LmHWTINT | \ + EN_LmM2REQMBXF | EN_LmRCVPRIM | \ + EN_LmRCVERR) + +#define LmM5INTEN_MASK (EN_LmADDRRCV | EN_LmM5OOBSVC | \ + EN_LmHWTINT | EN_LmRCVPRIM | \ + EN_LmRCVERR) + +#define LmXMTPRIMD(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x40) + +#define LmXMTPRIMCS(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x44) + +#define LmCONSTAT(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x45) + +#define LmMnDMAERRS(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x46) + +#define LmMnSGDMAERRS(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x47) + +#define LmM0EXPHDRP(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x48) + +#define LmM1SASALIGN(LinkNum) LmSEQ_PHY_REG(1, LinkNum, 0x48) +#define SAS_ALIGN_DEFAULT 0xFF + +#define LmM0MSKHDRP(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x49) + +#define LmM1STPALIGN(LinkNum) LmSEQ_PHY_REG(1, LinkNum, 0x49) +#define STP_ALIGN_DEFAULT 0x1F + +#define LmM0RCVHDRP(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x4A) + +#define LmM1XMTHDRP(LinkNum) LmSEQ_PHY_REG(1, LinkNum, 0x4A) + +#define LmM0ICLADR(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x4B) + +#define LmM1ALIGNMODE(LinkNum) LmSEQ_PHY_REG(1, LinkNum, 0x4B) + +#define LmDISALIGN 0x20 +#define LmROTSTPALIGN 0x10 +#define LmSTPALIGN 0x08 +#define LmROTNOTIFY 0x04 +#define LmDUALALIGN 0x02 +#define LmROTALIGN 0x01 + +#define LmM0EXPRCVNT(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x4C) + +#define LmM1XMTCNT(LinkNum) LmSEQ_PHY_REG(1, LinkNum, 0x4C) + +#define LmMnBUFSTAT(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x4E) + +#define LmMnBUFPERR 0x01 + +/* mode 0-1 */ +#define LmMnXFRLVL(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x59) + +#define LmMnXFRLVL_128 0x05 +#define LmMnXFRLVL_256 0x04 +#define LmMnXFRLVL_512 0x03 +#define LmMnXFRLVL_1024 0x02 +#define LmMnXFRLVL_1536 0x01 +#define LmMnXFRLVL_2048 0x00 + + /* mode 0-1 */ +#define LmMnSGDMACTL(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x5A) + +#define LmMnRESETSG 0x04 +#define LmMnSTOPSG 0x02 +#define LmMnSTARTSG 0x01 + +/* mode 0-1 */ +#define LmMnSGDMASTAT(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x5B) + +/* mode 0-1 */ +#define LmMnDDMACTL(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x5C) + +#define LmMnFLUSH 0x40 /* wo */ +#define LmMnRLSRTRY 0x20 /* wo */ +#define LmMnDISCARD 0x10 /* wo */ +#define LmMnRESETDAT 0x08 /* wo */ +#define LmMnSUSDAT 0x04 /* wo */ +#define LmMnSTOPDAT 0x02 /* wo */ +#define LmMnSTARTDAT 0x01 /* wo */ + +/* mode 0-1 */ +#define LmMnDDMASTAT(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x5D) + +#define LmMnDPEMPTY 0x80 +#define LmMnFLUSHING 0x40 +#define LmMnDDMAREQ 0x20 +#define LmMnHDMAREQ 0x10 +#define LmMnDATFREE 0x08 +#define LmMnDATSUS 0x04 +#define LmMnDATACT 0x02 +#define LmMnDATEN 0x01 + +/* mode 0-1 */ +#define LmMnDDMAMODE(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x5E) + +#define LmMnDMATYPE_NORMAL 0x0000 +#define LmMnDMATYPE_HOST_ONLY_TX 0x0001 +#define LmMnDMATYPE_DEVICE_ONLY_TX 0x0002 +#define LmMnDMATYPE_INVALID 0x0003 +#define LmMnDMATYPE_MASK 0x0003 + +#define LmMnDMAWRAP 0x0004 +#define LmMnBITBUCKET 0x0008 +#define LmMnDISHDR 0x0010 +#define LmMnSTPCRC 0x0020 +#define LmXTEST 0x0040 +#define LmMnDISCRC 0x0080 +#define LmMnENINTLK 0x0100 +#define LmMnADDRFRM 0x0400 +#define LmMnENXMTCRC 0x0800 + +/* mode 0-1 */ +#define LmMnXFRCNT(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x70) + +/* mode 0-1 */ +#define LmMnDPSEL(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x7B) +#define LmMnDPSEL_MASK 0x07 +#define LmMnEOLPRE 0x40 +#define LmMnEOSPRE 0x80 + +/* Registers used in conjunction with LmMnDPSEL and LmMnDPACC registers */ +/* Receive Mode n = 0 */ +#define LmMnHRADDR 0x00 +#define LmMnHBYTECNT 0x01 +#define LmMnHREWIND 0x02 +#define LmMnDWADDR 0x03 +#define LmMnDSPACECNT 0x04 +#define LmMnDFRMSIZE 0x05 + +/* Registers used in conjunction with LmMnDPSEL and LmMnDPACC registers */ +/* Transmit Mode n = 1 */ +#define LmMnHWADDR 0x00 +#define LmMnHSPACECNT 0x01 +/* #define LmMnHREWIND 0x02 */ +#define LmMnDRADDR 0x03 +#define LmMnDBYTECNT 0x04 +/* #define LmMnDFRMSIZE 0x05 */ + +/* mode 0-1 */ +#define LmMnDPACC(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x78) +#define LmMnDPACC_MASK 0x00FFFFFF + +/* mode 0-1 */ +#define LmMnHOLDLVL(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x7D) + +#define LmPRMSTAT0(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x80) +#define LmPRMSTAT0BYTE0 0x80 +#define LmPRMSTAT0BYTE1 0x81 +#define LmPRMSTAT0BYTE2 0x82 +#define LmPRMSTAT0BYTE3 0x83 + +#define LmFRAMERCVD 0x80000000 +#define LmXFRRDYRCVD 0x40000000 +#define LmUNKNOWNP 0x20000000 +#define LmBREAK 0x10000000 +#define LmDONE 0x08000000 +#define LmOPENACPT 0x04000000 +#define LmOPENRJCT 0x02000000 +#define LmOPENRTRY 0x01000000 +#define LmCLOSERV1 0x00800000 +#define LmCLOSERV0 0x00400000 +#define LmCLOSENORM 0x00200000 +#define LmCLOSECLAF 0x00100000 +#define LmNOTIFYRV2 0x00080000 +#define LmNOTIFYRV1 0x00040000 +#define LmNOTIFYRV0 0x00020000 +#define LmNOTIFYSPIN 0x00010000 +#define LmBROADRV4 0x00008000 +#define LmBROADRV3 0x00004000 +#define LmBROADRV2 0x00002000 +#define LmBROADRV1 0x00001000 +#define LmBROADSES 0x00000800 +#define LmBROADRVCH1 0x00000400 +#define LmBROADRVCH0 0x00000200 +#define LmBROADCH 0x00000100 +#define LmAIPRVWP 0x00000080 +#define LmAIPWP 0x00000040 +#define LmAIPWD 0x00000020 +#define LmAIPWC 0x00000010 +#define LmAIPRV2 0x00000008 +#define LmAIPRV1 0x00000004 +#define LmAIPRV0 0x00000002 +#define LmAIPNRML 0x00000001 + +#define LmBROADCAST_MASK (LmBROADCH | LmBROADRVCH0 | \ + LmBROADRVCH1) + +#define LmPRMSTAT1(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0x84) +#define LmPRMSTAT1BYTE0 0x84 +#define LmPRMSTAT1BYTE1 0x85 +#define LmPRMSTAT1BYTE2 0x86 +#define LmPRMSTAT1BYTE3 0x87 + +#define LmFRMRCVDSTAT 0x80000000 +#define LmBREAK_DET 0x04000000 +#define LmCLOSE_DET 0x02000000 +#define LmDONE_DET 0x01000000 +#define LmXRDY 0x00040000 +#define LmSYNCSRST 0x00020000 +#define LmSYNC 0x00010000 +#define LmXHOLD 0x00008000 +#define LmRRDY 0x00004000 +#define LmHOLD 0x00002000 +#define LmROK 0x00001000 +#define LmRIP 0x00000800 +#define LmCRBLK 0x00000400 +#define LmACK 0x00000200 +#define LmNAK 0x00000100 +#define LmHARDRST 0x00000080 +#define LmERROR 0x00000040 +#define LmRERR 0x00000020 +#define LmPMREQP 0x00000010 +#define LmPMREQS 0x00000008 +#define LmPMACK 0x00000004 +#define LmPMNAK 0x00000002 +#define LmDMAT 0x00000001 + +/* mode 1 */ +#define LmMnSATAFS(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x7E) +#define LmMnXMTSIZE(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0x93) + +/* mode 0 */ +#define LmMnFRMERR(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0xB0) + +#define LmACRCERR 0x00000800 +#define LmPHYOVRN 0x00000400 +#define LmOBOVRN 0x00000200 +#define LmMnZERODATA 0x00000100 +#define LmSATAINTLK 0x00000080 +#define LmMnCRCERR 0x00000020 +#define LmRRDYOVRN 0x00000010 +#define LmMISSSOAF 0x00000008 +#define LmMISSSOF 0x00000004 +#define LmMISSEOAF 0x00000002 +#define LmMISSEOF 0x00000001 + +#define LmFRMERREN(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xB4) + +#define EN_LmACRCERR 0x00000800 +#define EN_LmPHYOVRN 0x00000400 +#define EN_LmOBOVRN 0x00000200 +#define EN_LmMnZERODATA 0x00000100 +#define EN_LmSATAINTLK 0x00000080 +#define EN_LmFRMBAD 0x00000040 +#define EN_LmMnCRCERR 0x00000020 +#define EN_LmRRDYOVRN 0x00000010 +#define EN_LmMISSSOAF 0x00000008 +#define EN_LmMISSSOF 0x00000004 +#define EN_LmMISSEOAF 0x00000002 +#define EN_LmMISSEOF 0x00000001 + +#define LmFRMERREN_MASK (EN_LmSATAINTLK | EN_LmMnCRCERR | \ + EN_LmRRDYOVRN | EN_LmMISSSOF | \ + EN_LmMISSEOAF | EN_LmMISSEOF | \ + EN_LmACRCERR | LmPHYOVRN | \ + EN_LmOBOVRN | EN_LmMnZERODATA) + +#define LmHWTSTATEN(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xC5) + +#define EN_LmDONETO 0x80 +#define EN_LmINVDISP 0x40 +#define EN_LmINVDW 0x20 +#define EN_LmDWSEVENT 0x08 +#define EN_LmCRTTTO 0x04 +#define EN_LmANTTTO 0x02 +#define EN_LmBITLTTO 0x01 + +#define LmHWTSTATEN_MASK (EN_LmINVDISP | EN_LmINVDW | \ + EN_LmDWSEVENT | EN_LmCRTTTO | \ + EN_LmANTTTO | EN_LmDONETO | \ + EN_LmBITLTTO) + +#define LmHWTSTAT(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xC7) + +#define LmDONETO 0x80 +#define LmINVDISP 0x40 +#define LmINVDW 0x20 +#define LmDWSEVENT 0x08 +#define LmCRTTTO 0x04 +#define LmANTTTO 0x02 +#define LmBITLTTO 0x01 + +#define LmMnDATABUFADR(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0xC8) +#define LmDATABUFADR_MASK 0x0FFF + +#define LmMnDATABUF(LinkNum, Mode) LmSEQ_PHY_REG(Mode, LinkNum, 0xCA) + +#define LmPRIMSTAT0EN(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xE0) + +#define EN_LmUNKNOWNP 0x20000000 +#define EN_LmBREAK 0x10000000 +#define EN_LmDONE 0x08000000 +#define EN_LmOPENACPT 0x04000000 +#define EN_LmOPENRJCT 0x02000000 +#define EN_LmOPENRTRY 0x01000000 +#define EN_LmCLOSERV1 0x00800000 +#define EN_LmCLOSERV0 0x00400000 +#define EN_LmCLOSENORM 0x00200000 +#define EN_LmCLOSECLAF 0x00100000 +#define EN_LmNOTIFYRV2 0x00080000 +#define EN_LmNOTIFYRV1 0x00040000 +#define EN_LmNOTIFYRV0 0x00020000 +#define EN_LmNOTIFYSPIN 0x00010000 +#define EN_LmBROADRV4 0x00008000 +#define EN_LmBROADRV3 0x00004000 +#define EN_LmBROADRV2 0x00002000 +#define EN_LmBROADRV1 0x00001000 +#define EN_LmBROADRV0 0x00000800 +#define EN_LmBROADRVCH1 0x00000400 +#define EN_LmBROADRVCH0 0x00000200 +#define EN_LmBROADCH 0x00000100 +#define EN_LmAIPRVWP 0x00000080 +#define EN_LmAIPWP 0x00000040 +#define EN_LmAIPWD 0x00000020 +#define EN_LmAIPWC 0x00000010 +#define EN_LmAIPRV2 0x00000008 +#define EN_LmAIPRV1 0x00000004 +#define EN_LmAIPRV0 0x00000002 +#define EN_LmAIPNRML 0x00000001 + +#define LmPRIMSTAT0EN_MASK (EN_LmBREAK | \ + EN_LmDONE | EN_LmOPENACPT | \ + EN_LmOPENRJCT | EN_LmOPENRTRY | \ + EN_LmCLOSERV1 | EN_LmCLOSERV0 | \ + EN_LmCLOSENORM | EN_LmCLOSECLAF | \ + EN_LmBROADRV4 | EN_LmBROADRV3 | \ + EN_LmBROADRV2 | EN_LmBROADRV1 | \ + EN_LmBROADRV0 | EN_LmBROADRVCH1 | \ + EN_LmBROADRVCH0 | EN_LmBROADCH | \ + EN_LmAIPRVWP | EN_LmAIPWP | \ + EN_LmAIPWD | EN_LmAIPWC | \ + EN_LmAIPRV2 | EN_LmAIPRV1 | \ + EN_LmAIPRV0 | EN_LmAIPNRML) + +#define LmPRIMSTAT1EN(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xE4) + +#define EN_LmXRDY 0x00040000 +#define EN_LmSYNCSRST 0x00020000 +#define EN_LmSYNC 0x00010000 +#define EN_LmXHOLD 0x00008000 +#define EN_LmRRDY 0x00004000 +#define EN_LmHOLD 0x00002000 +#define EN_LmROK 0x00001000 +#define EN_LmRIP 0x00000800 +#define EN_LmCRBLK 0x00000400 +#define EN_LmACK 0x00000200 +#define EN_LmNAK 0x00000100 +#define EN_LmHARDRST 0x00000080 +#define EN_LmERROR 0x00000040 +#define EN_LmRERR 0x00000020 +#define EN_LmPMREQP 0x00000010 +#define EN_LmPMREQS 0x00000008 +#define EN_LmPMACK 0x00000004 +#define EN_LmPMNAK 0x00000002 +#define EN_LmDMAT 0x00000001 + +#define LmPRIMSTAT1EN_MASK (EN_LmHARDRST | \ + EN_LmSYNCSRST | \ + EN_LmPMREQP | EN_LmPMREQS | \ + EN_LmPMACK | EN_LmPMNAK) + +#define LmSMSTATE(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xE8) + +#define LmSMSTATEBRK(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xEC) + +#define LmSMDBGCTL(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xF0) + + +/* + * LmSEQ CIO Bus Mode 3 Register. + * Mode 3: Configuration and Setup, IOP Context SCB. + */ +#define LmM3SATATIMER(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0x48) + +#define LmM3INTVEC0(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0x90) + +#define LmM3INTVEC1(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0x92) + +#define LmM3INTVEC2(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0x94) + +#define LmM3INTVEC3(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0x96) + +#define LmM3INTVEC4(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0x98) + +#define LmM3INTVEC5(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0x9A) + +#define LmM3INTVEC6(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0x9C) + +#define LmM3INTVEC7(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0x9E) + +#define LmM3INTVEC8(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0xA4) + +#define LmM3INTVEC9(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0xA6) + +#define LmM3INTVEC10(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0xB0) + +#define LmM3FRMGAP(LinkNum) LmSEQ_PHY_REG(3, LinkNum, 0xB4) + +#define LmBITL_TIMER(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xA2) + +#define LmWWN(LinkNum) LmSEQ_PHY_REG(0, LinkNum, 0xA8) + + +/* + * LmSEQ CIO Bus Mode 5 Registers. + * Mode 5: Phy/OOB Control and Status. + */ +#define LmSEQ_OOB_REG(phy_id, reg) LmSEQ_PHY_REG(5, (phy_id), (reg)) + +#define OOB_BFLTR 0x100 + +#define BFLTR_THR_MASK 0xF0 +#define BFLTR_TC_MASK 0x0F + +#define OOB_INIT_MIN 0x102 + +#define OOB_INIT_MAX 0x104 + +#define OOB_INIT_NEG 0x106 + +#define OOB_SAS_MIN 0x108 + +#define OOB_SAS_MAX 0x10A + +#define OOB_SAS_NEG 0x10C + +#define OOB_WAKE_MIN 0x10E + +#define OOB_WAKE_MAX 0x110 + +#define OOB_WAKE_NEG 0x112 + +#define OOB_IDLE_MAX 0x114 + +#define OOB_BURST_MAX 0x116 + +#define OOB_DATA_KBITS 0x126 + +#define OOB_ALIGN_0_DATA 0x12C + +#define OOB_ALIGN_1_DATA 0x130 + +#define D10_2_DATA_k 0x00 +#define SYNC_DATA_k 0x02 +#define ALIGN_1_DATA_k 0x04 +#define ALIGN_0_DATA_k 0x08 +#define BURST_DATA_k 0x10 + +#define OOB_PHY_RESET_COUNT 0x13C + +#define OOB_SIG_GEN 0x140 + +#define START_OOB 0x80 +#define START_DWS 0x40 +#define ALIGN_CNT3 0x30 +#define ALIGN_CNT2 0x20 +#define ALIGN_CNT1 0x10 +#define ALIGN_CNT4 0x00 +#define STOP_DWS 0x08 +#define SEND_COMSAS 0x04 +#define SEND_COMINIT 0x02 +#define SEND_COMWAKE 0x01 + +#define OOB_XMIT 0x141 + +#define TX_ENABLE 0x80 +#define XMIT_OOB_BURST 0x10 +#define XMIT_D10_2 0x08 +#define XMIT_SYNC 0x04 +#define XMIT_ALIGN_1 0x02 +#define XMIT_ALIGN_0 0x01 + +#define FUNCTION_MASK 0x142 + +#define SAS_MODE_DIS 0x80 +#define SATA_MODE_DIS 0x40 +#define SPINUP_HOLD_DIS 0x20 +#define HOT_PLUG_DIS 0x10 +#define SATA_PS_DIS 0x08 +#define FUNCTION_MASK_DEFAULT (SPINUP_HOLD_DIS | SATA_PS_DIS) + +#define OOB_MODE 0x143 + +#define SAS_MODE 0x80 +#define SATA_MODE 0x40 +#define SLOW_CLK 0x20 +#define FORCE_XMIT_15 0x08 +#define PHY_SPEED_60 0x04 +#define PHY_SPEED_30 0x02 +#define PHY_SPEED_15 0x01 + +#define CURRENT_STATUS 0x144 + +#define CURRENT_OOB_DONE 0x80 +#define CURRENT_LOSS_OF_SIGNAL 0x40 +#define CURRENT_SPINUP_HOLD 0x20 +#define CURRENT_HOT_PLUG_CNCT 0x10 +#define CURRENT_GTO_TIMEOUT 0x08 +#define CURRENT_OOB_TIMEOUT 0x04 +#define CURRENT_DEVICE_PRESENT 0x02 +#define CURRENT_OOB_ERROR 0x01 + +#define CURRENT_OOB1_ERROR (CURRENT_HOT_PLUG_CNCT | \ + CURRENT_GTO_TIMEOUT) + +#define CURRENT_OOB2_ERROR (CURRENT_HOT_PLUG_CNCT | \ + CURRENT_OOB_ERROR) + +#define DEVICE_ADDED_W_CNT (CURRENT_OOB_DONE | \ + CURRENT_HOT_PLUG_CNCT | \ + CURRENT_DEVICE_PRESENT) + +#define DEVICE_ADDED_WO_CNT (CURRENT_OOB_DONE | \ + CURRENT_DEVICE_PRESENT) + +#define DEVICE_REMOVED CURRENT_LOSS_OF_SIGNAL + +#define CURRENT_PHY_MASK (CURRENT_OOB_DONE | \ + CURRENT_LOSS_OF_SIGNAL | \ + CURRENT_SPINUP_HOLD | \ + CURRENT_HOT_PLUG_CNCT | \ + CURRENT_GTO_TIMEOUT | \ + CURRENT_DEVICE_PRESENT | \ + CURRENT_OOB_ERROR ) + +#define CURRENT_ERR_MASK (CURRENT_LOSS_OF_SIGNAL | \ + CURRENT_GTO_TIMEOUT | \ + CURRENT_OOB_TIMEOUT | \ + CURRENT_OOB_ERROR ) + +#define SPEED_MASK 0x145 + +#define SATA_SPEED_30_DIS 0x10 +#define SATA_SPEED_15_DIS 0x08 +#define SAS_SPEED_60_DIS 0x04 +#define SAS_SPEED_30_DIS 0x02 +#define SAS_SPEED_15_DIS 0x01 +#define SAS_SPEED_MASK_DEFAULT 0x00 + +#define OOB_TIMER_ENABLE 0x14D + +#define HOT_PLUG_EN 0x80 +#define RCD_EN 0x40 +#define COMTIMER_EN 0x20 +#define SNTT_EN 0x10 +#define SNLT_EN 0x04 +#define SNWT_EN 0x02 +#define ALIGN_EN 0x01 + +#define OOB_STATUS 0x14E + +#define OOB_DONE 0x80 +#define LOSS_OF_SIGNAL 0x40 /* ro */ +#define SPINUP_HOLD 0x20 +#define HOT_PLUG_CNCT 0x10 /* ro */ +#define GTO_TIMEOUT 0x08 /* ro */ +#define OOB_TIMEOUT 0x04 /* ro */ +#define DEVICE_PRESENT 0x02 /* ro */ +#define OOB_ERROR 0x01 /* ro */ + +#define OOB_STATUS_ERROR_MASK (LOSS_OF_SIGNAL | GTO_TIMEOUT | \ + OOB_TIMEOUT | OOB_ERROR) + +#define OOB_STATUS_CLEAR 0x14F + +#define OOB_DONE_CLR 0x80 +#define LOSS_OF_SIGNAL_CLR 0x40 +#define SPINUP_HOLD_CLR 0x20 +#define HOT_PLUG_CNCT_CLR 0x10 +#define GTO_TIMEOUT_CLR 0x08 +#define OOB_TIMEOUT_CLR 0x04 +#define OOB_ERROR_CLR 0x01 + +#define HOT_PLUG_DELAY 0x150 +/* In 5 ms units. 20 = 100 ms. */ +#define HOTPLUG_DELAY_TIMEOUT 20 + + +#define INT_ENABLE_2 0x15A + +#define OOB_DONE_EN 0x80 +#define LOSS_OF_SIGNAL_EN 0x40 +#define SPINUP_HOLD_EN 0x20 +#define HOT_PLUG_CNCT_EN 0x10 +#define GTO_TIMEOUT_EN 0x08 +#define OOB_TIMEOUT_EN 0x04 +#define DEVICE_PRESENT_EN 0x02 +#define OOB_ERROR_EN 0x01 + +#define PHY_CONTROL_0 0x160 + +#define PHY_LOWPWREN_TX 0x80 +#define PHY_LOWPWREN_RX 0x40 +#define SPARE_REG_160_B5 0x20 +#define OFFSET_CANCEL_RX 0x10 + +/* bits 3:2 */ +#define PHY_RXCOMCENTER_60V 0x00 +#define PHY_RXCOMCENTER_70V 0x04 +#define PHY_RXCOMCENTER_80V 0x08 +#define PHY_RXCOMCENTER_90V 0x0C +#define PHY_RXCOMCENTER_MASK 0x0C + +#define PHY_RESET 0x02 +#define SAS_DEFAULT_SEL 0x01 + +#define PHY_CONTROL_1 0x161 + +/* bits 2:0 */ +#define SATA_PHY_DETLEVEL_50mv 0x00 +#define SATA_PHY_DETLEVEL_75mv 0x01 +#define SATA_PHY_DETLEVEL_100mv 0x02 +#define SATA_PHY_DETLEVEL_125mv 0x03 +#define SATA_PHY_DETLEVEL_150mv 0x04 +#define SATA_PHY_DETLEVEL_175mv 0x05 +#define SATA_PHY_DETLEVEL_200mv 0x06 +#define SATA_PHY_DETLEVEL_225mv 0x07 +#define SATA_PHY_DETLEVEL_MASK 0x07 + +/* bits 5:3 */ +#define SAS_PHY_DETLEVEL_50mv 0x00 +#define SAS_PHY_DETLEVEL_75mv 0x08 +#define SAS_PHY_DETLEVEL_100mv 0x10 +#define SAS_PHY_DETLEVEL_125mv 0x11 +#define SAS_PHY_DETLEVEL_150mv 0x20 +#define SAS_PHY_DETLEVEL_175mv 0x21 +#define SAS_PHY_DETLEVEL_200mv 0x30 +#define SAS_PHY_DETLEVEL_225mv 0x31 +#define SAS_PHY_DETLEVEL_MASK 0x38 + +#define PHY_CONTROL_2 0x162 + +/* bits 7:5 */ +#define SATA_PHY_DRV_400mv 0x00 +#define SATA_PHY_DRV_450mv 0x20 +#define SATA_PHY_DRV_500mv 0x40 +#define SATA_PHY_DRV_550mv 0x60 +#define SATA_PHY_DRV_600mv 0x80 +#define SATA_PHY_DRV_650mv 0xA0 +#define SATA_PHY_DRV_725mv 0xC0 +#define SATA_PHY_DRV_800mv 0xE0 +#define SATA_PHY_DRV_MASK 0xE0 + +/* bits 4:3 */ +#define SATA_PREEMP_0 0x00 +#define SATA_PREEMP_1 0x08 +#define SATA_PREEMP_2 0x10 +#define SATA_PREEMP_3 0x18 +#define SATA_PREEMP_MASK 0x18 + +#define SATA_CMSH1P5 0x04 + +/* bits 1:0 */ +#define SATA_SLEW_0 0x00 +#define SATA_SLEW_1 0x01 +#define SATA_SLEW_2 0x02 +#define SATA_SLEW_3 0x03 +#define SATA_SLEW_MASK 0x03 + +#define PHY_CONTROL_3 0x163 + +/* bits 7:5 */ +#define SAS_PHY_DRV_400mv 0x00 +#define SAS_PHY_DRV_450mv 0x20 +#define SAS_PHY_DRV_500mv 0x40 +#define SAS_PHY_DRV_550mv 0x60 +#define SAS_PHY_DRV_600mv 0x80 +#define SAS_PHY_DRV_650mv 0xA0 +#define SAS_PHY_DRV_725mv 0xC0 +#define SAS_PHY_DRV_800mv 0xE0 +#define SAS_PHY_DRV_MASK 0xE0 + +/* bits 4:3 */ +#define SAS_PREEMP_0 0x00 +#define SAS_PREEMP_1 0x08 +#define SAS_PREEMP_2 0x10 +#define SAS_PREEMP_3 0x18 +#define SAS_PREEMP_MASK 0x18 + +#define SAS_CMSH1P5 0x04 + +/* bits 1:0 */ +#define SAS_SLEW_0 0x00 +#define SAS_SLEW_1 0x01 +#define SAS_SLEW_2 0x02 +#define SAS_SLEW_3 0x03 +#define SAS_SLEW_MASK 0x03 + +#define PHY_CONTROL_4 0x168 + +#define PHY_DONE_CAL_TX 0x80 +#define PHY_DONE_CAL_RX 0x40 +#define RX_TERM_LOAD_DIS 0x20 +#define TX_TERM_LOAD_DIS 0x10 +#define AUTO_TERM_CAL_DIS 0x08 +#define PHY_SIGDET_FLTR_EN 0x04 +#define OSC_FREQ 0x02 +#define PHY_START_CAL 0x01 + +/* + * HST_PCIX2 Registers, Addresss Range: (0x00-0xFC) + */ +#define PCIX_REG_BASE_ADR 0xB8040000 + +#define PCIC_VENDOR_ID 0x00 + +#define PCIC_DEVICE_ID 0x02 + +#define PCIC_COMMAND 0x04 + +#define INT_DIS 0x0400 +#define FBB_EN 0x0200 /* ro */ +#define SERR_EN 0x0100 +#define STEP_EN 0x0080 /* ro */ +#define PERR_EN 0x0040 +#define VGA_EN 0x0020 /* ro */ +#define MWI_EN 0x0010 +#define SPC_EN 0x0008 +#define MST_EN 0x0004 +#define MEM_EN 0x0002 +#define IO_EN 0x0001 + +#define PCIC_STATUS 0x06 + +#define PERR_DET 0x8000 +#define SERR_GEN 0x4000 +#define MABT_DET 0x2000 +#define TABT_DET 0x1000 +#define TABT_GEN 0x0800 +#define DPERR_DET 0x0100 +#define CAP_LIST 0x0010 +#define INT_STAT 0x0008 + +#define PCIC_DEVREV_ID 0x08 + +#define PCIC_CLASS_CODE 0x09 + +#define PCIC_CACHELINE_SIZE 0x0C + +#define PCIC_MBAR0 0x10 + +#define PCIC_MBAR0_OFFSET 0 + +#define PCIC_MBAR1 0x18 + +#define PCIC_MBAR1_OFFSET 2 + +#define PCIC_IOBAR 0x20 + +#define PCIC_IOBAR_OFFSET 4 + +#define PCIC_SUBVENDOR_ID 0x2C + +#define PCIC_SUBSYTEM_ID 0x2E + +#define PCIX_STATUS 0x44 +#define RCV_SCE 0x20000000 +#define UNEXP_SC 0x00080000 +#define SC_DISCARD 0x00040000 + +#define ECC_CTRL_STAT 0x48 +#define UNCOR_ECCERR 0x00000008 + +#define PCIC_PM_CSR 0x5C + +#define PWR_STATE_D0 0 +#define PWR_STATE_D1 1 /* not supported */ +#define PWR_STATE_D2 2 /* not supported */ +#define PWR_STATE_D3 3 + +#define PCIC_BASE1 0x6C /* internal use only */ + +#define BASE1_RSVD 0xFFFFFFF8 + +#define PCIC_BASEA 0x70 /* internal use only */ + +#define BASEA_RSVD 0xFFFFFFC0 +#define BASEA_START 0 + +#define PCIC_BASEB 0x74 /* internal use only */ + +#define BASEB_RSVD 0xFFFFFF80 +#define BASEB_IOMAP_MASK 0x7F +#define BASEB_START 0x80 + +#define PCIC_BASEC 0x78 /* internal use only */ + +#define BASEC_RSVD 0xFFFFFFFC +#define BASEC_MASK 0x03 +#define BASEC_START 0x58 + +#define PCIC_MBAR_KEY 0x7C /* internal use only */ + +#define MBAR_KEY_MASK 0xFFFFFFFF + +#define PCIC_HSTPCIX_CNTRL 0xA0 + +#define REWIND_DIS 0x0800 +#define SC_TMR_DIS 0x04000000 + +#define PCIC_MBAR0_MASK 0xA8 +#define PCIC_MBAR0_SIZE_MASK 0x1FFFE000 +#define PCIC_MBAR0_SIZE_SHIFT 13 +#define PCIC_MBAR0_SIZE(val) \ + (((val) & PCIC_MBAR0_SIZE_MASK) >> PCIC_MBAR0_SIZE_SHIFT) + +#define PCIC_FLASH_MBAR 0xB8 + +#define PCIC_INTRPT_STAT 0xD4 + +#define PCIC_TP_CTRL 0xFC + +/* + * EXSI Registers, Addresss Range: (0x00-0xFC) + */ +#define EXSI_REG_BASE_ADR REG_BASE_ADDR_EXSI + +#define EXSICNFGR (EXSI_REG_BASE_ADR + 0x00) + +#define OCMINITIALIZED 0x80000000 +#define ASIEN 0x00400000 +#define HCMODE 0x00200000 +#define PCIDEF 0x00100000 +#define COMSTOCK 0x00080000 +#define SEEPROMEND 0x00040000 +#define MSTTIMEN 0x00020000 +#define XREGEX 0x00000200 +#define NVRAMW 0x00000100 +#define NVRAMEX 0x00000080 +#define SRAMW 0x00000040 +#define SRAMEX 0x00000020 +#define FLASHW 0x00000010 +#define FLASHEX 0x00000008 +#define SEEPROMCFG 0x00000004 +#define SEEPROMTYP 0x00000002 +#define SEEPROMEX 0x00000001 + + +#define EXSICNTRLR (EXSI_REG_BASE_ADR + 0x04) + +#define MODINT_EN 0x00000001 + + +#define PMSTATR (EXSI_REG_BASE_ADR + 0x10) + +#define FLASHRST 0x00000002 +#define FLASHRDY 0x00000001 + + +#define FLCNFGR (EXSI_REG_BASE_ADR + 0x14) + +#define FLWEH_MASK 0x30000000 +#define FLWESU_MASK 0x0C000000 +#define FLWEPW_MASK 0x03F00000 +#define FLOEH_MASK 0x000C0000 +#define FLOESU_MASK 0x00030000 +#define FLOEPW_MASK 0x0000FC00 +#define FLCSH_MASK 0x00000300 +#define FLCSSU_MASK 0x000000C0 +#define FLCSPW_MASK 0x0000003F + +#define SRCNFGR (EXSI_REG_BASE_ADR + 0x18) + +#define SRWEH_MASK 0x30000000 +#define SRWESU_MASK 0x0C000000 +#define SRWEPW_MASK 0x03F00000 + +#define SROEH_MASK 0x000C0000 +#define SROESU_MASK 0x00030000 +#define SROEPW_MASK 0x0000FC00 +#define SRCSH_MASK 0x00000300 +#define SRCSSU_MASK 0x000000C0 +#define SRCSPW_MASK 0x0000003F + +#define NVCNFGR (EXSI_REG_BASE_ADR + 0x1C) + +#define NVWEH_MASK 0x30000000 +#define NVWESU_MASK 0x0C000000 +#define NVWEPW_MASK 0x03F00000 +#define NVOEH_MASK 0x000C0000 +#define NVOESU_MASK 0x00030000 +#define NVOEPW_MASK 0x0000FC00 +#define NVCSH_MASK 0x00000300 +#define NVCSSU_MASK 0x000000C0 +#define NVCSPW_MASK 0x0000003F + +#define XRCNFGR (EXSI_REG_BASE_ADR + 0x20) + +#define XRWEH_MASK 0x30000000 +#define XRWESU_MASK 0x0C000000 +#define XRWEPW_MASK 0x03F00000 +#define XROEH_MASK 0x000C0000 +#define XROESU_MASK 0x00030000 +#define XROEPW_MASK 0x0000FC00 +#define XRCSH_MASK 0x00000300 +#define XRCSSU_MASK 0x000000C0 +#define XRCSPW_MASK 0x0000003F + +#define XREGADDR (EXSI_REG_BASE_ADR + 0x24) + +#define XRADDRINCEN 0x80000000 +#define XREGADD_MASK 0x007FFFFF + + +#define XREGDATAR (EXSI_REG_BASE_ADR + 0x28) + +#define XREGDATA_MASK 0x0000FFFF + +#define GPIOOER (EXSI_REG_BASE_ADR + 0x40) + +#define GPIOODENR (EXSI_REG_BASE_ADR + 0x44) + +#define GPIOINVR (EXSI_REG_BASE_ADR + 0x48) + +#define GPIODATAOR (EXSI_REG_BASE_ADR + 0x4C) + +#define GPIODATAIR (EXSI_REG_BASE_ADR + 0x50) + +#define GPIOCNFGR (EXSI_REG_BASE_ADR + 0x54) + +#define GPIO_EXTSRC 0x00000001 + +#define SCNTRLR (EXSI_REG_BASE_ADR + 0xA0) + +#define SXFERDONE 0x00000100 +#define SXFERCNT_MASK 0x000000E0 +#define SCMDTYP_MASK 0x0000001C +#define SXFERSTART 0x00000002 +#define SXFEREN 0x00000001 + +#define SRATER (EXSI_REG_BASE_ADR + 0xA4) + +#define SADDRR (EXSI_REG_BASE_ADR + 0xA8) + +#define SADDR_MASK 0x0000FFFF + +#define SDATAOR (EXSI_REG_BASE_ADR + 0xAC) + +#define SDATAOR0 (EXSI_REG_BASE_ADR + 0xAC) +#define SDATAOR1 (EXSI_REG_BASE_ADR + 0xAD) +#define SDATAOR2 (EXSI_REG_BASE_ADR + 0xAE) +#define SDATAOR3 (EXSI_REG_BASE_ADR + 0xAF) + +#define SDATAIR (EXSI_REG_BASE_ADR + 0xB0) + +#define SDATAIR0 (EXSI_REG_BASE_ADR + 0xB0) +#define SDATAIR1 (EXSI_REG_BASE_ADR + 0xB1) +#define SDATAIR2 (EXSI_REG_BASE_ADR + 0xB2) +#define SDATAIR3 (EXSI_REG_BASE_ADR + 0xB3) + +#define ASISTAT0R (EXSI_REG_BASE_ADR + 0xD0) +#define ASIFMTERR 0x00000400 +#define ASISEECHKERR 0x00000200 +#define ASIERR 0x00000100 + +#define ASISTAT1R (EXSI_REG_BASE_ADR + 0xD4) +#define CHECKSUM_MASK 0x0000FFFF + +#define ASIERRADDR (EXSI_REG_BASE_ADR + 0xD8) +#define ASIERRDATAR (EXSI_REG_BASE_ADR + 0xDC) +#define ASIERRSTATR (EXSI_REG_BASE_ADR + 0xE0) +#define CPI2ASIBYTECNT_MASK 0x00070000 +#define CPI2ASIBYTEEN_MASK 0x0000F000 +#define CPI2ASITARGERR_MASK 0x00000F00 +#define CPI2ASITARGMID_MASK 0x000000F0 +#define CPI2ASIMSTERR_MASK 0x0000000F + +/* + * XSRAM, External SRAM (DWord and any BE pattern accessible) + */ +#define XSRAM_REG_BASE_ADDR 0xB8100000 +#define XSRAM_SIZE 0x100000 + +/* + * NVRAM Registers, Address Range: (0x00000 - 0x3FFFF). + */ +#define NVRAM_REG_BASE_ADR 0xBF800000 +#define NVRAM_MAX_BASE_ADR 0x003FFFFF + +/* OCM base address */ +#define OCM_BASE_ADDR 0xA0000000 +#define OCM_MAX_SIZE 0x20000 + +/* + * Sequencers (Central and Link) Scratch RAM page definitions. + */ + +/* + * The Central Management Sequencer (CSEQ) Scratch Memory is a 1024 + * byte memory. It is dword accessible and has byte parity + * protection. The CSEQ accesses it in 32 byte windows, either as mode + * dependent or mode independent memory. Each mode has 96 bytes, + * (three 32 byte pages 0-2, not contiguous), leaving 128 bytes of + * Mode Independent memory (four 32 byte pages 3-7). Note that mode + * dependent scratch memory, Mode 8, page 0-3 overlaps mode + * independent scratch memory, pages 0-3. + * - 896 bytes of mode dependent scratch, 96 bytes per Modes 0-7, and + * 128 bytes in mode 8, + * - 259 bytes of mode independent scratch, common to modes 0-15. + * + * Sequencer scratch RAM is 1024 bytes. This scratch memory is + * divided into mode dependent and mode independent scratch with this + * memory further subdivided into pages of size 32 bytes. There are 5 + * pages (160 bytes) of mode independent scratch and 3 pages of + * dependent scratch memory for modes 0-7 (768 bytes). Mode 8 pages + * 0-2 dependent scratch overlap with pages 0-2 of mode independent + * scratch memory. + * + * The host accesses this scratch in a different manner from the + * central sequencer. The sequencer has to use CSEQ registers CSCRPAGE + * and CMnSCRPAGE to access the scratch memory. A flat mapping of the + * scratch memory is avaliable for software convenience and to prevent + * corruption while the sequencer is running. This memory is mapped + * onto addresses 800h - BFFh, total of 400h bytes. + * + * These addresses are mapped as follows: + * + * 800h-83Fh Mode Dependent Scratch Mode 0 Pages 0-1 + * 840h-87Fh Mode Dependent Scratch Mode 1 Pages 0-1 + * 880h-8BFh Mode Dependent Scratch Mode 2 Pages 0-1 + * 8C0h-8FFh Mode Dependent Scratch Mode 3 Pages 0-1 + * 900h-93Fh Mode Dependent Scratch Mode 4 Pages 0-1 + * 940h-97Fh Mode Dependent Scratch Mode 5 Pages 0-1 + * 980h-9BFh Mode Dependent Scratch Mode 6 Pages 0-1 + * 9C0h-9FFh Mode Dependent Scratch Mode 7 Pages 0-1 + * A00h-A5Fh Mode Dependent Scratch Mode 8 Pages 0-2 + * Mode Independent Scratch Pages 0-2 + * A60h-A7Fh Mode Dependent Scratch Mode 8 Page 3 + * Mode Independent Scratch Page 3 + * A80h-AFFh Mode Independent Scratch Pages 4-7 + * B00h-B1Fh Mode Dependent Scratch Mode 0 Page 2 + * B20h-B3Fh Mode Dependent Scratch Mode 1 Page 2 + * B40h-B5Fh Mode Dependent Scratch Mode 2 Page 2 + * B60h-B7Fh Mode Dependent Scratch Mode 3 Page 2 + * B80h-B9Fh Mode Dependent Scratch Mode 4 Page 2 + * BA0h-BBFh Mode Dependent Scratch Mode 5 Page 2 + * BC0h-BDFh Mode Dependent Scratch Mode 6 Page 2 + * BE0h-BFFh Mode Dependent Scratch Mode 7 Page 2 + */ + +/* General macros */ +#define CSEQ_PAGE_SIZE 32 /* Scratch page size (in bytes) */ + +/* All macros start with offsets from base + 0x800 (CMAPPEDSCR). + * Mode dependent scratch page 0, mode 0. + * For modes 1-7 you have to do arithmetic. */ +#define CSEQ_LRM_SAVE_SINDEX (CMAPPEDSCR + 0x0000) +#define CSEQ_LRM_SAVE_SCBPTR (CMAPPEDSCR + 0x0002) +#define CSEQ_Q_LINK_HEAD (CMAPPEDSCR + 0x0004) +#define CSEQ_Q_LINK_TAIL (CMAPPEDSCR + 0x0006) +#define CSEQ_LRM_SAVE_SCRPAGE (CMAPPEDSCR + 0x0008) + +/* Mode dependent scratch page 0 mode 8 macros. */ +#define CSEQ_RET_ADDR (CMAPPEDSCR + 0x0200) +#define CSEQ_RET_SCBPTR (CMAPPEDSCR + 0x0202) +#define CSEQ_SAVE_SCBPTR (CMAPPEDSCR + 0x0204) +#define CSEQ_EMPTY_TRANS_CTX (CMAPPEDSCR + 0x0206) +#define CSEQ_RESP_LEN (CMAPPEDSCR + 0x0208) +#define CSEQ_TMF_SCBPTR (CMAPPEDSCR + 0x020A) +#define CSEQ_GLOBAL_PREV_SCB (CMAPPEDSCR + 0x020C) +#define CSEQ_GLOBAL_HEAD (CMAPPEDSCR + 0x020E) +#define CSEQ_CLEAR_LU_HEAD (CMAPPEDSCR + 0x0210) +#define CSEQ_TMF_OPCODE (CMAPPEDSCR + 0x0212) +#define CSEQ_SCRATCH_FLAGS (CMAPPEDSCR + 0x0213) +#define CSEQ_HSB_SITE (CMAPPEDSCR + 0x021A) +#define CSEQ_FIRST_INV_SCB_SITE (CMAPPEDSCR + 0x021C) +#define CSEQ_FIRST_INV_DDB_SITE (CMAPPEDSCR + 0x021E) + +/* Mode dependent scratch page 1 mode 8 macros. */ +#define CSEQ_LUN_TO_CLEAR (CMAPPEDSCR + 0x0220) +#define CSEQ_LUN_TO_CHECK (CMAPPEDSCR + 0x0228) + +/* Mode dependent scratch page 2 mode 8 macros */ +#define CSEQ_HQ_NEW_POINTER (CMAPPEDSCR + 0x0240) +#define CSEQ_HQ_DONE_BASE (CMAPPEDSCR + 0x0248) +#define CSEQ_HQ_DONE_POINTER (CMAPPEDSCR + 0x0250) +#define CSEQ_HQ_DONE_PASS (CMAPPEDSCR + 0x0254) + +/* Mode independent scratch page 4 macros. */ +#define CSEQ_Q_EXE_HEAD (CMAPPEDSCR + 0x0280) +#define CSEQ_Q_EXE_TAIL (CMAPPEDSCR + 0x0282) +#define CSEQ_Q_DONE_HEAD (CMAPPEDSCR + 0x0284) +#define CSEQ_Q_DONE_TAIL (CMAPPEDSCR + 0x0286) +#define CSEQ_Q_SEND_HEAD (CMAPPEDSCR + 0x0288) +#define CSEQ_Q_SEND_TAIL (CMAPPEDSCR + 0x028A) +#define CSEQ_Q_DMA2CHIM_HEAD (CMAPPEDSCR + 0x028C) +#define CSEQ_Q_DMA2CHIM_TAIL (CMAPPEDSCR + 0x028E) +#define CSEQ_Q_COPY_HEAD (CMAPPEDSCR + 0x0290) +#define CSEQ_Q_COPY_TAIL (CMAPPEDSCR + 0x0292) +#define CSEQ_REG0 (CMAPPEDSCR + 0x0294) +#define CSEQ_REG1 (CMAPPEDSCR + 0x0296) +#define CSEQ_REG2 (CMAPPEDSCR + 0x0298) +#define CSEQ_LINK_CTL_Q_MAP (CMAPPEDSCR + 0x029C) +#define CSEQ_MAX_CSEQ_MODE (CMAPPEDSCR + 0x029D) +#define CSEQ_FREE_LIST_HACK_COUNT (CMAPPEDSCR + 0x029E) + +/* Mode independent scratch page 5 macros. */ +#define CSEQ_EST_NEXUS_REQ_QUEUE (CMAPPEDSCR + 0x02A0) +#define CSEQ_EST_NEXUS_REQ_COUNT (CMAPPEDSCR + 0x02A8) +#define CSEQ_Q_EST_NEXUS_HEAD (CMAPPEDSCR + 0x02B0) +#define CSEQ_Q_EST_NEXUS_TAIL (CMAPPEDSCR + 0x02B2) +#define CSEQ_NEED_EST_NEXUS_SCB (CMAPPEDSCR + 0x02B4) +#define CSEQ_EST_NEXUS_REQ_HEAD (CMAPPEDSCR + 0x02B6) +#define CSEQ_EST_NEXUS_REQ_TAIL (CMAPPEDSCR + 0x02B7) +#define CSEQ_EST_NEXUS_SCB_OFFSET (CMAPPEDSCR + 0x02B8) + +/* Mode independent scratch page 6 macros. */ +#define CSEQ_INT_ROUT_RET_ADDR0 (CMAPPEDSCR + 0x02C0) +#define CSEQ_INT_ROUT_RET_ADDR1 (CMAPPEDSCR + 0x02C2) +#define CSEQ_INT_ROUT_SCBPTR (CMAPPEDSCR + 0x02C4) +#define CSEQ_INT_ROUT_MODE (CMAPPEDSCR + 0x02C6) +#define CSEQ_ISR_SCRATCH_FLAGS (CMAPPEDSCR + 0x02C7) +#define CSEQ_ISR_SAVE_SINDEX (CMAPPEDSCR + 0x02C8) +#define CSEQ_ISR_SAVE_DINDEX (CMAPPEDSCR + 0x02CA) +#define CSEQ_Q_MONIRTT_HEAD (CMAPPEDSCR + 0x02D0) +#define CSEQ_Q_MONIRTT_TAIL (CMAPPEDSCR + 0x02D2) +#define CSEQ_FREE_SCB_MASK (CMAPPEDSCR + 0x02D5) +#define CSEQ_BUILTIN_FREE_SCB_HEAD (CMAPPEDSCR + 0x02D6) +#define CSEQ_BUILTIN_FREE_SCB_TAIL (CMAPPEDSCR + 0x02D8) +#define CSEQ_EXTENDED_FREE_SCB_HEAD (CMAPPEDSCR + 0x02DA) +#define CSEQ_EXTENDED_FREE_SCB_TAIL (CMAPPEDSCR + 0x02DC) + +/* Mode independent scratch page 7 macros. */ +#define CSEQ_EMPTY_REQ_QUEUE (CMAPPEDSCR + 0x02E0) +#define CSEQ_EMPTY_REQ_COUNT (CMAPPEDSCR + 0x02E8) +#define CSEQ_Q_EMPTY_HEAD (CMAPPEDSCR + 0x02F0) +#define CSEQ_Q_EMPTY_TAIL (CMAPPEDSCR + 0x02F2) +#define CSEQ_NEED_EMPTY_SCB (CMAPPEDSCR + 0x02F4) +#define CSEQ_EMPTY_REQ_HEAD (CMAPPEDSCR + 0x02F6) +#define CSEQ_EMPTY_REQ_TAIL (CMAPPEDSCR + 0x02F7) +#define CSEQ_EMPTY_SCB_OFFSET (CMAPPEDSCR + 0x02F8) +#define CSEQ_PRIMITIVE_DATA (CMAPPEDSCR + 0x02FA) +#define CSEQ_TIMEOUT_CONST (CMAPPEDSCR + 0x02FC) + +/*************************************************************************** +* Link m Sequencer scratch RAM is 512 bytes. +* This scratch memory is divided into mode dependent and mode +* independent scratch with this memory further subdivided into +* pages of size 32 bytes. There are 4 pages (128 bytes) of +* mode independent scratch and 4 pages of dependent scratch +* memory for modes 0-2 (384 bytes). +* +* The host accesses this scratch in a different manner from the +* link sequencer. The sequencer has to use LSEQ registers +* LmSCRPAGE and LmMnSCRPAGE to access the scratch memory. A flat +* mapping of the scratch memory is avaliable for software +* convenience and to prevent corruption while the sequencer is +* running. This memory is mapped onto addresses 800h - 9FFh. +* +* These addresses are mapped as follows: +* +* 800h-85Fh Mode Dependent Scratch Mode 0 Pages 0-2 +* 860h-87Fh Mode Dependent Scratch Mode 0 Page 3 +* Mode Dependent Scratch Mode 5 Page 0 +* 880h-8DFh Mode Dependent Scratch Mode 1 Pages 0-2 +* 8E0h-8FFh Mode Dependent Scratch Mode 1 Page 3 +* Mode Dependent Scratch Mode 5 Page 1 +* 900h-95Fh Mode Dependent Scratch Mode 2 Pages 0-2 +* 960h-97Fh Mode Dependent Scratch Mode 2 Page 3 +* Mode Dependent Scratch Mode 5 Page 2 +* 980h-9DFh Mode Independent Scratch Pages 0-3 +* 9E0h-9FFh Mode Independent Scratch Page 3 +* Mode Dependent Scratch Mode 5 Page 3 +* +****************************************************************************/ +/* General macros */ +#define LSEQ_MODE_SCRATCH_SIZE 0x80 /* Size of scratch RAM per mode */ +#define LSEQ_PAGE_SIZE 0x20 /* Scratch page size (in bytes) */ +#define LSEQ_MODE5_PAGE0_OFFSET 0x60 + +/* Common mode dependent scratch page 0 macros for modes 0,1,2, and 5 */ +/* Indexed using LSEQ_MODE_SCRATCH_SIZE * mode, for modes 0,1,2. */ +#define LmSEQ_RET_ADDR(LinkNum) (LmSCRATCH(LinkNum) + 0x0000) +#define LmSEQ_REG0_MODE(LinkNum) (LmSCRATCH(LinkNum) + 0x0002) +#define LmSEQ_MODE_FLAGS(LinkNum) (LmSCRATCH(LinkNum) + 0x0004) + +/* Mode flag macros (byte 0) */ +#define SAS_SAVECTX_OCCURRED 0x80 +#define SAS_OOBSVC_OCCURRED 0x40 +#define SAS_OOB_DEVICE_PRESENT 0x20 +#define SAS_CFGHDR_OCCURRED 0x10 +#define SAS_RCV_INTS_ARE_DISABLED 0x08 +#define SAS_OOB_HOT_PLUG_CNCT 0x04 +#define SAS_AWAIT_OPEN_CONNECTION 0x02 +#define SAS_CFGCMPLT_OCCURRED 0x01 + +/* Mode flag macros (byte 1) */ +#define SAS_RLSSCB_OCCURRED 0x80 +#define SAS_FORCED_HEADER_MISS 0x40 + +#define LmSEQ_RET_ADDR2(LinkNum) (LmSCRATCH(LinkNum) + 0x0006) +#define LmSEQ_RET_ADDR1(LinkNum) (LmSCRATCH(LinkNum) + 0x0008) +#define LmSEQ_OPCODE_TO_CSEQ(LinkNum) (LmSCRATCH(LinkNum) + 0x000B) +#define LmSEQ_DATA_TO_CSEQ(LinkNum) (LmSCRATCH(LinkNum) + 0x000C) + +/* Mode dependent scratch page 0 macros for mode 0 (non-common) */ +/* Absolute offsets */ +#define LmSEQ_FIRST_INV_DDB_SITE(LinkNum) (LmSCRATCH(LinkNum) + 0x000E) +#define LmSEQ_EMPTY_TRANS_CTX(LinkNum) (LmSCRATCH(LinkNum) + 0x0010) +#define LmSEQ_RESP_LEN(LinkNum) (LmSCRATCH(LinkNum) + 0x0012) +#define LmSEQ_FIRST_INV_SCB_SITE(LinkNum) (LmSCRATCH(LinkNum) + 0x0014) +#define LmSEQ_INTEN_SAVE(LinkNum) (LmSCRATCH(LinkNum) + 0x0016) +#define LmSEQ_LINK_RST_FRM_LEN(LinkNum) (LmSCRATCH(LinkNum) + 0x001A) +#define LmSEQ_LINK_RST_PROTOCOL(LinkNum) (LmSCRATCH(LinkNum) + 0x001B) +#define LmSEQ_RESP_STATUS(LinkNum) (LmSCRATCH(LinkNum) + 0x001C) +#define LmSEQ_LAST_LOADED_SGE(LinkNum) (LmSCRATCH(LinkNum) + 0x001D) +#define LmSEQ_SAVE_SCBPTR(LinkNum) (LmSCRATCH(LinkNum) + 0x001E) + +/* Mode dependent scratch page 0 macros for mode 1 (non-common) */ +/* Absolute offsets */ +#define LmSEQ_Q_XMIT_HEAD(LinkNum) (LmSCRATCH(LinkNum) + 0x008E) +#define LmSEQ_M1_EMPTY_TRANS_CTX(LinkNum) (LmSCRATCH(LinkNum) + 0x0090) +#define LmSEQ_INI_CONN_TAG(LinkNum) (LmSCRATCH(LinkNum) + 0x0092) +#define LmSEQ_FAILED_OPEN_STATUS(LinkNum) (LmSCRATCH(LinkNum) + 0x009A) +#define LmSEQ_XMIT_REQUEST_TYPE(LinkNum) (LmSCRATCH(LinkNum) + 0x009B) +#define LmSEQ_M1_RESP_STATUS(LinkNum) (LmSCRATCH(LinkNum) + 0x009C) +#define LmSEQ_M1_LAST_LOADED_SGE(LinkNum) (LmSCRATCH(LinkNum) + 0x009D) +#define LmSEQ_M1_SAVE_SCBPTR(LinkNum) (LmSCRATCH(LinkNum) + 0x009E) + +/* Mode dependent scratch page 0 macros for mode 2 (non-common) */ +#define LmSEQ_PORT_COUNTER(LinkNum) (LmSCRATCH(LinkNum) + 0x010E) +#define LmSEQ_PM_TABLE_PTR(LinkNum) (LmSCRATCH(LinkNum) + 0x0110) +#define LmSEQ_SATA_INTERLOCK_TMR_SAVE(LinkNum) (LmSCRATCH(LinkNum) + 0x0112) +#define LmSEQ_IP_BITL(LinkNum) (LmSCRATCH(LinkNum) + 0x0114) +#define LmSEQ_COPY_SMP_CONN_TAG(LinkNum) (LmSCRATCH(LinkNum) + 0x0116) +#define LmSEQ_P0M2_OFFS1AH(LinkNum) (LmSCRATCH(LinkNum) + 0x011A) + +/* Mode dependent scratch page 0 macros for modes 4/5 (non-common) */ +/* Absolute offsets */ +#define LmSEQ_SAVED_OOB_STATUS(LinkNum) (LmSCRATCH(LinkNum) + 0x006E) +#define LmSEQ_SAVED_OOB_MODE(LinkNum) (LmSCRATCH(LinkNum) + 0x006F) +#define LmSEQ_Q_LINK_HEAD(LinkNum) (LmSCRATCH(LinkNum) + 0x0070) +#define LmSEQ_LINK_RST_ERR(LinkNum) (LmSCRATCH(LinkNum) + 0x0072) +#define LmSEQ_SAVED_OOB_SIGNALS(LinkNum) (LmSCRATCH(LinkNum) + 0x0073) +#define LmSEQ_SAS_RESET_MODE(LinkNum) (LmSCRATCH(LinkNum) + 0x0074) +#define LmSEQ_LINK_RESET_RETRY_COUNT(LinkNum) (LmSCRATCH(LinkNum) + 0x0075) +#define LmSEQ_NUM_LINK_RESET_RETRIES(LinkNum) (LmSCRATCH(LinkNum) + 0x0076) +#define LmSEQ_OOB_INT_ENABLES(LinkNum) (LmSCRATCH(LinkNum) + 0x007A) +#define LmSEQ_NOTIFY_TIMER_TIMEOUT(LinkNum) (LmSCRATCH(LinkNum) + 0x007C) +#define LmSEQ_NOTIFY_TIMER_DOWN_COUNT(LinkNum) (LmSCRATCH(LinkNum) + 0x007E) + +/* Mode dependent scratch page 1, mode 0 and mode 1 */ +#define LmSEQ_SG_LIST_PTR_ADDR0(LinkNum) (LmSCRATCH(LinkNum) + 0x0020) +#define LmSEQ_SG_LIST_PTR_ADDR1(LinkNum) (LmSCRATCH(LinkNum) + 0x0030) +#define LmSEQ_M1_SG_LIST_PTR_ADDR0(LinkNum) (LmSCRATCH(LinkNum) + 0x00A0) +#define LmSEQ_M1_SG_LIST_PTR_ADDR1(LinkNum) (LmSCRATCH(LinkNum) + 0x00B0) + +/* Mode dependent scratch page 1 macros for mode 2 */ +/* Absolute offsets */ +#define LmSEQ_INVALID_DWORD_COUNT(LinkNum) (LmSCRATCH(LinkNum) + 0x0120) +#define LmSEQ_DISPARITY_ERROR_COUNT(LinkNum) (LmSCRATCH(LinkNum) + 0x0124) +#define LmSEQ_LOSS_OF_SYNC_COUNT(LinkNum) (LmSCRATCH(LinkNum) + 0x0128) + +/* Mode dependent scratch page 1 macros for mode 4/5 */ +#define LmSEQ_FRAME_TYPE_MASK(LinkNum) (LmSCRATCH(LinkNum) + 0x00E0) +#define LmSEQ_HASHED_DEST_ADDR_MASK(LinkNum) (LmSCRATCH(LinkNum) + 0x00E1) +#define LmSEQ_HASHED_SRC_ADDR_MASK_PRINT(LinkNum) (LmSCRATCH(LinkNum) + 0x00E4) +#define LmSEQ_HASHED_SRC_ADDR_MASK(LinkNum) (LmSCRATCH(LinkNum) + 0x00E5) +#define LmSEQ_NUM_FILL_BYTES_MASK(LinkNum) (LmSCRATCH(LinkNum) + 0x00EB) +#define LmSEQ_TAG_MASK(LinkNum) (LmSCRATCH(LinkNum) + 0x00F0) +#define LmSEQ_TARGET_PORT_XFER_TAG(LinkNum) (LmSCRATCH(LinkNum) + 0x00F2) +#define LmSEQ_DATA_OFFSET(LinkNum) (LmSCRATCH(LinkNum) + 0x00F4) + +/* Mode dependent scratch page 2 macros for mode 0 */ +/* Absolute offsets */ +#define LmSEQ_SMP_RCV_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x0040) +#define LmSEQ_DEVICE_BITS(LinkNum) (LmSCRATCH(LinkNum) + 0x005B) +#define LmSEQ_SDB_DDB(LinkNum) (LmSCRATCH(LinkNum) + 0x005C) +#define LmSEQ_SDB_NUM_TAGS(LinkNum) (LmSCRATCH(LinkNum) + 0x005E) +#define LmSEQ_SDB_CURR_TAG(LinkNum) (LmSCRATCH(LinkNum) + 0x005F) + +/* Mode dependent scratch page 2 macros for mode 1 */ +/* Absolute offsets */ +/* byte 0 bits 1-0 are domain select. */ +#define LmSEQ_TX_ID_ADDR_FRAME(LinkNum) (LmSCRATCH(LinkNum) + 0x00C0) +#define LmSEQ_OPEN_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x00C8) +#define LmSEQ_SRST_AS_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x00CC) +#define LmSEQ_LAST_LOADED_SG_EL(LinkNum) (LmSCRATCH(LinkNum) + 0x00D4) + +/* Mode dependent scratch page 2 macros for mode 2 */ +/* Absolute offsets */ +#define LmSEQ_STP_SHUTDOWN_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x0140) +#define LmSEQ_CLOSE_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x0144) +#define LmSEQ_BREAK_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x0148) +#define LmSEQ_DWS_RESET_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x014C) +#define LmSEQ_SATA_INTERLOCK_TIMER_TERM_TS(LinkNum) \ + (LmSCRATCH(LinkNum) + 0x0150) +#define LmSEQ_MCTL_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x0154) + +/* Mode dependent scratch page 2 macros for mode 5 */ +#define LmSEQ_COMINIT_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x0160) +#define LmSEQ_RCV_ID_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x0164) +#define LmSEQ_RCV_FIS_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x0168) +#define LmSEQ_DEV_PRES_TIMER_TERM_TS(LinkNum) (LmSCRATCH(LinkNum) + 0x016C) + +/* Mode dependent scratch page 3 macros for modes 0 and 1 */ +/* None defined */ + +/* Mode dependent scratch page 3 macros for modes 2 and 5 */ +/* None defined */ + +/* Mode Independent Scratch page 0 macros. */ +#define LmSEQ_Q_TGTXFR_HEAD(LinkNum) (LmSCRATCH(LinkNum) + 0x0180) +#define LmSEQ_Q_TGTXFR_TAIL(LinkNum) (LmSCRATCH(LinkNum) + 0x0182) +#define LmSEQ_LINK_NUMBER(LinkNum) (LmSCRATCH(LinkNum) + 0x0186) +#define LmSEQ_SCRATCH_FLAGS(LinkNum) (LmSCRATCH(LinkNum) + 0x0187) +/* + * Currently only bit 0, SAS_DWSAQD, is used. + */ +#define SAS_DWSAQD 0x01 /* + * DWSSTATUS: DWSAQD + * bit las read in ISR. + */ +#define LmSEQ_CONNECTION_STATE(LinkNum) (LmSCRATCH(LinkNum) + 0x0188) +/* Connection states (byte 0) */ +#define SAS_WE_OPENED_CS 0x01 +#define SAS_DEVICE_OPENED_CS 0x02 +#define SAS_WE_SENT_DONE_CS 0x04 +#define SAS_DEVICE_SENT_DONE_CS 0x08 +#define SAS_WE_SENT_CLOSE_CS 0x10 +#define SAS_DEVICE_SENT_CLOSE_CS 0x20 +#define SAS_WE_SENT_BREAK_CS 0x40 +#define SAS_DEVICE_SENT_BREAK_CS 0x80 +/* Connection states (byte 1) */ +#define SAS_OPN_TIMEOUT_OR_OPN_RJCT_CS 0x01 +#define SAS_AIP_RECEIVED_CS 0x02 +#define SAS_CREDIT_TIMEOUT_OCCURRED_CS 0x04 +#define SAS_ACKNAK_TIMEOUT_OCCURRED_CS 0x08 +#define SAS_SMPRSP_TIMEOUT_OCCURRED_CS 0x10 +#define SAS_DONE_TIMEOUT_OCCURRED_CS 0x20 +/* Connection states (byte 2) */ +#define SAS_SMP_RESPONSE_RECEIVED_CS 0x01 +#define SAS_INTLK_TIMEOUT_OCCURRED_CS 0x02 +#define SAS_DEVICE_SENT_DMAT_CS 0x04 +#define SAS_DEVICE_SENT_SYNCSRST_CS 0x08 +#define SAS_CLEARING_AFFILIATION_CS 0x20 +#define SAS_RXTASK_ACTIVE_CS 0x40 +#define SAS_TXTASK_ACTIVE_CS 0x80 +/* Connection states (byte 3) */ +#define SAS_PHY_LOSS_OF_SIGNAL_CS 0x01 +#define SAS_DWS_TIMER_EXPIRED_CS 0x02 +#define SAS_LINK_RESET_NOT_COMPLETE_CS 0x04 +#define SAS_PHY_DISABLED_CS 0x08 +#define SAS_LINK_CTL_TASK_ACTIVE_CS 0x10 +#define SAS_PHY_EVENT_TASK_ACTIVE_CS 0x20 +#define SAS_DEVICE_SENT_ID_FRAME_CS 0x40 +#define SAS_DEVICE_SENT_REG_FIS_CS 0x40 +#define SAS_DEVICE_SENT_HARD_RESET_CS 0x80 +#define SAS_PHY_IS_DOWN_FLAGS (SAS_PHY_LOSS_OF_SIGNAL_CS|\ + SAS_DWS_TIMER_EXPIRED_CS |\ + SAS_LINK_RESET_NOT_COMPLETE_CS|\ + SAS_PHY_DISABLED_CS) + +#define SAS_LINK_CTL_PHY_EVENT_FLAGS (SAS_LINK_CTL_TASK_ACTIVE_CS |\ + SAS_PHY_EVENT_TASK_ACTIVE_CS |\ + SAS_DEVICE_SENT_ID_FRAME_CS |\ + SAS_DEVICE_SENT_HARD_RESET_CS) + +#define LmSEQ_CONCTL(LinkNum) (LmSCRATCH(LinkNum) + 0x018C) +#define LmSEQ_CONSTAT(LinkNum) (LmSCRATCH(LinkNum) + 0x018E) +#define LmSEQ_CONNECTION_MODES(LinkNum) (LmSCRATCH(LinkNum) + 0x018F) +#define LmSEQ_REG1_ISR(LinkNum) (LmSCRATCH(LinkNum) + 0x0192) +#define LmSEQ_REG2_ISR(LinkNum) (LmSCRATCH(LinkNum) + 0x0194) +#define LmSEQ_REG3_ISR(LinkNum) (LmSCRATCH(LinkNum) + 0x0196) +#define LmSEQ_REG0_ISR(LinkNum) (LmSCRATCH(LinkNum) + 0x0198) + +/* Mode independent scratch page 1 macros. */ +#define LmSEQ_EST_NEXUS_SCBPTR0(LinkNum) (LmSCRATCH(LinkNum) + 0x01A0) +#define LmSEQ_EST_NEXUS_SCBPTR1(LinkNum) (LmSCRATCH(LinkNum) + 0x01A2) +#define LmSEQ_EST_NEXUS_SCBPTR2(LinkNum) (LmSCRATCH(LinkNum) + 0x01A4) +#define LmSEQ_EST_NEXUS_SCBPTR3(LinkNum) (LmSCRATCH(LinkNum) + 0x01A6) +#define LmSEQ_EST_NEXUS_SCB_OPCODE0(LinkNum) (LmSCRATCH(LinkNum) + 0x01A8) +#define LmSEQ_EST_NEXUS_SCB_OPCODE1(LinkNum) (LmSCRATCH(LinkNum) + 0x01A9) +#define LmSEQ_EST_NEXUS_SCB_OPCODE2(LinkNum) (LmSCRATCH(LinkNum) + 0x01AA) +#define LmSEQ_EST_NEXUS_SCB_OPCODE3(LinkNum) (LmSCRATCH(LinkNum) + 0x01AB) +#define LmSEQ_EST_NEXUS_SCB_HEAD(LinkNum) (LmSCRATCH(LinkNum) + 0x01AC) +#define LmSEQ_EST_NEXUS_SCB_TAIL(LinkNum) (LmSCRATCH(LinkNum) + 0x01AD) +#define LmSEQ_EST_NEXUS_BUF_AVAIL(LinkNum) (LmSCRATCH(LinkNum) + 0x01AE) +#define LmSEQ_TIMEOUT_CONST(LinkNum) (LmSCRATCH(LinkNum) + 0x01B8) +#define LmSEQ_ISR_SAVE_SINDEX(LinkNum) (LmSCRATCH(LinkNum) + 0x01BC) +#define LmSEQ_ISR_SAVE_DINDEX(LinkNum) (LmSCRATCH(LinkNum) + 0x01BE) + +/* Mode independent scratch page 2 macros. */ +#define LmSEQ_EMPTY_SCB_PTR0(LinkNum) (LmSCRATCH(LinkNum) + 0x01C0) +#define LmSEQ_EMPTY_SCB_PTR1(LinkNum) (LmSCRATCH(LinkNum) + 0x01C2) +#define LmSEQ_EMPTY_SCB_PTR2(LinkNum) (LmSCRATCH(LinkNum) + 0x01C4) +#define LmSEQ_EMPTY_SCB_PTR3(LinkNum) (LmSCRATCH(LinkNum) + 0x01C6) +#define LmSEQ_EMPTY_SCB_OPCD0(LinkNum) (LmSCRATCH(LinkNum) + 0x01C8) +#define LmSEQ_EMPTY_SCB_OPCD1(LinkNum) (LmSCRATCH(LinkNum) + 0x01C9) +#define LmSEQ_EMPTY_SCB_OPCD2(LinkNum) (LmSCRATCH(LinkNum) + 0x01CA) +#define LmSEQ_EMPTY_SCB_OPCD3(LinkNum) (LmSCRATCH(LinkNum) + 0x01CB) +#define LmSEQ_EMPTY_SCB_HEAD(LinkNum) (LmSCRATCH(LinkNum) + 0x01CC) +#define LmSEQ_EMPTY_SCB_TAIL(LinkNum) (LmSCRATCH(LinkNum) + 0x01CD) +#define LmSEQ_EMPTY_BUFS_AVAIL(LinkNum) (LmSCRATCH(LinkNum) + 0x01CE) +#define LmSEQ_ATA_SCR_REGS(LinkNum) (LmSCRATCH(LinkNum) + 0x01D4) + +/* Mode independent scratch page 3 macros. */ +#define LmSEQ_DEV_PRES_TMR_TOUT_CONST(LinkNum) (LmSCRATCH(LinkNum) + 0x01E0) +#define LmSEQ_SATA_INTERLOCK_TIMEOUT(LinkNum) (LmSCRATCH(LinkNum) + 0x01E4) +#define LmSEQ_STP_SHUTDOWN_TIMEOUT(LinkNum) (LmSCRATCH(LinkNum) + 0x01E8) +#define LmSEQ_SRST_ASSERT_TIMEOUT(LinkNum) (LmSCRATCH(LinkNum) + 0x01EC) +#define LmSEQ_RCV_FIS_TIMEOUT(LinkNum) (LmSCRATCH(LinkNum) + 0x01F0) +#define LmSEQ_ONE_MILLISEC_TIMEOUT(LinkNum) (LmSCRATCH(LinkNum) + 0x01F4) +#define LmSEQ_TEN_MS_COMINIT_TIMEOUT(LinkNum) (LmSCRATCH(LinkNum) + 0x01F8) +#define LmSEQ_SMP_RCV_TIMEOUT(LinkNum) (LmSCRATCH(LinkNum) + 0x01FC) + +#endif diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_sas.h linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_sas.h --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_sas.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_sas.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,785 @@ +/* + * Aic94xx SAS/SATA driver SAS definitions and hardware interface header file. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _AIC94XX_SAS_H_ +#define _AIC94XX_SAS_H_ + +#include + +/* ---------- DDBs ---------- */ +/* DDBs are device descriptor blocks which describe a device in the + * domain that this sequencer can maintain low-level connections for + * us. They are be 64 bytes. + */ + +struct asd_ddb_ssp_smp_target_port { + u8 conn_type; /* byte 0 */ +#define DDB_TP_CONN_TYPE 0x81 /* Initiator port and addr frame type 0x01 */ + + u8 conn_rate; + __be16 init_conn_tag; + u8 dest_sas_addr[8]; /* bytes 4-11 */ + + __le16 send_queue_head; + u8 sq_suspended; + u8 ddb_type; /* DDB_TYPE_TARGET */ +#define DDB_TYPE_UNUSED 0xFF +#define DDB_TYPE_TARGET 0xFE +#define DDB_TYPE_INITIATOR 0xFD +#define DDB_TYPE_PM_PORT 0xFC + + __le16 _r_a; + __be16 awt_def; + + u8 compat_features; /* byte 20 */ + u8 pathway_blocked_count; + __be16 arb_wait_time; + __be32 more_compat_features; /* byte 24 */ + + u8 conn_mask; + u8 flags; /* concurrent conn:2,2 and open:0(1) */ +#define CONCURRENT_CONN_SUPP 0x04 +#define OPEN_REQUIRED 0x01 + + u16 _r_b; + __le16 exec_queue_tail; + __le16 send_queue_tail; + __le16 sister_ddb; + + __le16 _r_c; + + u8 max_concurrent_conn; + u8 num_concurrent_conn; + u8 num_contexts; + + u8 _r_d; + + __le16 active_task_count; + + u8 _r_e[9]; + + u8 itnl_reason; /* I_T nexus loss reason */ + + __le16 _r_f; + + __le16 itnl_timeout; +#define ITNL_TIMEOUT_CONST 0x7D0 /* 2 seconds */ + + __le32 itnl_timestamp; +} __attribute__ ((packed)); + +struct asd_ddb_stp_sata_target_port { + u8 conn_type; /* byte 0 */ + u8 conn_rate; + __be16 init_conn_tag; + u8 dest_sas_addr[8]; /* bytes 4-11 */ + + __le16 send_queue_head; + u8 sq_suspended; + u8 ddb_type; /* DDB_TYPE_TARGET */ + + __le16 _r_a; + + __be16 awt_def; + u8 compat_features; /* byte 20 */ + u8 pathway_blocked_count; + __be16 arb_wait_time; + __be32 more_compat_features; /* byte 24 */ + + u8 conn_mask; + u8 flags; /* concurrent conn:2,2 and open:0(1) */ +#define SATA_MULTIPORT 0x80 +#define SUPPORTS_AFFIL 0x40 +#define STP_AFFIL_POL 0x20 + + u8 _r_b; + u8 flags2; /* STP close policy:0 */ +#define STP_CL_POL_NO_TX 0x00 +#define STP_CL_POL_BTW_CMDS 0x01 + + __le16 exec_queue_tail; + __le16 send_queue_tail; + __le16 sister_ddb; + __le16 ata_cmd_scbptr; + __le32 sata_tag_alloc_mask; + __le16 active_task_count; + __le16 _r_c; + __le32 sata_sactive; + u8 num_sata_tags; + u8 sata_status; + u8 sata_ending_status; + u8 itnl_reason; /* I_T nexus loss reason */ + __le16 ncq_data_scb_ptr; + __le16 itnl_timeout; + __le32 itnl_timestamp; +} __attribute__ ((packed)); + +/* This struct asd_ddb_init_port, describes the device descriptor block + * of an initiator port (when the sequencer is operating in target mode). + * Bytes [0,11] and [20,27] are from the OPEN address frame. + * The sequencer allocates an initiator port DDB entry. + */ +struct asd_ddb_init_port { + u8 conn_type; /* byte 0 */ + u8 conn_rate; + __be16 init_conn_tag; /* BE */ + u8 dest_sas_addr[8]; + __le16 send_queue_head; /* LE, byte 12 */ + u8 sq_suspended; + u8 ddb_type; /* DDB_TYPE_INITIATOR */ + __le16 _r_a; + __be16 awt_def; /* BE */ + u8 compat_features; + u8 pathway_blocked_count; + __be16 arb_wait_time; /* BE */ + __be32 more_compat_features; /* BE */ + u8 conn_mask; + u8 flags; /* == 5 */ + u16 _r_b; + __le16 exec_queue_tail; /* execution queue tail */ + __le16 send_queue_tail; + __le16 sister_ddb; + __le16 init_resp_timeout; /* initiator response timeout */ + __le32 _r_c; + __le16 active_tasks; /* active task count */ + __le16 init_list; /* initiator list link pointer */ + __le32 _r_d; + u8 max_conn_to[3]; /* from Conn-Disc mode page, in us, LE */ + u8 itnl_reason; /* I_T nexus loss reason */ + __le16 bus_inact_to; /* from Conn-Disc mode page, in 100 us, LE */ + __le16 itnl_to; /* from the Protocol Specific Port Ctrl MP */ + __le32 itnl_timestamp; +} __attribute__ ((packed)); + +/* This struct asd_ddb_sata_tag, describes a look-up table to be used + * by the sequencers. SATA II, IDENTIFY DEVICE data, word 76, bit 8: + * NCQ support. This table is used by the sequencers to find the + * corresponding SCB, given a SATA II tag value. + */ +struct asd_ddb_sata_tag { + __le16 scb_pointer[32]; +} __attribute__ ((packed)); + +/* This struct asd_ddb_sata_pm_table, describes a port number to + * connection handle look-up table. SATA targets attached to a port + * multiplier require a 4-bit port number value. There is one DDB + * entry of this type for each SATA port multiplier (sister DDB). + * Given a SATA PM port number, this table gives us the SATA PM Port + * DDB of the SATA port multiplier port (i.e. the SATA target + * discovered on the port). + */ +struct asd_ddb_sata_pm_table { + __le16 ddb_pointer[16]; + __le16 _r_a[16]; +} __attribute__ ((packed)); + +/* This struct asd_ddb_sata_pm_port, describes the SATA port multiplier + * port format DDB. + */ +struct asd_ddb_sata_pm_port { + u8 _r_a[15]; + u8 ddb_type; + u8 _r_b[13]; + u8 pm_port_flags; +#define PM_PORT_MASK 0xF0 +#define PM_PORT_SET 0x02 + u8 _r_c[6]; + __le16 sister_ddb; + __le16 ata_cmd_scbptr; + __le32 sata_tag_alloc_mask; + __le16 active_task_count; + __le16 parent_ddb; + __le32 sata_sactive; + u8 num_sata_tags; + u8 sata_status; + u8 sata_ending_status; + u8 _r_d[9]; +} __attribute__ ((packed)); + +/* This struct asd_ddb_seq_shared, describes a DDB shared by the + * central and link sequencers. port_map_by_links is indexed phy + * number [0,7]; each byte is a bit mask of all the phys that are in + * the same port as the indexed phy. + */ +struct asd_ddb_seq_shared { + __le16 q_free_ddb_head; + __le16 q_free_ddb_tail; + __le16 q_free_ddb_cnt; + __le16 q_used_ddb_head; + __le16 q_used_ddb_tail; + __le16 shared_mem_lock; + __le16 smp_conn_tag; + __le16 est_nexus_buf_cnt; + __le16 est_nexus_buf_thresh; + u32 _r_a; + u8 settable_max_contexts; + u8 _r_b[23]; + u8 conn_not_active; + u8 phy_is_up; + u8 _r_c[8]; + u8 port_map_by_links[8]; +} __attribute__ ((packed)); + +/* ---------- SG Element ---------- */ + +/* This struct sg_el, describes the hardware scatter gather buffer + * element. All entries are little endian. In an SCB, there are 2 of + * this, plus one more, called a link element of this indicating a + * sublist if needed. + * + * A link element has only the bus address set and the flags (DS) bit + * valid. The bus address points to the start of the sublist. + * + * If a sublist is needed, then that sublist should also include the 2 + * sg_el embedded in the SCB, in which case next_sg_offset is 32, + * since sizeof(sg_el) = 16; EOS should be 1 and EOL 0 in this case. + */ +struct sg_el { + __le64 bus_addr; + __le32 size; + __le16 _r; + u8 next_sg_offs; + u8 flags; +#define ASD_SG_EL_DS_MASK 0x30 +#define ASD_SG_EL_DS_OCM 0x10 +#define ASD_SG_EL_DS_HM 0x00 +#define ASD_SG_EL_LIST_MASK 0xC0 +#define ASD_SG_EL_LIST_EOL 0x40 +#define ASD_SG_EL_LIST_EOS 0x80 +} __attribute__ ((packed)); + +/* ---------- SCBs ---------- */ + +/* An SCB (sequencer control block) is comprised of a common header + * and a task part, for a total of 128 bytes. All fields are in LE + * order, unless otherwise noted. + */ + +/* This struct scb_header, defines the SCB header format. + */ +struct scb_header { + __le64 next_scb; + __le16 index; /* transaction context */ + u8 opcode; +} __attribute__ ((packed)); + +/* SCB opcodes: Execution queue + */ +#define INITIATE_SSP_TASK 0x00 +#define INITIATE_LONG_SSP_TASK 0x01 +#define INITIATE_BIDIR_SSP_TASK 0x02 +#define ABORT_TASK 0x03 +#define INITIATE_SSP_TMF 0x04 +#define SSP_TARG_GET_DATA 0x05 +#define SSP_TARG_GET_DATA_GOOD 0x06 +#define SSP_TARG_SEND_RESP 0x07 +#define QUERY_SSP_TASK 0x08 +#define INITIATE_ATA_TASK 0x09 +#define INITIATE_ATAPI_TASK 0x0a +#define CONTROL_ATA_DEV 0x0b +#define INITIATE_SMP_TASK 0x0c +#define SMP_TARG_SEND_RESP 0x0f + +/* SCB opcodes: Send Queue + */ +#define SSP_TARG_SEND_DATA 0x40 +#define SSP_TARG_SEND_DATA_GOOD 0x41 + +/* SCB opcodes: Link Queue + */ +#define CONTROL_PHY 0x80 +#define SEND_PRIMITIVE 0x81 +#define INITIATE_LINK_ADM_TASK 0x82 + +/* SCB opcodes: other + */ +#define EMPTY_SCB 0xc0 +#define INITIATE_SEQ_ADM_TASK 0xc1 +#define EST_ICL_TARG_WINDOW 0xc2 +#define COPY_MEM 0xc3 +#define CLEAR_NEXUS 0xc4 +#define INITIATE_DDB_ADM_TASK 0xc6 +#define ESTABLISH_NEXUS_ESCB 0xd0 + +#define LUN_SIZE 8 + +/* See SAS spec, task IU + */ +struct ssp_task_iu { + u8 lun[LUN_SIZE]; /* BE */ + u16 _r_a; + u8 tmf; + u8 _r_b; + __be16 tag; /* BE */ + u8 _r_c[14]; +} __attribute__ ((packed)); + +/* See SAS spec, command IU + */ +struct ssp_command_iu { + u8 lun[LUN_SIZE]; + u8 _r_a; + u8 efb_prio_attr; /* enable first burst, task prio & attr */ +#define EFB_MASK 0x80 +#define TASK_PRIO_MASK 0x78 +#define TASK_ATTR_MASK 0x07 + + u8 _r_b; + u8 add_cdb_len; /* in dwords, since bit 0,1 are reserved */ + union { + u8 cdb[16]; + struct { + __le64 long_cdb_addr; /* bus address, LE */ + __le32 long_cdb_size; /* LE */ + u8 _r_c[3]; + u8 eol_ds; /* eol:6,6, ds:5,4 */ + } long_cdb; /* sequencer extension */ + }; +} __attribute__ ((packed)); + +struct xfer_rdy_iu { + __be32 requested_offset; /* BE */ + __be32 write_data_len; /* BE */ + __be32 _r_a; +} __attribute__ ((packed)); + +/* ---------- SCB tasks ---------- */ + +/* This is both ssp_task and long_ssp_task + */ +struct initiate_ssp_task { + u8 proto_conn_rate; /* proto:6,4, conn_rate:3,0 */ + __le32 total_xfer_len; + struct ssp_frame_hdr ssp_frame; + struct ssp_command_iu ssp_cmd; + __le16 sister_scb; /* 0xFFFF */ + __le16 conn_handle; /* index to DDB for the intended target */ + u8 data_dir; /* :1,0 */ +#define DATA_DIR_NONE 0x00 +#define DATA_DIR_IN 0x01 +#define DATA_DIR_OUT 0x02 +#define DATA_DIR_BYRECIPIENT 0x03 + + u8 _r_a; + u8 retry_count; + u8 _r_b[5]; + struct sg_el sg_element[3]; /* 2 real and 1 link */ +} __attribute__ ((packed)); + +/* This defines both ata_task and atapi_task. + * ata: C bit of FIS should be 1, + * atapi: C bit of FIS should be 1, and command register should be 0xA0, + * to indicate a packet command. + */ +struct initiate_ata_task { + u8 proto_conn_rate; + __le32 total_xfer_len; + struct host_to_dev_fis fis; + __le32 data_offs; + u8 atapi_packet[16]; + u8 _r_a[12]; + __le16 sister_scb; + __le16 conn_handle; + u8 ata_flags; /* CSMI:6,6, DTM:4,4, QT:3,3, data dir:1,0 */ +#define CSMI_TASK 0x40 +#define DATA_XFER_MODE_DMA 0x10 +#define ATA_Q_TYPE_MASK 0x08 +#define ATA_Q_TYPE_UNTAGGED 0x00 +#define ATA_Q_TYPE_NCQ 0x08 + + u8 _r_b; + u8 retry_count; + u8 _r_c; + u8 flags; +#define STP_AFFIL_POLICY 0x20 +#define SET_AFFIL_POLICY 0x10 +#define RET_PARTIAL_SGLIST 0x02 + + u8 _r_d[3]; + struct sg_el sg_element[3]; +} __attribute__ ((packed)); + +struct initiate_smp_task { + u8 proto_conn_rate; + u8 _r_a[40]; + struct sg_el smp_req; + __le16 sister_scb; + __le16 conn_handle; + u8 _r_c[8]; + struct sg_el smp_resp; + u8 _r_d[32]; +} __attribute__ ((packed)); + +struct control_phy { + u8 phy_id; + u8 sub_func; +#define DISABLE_PHY 0x00 +#define ENABLE_PHY 0x01 +#define RELEASE_SPINUP_HOLD 0x02 +#define ENABLE_PHY_NO_SAS_OOB 0x03 +#define ENABLE_PHY_NO_SATA_OOB 0x04 +#define PHY_NO_OP 0x05 +#define EXECUTE_HARD_RESET 0x81 + + u8 func_mask; + u8 speed_mask; + u8 hot_plug_delay; + u8 port_type; + u8 flags; +#define DEV_PRES_TIMER_OVERRIDE_ENABLE 0x01 +#define DISABLE_PHY_IF_OOB_FAILS 0x02 + + __le32 timeout_override; + u8 link_reset_retries; + u8 _r_a[47]; + __le16 conn_handle; + u8 _r_b[56]; +} __attribute__ ((packed)); + +struct control_ata_dev { + u8 proto_conn_rate; + __le32 _r_a; + struct host_to_dev_fis fis; + u8 _r_b[32]; + __le16 sister_scb; + __le16 conn_handle; + u8 ata_flags; /* 0 */ + u8 _r_c[55]; +} __attribute__ ((packed)); + +struct empty_scb { + u8 num_valid; + __le32 _r_a; +#define ASD_EDBS_PER_SCB 7 +/* header+data+CRC+DMA suffix data */ +#define ASD_EDB_SIZE (24+1024+4+16) + struct sg_el eb[ASD_EDBS_PER_SCB]; +#define ELEMENT_NOT_VALID 0xC0 +} __attribute__ ((packed)); + +struct initiate_link_adm { + u8 phy_id; + u8 sub_func; +#define GET_LINK_ERROR_COUNT 0x00 +#define RESET_LINK_ERROR_COUNT 0x01 +#define ENABLE_NOTIFY_SPINUP_INTS 0x02 + + u8 _r_a[57]; + __le16 conn_handle; + u8 _r_b[56]; +} __attribute__ ((packed)); + +struct copy_memory { + u8 _r_a; + __le16 xfer_len; + __le16 _r_b; + __le64 src_busaddr; + u8 src_ds; /* See definition of sg_el */ + u8 _r_c[45]; + __le16 conn_handle; + __le64 _r_d; + __le64 dest_busaddr; + u8 dest_ds; /* See definition of sg_el */ + u8 _r_e[39]; +} __attribute__ ((packed)); + +struct abort_task { + u8 proto_conn_rate; + __le32 _r_a; + struct ssp_frame_hdr ssp_frame; + struct ssp_task_iu ssp_task; + __le16 sister_scb; + __le16 conn_handle; + u8 flags; /* ovrd_itnl_timer:3,3, suspend_data_trans:2,2 */ +#define SUSPEND_DATA_TRANS 0x04 + + u8 _r_b; + u8 retry_count; + u8 _r_c[5]; + __le16 index; /* Transaction context of task to be queried */ + __le16 itnl_to; + u8 _r_d[44]; +} __attribute__ ((packed)); + +struct clear_nexus { + u8 nexus; +#define NEXUS_ADAPTER 0x00 +#define NEXUS_PORT 0x01 +#define NEXUS_I_T 0x02 +#define NEXUS_I_T_L 0x03 +#define NEXUS_TAG 0x04 +#define NEXUS_TRANS_CX 0x05 +#define NEXUS_SATA_TAG 0x06 +#define NEXUS_T_L 0x07 +#define NEXUS_L 0x08 +#define NEXUS_T_TAG 0x09 + + __le32 _r_a; + u8 flags; +#define SUSPEND_TX 0x80 +#define RESUME_TX 0x40 +#define SEND_Q 0x04 +#define EXEC_Q 0x02 +#define NOTINQ 0x01 + + u8 _r_b[3]; + u8 conn_mask; + u8 _r_c[19]; + struct ssp_task_iu ssp_task; /* LUN and TAG */ + __le16 _r_d; + __le16 conn_handle; + __le64 _r_e; + __le16 index; /* Transaction context of task to be cleared */ + __le16 context; /* Clear nexus context */ + u8 _r_f[44]; +} __attribute__ ((packed)); + +struct initiate_ssp_tmf { + u8 proto_conn_rate; + __le32 _r_a; + struct ssp_frame_hdr ssp_frame; + struct ssp_task_iu ssp_task; + __le16 sister_scb; + __le16 conn_handle; + u8 flags; /* itnl override and suspend data tx */ +#define OVERRIDE_ITNL_TIMER 8 + + u8 _r_b; + u8 retry_count; + u8 _r_c[5]; + __le16 index; /* Transaction context of task to be queried */ + __le16 itnl_to; + u8 _r_d[44]; +} __attribute__ ((packed)); + +/* Transmits an arbitrary primitive on the link. + * Used for NOTIFY and BROADCAST. + */ +struct send_prim { + u8 phy_id; + u8 wait_transmit; /* :0,0 */ + u8 xmit_flags; +#define XMTPSIZE_MASK 0xF0 +#define XMTPSIZE_SINGLE 0x10 +#define XMTPSIZE_REPEATED 0x20 +#define XMTPSIZE_CONT 0x20 +#define XMTPSIZE_TRIPLE 0x30 +#define XMTPSIZE_REDUNDANT 0x60 +#define XMTPSIZE_INF 0 + +#define XMTCONTEN 0x04 +#define XMTPFRM 0x02 /* Transmit at the next frame boundary */ +#define XMTPIMM 0x01 /* Transmit immediately */ + + __le16 _r_a; + u8 prim[4]; /* K, D0, D1, D2 */ + u8 _r_b[50]; + __le16 conn_handle; + u8 _r_c[56]; +} __attribute__ ((packed)); + +/* This describes both SSP Target Get Data and SSP Target Get Data And + * Send Good Response SCBs. Used when the sequencer is operating in + * target mode... + */ +struct ssp_targ_get_data { + u8 proto_conn_rate; + __le32 total_xfer_len; + struct ssp_frame_hdr ssp_frame; + struct xfer_rdy_iu xfer_rdy; + u8 lun[LUN_SIZE]; + __le64 _r_a; + __le16 sister_scb; + __le16 conn_handle; + u8 data_dir; /* 01b */ + u8 _r_b; + u8 retry_count; + u8 _r_c[5]; + struct sg_el sg_element[3]; +} __attribute__ ((packed)); + +/* ---------- The actual SCB struct ---------- */ + +struct scb { + struct scb_header header; + union { + struct initiate_ssp_task ssp_task; + struct initiate_ata_task ata_task; + struct initiate_smp_task smp_task; + struct control_phy control_phy; + struct control_ata_dev control_ata_dev; + struct empty_scb escb; + struct initiate_link_adm link_adm; + struct copy_memory cp_mem; + struct abort_task abort_task; + struct clear_nexus clear_nexus; + struct initiate_ssp_tmf ssp_tmf; + }; +} __attribute__ ((packed)); + +/* ---------- Done List ---------- */ +/* The done list entry opcode field is defined below. + * The mnemonic encoding and meaning is as follows: + * TC - Task Complete, status was received and acknowledged + * TF - Task Failed, indicates an error prior to receiving acknowledgment + * for the command: + * - no conn, + * - NACK or R_ERR received in response to this command, + * - credit blocked or not available, or in the case of SMP request, + * - no SMP response was received. + * In these four cases it is known that the target didn't receive the + * command. + * TI - Task Interrupted, error after the command was acknowledged. It is + * known that the command was received by the target. + * TU - Task Unacked, command was transmitted but neither ACK (R_OK) nor NAK + * (R_ERR) was received due to loss of signal, broken connection, loss of + * dword sync or other reason. The application client should send the + * appropriate task query. + * TA - Task Aborted, see TF. + * _RESP - The completion includes an empty buffer containing status. + * TO - Timeout. + */ +#define TC_NO_ERROR 0x00 +#define TC_UNDERRUN 0x01 +#define TC_OVERRUN 0x02 +#define TF_OPEN_TO 0x03 +#define TF_OPEN_REJECT 0x04 +#define TI_BREAK 0x05 +#define TI_PROTO_ERR 0x06 +#define TC_SSP_RESP 0x07 +#define TI_PHY_DOWN 0x08 +#define TF_PHY_DOWN 0x09 +#define TC_LINK_ADM_RESP 0x0a +#define TC_CSMI 0x0b +#define TC_ATA_RESP 0x0c +#define TU_PHY_DOWN 0x0d +#define TU_BREAK 0x0e +#define TI_SATA_TO 0x0f +#define TI_NAK 0x10 +#define TC_CONTROL_PHY 0x11 +#define TF_BREAK 0x12 +#define TC_RESUME 0x13 +#define TI_ACK_NAK_TO 0x14 +#define TF_SMPRSP_TO 0x15 +#define TF_SMP_XMIT_RCV_ERR 0x16 +#define TC_PARTIAL_SG_LIST 0x17 +#define TU_ACK_NAK_TO 0x18 +#define TU_SATA_TO 0x19 +#define TF_NAK_RECV 0x1a +#define TA_I_T_NEXUS_LOSS 0x1b +#define TC_ATA_R_ERR_RECV 0x1c +#define TF_TMF_NO_CTX 0x1d +#define TA_ON_REQ 0x1e +#define TF_TMF_NO_TAG 0x1f +#define TF_TMF_TAG_FREE 0x20 +#define TF_TMF_TASK_DONE 0x21 +#define TF_TMF_NO_CONN_HANDLE 0x22 +#define TC_TASK_CLEARED 0x23 +#define TI_SYNCS_RECV 0x24 +#define TU_SYNCS_RECV 0x25 +#define TF_IRTT_TO 0x26 +#define TF_NO_SMP_CONN 0x27 +#define TF_IU_SHORT 0x28 +#define TF_DATA_OFFS_ERR 0x29 +#define TF_INV_CONN_HANDLE 0x2a +#define TF_REQUESTED_N_PENDING 0x2b + +/* 0xc1 - 0xc7: empty buffer received, + 0xd1 - 0xd7: establish nexus empty buffer received +*/ +/* This is the ESCB mask */ +#define ESCB_RECVD 0xC0 + + +/* This struct done_list_struct defines the done list entry. + * All fields are LE. + */ +struct done_list_struct { + __le16 index; /* aka transaction context */ + u8 opcode; + u8 status_block[4]; + u8 toggle; /* bit 0 */ +#define DL_TOGGLE_MASK 0x01 +} __attribute__ ((packed)); + +/* ---------- PHYS ---------- */ + +struct asd_phy { + struct asd_sas_phy sas_phy; + struct asd_phy_desc *phy_desc; /* hw profile */ + + struct sas_identify_frame *identify_frame; + struct asd_dma_tok *id_frm_tok; + + u8 frame_rcvd[ASD_EDB_SIZE]; +}; + + +#define ASD_SCB_SIZE sizeof(struct scb) +#define ASD_DDB_SIZE sizeof(struct asd_ddb_ssp_smp_target_port) + +/* Define this to 0 if you do not want NOTIFY (ENABLE SPINIP) sent. + * Default: 0x10 (it's a mask) + */ +#define ASD_NOTIFY_ENABLE_SPINUP 0x10 + +/* If enabled, set this to the interval between transmission + * of NOTIFY (ENABLE SPINUP). In units of 200 us. + */ +#define ASD_NOTIFY_TIMEOUT 2500 + +/* Initial delay after OOB, before we transmit NOTIFY (ENABLE SPINUP). + * If 0, transmit immediately. In milliseconds. + */ +#define ASD_NOTIFY_DOWN_COUNT 0 + +/* Device present timer timeout constant, 10 ms. */ +#define ASD_DEV_PRESENT_TIMEOUT 0x2710 + +#define ASD_SATA_INTERLOCK_TIMEOUT 0 + +/* How long to wait before shutting down an STP connection, unless + * an STP target sent frame(s). 50 usec. + * IGNORED by the sequencer (i.e. value 0 always). + */ +#define ASD_STP_SHUTDOWN_TIMEOUT 0x0 + +/* ATA soft reset timer timeout. 5 usec. */ +#define ASD_SRST_ASSERT_TIMEOUT 0x05 + +/* 31 sec */ +#define ASD_RCV_FIS_TIMEOUT 0x01D905C0 + +#define ASD_ONE_MILLISEC_TIMEOUT 0x03e8 + +/* COMINIT timer */ +#define ASD_TEN_MILLISEC_TIMEOUT 0x2710 +#define ASD_COMINIT_TIMEOUT ASD_TEN_MILLISEC_TIMEOUT + +/* 1 sec */ +#define ASD_SMP_RCV_TIMEOUT 0x000F4240 + +#endif diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_scb.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_scb.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_scb.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_scb.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,732 @@ +/* + * Aic94xx SAS/SATA driver SCB management. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include + +#include "aic94xx.h" +#include "aic94xx_reg.h" +#include "aic94xx_hwi.h" +#include "aic94xx_seq.h" + +#include "aic94xx_dump.h" + +/* ---------- EMPTY SCB ---------- */ + +#define DL_PHY_MASK 7 +#define BYTES_DMAED 0 +#define PRIMITIVE_RECVD 0x08 +#define PHY_EVENT 0x10 +#define LINK_RESET_ERROR 0x18 +#define TIMER_EVENT 0x20 +#define REQ_TASK_ABORT 0xF0 +#define REQ_DEVICE_RESET 0xF1 +#define SIGNAL_NCQ_ERROR 0xF2 +#define CLEAR_NCQ_ERROR 0xF3 + +#define PHY_EVENTS_STATUS (CURRENT_LOSS_OF_SIGNAL | CURRENT_OOB_DONE \ + | CURRENT_SPINUP_HOLD | CURRENT_GTO_TIMEOUT \ + | CURRENT_OOB_ERROR) + +static inline void get_lrate_mode(struct asd_phy *phy, u8 oob_mode) +{ + switch (oob_mode & 7) { + case PHY_SPEED_60: + /* FIXME: sas transport class doesn't have this */ + phy->sas_phy.linkrate = PHY_LINKRATE_6; + phy->sas_phy.phy->negotiated_linkrate = SAS_LINK_RATE_6_0_GBPS; + break; + case PHY_SPEED_30: + phy->sas_phy.linkrate = PHY_LINKRATE_3; + phy->sas_phy.phy->negotiated_linkrate = SAS_LINK_RATE_3_0_GBPS; + break; + case PHY_SPEED_15: + phy->sas_phy.linkrate = PHY_LINKRATE_1_5; + phy->sas_phy.phy->negotiated_linkrate = SAS_LINK_RATE_1_5_GBPS; + break; + } + if (oob_mode & SAS_MODE) + phy->sas_phy.oob_mode = SAS_OOB_MODE; + else if (oob_mode & SATA_MODE) + phy->sas_phy.oob_mode = SATA_OOB_MODE; +} + +static inline void asd_phy_event_tasklet(struct asd_ascb *ascb, + struct done_list_struct *dl) +{ + struct asd_ha_struct *asd_ha = ascb->ha; + struct sas_ha_struct *sas_ha = &asd_ha->sas_ha; + int phy_id = dl->status_block[0] & DL_PHY_MASK; + struct asd_phy *phy = &asd_ha->phys[phy_id]; + + u8 oob_status = dl->status_block[1] & PHY_EVENTS_STATUS; + u8 oob_mode = dl->status_block[2]; + + switch (oob_status) { + case CURRENT_LOSS_OF_SIGNAL: + /* directly attached device was removed */ + ASD_DPRINTK("phy%d: device unplugged\n", phy_id); + asd_turn_led(asd_ha, phy_id, 0); + sas_phy_disconnected(&phy->sas_phy); + sas_ha->notify_phy_event(&phy->sas_phy, PHYE_LOSS_OF_SIGNAL); + break; + case CURRENT_OOB_DONE: + /* hot plugged device */ + asd_turn_led(asd_ha, phy_id, 1); + get_lrate_mode(phy, oob_mode); + ASD_DPRINTK("phy%d device plugged: lrate:0x%x, proto:0x%x\n", + phy_id, phy->sas_phy.linkrate, phy->sas_phy.iproto); + sas_ha->notify_phy_event(&phy->sas_phy, PHYE_OOB_DONE); + break; + case CURRENT_SPINUP_HOLD: + /* hot plug SATA, no COMWAKE sent */ + asd_turn_led(asd_ha, phy_id, 1); + sas_ha->notify_phy_event(&phy->sas_phy, PHYE_SPINUP_HOLD); + break; + case CURRENT_GTO_TIMEOUT: + case CURRENT_OOB_ERROR: + ASD_DPRINTK("phy%d error while OOB: oob status:0x%x\n", phy_id, + dl->status_block[1]); + asd_turn_led(asd_ha, phy_id, 0); + sas_phy_disconnected(&phy->sas_phy); + sas_ha->notify_phy_event(&phy->sas_phy, PHYE_OOB_ERROR); + break; + } +} + +/* If phys are enabled sparsely, this will do the right thing. */ +static inline unsigned ord_phy(struct asd_ha_struct *asd_ha, + struct asd_phy *phy) +{ + u8 enabled_mask = asd_ha->hw_prof.enabled_phys; + int i, k = 0; + + for_each_phy(enabled_mask, enabled_mask, i) { + if (&asd_ha->phys[i] == phy) + return k; + k++; + } + return 0; +} + +/** + * asd_get_attached_sas_addr -- extract/generate attached SAS address + * phy: pointer to asd_phy + * sas_addr: pointer to buffer where the SAS address is to be written + * + * This function extracts the SAS address from an IDENTIFY frame + * received. If OOB is SATA, then a SAS address is generated from the + * HA tables. + * + * LOCKING: the frame_rcvd_lock needs to be held since this parses the frame + * buffer. + */ +static inline void asd_get_attached_sas_addr(struct asd_phy *phy, u8 *sas_addr) +{ + if (phy->sas_phy.frame_rcvd[0] == 0x34 + && phy->sas_phy.oob_mode == SATA_OOB_MODE) { + struct asd_ha_struct *asd_ha = phy->sas_phy.ha->lldd_ha; + /* FIS device-to-host */ + u64 addr = be64_to_cpu(*(__be64 *)phy->phy_desc->sas_addr); + + addr += asd_ha->hw_prof.sata_name_base + ord_phy(asd_ha, phy); + *(__be64 *)sas_addr = cpu_to_be64(addr); + } else { + struct sas_identify_frame *idframe = + (void *) phy->sas_phy.frame_rcvd; + memcpy(sas_addr, idframe->sas_addr, SAS_ADDR_SIZE); + } +} + +static inline void asd_bytes_dmaed_tasklet(struct asd_ascb *ascb, + struct done_list_struct *dl, + int edb_id, int phy_id) +{ + unsigned long flags; + int edb_el = edb_id + ascb->edb_index; + struct asd_dma_tok *edb = ascb->ha->seq.edb_arr[edb_el]; + struct asd_phy *phy = &ascb->ha->phys[phy_id]; + struct sas_ha_struct *sas_ha = phy->sas_phy.ha; + u16 size = ((dl->status_block[3] & 7) << 8) | dl->status_block[2]; + + size = min(size, (u16) sizeof(phy->frame_rcvd)); + + spin_lock_irqsave(&phy->sas_phy.frame_rcvd_lock, flags); + memcpy(phy->sas_phy.frame_rcvd, edb->vaddr, size); + phy->sas_phy.frame_rcvd_size = size; + asd_get_attached_sas_addr(phy, phy->sas_phy.attached_sas_addr); + spin_unlock_irqrestore(&phy->sas_phy.frame_rcvd_lock, flags); + asd_dump_frame_rcvd(phy, dl); + sas_ha->notify_port_event(&phy->sas_phy, PORTE_BYTES_DMAED); +} + +static inline void asd_link_reset_err_tasklet(struct asd_ascb *ascb, + struct done_list_struct *dl, + int phy_id) +{ + struct asd_ha_struct *asd_ha = ascb->ha; + struct sas_ha_struct *sas_ha = &asd_ha->sas_ha; + struct asd_sas_phy *sas_phy = sas_ha->sas_phy[phy_id]; + u8 lr_error = dl->status_block[1]; + u8 retries_left = dl->status_block[2]; + + switch (lr_error) { + case 0: + ASD_DPRINTK("phy%d: Receive ID timer expired\n", phy_id); + break; + case 1: + ASD_DPRINTK("phy%d: Loss of signal\n", phy_id); + break; + case 2: + ASD_DPRINTK("phy%d: Loss of dword sync\n", phy_id); + break; + case 3: + ASD_DPRINTK("phy%d: Receive FIS timeout\n", phy_id); + break; + default: + ASD_DPRINTK("phy%d: unknown link reset error code: 0x%x\n", + phy_id, lr_error); + break; + } + + asd_turn_led(asd_ha, phy_id, 0); + sas_phy_disconnected(sas_phy); + sas_ha->notify_port_event(sas_phy, PORTE_LINK_RESET_ERR); + + if (retries_left == 0) { + int num = 1; + struct asd_ascb *cp = asd_ascb_alloc_list(ascb->ha, &num, + GFP_ATOMIC); + if (!cp) { + asd_printk("%s: out of memory\n", __FUNCTION__); + goto out; + } + ASD_DPRINTK("phy%d: retries:0 performing link reset seq\n", + phy_id); + asd_build_control_phy(cp, phy_id, ENABLE_PHY); + if (asd_post_ascb_list(ascb->ha, cp, 1) != 0) + asd_ascb_free(cp); + } +out: + ; +} + +static inline void asd_primitive_rcvd_tasklet(struct asd_ascb *ascb, + struct done_list_struct *dl, + int phy_id) +{ + unsigned long flags; + struct sas_ha_struct *sas_ha = &ascb->ha->sas_ha; + struct asd_sas_phy *sas_phy = sas_ha->sas_phy[phy_id]; + u8 reg = dl->status_block[1]; + u32 cont = dl->status_block[2] << ((reg & 3)*8); + + reg &= ~3; + switch (reg) { + case LmPRMSTAT0BYTE0: + switch (cont) { + case LmBROADCH: + case LmBROADRVCH0: + case LmBROADRVCH1: + case LmBROADSES: + ASD_DPRINTK("phy%d: BROADCAST change received:%d\n", + phy_id, cont); + spin_lock_irqsave(&sas_phy->sas_prim_lock, flags); + sas_phy->sas_prim = ffs(cont); + spin_unlock_irqrestore(&sas_phy->sas_prim_lock, flags); + sas_ha->notify_port_event(sas_phy,PORTE_BROADCAST_RCVD); + break; + + case LmUNKNOWNP: + ASD_DPRINTK("phy%d: unknown BREAK\n", phy_id); + break; + + default: + ASD_DPRINTK("phy%d: primitive reg:0x%x, cont:0x%04x\n", + phy_id, reg, cont); + break; + } + break; + case LmPRMSTAT1BYTE0: + switch (cont) { + case LmHARDRST: + ASD_DPRINTK("phy%d: HARD_RESET primitive rcvd\n", + phy_id); + /* The sequencer disables all phys on that port. + * We have to re-enable the phys ourselves. */ + sas_ha->notify_port_event(sas_phy, PORTE_HARD_RESET); + break; + + default: + ASD_DPRINTK("phy%d: primitive reg:0x%x, cont:0x%04x\n", + phy_id, reg, cont); + break; + } + break; + default: + ASD_DPRINTK("unknown primitive register:0x%x\n", + dl->status_block[1]); + break; + } +} + +/** + * asd_invalidate_edb -- invalidate an EDB and if necessary post the ESCB + * @ascb: pointer to Empty SCB + * @edb_id: index [0,6] to the empty data buffer which is to be invalidated + * + * After an EDB has been invalidated, if all EDBs in this ESCB have been + * invalidated, the ESCB is posted back to the sequencer. + * Context is tasklet/IRQ. + */ +void asd_invalidate_edb(struct asd_ascb *ascb, int edb_id) +{ + struct asd_seq_data *seq = &ascb->ha->seq; + struct empty_scb *escb = &ascb->scb->escb; + struct sg_el *eb = &escb->eb[edb_id]; + struct asd_dma_tok *edb = seq->edb_arr[ascb->edb_index + edb_id]; + + memset(edb->vaddr, 0, ASD_EDB_SIZE); + eb->flags |= ELEMENT_NOT_VALID; + escb->num_valid--; + + if (escb->num_valid == 0) { + int i; + /* ASD_DPRINTK("reposting escb: vaddr: 0x%p, " + "dma_handle: 0x%08llx, next: 0x%08llx, " + "index:%d, opcode:0x%02x\n", + ascb->dma_scb.vaddr, + (u64)ascb->dma_scb.dma_handle, + le64_to_cpu(ascb->scb->header.next_scb), + le16_to_cpu(ascb->scb->header.index), + ascb->scb->header.opcode); + */ + escb->num_valid = ASD_EDBS_PER_SCB; + for (i = 0; i < ASD_EDBS_PER_SCB; i++) + escb->eb[i].flags = 0; + if (!list_empty(&ascb->list)) + list_del_init(&ascb->list); + i = asd_post_escb_list(ascb->ha, ascb, 1); + if (i) + asd_printk("couldn't post escb, err:%d\n", i); + } +} + +static void escb_tasklet_complete(struct asd_ascb *ascb, + struct done_list_struct *dl) +{ + struct asd_ha_struct *asd_ha = ascb->ha; + struct sas_ha_struct *sas_ha = &asd_ha->sas_ha; + int edb = (dl->opcode & DL_PHY_MASK) - 1; /* [0xc1,0xc7] -> [0,6] */ + u8 sb_opcode = dl->status_block[0]; + int phy_id = sb_opcode & DL_PHY_MASK; + struct asd_sas_phy *sas_phy = sas_ha->sas_phy[phy_id]; + + if (edb > 6 || edb < 0) { + ASD_DPRINTK("edb is 0x%x! dl->opcode is 0x%x\n", + edb, dl->opcode); + ASD_DPRINTK("sb_opcode : 0x%x, phy_id: 0x%x\n", + sb_opcode, phy_id); + ASD_DPRINTK("escb: vaddr: 0x%p, " + "dma_handle: 0x%llx, next: 0x%llx, " + "index:%d, opcode:0x%02x\n", + ascb->dma_scb.vaddr, + (unsigned long long)ascb->dma_scb.dma_handle, + (unsigned long long) + le64_to_cpu(ascb->scb->header.next_scb), + le16_to_cpu(ascb->scb->header.index), + ascb->scb->header.opcode); + } + + sb_opcode &= ~DL_PHY_MASK; + + switch (sb_opcode) { + case BYTES_DMAED: + ASD_DPRINTK("%s: phy%d: BYTES_DMAED\n", __FUNCTION__, phy_id); + asd_bytes_dmaed_tasklet(ascb, dl, edb, phy_id); + break; + case PRIMITIVE_RECVD: + ASD_DPRINTK("%s: phy%d: PRIMITIVE_RECVD\n", __FUNCTION__, + phy_id); + asd_primitive_rcvd_tasklet(ascb, dl, phy_id); + break; + case PHY_EVENT: + ASD_DPRINTK("%s: phy%d: PHY_EVENT\n", __FUNCTION__, phy_id); + asd_phy_event_tasklet(ascb, dl); + break; + case LINK_RESET_ERROR: + ASD_DPRINTK("%s: phy%d: LINK_RESET_ERROR\n", __FUNCTION__, + phy_id); + asd_link_reset_err_tasklet(ascb, dl, phy_id); + break; + case TIMER_EVENT: + ASD_DPRINTK("%s: phy%d: TIMER_EVENT, lost dw sync\n", + __FUNCTION__, phy_id); + asd_turn_led(asd_ha, phy_id, 0); + /* the device is gone */ + sas_phy_disconnected(sas_phy); + sas_ha->notify_port_event(sas_phy, PORTE_TIMER_EVENT); + break; + case REQ_TASK_ABORT: + ASD_DPRINTK("%s: phy%d: REQ_TASK_ABORT\n", __FUNCTION__, + phy_id); + break; + case REQ_DEVICE_RESET: + ASD_DPRINTK("%s: phy%d: REQ_DEVICE_RESET\n", __FUNCTION__, + phy_id); + break; + case SIGNAL_NCQ_ERROR: + ASD_DPRINTK("%s: phy%d: SIGNAL_NCQ_ERROR\n", __FUNCTION__, + phy_id); + break; + case CLEAR_NCQ_ERROR: + ASD_DPRINTK("%s: phy%d: CLEAR_NCQ_ERROR\n", __FUNCTION__, + phy_id); + break; + default: + ASD_DPRINTK("%s: phy%d: unknown event:0x%x\n", __FUNCTION__, + phy_id, sb_opcode); + ASD_DPRINTK("edb is 0x%x! dl->opcode is 0x%x\n", + edb, dl->opcode); + ASD_DPRINTK("sb_opcode : 0x%x, phy_id: 0x%x\n", + sb_opcode, phy_id); + ASD_DPRINTK("escb: vaddr: 0x%p, " + "dma_handle: 0x%llx, next: 0x%llx, " + "index:%d, opcode:0x%02x\n", + ascb->dma_scb.vaddr, + (unsigned long long)ascb->dma_scb.dma_handle, + (unsigned long long) + le64_to_cpu(ascb->scb->header.next_scb), + le16_to_cpu(ascb->scb->header.index), + ascb->scb->header.opcode); + + break; + } + + asd_invalidate_edb(ascb, edb); +} + +int asd_init_post_escbs(struct asd_ha_struct *asd_ha) +{ + struct asd_seq_data *seq = &asd_ha->seq; + int i; + + for (i = 0; i < seq->num_escbs; i++) + seq->escb_arr[i]->tasklet_complete = escb_tasklet_complete; + + ASD_DPRINTK("posting %d escbs\n", i); + return asd_post_escb_list(asd_ha, seq->escb_arr[0], seq->num_escbs); +} + +/* ---------- CONTROL PHY ---------- */ + +#define CONTROL_PHY_STATUS (CURRENT_DEVICE_PRESENT | CURRENT_OOB_DONE \ + | CURRENT_SPINUP_HOLD | CURRENT_GTO_TIMEOUT \ + | CURRENT_OOB_ERROR) + +/** + * control_phy_tasklet_complete -- tasklet complete for CONTROL PHY ascb + * @ascb: pointer to an ascb + * @dl: pointer to the done list entry + * + * This function completes a CONTROL PHY scb and frees the ascb. + * A note on LEDs: + * - an LED blinks if there is IO though it, + * - if a device is connected to the LED, it is lit, + * - if no device is connected to the LED, is is dimmed (off). + */ +static void control_phy_tasklet_complete(struct asd_ascb *ascb, + struct done_list_struct *dl) +{ + struct asd_ha_struct *asd_ha = ascb->ha; + struct scb *scb = ascb->scb; + struct control_phy *control_phy = &scb->control_phy; + u8 phy_id = control_phy->phy_id; + struct asd_phy *phy = &ascb->ha->phys[phy_id]; + + u8 status = dl->status_block[0]; + u8 oob_status = dl->status_block[1]; + u8 oob_mode = dl->status_block[2]; + /* u8 oob_signals= dl->status_block[3]; */ + + if (status != 0) { + ASD_DPRINTK("%s: phy%d status block opcode:0x%x\n", + __FUNCTION__, phy_id, status); + goto out; + } + + switch (control_phy->sub_func) { + case DISABLE_PHY: + asd_ha->hw_prof.enabled_phys &= ~(1 << phy_id); + asd_turn_led(asd_ha, phy_id, 0); + asd_control_led(asd_ha, phy_id, 0); + ASD_DPRINTK("%s: disable phy%d\n", __FUNCTION__, phy_id); + break; + + case ENABLE_PHY: + asd_control_led(asd_ha, phy_id, 1); + if (oob_status & CURRENT_OOB_DONE) { + asd_ha->hw_prof.enabled_phys |= (1 << phy_id); + get_lrate_mode(phy, oob_mode); + asd_turn_led(asd_ha, phy_id, 1); + ASD_DPRINTK("%s: phy%d, lrate:0x%x, proto:0x%x\n", + __FUNCTION__, phy_id,phy->sas_phy.linkrate, + phy->sas_phy.iproto); + } else if (oob_status & CURRENT_SPINUP_HOLD) { + asd_ha->hw_prof.enabled_phys |= (1 << phy_id); + asd_turn_led(asd_ha, phy_id, 1); + ASD_DPRINTK("%s: phy%d, spinup hold\n", __FUNCTION__, + phy_id); + } else if (oob_status & CURRENT_ERR_MASK) { + asd_turn_led(asd_ha, phy_id, 0); + ASD_DPRINTK("%s: phy%d: error: oob status:0x%02x\n", + __FUNCTION__, phy_id, oob_status); + } else if (oob_status & (CURRENT_HOT_PLUG_CNCT + | CURRENT_DEVICE_PRESENT)) { + asd_ha->hw_prof.enabled_phys |= (1 << phy_id); + asd_turn_led(asd_ha, phy_id, 1); + ASD_DPRINTK("%s: phy%d: hot plug or device present\n", + __FUNCTION__, phy_id); + } else { + asd_ha->hw_prof.enabled_phys |= (1 << phy_id); + asd_turn_led(asd_ha, phy_id, 0); + ASD_DPRINTK("%s: phy%d: no device present: " + "oob_status:0x%x\n", + __FUNCTION__, phy_id, oob_status); + } + break; + case RELEASE_SPINUP_HOLD: + case PHY_NO_OP: + case EXECUTE_HARD_RESET: + ASD_DPRINTK("%s: phy%d: sub_func:0x%x\n", __FUNCTION__, + phy_id, control_phy->sub_func); + /* XXX finish */ + break; + default: + ASD_DPRINTK("%s: phy%d: sub_func:0x%x?\n", __FUNCTION__, + phy_id, control_phy->sub_func); + break; + } +out: + asd_ascb_free(ascb); +} + +static inline void set_speed_mask(u8 *speed_mask, struct asd_phy_desc *pd) +{ + /* disable all speeds, then enable defaults */ + *speed_mask = SAS_SPEED_60_DIS | SAS_SPEED_30_DIS | SAS_SPEED_15_DIS + | SATA_SPEED_30_DIS | SATA_SPEED_15_DIS; + + switch (pd->max_sas_lrate) { + case PHY_LINKRATE_6: + *speed_mask &= ~SAS_SPEED_60_DIS; + default: + case PHY_LINKRATE_3: + *speed_mask &= ~SAS_SPEED_30_DIS; + case PHY_LINKRATE_1_5: + *speed_mask &= ~SAS_SPEED_15_DIS; + } + + switch (pd->min_sas_lrate) { + case PHY_LINKRATE_6: + *speed_mask |= SAS_SPEED_30_DIS; + case PHY_LINKRATE_3: + *speed_mask |= SAS_SPEED_15_DIS; + default: + case PHY_LINKRATE_1_5: + /* nothing to do */ + ; + } + + switch (pd->max_sata_lrate) { + case PHY_LINKRATE_3: + *speed_mask &= ~SATA_SPEED_30_DIS; + default: + case PHY_LINKRATE_1_5: + *speed_mask &= ~SATA_SPEED_15_DIS; + } + + switch (pd->min_sata_lrate) { + case PHY_LINKRATE_3: + *speed_mask |= SATA_SPEED_15_DIS; + default: + case PHY_LINKRATE_1_5: + /* nothing to do */ + ; + } +} + +/** + * asd_build_control_phy -- build a CONTROL PHY SCB + * @ascb: pointer to an ascb + * @phy_id: phy id to control, integer + * @subfunc: subfunction, what to actually to do the phy + * + * This function builds a CONTROL PHY scb. No allocation of any kind + * is performed. @ascb is allocated with the list function. + * The caller can override the ascb->tasklet_complete to point + * to its own callback function. It must call asd_ascb_free() + * at its tasklet complete function. + * See the default implementation. + */ +void asd_build_control_phy(struct asd_ascb *ascb, int phy_id, u8 subfunc) +{ + struct asd_phy *phy = &ascb->ha->phys[phy_id]; + struct scb *scb = ascb->scb; + struct control_phy *control_phy = &scb->control_phy; + + scb->header.opcode = CONTROL_PHY; + control_phy->phy_id = (u8) phy_id; + control_phy->sub_func = subfunc; + + switch (subfunc) { + case EXECUTE_HARD_RESET: /* 0x81 */ + case ENABLE_PHY: /* 0x01 */ + /* decide hot plug delay */ + control_phy->hot_plug_delay = HOTPLUG_DELAY_TIMEOUT; + + /* decide speed mask */ + set_speed_mask(&control_phy->speed_mask, phy->phy_desc); + + /* initiator port settings are in the hi nibble */ + if (phy->sas_phy.role == PHY_ROLE_INITIATOR) + control_phy->port_type = SAS_PROTO_ALL << 4; + else if (phy->sas_phy.role == PHY_ROLE_TARGET) + control_phy->port_type = SAS_PROTO_ALL; + else + control_phy->port_type = + (SAS_PROTO_ALL << 4) | SAS_PROTO_ALL; + + /* link reset retries, this should be nominal */ + control_phy->link_reset_retries = 10; + + case RELEASE_SPINUP_HOLD: /* 0x02 */ + /* decide the func_mask */ + control_phy->func_mask = FUNCTION_MASK_DEFAULT; + if (phy->phy_desc->flags & ASD_SATA_SPINUP_HOLD) + control_phy->func_mask &= ~SPINUP_HOLD_DIS; + else + control_phy->func_mask |= SPINUP_HOLD_DIS; + } + + control_phy->conn_handle = cpu_to_le16(0xFFFF); + + ascb->tasklet_complete = control_phy_tasklet_complete; +} + +/* ---------- INITIATE LINK ADM TASK ---------- */ + +static void link_adm_tasklet_complete(struct asd_ascb *ascb, + struct done_list_struct *dl) +{ + u8 opcode = dl->opcode; + struct initiate_link_adm *link_adm = &ascb->scb->link_adm; + u8 phy_id = link_adm->phy_id; + + if (opcode != TC_NO_ERROR) { + asd_printk("phy%d: link adm task 0x%x completed with error " + "0x%x\n", phy_id, link_adm->sub_func, opcode); + } + ASD_DPRINTK("phy%d: link adm task 0x%x: 0x%x\n", + phy_id, link_adm->sub_func, opcode); + + asd_ascb_free(ascb); +} + +void asd_build_initiate_link_adm_task(struct asd_ascb *ascb, int phy_id, + u8 subfunc) +{ + struct scb *scb = ascb->scb; + struct initiate_link_adm *link_adm = &scb->link_adm; + + scb->header.opcode = INITIATE_LINK_ADM_TASK; + + link_adm->phy_id = phy_id; + link_adm->sub_func = subfunc; + link_adm->conn_handle = cpu_to_le16(0xFFFF); + + ascb->tasklet_complete = link_adm_tasklet_complete; +} + +/* ---------- SCB timer ---------- */ + +/** + * asd_ascb_timedout -- called when a pending SCB's timer has expired + * @data: unsigned long, a pointer to the ascb in question + * + * This is the default timeout function which does the most necessary. + * Upper layers can implement their own timeout function, say to free + * resources they have with this SCB, and then call this one at the + * end of their timeout function. To do this, one should initialize + * the ascb->timer.{function, data, expires} prior to calling the post + * funcion. The timer is started by the post function. + */ +void asd_ascb_timedout(unsigned long data) +{ + struct asd_ascb *ascb = (void *) data; + struct asd_seq_data *seq = &ascb->ha->seq; + unsigned long flags; + + ASD_DPRINTK("scb:0x%x timed out\n", ascb->scb->header.opcode); + + spin_lock_irqsave(&seq->pend_q_lock, flags); + seq->pending--; + list_del_init(&ascb->list); + spin_unlock_irqrestore(&seq->pend_q_lock, flags); + + asd_ascb_free(ascb); +} + +/* ---------- CONTROL PHY ---------- */ + +/* Given the spec value, return a driver value. */ +static const int phy_func_table[] = { + [PHY_FUNC_NOP] = PHY_NO_OP, + [PHY_FUNC_LINK_RESET] = ENABLE_PHY, + [PHY_FUNC_HARD_RESET] = EXECUTE_HARD_RESET, + [PHY_FUNC_DISABLE] = DISABLE_PHY, + [PHY_FUNC_RELEASE_SPINUP_HOLD] = RELEASE_SPINUP_HOLD, +}; + +int asd_control_phy(struct asd_sas_phy *phy, enum phy_func func) +{ + struct asd_ha_struct *asd_ha = phy->ha->lldd_ha; + struct asd_ascb *ascb; + int res = 1; + + if (func == PHY_FUNC_CLEAR_ERROR_LOG) + return -ENOSYS; + + ascb = asd_ascb_alloc_list(asd_ha, &res, GFP_KERNEL); + if (!ascb) + return -ENOMEM; + + asd_build_control_phy(ascb, phy->id, phy_func_table[func]); + res = asd_post_ascb_list(asd_ha, ascb , 1); + if (res) + asd_ascb_free(ascb); + + return res; +} diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_sds.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_sds.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_sds.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_sds.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1154 @@ +/* + * Aic94xx SAS/SATA driver access to shared data structures and memory + * maps. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +#include "aic94xx.h" +#include "aic94xx_reg.h" + +/* ---------- OCM stuff ---------- */ + +struct asd_ocm_dir_ent { + u8 type; + u8 offs[3]; + u8 _r1; + u8 size[3]; +} __attribute__ ((packed)); + +struct asd_ocm_dir { + char sig[2]; + u8 _r1[2]; + u8 major; /* 0 */ + u8 minor; /* 0 */ + u8 _r2; + u8 num_de; + struct asd_ocm_dir_ent entry[15]; +} __attribute__ ((packed)); + +#define OCM_DE_OCM_DIR 0x00 +#define OCM_DE_WIN_DRVR 0x01 +#define OCM_DE_BIOS_CHIM 0x02 +#define OCM_DE_RAID_ENGN 0x03 +#define OCM_DE_BIOS_INTL 0x04 +#define OCM_DE_BIOS_CHIM_OSM 0x05 +#define OCM_DE_BIOS_CHIM_DYNAMIC 0x06 +#define OCM_DE_ADDC2C_RES0 0x07 +#define OCM_DE_ADDC2C_RES1 0x08 +#define OCM_DE_ADDC2C_RES2 0x09 +#define OCM_DE_ADDC2C_RES3 0x0A + +#define OCM_INIT_DIR_ENTRIES 5 +/*************************************************************************** +* OCM dircetory default +***************************************************************************/ +static struct asd_ocm_dir OCMDirInit = +{ + .sig = {0x4D, 0x4F}, /* signature */ + .num_de = OCM_INIT_DIR_ENTRIES, /* no. of directory entries */ +}; + +/*************************************************************************** +* OCM dircetory Entries default +***************************************************************************/ +static struct asd_ocm_dir_ent OCMDirEntriesInit[OCM_INIT_DIR_ENTRIES] = +{ + { + .type = (OCM_DE_ADDC2C_RES0), /* Entry type */ + .offs = {128}, /* Offset */ + .size = {0, 4}, /* size */ + }, + { + .type = (OCM_DE_ADDC2C_RES1), /* Entry type */ + .offs = {128, 4}, /* Offset */ + .size = {0, 4}, /* size */ + }, + { + .type = (OCM_DE_ADDC2C_RES2), /* Entry type */ + .offs = {128, 8}, /* Offset */ + .size = {0, 4}, /* size */ + }, + { + .type = (OCM_DE_ADDC2C_RES3), /* Entry type */ + .offs = {128, 12}, /* Offset */ + .size = {0, 4}, /* size */ + }, + { + .type = (OCM_DE_WIN_DRVR), /* Entry type */ + .offs = {128, 16}, /* Offset */ + .size = {128, 235, 1}, /* size */ + }, +}; + +struct asd_bios_chim_struct { + char sig[4]; + u8 major; /* 1 */ + u8 minor; /* 0 */ + u8 bios_major; + u8 bios_minor; + __le32 bios_build; + u8 flags; + u8 pci_slot; + __le16 ue_num; + __le16 ue_size; + u8 _r[14]; + /* The unit element array is right here. + */ +} __attribute__ ((packed)); + +/** + * asd_read_ocm_seg - read an on chip memory (OCM) segment + * @asd_ha: pointer to the host adapter structure + * @buffer: where to write the read data + * @offs: offset into OCM where to read from + * @size: how many bytes to read + * + * Return the number of bytes not read. Return 0 on success. + */ +static int asd_read_ocm_seg(struct asd_ha_struct *asd_ha, void *buffer, + u32 offs, int size) +{ + u8 *p = buffer; + if (unlikely(asd_ha->iospace)) + asd_read_reg_string(asd_ha, buffer, offs+OCM_BASE_ADDR, size); + else { + for ( ; size > 0; size--, offs++, p++) + *p = asd_read_ocm_byte(asd_ha, offs); + } + return size; +} + +static int asd_read_ocm_dir(struct asd_ha_struct *asd_ha, + struct asd_ocm_dir *dir, u32 offs) +{ + int err = asd_read_ocm_seg(asd_ha, dir, offs, sizeof(*dir)); + if (err) { + ASD_DPRINTK("couldn't read ocm segment\n"); + return err; + } + + if (dir->sig[0] != 'M' || dir->sig[1] != 'O') { + ASD_DPRINTK("no valid dir signature(%c%c) at start of OCM\n", + dir->sig[0], dir->sig[1]); + return -ENOENT; + } + if (dir->major != 0) { + asd_printk("unsupported major version of ocm dir:0x%x\n", + dir->major); + return -ENOENT; + } + dir->num_de &= 0xf; + return 0; +} + +/** + * asd_write_ocm_seg - write an on chip memory (OCM) segment + * @asd_ha: pointer to the host adapter structure + * @buffer: where to read the write data + * @offs: offset into OCM to write to + * @size: how many bytes to write + * + * Return the number of bytes not written. Return 0 on success. + */ +static void asd_write_ocm_seg(struct asd_ha_struct *asd_ha, void *buffer, + u32 offs, int size) +{ + u8 *p = buffer; + if (unlikely(asd_ha->iospace)) + asd_write_reg_string(asd_ha, buffer, offs+OCM_BASE_ADDR, size); + else { + for ( ; size > 0; size--, offs++, p++) + asd_write_ocm_byte(asd_ha, offs, *p); + } + return; +} + +#define THREE_TO_NUM(X) ((X)[0] | ((X)[1] << 8) | ((X)[2] << 16)) + +static int asd_find_dir_entry(struct asd_ocm_dir *dir, u8 type, + u32 *offs, u32 *size) +{ + int i; + struct asd_ocm_dir_ent *ent; + + for (i = 0; i < dir->num_de; i++) { + if (dir->entry[i].type == type) + break; + } + if (i >= dir->num_de) + return -ENOENT; + ent = &dir->entry[i]; + *offs = (u32) THREE_TO_NUM(ent->offs); + *size = (u32) THREE_TO_NUM(ent->size); + return 0; +} + +#define OCM_BIOS_CHIM_DE 2 +#define BC_BIOS_PRESENT 1 + +static int asd_get_bios_chim(struct asd_ha_struct *asd_ha, + struct asd_ocm_dir *dir) +{ + int err; + struct asd_bios_chim_struct *bc_struct; + u32 offs, size; + + err = asd_find_dir_entry(dir, OCM_BIOS_CHIM_DE, &offs, &size); + if (err) { + ASD_DPRINTK("couldn't find BIOS_CHIM dir ent\n"); + goto out; + } + err = -ENOMEM; + bc_struct = kmalloc(sizeof(*bc_struct), GFP_KERNEL); + if (!bc_struct) { + asd_printk("no memory for bios_chim struct\n"); + goto out; + } + err = asd_read_ocm_seg(asd_ha, (void *)bc_struct, offs, + sizeof(*bc_struct)); + if (err) { + ASD_DPRINTK("couldn't read ocm segment\n"); + goto out2; + } + if (strncmp(bc_struct->sig, "SOIB", 4) + && strncmp(bc_struct->sig, "IPSA", 4)) { + ASD_DPRINTK("BIOS_CHIM entry has no valid sig(%c%c%c%c)\n", + bc_struct->sig[0], bc_struct->sig[1], + bc_struct->sig[2], bc_struct->sig[3]); + err = -ENOENT; + goto out2; + } + if (bc_struct->major != 1) { + asd_printk("BIOS_CHIM unsupported major version:0x%x\n", + bc_struct->major); + err = -ENOENT; + goto out2; + } + if (bc_struct->flags & BC_BIOS_PRESENT) { + asd_ha->hw_prof.bios.present = 1; + asd_ha->hw_prof.bios.maj = bc_struct->bios_major; + asd_ha->hw_prof.bios.min = bc_struct->bios_minor; + asd_ha->hw_prof.bios.bld = le32_to_cpu(bc_struct->bios_build); + ASD_DPRINTK("BIOS present (%d,%d), %d\n", + asd_ha->hw_prof.bios.maj, + asd_ha->hw_prof.bios.min, + asd_ha->hw_prof.bios.bld); + } + asd_ha->hw_prof.ue.num = le16_to_cpu(bc_struct->ue_num); + asd_ha->hw_prof.ue.size= le16_to_cpu(bc_struct->ue_size); + ASD_DPRINTK("ue num:%d, ue size:%d\n", asd_ha->hw_prof.ue.num, + asd_ha->hw_prof.ue.size); + size = asd_ha->hw_prof.ue.num * asd_ha->hw_prof.ue.size; + if (size > 0) { + err = -ENOMEM; + asd_ha->hw_prof.ue.area = kmalloc(size, GFP_KERNEL); + if (!asd_ha->hw_prof.ue.area) + goto out2; + err = asd_read_ocm_seg(asd_ha, (void *)asd_ha->hw_prof.ue.area, + offs + sizeof(*bc_struct), size); + if (err) { + kfree(asd_ha->hw_prof.ue.area); + asd_ha->hw_prof.ue.area = NULL; + asd_ha->hw_prof.ue.num = 0; + asd_ha->hw_prof.ue.size = 0; + ASD_DPRINTK("couldn't read ue entries(%d)\n", err); + } + } +out2: + kfree(bc_struct); +out: + return err; +} + +static void +asd_hwi_initialize_ocm_dir (struct asd_ha_struct *asd_ha) +{ + int i; + + /* Zero OCM */ + for (i = 0; i < OCM_MAX_SIZE; i += 4) + asd_write_ocm_dword(asd_ha, i, 0); + + /* Write Dir */ + asd_write_ocm_seg(asd_ha, &OCMDirInit, 0, + sizeof(struct asd_ocm_dir)); + + /* Write Dir Entries */ + for (i = 0; i < OCM_INIT_DIR_ENTRIES; i++) + asd_write_ocm_seg(asd_ha, &OCMDirEntriesInit[i], + sizeof(struct asd_ocm_dir) + + (i * sizeof(struct asd_ocm_dir_ent)) + , sizeof(struct asd_ocm_dir_ent)); + +} + +static int +asd_hwi_check_ocm_access (struct asd_ha_struct *asd_ha) +{ + struct pci_dev *pcidev = asd_ha->pcidev; + u32 reg; + int err = 0; + u32 v; + + /* check if OCM has been initialized by BIOS */ + reg = asd_read_reg_dword(asd_ha, EXSICNFGR); + + if (!(reg & OCMINITIALIZED)) { + err = pci_read_config_dword(pcidev, PCIC_INTRPT_STAT, &v); + if (err) { + asd_printk("couldn't access PCIC_INTRPT_STAT of %s\n", + pci_name(pcidev)); + goto out; + } + + printk(KERN_INFO "OCM is not initialized by BIOS," + "reinitialize it and ignore it, current IntrptStatus" + "is 0x%x\n", v); + + if (v) + err = pci_write_config_dword(pcidev, + PCIC_INTRPT_STAT, v); + if (err) { + asd_printk("couldn't write PCIC_INTRPT_STAT of %s\n", + pci_name(pcidev)); + goto out; + } + + asd_hwi_initialize_ocm_dir(asd_ha); + + } +out: + return err; +} + +/** + * asd_read_ocm - read on chip memory (OCM) + * @asd_ha: pointer to the host adapter structure + */ +int asd_read_ocm(struct asd_ha_struct *asd_ha) +{ + int err; + struct asd_ocm_dir *dir; + + if (asd_hwi_check_ocm_access(asd_ha)) + return -1; + + dir = kmalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) { + asd_printk("no memory for ocm dir\n"); + return -ENOMEM; + } + + err = asd_read_ocm_dir(asd_ha, dir, 0); + if (err) + goto out; + + err = asd_get_bios_chim(asd_ha, dir); +out: + kfree(dir); + return err; +} + +/* ---------- FLASH stuff ---------- */ + +#define FLASH_RESET 0xF0 +#define FLASH_MANUF_AMD 0x01 +#define FLASH_MANUF_ST 0x20 +#define FLASH_MANUF_FUJITSU 0x04 +#define FLASH_MANUF_MACRONIX 0xC2 + +#define FLASH_SIZE 0x200000 +#define FLASH_DIR_COOKIE "*** ADAPTEC FLASH DIRECTORY *** " +#define FLASH_NEXT_ENTRY_OFFS 0x2000 +#define FLASH_MAX_DIR_ENTRIES 32 + +#define FLASH_DE_TYPE_MASK 0x3FFFFFFF +#define FLASH_DE_MS 0x120 +#define FLASH_DE_CTRL_A_USER 0xE0 + +struct asd_flash_de { + __le32 type; + __le32 offs; + __le32 pad_size; + __le32 image_size; + __le32 chksum; + u8 _r[12]; + u8 version[32]; +} __attribute__ ((packed)); + +struct asd_flash_dir { + u8 cookie[32]; + __le32 rev; /* 2 */ + __le32 chksum; + __le32 chksum_antidote; + __le32 bld; + u8 bld_id[32]; /* build id data */ + u8 ver_data[32]; /* date and time of build */ + __le32 ae_mask; + __le32 v_mask; + __le32 oc_mask; + u8 _r[20]; + struct asd_flash_de dir_entry[FLASH_MAX_DIR_ENTRIES]; +} __attribute__ ((packed)); + +struct asd_manuf_sec { + char sig[2]; /* 'S', 'M' */ + u16 offs_next; + u8 maj; /* 0 */ + u8 min; /* 0 */ + u16 chksum; + u16 size; + u8 _r[6]; + u8 sas_addr[SAS_ADDR_SIZE]; + u8 pcba_sn[ASD_PCBA_SN_SIZE]; + /* Here start the other segments */ + u8 linked_list[0]; +} __attribute__ ((packed)); + +struct asd_manuf_phy_desc { + u8 state; /* low 4 bits */ +#define MS_PHY_STATE_ENABLEABLE 0 +#define MS_PHY_STATE_REPORTED 1 +#define MS_PHY_STATE_HIDDEN 2 + u8 phy_id; + u16 _r; + u8 phy_control_0; /* mode 5 reg 0x160 */ + u8 phy_control_1; /* mode 5 reg 0x161 */ + u8 phy_control_2; /* mode 5 reg 0x162 */ + u8 phy_control_3; /* mode 5 reg 0x163 */ +} __attribute__ ((packed)); + +struct asd_manuf_phy_param { + char sig[2]; /* 'P', 'M' */ + u16 next; + u8 maj; /* 0 */ + u8 min; /* 2 */ + u8 num_phy_desc; /* 8 */ + u8 phy_desc_size; /* 8 */ + u8 _r[3]; + u8 usage_model_id; + u32 _r2; + struct asd_manuf_phy_desc phy_desc[ASD_MAX_PHYS]; +} __attribute__ ((packed)); + +#if 0 +static const char *asd_sb_type[] = { + "unknown", + "SGPIO", + [2 ... 0x7F] = "unknown", + [0x80] = "ADPT_I2C", + [0x81 ... 0xFF] = "VENDOR_UNIQUExx" +}; +#endif + +struct asd_ms_sb_desc { + u8 type; + u8 node_desc_index; + u8 conn_desc_index; + u8 _recvd[0]; +} __attribute__ ((packed)); + +#if 0 +static const char *asd_conn_type[] = { + [0 ... 7] = "unknown", + "SFF8470", + "SFF8482", + "SFF8484", + [0x80] = "PCIX_DAUGHTER0", + [0x81] = "SAS_DAUGHTER0", + [0x82 ... 0xFF] = "VENDOR_UNIQUExx" +}; + +static const char *asd_conn_location[] = { + "unknown", + "internal", + "external", + "board_to_board", +}; +#endif + +struct asd_ms_conn_desc { + u8 type; + u8 location; + u8 num_sideband_desc; + u8 size_sideband_desc; + u32 _resvd; + u8 name[16]; + struct asd_ms_sb_desc sb_desc[0]; +} __attribute__ ((packed)); + +struct asd_nd_phy_desc { + u8 vp_attch_type; + u8 attch_specific[0]; +} __attribute__ ((packed)); + +#if 0 +static const char *asd_node_type[] = { + "IOP", + "IO_CONTROLLER", + "EXPANDER", + "PORT_MULTIPLIER", + "PORT_MULTIPLEXER", + "MULTI_DROP_I2C_BUS", +}; +#endif + +struct asd_ms_node_desc { + u8 type; + u8 num_phy_desc; + u8 size_phy_desc; + u8 _resvd; + u8 name[16]; + struct asd_nd_phy_desc phy_desc[0]; +} __attribute__ ((packed)); + +struct asd_ms_conn_map { + char sig[2]; /* 'M', 'C' */ + __le16 next; + u8 maj; /* 0 */ + u8 min; /* 0 */ + __le16 cm_size; /* size of this struct */ + u8 num_conn; + u8 conn_size; + u8 num_nodes; + u8 usage_model_id; + u32 _resvd; + struct asd_ms_conn_desc conn_desc[0]; + struct asd_ms_node_desc node_desc[0]; +} __attribute__ ((packed)); + +struct asd_ctrla_phy_entry { + u8 sas_addr[SAS_ADDR_SIZE]; + u8 sas_link_rates; /* max in hi bits, min in low bits */ + u8 flags; + u8 sata_link_rates; + u8 _r[5]; +} __attribute__ ((packed)); + +struct asd_ctrla_phy_settings { + u8 id0; /* P'h'y */ + u8 _r; + u16 next; + u8 num_phys; /* number of PHYs in the PCI function */ + u8 _r2[3]; + struct asd_ctrla_phy_entry phy_ent[ASD_MAX_PHYS]; +} __attribute__ ((packed)); + +struct asd_ll_el { + u8 id0; + u8 id1; + __le16 next; + u8 something_here[0]; +} __attribute__ ((packed)); + +static int asd_poll_flash(struct asd_ha_struct *asd_ha) +{ + int c; + u8 d; + + for (c = 5000; c > 0; c--) { + d = asd_read_reg_byte(asd_ha, asd_ha->hw_prof.flash.bar); + d ^= asd_read_reg_byte(asd_ha, asd_ha->hw_prof.flash.bar); + if (!d) + return 0; + udelay(5); + } + return -ENOENT; +} + +static int asd_reset_flash(struct asd_ha_struct *asd_ha) +{ + int err; + + err = asd_poll_flash(asd_ha); + if (err) + return err; + asd_write_reg_byte(asd_ha, asd_ha->hw_prof.flash.bar, FLASH_RESET); + err = asd_poll_flash(asd_ha); + + return err; +} + +static inline int asd_read_flash_seg(struct asd_ha_struct *asd_ha, + void *buffer, u32 offs, int size) +{ + asd_read_reg_string(asd_ha, buffer, asd_ha->hw_prof.flash.bar+offs, + size); + return 0; +} + +/** + * asd_find_flash_dir - finds and reads the flash directory + * @asd_ha: pointer to the host adapter structure + * @flash_dir: pointer to flash directory structure + * + * If found, the flash directory segment will be copied to + * @flash_dir. Return 1 if found, 0 if not. + */ +static int asd_find_flash_dir(struct asd_ha_struct *asd_ha, + struct asd_flash_dir *flash_dir) +{ + u32 v; + for (v = 0; v < FLASH_SIZE; v += FLASH_NEXT_ENTRY_OFFS) { + asd_read_flash_seg(asd_ha, flash_dir, v, + sizeof(FLASH_DIR_COOKIE)-1); + if (memcmp(flash_dir->cookie, FLASH_DIR_COOKIE, + sizeof(FLASH_DIR_COOKIE)-1) == 0) { + asd_ha->hw_prof.flash.dir_offs = v; + asd_read_flash_seg(asd_ha, flash_dir, v, + sizeof(*flash_dir)); + return 1; + } + } + return 0; +} + +static int asd_flash_getid(struct asd_ha_struct *asd_ha) +{ + int err = 0; + u32 reg, inc; + + reg = asd_read_reg_dword(asd_ha, EXSICNFGR); + + if (!(reg & FLASHEX)) { + ASD_DPRINTK("flash doesn't exist\n"); + return -ENOENT; + } + if (pci_read_config_dword(asd_ha->pcidev, PCI_CONF_FLSH_BAR, + &asd_ha->hw_prof.flash.bar)) { + asd_printk("couldn't read PCI_CONF_FLSH_BAR of %s\n", + pci_name(asd_ha->pcidev)); + return -ENOENT; + } + asd_ha->hw_prof.flash.present = 1; + asd_ha->hw_prof.flash.wide = reg & FLASHW ? 1 : 0; + err = asd_reset_flash(asd_ha); + if (err) { + ASD_DPRINTK("couldn't reset flash(%d)\n", err); + return err; + } + /* Get flash info. This would most likely be AMD Am29LV family flash. + * First try the sequence for word mode. It is the same as for + * 008B (byte mode only), 160B (word mode) and 800D (word mode). + */ + reg = asd_ha->hw_prof.flash.bar; + inc = asd_ha->hw_prof.flash.wide ? 2 : 1; + asd_write_reg_byte(asd_ha, reg + 0x555, 0xAA); + asd_write_reg_byte(asd_ha, reg + 0x2AA, 0x55); + asd_write_reg_byte(asd_ha, reg + 0x555, 0x90); + asd_ha->hw_prof.flash.manuf = asd_read_reg_byte(asd_ha, reg); + asd_ha->hw_prof.flash.dev_id= asd_read_reg_byte(asd_ha,reg+inc); + asd_ha->hw_prof.flash.sec_prot = asd_read_reg_byte(asd_ha,reg+inc+inc); + /* Get out of autoselect mode. */ + err = asd_reset_flash(asd_ha); + + switch(asd_ha->hw_prof.flash.manuf) { + case FLASH_MANUF_AMD: + case FLASH_MANUF_ST: + case FLASH_MANUF_FUJITSU: + case FLASH_MANUF_MACRONIX: + ASD_DPRINTK("0Found FLASH(%d) manuf:%d, dev_id:0x%x, " + "sec_prot:%d\n", + asd_ha->hw_prof.flash.wide ? 16 : 8, + asd_ha->hw_prof.flash.manuf, + asd_ha->hw_prof.flash.dev_id, + asd_ha->hw_prof.flash.sec_prot); + return 0; + default: + break; + + } + + /* Ok, try the sequence for byte mode of 160B and 800D. + * We may actually never need this. + */ + asd_write_reg_byte(asd_ha, reg + 0xAAA, 0xAA); + asd_write_reg_byte(asd_ha, reg + 0x555, 0x55); + asd_write_reg_byte(asd_ha, reg + 0xAAA, 0x90); + asd_ha->hw_prof.flash.manuf = asd_read_reg_byte(asd_ha, reg); + asd_ha->hw_prof.flash.dev_id = asd_read_reg_byte(asd_ha, reg + 2); + asd_ha->hw_prof.flash.sec_prot = asd_read_reg_byte(asd_ha, reg + 4); + err = asd_reset_flash(asd_ha); + + switch(asd_ha->hw_prof.flash.manuf) { + case FLASH_MANUF_AMD: + case FLASH_MANUF_ST: + case FLASH_MANUF_FUJITSU: + case FLASH_MANUF_MACRONIX: + ASD_DPRINTK("1Found FLASH(%d) manuf:%d, dev_id:0x%x, " + "sec_prot:%d\n", + asd_ha->hw_prof.flash.wide ? 16 : 8, + asd_ha->hw_prof.flash.manuf, + asd_ha->hw_prof.flash.dev_id, + asd_ha->hw_prof.flash.sec_prot); + return 0; + default: + break; + + } + + + return -ENOENT; +} + +static u16 asd_calc_flash_chksum(u16 *p, int size) +{ + u16 chksum = 0; + + while (size-- > 0) + chksum += *p++; + + return chksum; +} + + +static int asd_find_flash_de(struct asd_flash_dir *flash_dir, u32 entry_type, + u32 *offs, u32 *size) +{ + int i; + struct asd_flash_de *de; + + for (i = 0; i < FLASH_MAX_DIR_ENTRIES; i++) { + u32 type = le32_to_cpu(flash_dir->dir_entry[i].type); + + type &= FLASH_DE_TYPE_MASK; + if (type == entry_type) + break; + } + if (i >= FLASH_MAX_DIR_ENTRIES) + return -ENOENT; + de = &flash_dir->dir_entry[i]; + *offs = le32_to_cpu(de->offs); + *size = le32_to_cpu(de->pad_size); + return 0; +} + +static int asd_validate_ms(struct asd_manuf_sec *ms) +{ + if (ms->sig[0] != 'S' || ms->sig[1] != 'M') { + ASD_DPRINTK("manuf sec: no valid sig(%c%c)\n", + ms->sig[0], ms->sig[1]); + return -ENOENT; + } + if (ms->maj != 0) { + asd_printk("unsupported manuf. sector. major version:%x\n", + ms->maj); + return -ENOENT; + } + ms->offs_next = le16_to_cpu((__force __le16) ms->offs_next); + ms->chksum = le16_to_cpu((__force __le16) ms->chksum); + ms->size = le16_to_cpu((__force __le16) ms->size); + + if (asd_calc_flash_chksum((u16 *)ms, ms->size/2)) { + asd_printk("failed manuf sector checksum\n"); + } + + return 0; +} + +static int asd_ms_get_sas_addr(struct asd_ha_struct *asd_ha, + struct asd_manuf_sec *ms) +{ + memcpy(asd_ha->hw_prof.sas_addr, ms->sas_addr, SAS_ADDR_SIZE); + return 0; +} + +static int asd_ms_get_pcba_sn(struct asd_ha_struct *asd_ha, + struct asd_manuf_sec *ms) +{ + memcpy(asd_ha->hw_prof.pcba_sn, ms->pcba_sn, ASD_PCBA_SN_SIZE); + asd_ha->hw_prof.pcba_sn[ASD_PCBA_SN_SIZE] = '\0'; + return 0; +} + +/** + * asd_find_ll_by_id - find a linked list entry by its id + * @start: void pointer to the first element in the linked list + * @id0: the first byte of the id (offs 0) + * @id1: the second byte of the id (offs 1) + * + * @start has to be the _base_ element start, since the + * linked list entries's offset is from this pointer. + * Some linked list entries use only the first id, in which case + * you can pass 0xFF for the second. + */ +static void *asd_find_ll_by_id(void * const start, const u8 id0, const u8 id1) +{ + struct asd_ll_el *el = start; + + do { + switch (id1) { + default: + if (el->id1 == id1) + case 0xFF: + if (el->id0 == id0) + return el; + } + el = start + le16_to_cpu(el->next); + } while (el != start); + + return NULL; +} + +/** + * asd_ms_get_phy_params - get phy parameters from the manufacturing sector + * @asd_ha: pointer to the host adapter structure + * @manuf_sec: pointer to the manufacturing sector + * + * The manufacturing sector contans also the linked list of sub-segments, + * since when it was read, its size was taken from the flash directory, + * not from the structure size. + * + * HIDDEN phys do not count in the total count. REPORTED phys cannot + * be enabled but are reported and counted towards the total. + * ENEBLEABLE phys are enabled by default and count towards the total. + * The absolute total phy number is ASD_MAX_PHYS. hw_prof->num_phys + * merely specifies the number of phys the host adapter decided to + * report. E.g., it is possible for phys 0, 1 and 2 to be HIDDEN, + * phys 3, 4 and 5 to be REPORTED and phys 6 and 7 to be ENEBLEABLE. + * In this case ASD_MAX_PHYS is 8, hw_prof->num_phys is 5, and only 2 + * are actually enabled (enabled by default, max number of phys + * enableable in this case). + */ +static int asd_ms_get_phy_params(struct asd_ha_struct *asd_ha, + struct asd_manuf_sec *manuf_sec) +{ + int i; + int en_phys = 0; + int rep_phys = 0; + struct asd_manuf_phy_param *phy_param; + struct asd_manuf_phy_param dflt_phy_param; + + phy_param = asd_find_ll_by_id(manuf_sec, 'P', 'M'); + if (!phy_param) { + ASD_DPRINTK("ms: no phy parameters found\n"); + ASD_DPRINTK("ms: Creating default phy parameters\n"); + dflt_phy_param.sig[0] = 'P'; + dflt_phy_param.sig[1] = 'M'; + dflt_phy_param.maj = 0; + dflt_phy_param.min = 2; + dflt_phy_param.num_phy_desc = 8; + dflt_phy_param.phy_desc_size = sizeof(struct asd_manuf_phy_desc); + for (i =0; i < ASD_MAX_PHYS; i++) { + dflt_phy_param.phy_desc[i].state = 0; + dflt_phy_param.phy_desc[i].phy_id = i; + dflt_phy_param.phy_desc[i].phy_control_0 = 0xf6; + dflt_phy_param.phy_desc[i].phy_control_1 = 0x10; + dflt_phy_param.phy_desc[i].phy_control_2 = 0x43; + dflt_phy_param.phy_desc[i].phy_control_3 = 0xeb; + } + + phy_param = &dflt_phy_param; + + } + + if (phy_param->maj != 0) { + asd_printk("unsupported manuf. phy param major version:0x%x\n", + phy_param->maj); + return -ENOENT; + } + + ASD_DPRINTK("ms: num_phy_desc: %d\n", phy_param->num_phy_desc); + asd_ha->hw_prof.enabled_phys = 0; + for (i = 0; i < phy_param->num_phy_desc; i++) { + struct asd_manuf_phy_desc *pd = &phy_param->phy_desc[i]; + switch (pd->state & 0xF) { + case MS_PHY_STATE_HIDDEN: + ASD_DPRINTK("ms: phy%d: HIDDEN\n", i); + continue; + case MS_PHY_STATE_REPORTED: + ASD_DPRINTK("ms: phy%d: REPORTED\n", i); + asd_ha->hw_prof.enabled_phys &= ~(1 << i); + rep_phys++; + continue; + case MS_PHY_STATE_ENABLEABLE: + ASD_DPRINTK("ms: phy%d: ENEBLEABLE\n", i); + asd_ha->hw_prof.enabled_phys |= (1 << i); + en_phys++; + break; + } + asd_ha->hw_prof.phy_desc[i].phy_control_0 = pd->phy_control_0; + asd_ha->hw_prof.phy_desc[i].phy_control_1 = pd->phy_control_1; + asd_ha->hw_prof.phy_desc[i].phy_control_2 = pd->phy_control_2; + asd_ha->hw_prof.phy_desc[i].phy_control_3 = pd->phy_control_3; + } + asd_ha->hw_prof.max_phys = rep_phys + en_phys; + asd_ha->hw_prof.num_phys = en_phys; + ASD_DPRINTK("ms: max_phys:0x%x, num_phys:0x%x\n", + asd_ha->hw_prof.max_phys, asd_ha->hw_prof.num_phys); + ASD_DPRINTK("ms: enabled_phys:0x%x\n", asd_ha->hw_prof.enabled_phys); + return 0; +} + +static int asd_ms_get_connector_map(struct asd_ha_struct *asd_ha, + struct asd_manuf_sec *manuf_sec) +{ + struct asd_ms_conn_map *cm; + + cm = asd_find_ll_by_id(manuf_sec, 'M', 'C'); + if (!cm) { + ASD_DPRINTK("ms: no connector map found\n"); + return 0; + } + + if (cm->maj != 0) { + ASD_DPRINTK("ms: unsupported: connector map major version 0x%x" + "\n", cm->maj); + return -ENOENT; + } + + /* XXX */ + + return 0; +} + + +/** + * asd_process_ms - find and extract information from the manufacturing sector + * @asd_ha: pointer to the host adapter structure + * @flash_dir: pointer to the flash directory + */ +static int asd_process_ms(struct asd_ha_struct *asd_ha, + struct asd_flash_dir *flash_dir) +{ + int err; + struct asd_manuf_sec *manuf_sec; + u32 offs, size; + + err = asd_find_flash_de(flash_dir, FLASH_DE_MS, &offs, &size); + if (err) { + ASD_DPRINTK("Couldn't find the manuf. sector\n"); + goto out; + } + + if (size == 0) + goto out; + + err = -ENOMEM; + manuf_sec = kmalloc(size, GFP_KERNEL); + if (!manuf_sec) { + ASD_DPRINTK("no mem for manuf sector\n"); + goto out; + } + + err = asd_read_flash_seg(asd_ha, (void *)manuf_sec, offs, size); + if (err) { + ASD_DPRINTK("couldn't read manuf sector at 0x%x, size 0x%x\n", + offs, size); + goto out2; + } + + err = asd_validate_ms(manuf_sec); + if (err) { + ASD_DPRINTK("couldn't validate manuf sector\n"); + goto out2; + } + + err = asd_ms_get_sas_addr(asd_ha, manuf_sec); + if (err) { + ASD_DPRINTK("couldn't read the SAS_ADDR\n"); + goto out2; + } + ASD_DPRINTK("manuf sect SAS_ADDR %llx\n", + SAS_ADDR(asd_ha->hw_prof.sas_addr)); + + err = asd_ms_get_pcba_sn(asd_ha, manuf_sec); + if (err) { + ASD_DPRINTK("couldn't read the PCBA SN\n"); + goto out2; + } + ASD_DPRINTK("manuf sect PCBA SN %s\n", asd_ha->hw_prof.pcba_sn); + + err = asd_ms_get_phy_params(asd_ha, manuf_sec); + if (err) { + ASD_DPRINTK("ms: couldn't get phy parameters\n"); + goto out2; + } + + err = asd_ms_get_connector_map(asd_ha, manuf_sec); + if (err) { + ASD_DPRINTK("ms: couldn't get connector map\n"); + goto out2; + } + +out2: + kfree(manuf_sec); +out: + return err; +} + +static int asd_process_ctrla_phy_settings(struct asd_ha_struct *asd_ha, + struct asd_ctrla_phy_settings *ps) +{ + int i; + for (i = 0; i < ps->num_phys; i++) { + struct asd_ctrla_phy_entry *pe = &ps->phy_ent[i]; + + if (!PHY_ENABLED(asd_ha, i)) + continue; + if (*(u64 *)pe->sas_addr == 0) { + asd_ha->hw_prof.enabled_phys &= ~(1 << i); + continue; + } + /* This is the SAS address which should be sent in IDENTIFY. */ + memcpy(asd_ha->hw_prof.phy_desc[i].sas_addr, pe->sas_addr, + SAS_ADDR_SIZE); + asd_ha->hw_prof.phy_desc[i].max_sas_lrate = + (pe->sas_link_rates & 0xF0) >> 4; + asd_ha->hw_prof.phy_desc[i].min_sas_lrate = + (pe->sas_link_rates & 0x0F); + asd_ha->hw_prof.phy_desc[i].max_sata_lrate = + (pe->sata_link_rates & 0xF0) >> 4; + asd_ha->hw_prof.phy_desc[i].min_sata_lrate = + (pe->sata_link_rates & 0x0F); + asd_ha->hw_prof.phy_desc[i].flags = pe->flags; + ASD_DPRINTK("ctrla: phy%d: sas_addr: %llx, sas rate:0x%x-0x%x," + " sata rate:0x%x-0x%x, flags:0x%x\n", + i, + SAS_ADDR(asd_ha->hw_prof.phy_desc[i].sas_addr), + asd_ha->hw_prof.phy_desc[i].max_sas_lrate, + asd_ha->hw_prof.phy_desc[i].min_sas_lrate, + asd_ha->hw_prof.phy_desc[i].max_sata_lrate, + asd_ha->hw_prof.phy_desc[i].min_sata_lrate, + asd_ha->hw_prof.phy_desc[i].flags); + } + + return 0; +} + +/** + * asd_process_ctrl_a_user - process CTRL-A user settings + * @asd_ha: pointer to the host adapter structure + * @flash_dir: pointer to the flash directory + */ +static int asd_process_ctrl_a_user(struct asd_ha_struct *asd_ha, + struct asd_flash_dir *flash_dir) +{ + int err, i; + u32 offs, size; + struct asd_ll_el *el; + struct asd_ctrla_phy_settings *ps; + struct asd_ctrla_phy_settings dflt_ps; + + err = asd_find_flash_de(flash_dir, FLASH_DE_CTRL_A_USER, &offs, &size); + if (err) { + ASD_DPRINTK("couldn't find CTRL-A user settings section\n"); + ASD_DPRINTK("Creating default CTRL-A user settings section\n"); + + dflt_ps.id0 = 'h'; + dflt_ps.num_phys = 8; + for (i =0; i < ASD_MAX_PHYS; i++) { + memcpy(dflt_ps.phy_ent[i].sas_addr, + asd_ha->hw_prof.sas_addr, SAS_ADDR_SIZE); + dflt_ps.phy_ent[i].sas_link_rates = 0x98; + dflt_ps.phy_ent[i].flags = 0x0; + dflt_ps.phy_ent[i].sata_link_rates = 0x0; + } + + size = sizeof(struct asd_ctrla_phy_settings); + ps = &dflt_ps; + } + + if (size == 0) + goto out; + + err = -ENOMEM; + el = kmalloc(size, GFP_KERNEL); + if (!el) { + ASD_DPRINTK("no mem for ctrla user settings section\n"); + goto out; + } + + err = asd_read_flash_seg(asd_ha, (void *)el, offs, size); + if (err) { + ASD_DPRINTK("couldn't read ctrla phy settings section\n"); + goto out2; + } + + err = -ENOENT; + ps = asd_find_ll_by_id(el, 'h', 0xFF); + if (!ps) { + ASD_DPRINTK("couldn't find ctrla phy settings struct\n"); + goto out2; + } + + err = asd_process_ctrla_phy_settings(asd_ha, ps); + if (err) { + ASD_DPRINTK("couldn't process ctrla phy settings\n"); + goto out2; + } +out2: + kfree(el); +out: + return err; +} + +/** + * asd_read_flash - read flash memory + * @asd_ha: pointer to the host adapter structure + */ +int asd_read_flash(struct asd_ha_struct *asd_ha) +{ + int err; + struct asd_flash_dir *flash_dir; + + err = asd_flash_getid(asd_ha); + if (err) + return err; + + flash_dir = kmalloc(sizeof(*flash_dir), GFP_KERNEL); + if (!flash_dir) + return -ENOMEM; + + err = -ENOENT; + if (!asd_find_flash_dir(asd_ha, flash_dir)) { + ASD_DPRINTK("couldn't find flash directory\n"); + goto out; + } + + if (le32_to_cpu(flash_dir->rev) != 2) { + asd_printk("unsupported flash dir version:0x%x\n", + le32_to_cpu(flash_dir->rev)); + goto out; + } + + err = asd_process_ms(asd_ha, flash_dir); + if (err) { + ASD_DPRINTK("couldn't process manuf sector settings\n"); + goto out; + } + + err = asd_process_ctrl_a_user(asd_ha, flash_dir); + if (err) { + ASD_DPRINTK("couldn't process CTRL-A user settings\n"); + goto out; + } + +out: + kfree(flash_dir); + return err; +} diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_seq.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_seq.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_seq.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_seq.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1318 @@ +/* + * Aic94xx SAS/SATA driver sequencer interface. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * Parts of this code adapted from David Chaw's adp94xx_seq.c. + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include "aic94xx_reg.h" +#include "aic94xx_hwi.h" + +#include "aic94xx_seq.h" +#include "aic94xx_dump.h" + +#include "aic94xx_seq_microcode.c" + +/* It takes no more than 0.05 us for an instruction + * to complete. So waiting for 1 us should be more than + * plenty. + */ +#define PAUSE_DELAY 1 +#define PAUSE_TRIES 1000 + +static u16 first_scb_site_no = 0xFFFF; +static u16 last_scb_site_no; + +/* ---------- Pause/Unpause CSEQ/LSEQ ---------- */ + +/** + * asd_pause_cseq - pause the central sequencer + * @asd_ha: pointer to host adapter structure + * + * Return 0 on success, negative on failure. + */ +int asd_pause_cseq(struct asd_ha_struct *asd_ha) +{ + int count = PAUSE_TRIES; + u32 arp2ctl; + + arp2ctl = asd_read_reg_dword(asd_ha, CARP2CTL); + if (arp2ctl & PAUSED) + return 0; + + asd_write_reg_dword(asd_ha, CARP2CTL, arp2ctl | EPAUSE); + do { + arp2ctl = asd_read_reg_dword(asd_ha, CARP2CTL); + if (arp2ctl & PAUSED) + return 0; + udelay(PAUSE_DELAY); + } while (--count > 0); + + ASD_DPRINTK("couldn't pause CSEQ\n"); + return -1; +} + +/** + * asd_unpause_cseq - unpause the central sequencer. + * @asd_ha: pointer to host adapter structure. + * + * Return 0 on success, negative on error. + */ +int asd_unpause_cseq(struct asd_ha_struct *asd_ha) +{ + u32 arp2ctl; + int count = PAUSE_TRIES; + + arp2ctl = asd_read_reg_dword(asd_ha, CARP2CTL); + if (!(arp2ctl & PAUSED)) + return 0; + + asd_write_reg_dword(asd_ha, CARP2CTL, arp2ctl & ~EPAUSE); + do { + arp2ctl = asd_read_reg_dword(asd_ha, CARP2CTL); + if (!(arp2ctl & PAUSED)) + return 0; + udelay(PAUSE_DELAY); + } while (--count > 0); + + ASD_DPRINTK("couldn't unpause the CSEQ\n"); + return -1; +} + +/** + * asd_seq_pause_lseq - pause a link sequencer + * @asd_ha: pointer to a host adapter structure + * @lseq: link sequencer of interest + * + * Return 0 on success, negative on error. + */ +static inline int asd_seq_pause_lseq(struct asd_ha_struct *asd_ha, int lseq) +{ + u32 arp2ctl; + int count = PAUSE_TRIES; + + arp2ctl = asd_read_reg_dword(asd_ha, LmARP2CTL(lseq)); + if (arp2ctl & PAUSED) + return 0; + + asd_write_reg_dword(asd_ha, LmARP2CTL(lseq), arp2ctl | EPAUSE); + do { + arp2ctl = asd_read_reg_dword(asd_ha, LmARP2CTL(lseq)); + if (arp2ctl & PAUSED) + return 0; + udelay(PAUSE_DELAY); + } while (--count > 0); + + ASD_DPRINTK("couldn't pause LSEQ %d\n", lseq); + return -1; +} + +/** + * asd_pause_lseq - pause the link sequencer(s) + * @asd_ha: pointer to host adapter structure + * @lseq_mask: mask of link sequencers of interest + * + * Return 0 on success, negative on failure. + */ +int asd_pause_lseq(struct asd_ha_struct *asd_ha, u8 lseq_mask) +{ + int lseq; + int err = 0; + + for_each_sequencer(lseq_mask, lseq_mask, lseq) { + err = asd_seq_pause_lseq(asd_ha, lseq); + if (err) + return err; + } + + return err; +} + +/** + * asd_seq_unpause_lseq - unpause a link sequencer + * @asd_ha: pointer to host adapter structure + * @lseq: link sequencer of interest + * + * Return 0 on success, negative on error. + */ +static inline int asd_seq_unpause_lseq(struct asd_ha_struct *asd_ha, int lseq) +{ + u32 arp2ctl; + int count = PAUSE_TRIES; + + arp2ctl = asd_read_reg_dword(asd_ha, LmARP2CTL(lseq)); + if (!(arp2ctl & PAUSED)) + return 0; + + asd_write_reg_dword(asd_ha, LmARP2CTL(lseq), arp2ctl & ~EPAUSE); + do { + arp2ctl = asd_read_reg_dword(asd_ha, LmARP2CTL(lseq)); + if (!(arp2ctl & PAUSED)) + return 0; + udelay(PAUSE_DELAY); + } while (--count > 0); + + ASD_DPRINTK("couldn't unpause LSEQ %d\n", lseq); + return 0; +} + + +/** + * asd_unpause_lseq - unpause the link sequencer(s) + * @asd_ha: pointer to host adapter structure + * @lseq_mask: mask of link sequencers of interest + * + * Return 0 on success, negative on failure. + */ +int asd_unpause_lseq(struct asd_ha_struct *asd_ha, u8 lseq_mask) +{ + int lseq; + int err = 0; + + for_each_sequencer(lseq_mask, lseq_mask, lseq) { + err = asd_seq_unpause_lseq(asd_ha, lseq); + if (err) + return err; + } + + return err; +} + +/* ---------- Downloading CSEQ/LSEQ microcode ---------- */ + +static int asd_verify_cseq(struct asd_ha_struct *asd_ha, const u8 *_prog, + u32 size) +{ + u32 addr = CSEQ_RAM_REG_BASE_ADR; + const u32 *prog = (u32 *) _prog; + u32 i; + + for (i = 0; i < size; i += 4, prog++, addr += 4) { + u32 val = asd_read_reg_dword(asd_ha, addr); + + if (le32_to_cpu(*prog) != val) { + asd_printk("%s: cseq verify failed at %u " + "read:0x%x, wanted:0x%x\n", + pci_name(asd_ha->pcidev), + i, val, le32_to_cpu(*prog)); + return -1; + } + } + ASD_DPRINTK("verified %d bytes, passed\n", size); + return 0; +} + +/** + * asd_verify_lseq - verify the microcode of a link sequencer + * @asd_ha: pointer to host adapter structure + * @_prog: pointer to the microcode + * @size: size of the microcode in bytes + * @lseq: link sequencer of interest + * + * The link sequencer code is accessed in 4 KB pages, which are selected + * by setting LmRAMPAGE (bits 8 and 9) of the LmBISTCTL1 register. + * The 10 KB LSEQm instruction code is mapped, page at a time, at + * LmSEQRAM address. + */ +static int asd_verify_lseq(struct asd_ha_struct *asd_ha, const u8 *_prog, + u32 size, int lseq) +{ +#define LSEQ_CODEPAGE_SIZE 4096 + int pages = (size + LSEQ_CODEPAGE_SIZE - 1) / LSEQ_CODEPAGE_SIZE; + u32 page; + const u32 *prog = (u32 *) _prog; + + for (page = 0; page < pages; page++) { + u32 i; + + asd_write_reg_dword(asd_ha, LmBISTCTL1(lseq), + page << LmRAMPAGE_LSHIFT); + for (i = 0; size > 0 && i < LSEQ_CODEPAGE_SIZE; + i += 4, prog++, size-=4) { + + u32 val = asd_read_reg_dword(asd_ha, LmSEQRAM(lseq)+i); + + if (le32_to_cpu(*prog) != val) { + asd_printk("%s: LSEQ%d verify failed " + "page:%d, offs:%d\n", + pci_name(asd_ha->pcidev), + lseq, page, i); + return -1; + } + } + } + ASD_DPRINTK("LSEQ%d verified %d bytes, passed\n", lseq, + (int)((u8 *)prog-_prog)); + return 0; +} + +/** + * asd_verify_seq -- verify CSEQ/LSEQ microcode + * @asd_ha: pointer to host adapter structure + * @prog: pointer to microcode + * @size: size of the microcode + * @lseq_mask: if 0, verify CSEQ microcode, else mask of LSEQs of interest + * + * Return 0 if microcode is correct, negative on mismatch. + */ +static int asd_verify_seq(struct asd_ha_struct *asd_ha, const u8 *prog, + u32 size, u8 lseq_mask) +{ + if (lseq_mask == 0) + return asd_verify_cseq(asd_ha, prog, size); + else { + int lseq, err; + + for_each_sequencer(lseq_mask, lseq_mask, lseq) { + err = asd_verify_lseq(asd_ha, prog, size, lseq); + if (err) + return err; + } + } + + return 0; +} +#define ASD_DMA_MODE_DOWNLOAD +#ifdef ASD_DMA_MODE_DOWNLOAD +/* This is the size of the CSEQ Mapped instruction page */ +#define MAX_DMA_OVLY_COUNT ((1U << 14)-1) +static int asd_download_seq(struct asd_ha_struct *asd_ha, + const u8 * const prog, u32 size, u8 lseq_mask) +{ + u32 comstaten; + u32 reg; + int page; + const int pages = (size + MAX_DMA_OVLY_COUNT - 1) / MAX_DMA_OVLY_COUNT; + struct asd_dma_tok *token; + int err = 0; + + if (size % 4) { + asd_printk("sequencer program not multiple of 4\n"); + return -1; + } + + asd_pause_cseq(asd_ha); + asd_pause_lseq(asd_ha, 0xFF); + + /* save, disable and clear interrupts */ + comstaten = asd_read_reg_dword(asd_ha, COMSTATEN); + asd_write_reg_dword(asd_ha, COMSTATEN, 0); + asd_write_reg_dword(asd_ha, COMSTAT, COMSTAT_MASK); + + asd_write_reg_dword(asd_ha, CHIMINTEN, RST_CHIMINTEN); + asd_write_reg_dword(asd_ha, CHIMINT, CHIMINT_MASK); + + token = asd_alloc_coherent(asd_ha, MAX_DMA_OVLY_COUNT, GFP_KERNEL); + if (!token) { + asd_printk("out of memory for dma SEQ download\n"); + err = -ENOMEM; + goto out; + } + ASD_DPRINTK("dma-ing %d bytes\n", size); + + for (page = 0; page < pages; page++) { + int i; + u32 left = min(size-page*MAX_DMA_OVLY_COUNT, + (u32)MAX_DMA_OVLY_COUNT); + + memcpy(token->vaddr, prog + page*MAX_DMA_OVLY_COUNT, left); + asd_write_reg_addr(asd_ha, OVLYDMAADR, token->dma_handle); + asd_write_reg_dword(asd_ha, OVLYDMACNT, left); + reg = !page ? RESETOVLYDMA : 0; + reg |= (STARTOVLYDMA | OVLYHALTERR); + reg |= (lseq_mask ? (((u32)lseq_mask) << 8) : OVLYCSEQ); + /* Start DMA. */ + asd_write_reg_dword(asd_ha, OVLYDMACTL, reg); + + for (i = PAUSE_TRIES*100; i > 0; i--) { + u32 dmadone = asd_read_reg_dword(asd_ha, OVLYDMACTL); + if (!(dmadone & OVLYDMAACT)) + break; + udelay(PAUSE_DELAY); + } + } + + reg = asd_read_reg_dword(asd_ha, COMSTAT); + if (!(reg & OVLYDMADONE) || (reg & OVLYERR) + || (asd_read_reg_dword(asd_ha, CHIMINT) & DEVEXCEPT_MASK)){ + asd_printk("%s: error DMA-ing sequencer code\n", + pci_name(asd_ha->pcidev)); + err = -ENODEV; + } + + asd_free_coherent(asd_ha, token); + out: + asd_write_reg_dword(asd_ha, COMSTATEN, comstaten); + + return err ? : asd_verify_seq(asd_ha, prog, size, lseq_mask); +} +#else /* ASD_DMA_MODE_DOWNLOAD */ +static int asd_download_seq(struct asd_ha_struct *asd_ha, const u8 *_prog, + u32 size, u8 lseq_mask) +{ + int i; + u32 reg = 0; + const u32 *prog = (u32 *) _prog; + + if (size % 4) { + asd_printk("sequencer program not multiple of 4\n"); + return -1; + } + + asd_pause_cseq(asd_ha); + asd_pause_lseq(asd_ha, 0xFF); + + reg |= (lseq_mask ? (((u32)lseq_mask) << 8) : OVLYCSEQ); + reg |= PIOCMODE; + + asd_write_reg_dword(asd_ha, OVLYDMACNT, size); + asd_write_reg_dword(asd_ha, OVLYDMACTL, reg); + + ASD_DPRINTK("downloading %s sequencer%s in PIO mode...\n", + lseq_mask ? "LSEQ" : "CSEQ", lseq_mask ? "s" : ""); + + for (i = 0; i < size; i += 4, prog++) + asd_write_reg_dword(asd_ha, SPIODATA, *prog); + + reg = (reg & ~PIOCMODE) | OVLYHALTERR; + asd_write_reg_dword(asd_ha, OVLYDMACTL, reg); + + return asd_verify_seq(asd_ha, _prog, size, lseq_mask); +} +#endif /* ASD_DMA_MODE_DOWNLOAD */ + +/** + * asd_seq_download_seqs - download the sequencer microcode + * @asd_ha: pointer to host adapter structure + * + * Download the central and link sequencer microcode. + */ +static int asd_seq_download_seqs(struct asd_ha_struct *asd_ha) +{ + int err; + + if (!asd_ha->hw_prof.enabled_phys) { + asd_printk("%s: no enabled phys!\n", pci_name(asd_ha->pcidev)); + return -ENODEV; + } + + /* Download the CSEQ */ + ASD_DPRINTK("downloading CSEQ...\n"); + err = asd_download_seq(asd_ha, Cseq, sizeof(Cseq), 0); + if (err) { + asd_printk("CSEQ download failed:%d\n", err); + return err; + } + + /* Download the Link Sequencers code. All of the Link Sequencers + * microcode can be downloaded at the same time. + */ + ASD_DPRINTK("downloading LSEQs...\n"); + err = asd_download_seq(asd_ha, Lseq, sizeof(Lseq), + asd_ha->hw_prof.enabled_phys); + if (err) { + /* Try it one at a time */ + u8 lseq; + u8 lseq_mask = asd_ha->hw_prof.enabled_phys; + + for_each_sequencer(lseq_mask, lseq_mask, lseq) { + err = asd_download_seq(asd_ha, Lseq, sizeof(Lseq), + 1<> 8; + asd_write_reg_byte(asd_ha, CSEQ_FREE_SCB_MASK, (u8)cmdctx); + } + asd_write_reg_word(asd_ha, CSEQ_BUILTIN_FREE_SCB_HEAD, + first_scb_site_no); + asd_write_reg_word(asd_ha, CSEQ_BUILTIN_FREE_SCB_TAIL, + last_scb_site_no); + asd_write_reg_word(asd_ha, CSEQ_EXTENDED_FREE_SCB_HEAD, 0xFFFF); + asd_write_reg_word(asd_ha, CSEQ_EXTENDED_FREE_SCB_TAIL, 0xFFFF); + + /* CSEQ Mode independent, page 7 setup. */ + asd_write_reg_dword(asd_ha, CSEQ_EMPTY_REQ_QUEUE, 0); + asd_write_reg_dword(asd_ha, CSEQ_EMPTY_REQ_QUEUE+4, 0); + asd_write_reg_dword(asd_ha, CSEQ_EMPTY_REQ_COUNT, 0); + asd_write_reg_dword(asd_ha, CSEQ_EMPTY_REQ_COUNT+4, 0); + asd_write_reg_word(asd_ha, CSEQ_Q_EMPTY_HEAD, 0xFFFF); + asd_write_reg_word(asd_ha, CSEQ_Q_EMPTY_TAIL, 0xFFFF); + asd_write_reg_word(asd_ha, CSEQ_NEED_EMPTY_SCB, 0); + asd_write_reg_byte(asd_ha, CSEQ_EMPTY_REQ_HEAD, 0); + asd_write_reg_byte(asd_ha, CSEQ_EMPTY_REQ_TAIL, 0); + asd_write_reg_byte(asd_ha, CSEQ_EMPTY_SCB_OFFSET, 0); + asd_write_reg_word(asd_ha, CSEQ_PRIMITIVE_DATA, 0); + asd_write_reg_dword(asd_ha, CSEQ_TIMEOUT_CONST, 0); +} + +/** + * asd_init_cseq_mdp - initialize CSEQ Mode dependent pages + * @asd_ha: pointer to host adapter structure + */ +static void asd_init_cseq_mdp(struct asd_ha_struct *asd_ha) +{ + int i; + int moffs; + + moffs = CSEQ_PAGE_SIZE * 2; + + /* CSEQ Mode dependent, modes 0-7, page 0 setup. */ + for (i = 0; i < 8; i++) { + asd_write_reg_word(asd_ha, i*moffs+CSEQ_LRM_SAVE_SINDEX, 0); + asd_write_reg_word(asd_ha, i*moffs+CSEQ_LRM_SAVE_SCBPTR, 0); + asd_write_reg_word(asd_ha, i*moffs+CSEQ_Q_LINK_HEAD, 0xFFFF); + asd_write_reg_word(asd_ha, i*moffs+CSEQ_Q_LINK_TAIL, 0xFFFF); + asd_write_reg_byte(asd_ha, i*moffs+CSEQ_LRM_SAVE_SCRPAGE, 0); + } + + /* CSEQ Mode dependent, mode 0-7, page 1 and 2 shall be ignored. */ + + /* CSEQ Mode dependent, mode 8, page 0 setup. */ + asd_write_reg_word(asd_ha, CSEQ_RET_ADDR, 0xFFFF); + asd_write_reg_word(asd_ha, CSEQ_RET_SCBPTR, 0); + asd_write_reg_word(asd_ha, CSEQ_SAVE_SCBPTR, 0); + asd_write_reg_word(asd_ha, CSEQ_EMPTY_TRANS_CTX, 0); + asd_write_reg_word(asd_ha, CSEQ_RESP_LEN, 0); + asd_write_reg_word(asd_ha, CSEQ_TMF_SCBPTR, 0); + asd_write_reg_word(asd_ha, CSEQ_GLOBAL_PREV_SCB, 0); + asd_write_reg_word(asd_ha, CSEQ_GLOBAL_HEAD, 0); + asd_write_reg_word(asd_ha, CSEQ_CLEAR_LU_HEAD, 0); + asd_write_reg_byte(asd_ha, CSEQ_TMF_OPCODE, 0); + asd_write_reg_byte(asd_ha, CSEQ_SCRATCH_FLAGS, 0); + asd_write_reg_word(asd_ha, CSEQ_HSB_SITE, 0); + asd_write_reg_word(asd_ha, CSEQ_FIRST_INV_SCB_SITE, + (u16)last_scb_site_no+1); + asd_write_reg_word(asd_ha, CSEQ_FIRST_INV_DDB_SITE, + (u16)asd_ha->hw_prof.max_ddbs); + + /* CSEQ Mode dependent, mode 8, page 1 setup. */ + asd_write_reg_dword(asd_ha, CSEQ_LUN_TO_CLEAR, 0); + asd_write_reg_dword(asd_ha, CSEQ_LUN_TO_CLEAR + 4, 0); + asd_write_reg_dword(asd_ha, CSEQ_LUN_TO_CHECK, 0); + asd_write_reg_dword(asd_ha, CSEQ_LUN_TO_CHECK + 4, 0); + + /* CSEQ Mode dependent, mode 8, page 2 setup. */ + /* Tell the sequencer the bus address of the first SCB. */ + asd_write_reg_addr(asd_ha, CSEQ_HQ_NEW_POINTER, + asd_ha->seq.next_scb.dma_handle); + ASD_DPRINTK("First SCB dma_handle: 0x%llx\n", + (unsigned long long)asd_ha->seq.next_scb.dma_handle); + + /* Tell the sequencer the first Done List entry address. */ + asd_write_reg_addr(asd_ha, CSEQ_HQ_DONE_BASE, + asd_ha->seq.actual_dl->dma_handle); + + /* Initialize the Q_DONE_POINTER with the least significant + * 4 bytes of the first Done List address. */ + asd_write_reg_dword(asd_ha, CSEQ_HQ_DONE_POINTER, + ASD_BUSADDR_LO(asd_ha->seq.actual_dl->dma_handle)); + + asd_write_reg_byte(asd_ha, CSEQ_HQ_DONE_PASS, ASD_DEF_DL_TOGGLE); + + /* CSEQ Mode dependent, mode 8, page 3 shall be ignored. */ +} + +/** + * asd_init_cseq_scratch -- setup and init CSEQ + * @asd_ha: pointer to host adapter structure + * + * Setup and initialize Central sequencers. Initialiaze the mode + * independent and dependent scratch page to the default settings. + */ +static void asd_init_cseq_scratch(struct asd_ha_struct *asd_ha) +{ + asd_init_cseq_mip(asd_ha); + asd_init_cseq_mdp(asd_ha); +} + +/** + * asd_init_lseq_mip -- initialize LSEQ Mode independent pages 0-3 + * @asd_ha: pointer to host adapter structure + */ +static void asd_init_lseq_mip(struct asd_ha_struct *asd_ha, u8 lseq) +{ + int i; + + /* LSEQ Mode independent page 0 setup. */ + asd_write_reg_word(asd_ha, LmSEQ_Q_TGTXFR_HEAD(lseq), 0xFFFF); + asd_write_reg_word(asd_ha, LmSEQ_Q_TGTXFR_TAIL(lseq), 0xFFFF); + asd_write_reg_byte(asd_ha, LmSEQ_LINK_NUMBER(lseq), lseq); + asd_write_reg_byte(asd_ha, LmSEQ_SCRATCH_FLAGS(lseq), + ASD_NOTIFY_ENABLE_SPINUP); + asd_write_reg_dword(asd_ha, LmSEQ_CONNECTION_STATE(lseq),0x08000000); + asd_write_reg_word(asd_ha, LmSEQ_CONCTL(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_CONSTAT(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_CONNECTION_MODES(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_REG1_ISR(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_REG2_ISR(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_REG3_ISR(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_REG0_ISR(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_REG0_ISR(lseq)+4, 0); + + /* LSEQ Mode independent page 1 setup. */ + asd_write_reg_word(asd_ha, LmSEQ_EST_NEXUS_SCBPTR0(lseq), 0xFFFF); + asd_write_reg_word(asd_ha, LmSEQ_EST_NEXUS_SCBPTR1(lseq), 0xFFFF); + asd_write_reg_word(asd_ha, LmSEQ_EST_NEXUS_SCBPTR2(lseq), 0xFFFF); + asd_write_reg_word(asd_ha, LmSEQ_EST_NEXUS_SCBPTR3(lseq), 0xFFFF); + asd_write_reg_byte(asd_ha, LmSEQ_EST_NEXUS_SCB_OPCODE0(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EST_NEXUS_SCB_OPCODE1(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EST_NEXUS_SCB_OPCODE2(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EST_NEXUS_SCB_OPCODE3(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EST_NEXUS_SCB_HEAD(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EST_NEXUS_SCB_TAIL(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EST_NEXUS_BUF_AVAIL(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_TIMEOUT_CONST(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_ISR_SAVE_SINDEX(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_ISR_SAVE_DINDEX(lseq), 0); + + /* LSEQ Mode Independent page 2 setup. */ + asd_write_reg_word(asd_ha, LmSEQ_EMPTY_SCB_PTR0(lseq), 0xFFFF); + asd_write_reg_word(asd_ha, LmSEQ_EMPTY_SCB_PTR1(lseq), 0xFFFF); + asd_write_reg_word(asd_ha, LmSEQ_EMPTY_SCB_PTR2(lseq), 0xFFFF); + asd_write_reg_word(asd_ha, LmSEQ_EMPTY_SCB_PTR3(lseq), 0xFFFF); + asd_write_reg_byte(asd_ha, LmSEQ_EMPTY_SCB_OPCD0(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EMPTY_SCB_OPCD1(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EMPTY_SCB_OPCD2(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EMPTY_SCB_OPCD3(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EMPTY_SCB_HEAD(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EMPTY_SCB_TAIL(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_EMPTY_BUFS_AVAIL(lseq), 0); + for (i = 0; i < 12; i += 4) + asd_write_reg_dword(asd_ha, LmSEQ_ATA_SCR_REGS(lseq) + i, 0); + + /* LSEQ Mode Independent page 3 setup. */ + + /* Device present timer timeout */ + asd_write_reg_dword(asd_ha, LmSEQ_DEV_PRES_TMR_TOUT_CONST(lseq), + ASD_DEV_PRESENT_TIMEOUT); + + /* SATA interlock timer disabled */ + asd_write_reg_dword(asd_ha, LmSEQ_SATA_INTERLOCK_TIMEOUT(lseq), + ASD_SATA_INTERLOCK_TIMEOUT); + + /* STP shutdown timer timeout constant, IGNORED by the sequencer, + * always 0. */ + asd_write_reg_dword(asd_ha, LmSEQ_STP_SHUTDOWN_TIMEOUT(lseq), + ASD_STP_SHUTDOWN_TIMEOUT); + + asd_write_reg_dword(asd_ha, LmSEQ_SRST_ASSERT_TIMEOUT(lseq), + ASD_SRST_ASSERT_TIMEOUT); + + asd_write_reg_dword(asd_ha, LmSEQ_RCV_FIS_TIMEOUT(lseq), + ASD_RCV_FIS_TIMEOUT); + + asd_write_reg_dword(asd_ha, LmSEQ_ONE_MILLISEC_TIMEOUT(lseq), + ASD_ONE_MILLISEC_TIMEOUT); + + /* COM_INIT timer */ + asd_write_reg_dword(asd_ha, LmSEQ_TEN_MS_COMINIT_TIMEOUT(lseq), + ASD_TEN_MILLISEC_TIMEOUT); + + asd_write_reg_dword(asd_ha, LmSEQ_SMP_RCV_TIMEOUT(lseq), + ASD_SMP_RCV_TIMEOUT); +} + +/** + * asd_init_lseq_mdp -- initialize LSEQ mode dependent pages. + * @asd_ha: pointer to host adapter structure + */ +static void asd_init_lseq_mdp(struct asd_ha_struct *asd_ha, int lseq) +{ + int i; + u32 moffs; + static const u16 ret_addr[] = { + 0xFFFF, /* mode 0 */ + 0xFFFF, /* mode 1 */ + MODE2_TASK, /* mode 2 */ + 0, + 0xFFFF, /* mode 4/5 */ + 0xFFFF, /* mode 4/5 */ + }; + + /* + * Mode 0,1,2 and 4/5 have common field on page 0 for the first + * 14 bytes. + */ + for (i = 0; i < 3; i++) { + moffs = i * LSEQ_MODE_SCRATCH_SIZE; + asd_write_reg_word(asd_ha, LmSEQ_RET_ADDR(lseq)+moffs, + ret_addr[i]); + asd_write_reg_word(asd_ha, LmSEQ_REG0_MODE(lseq)+moffs, 0); + asd_write_reg_word(asd_ha, LmSEQ_MODE_FLAGS(lseq)+moffs, 0); + asd_write_reg_word(asd_ha, LmSEQ_RET_ADDR2(lseq)+moffs,0xFFFF); + asd_write_reg_word(asd_ha, LmSEQ_RET_ADDR1(lseq)+moffs,0xFFFF); + asd_write_reg_byte(asd_ha, LmSEQ_OPCODE_TO_CSEQ(lseq)+moffs,0); + asd_write_reg_word(asd_ha, LmSEQ_DATA_TO_CSEQ(lseq)+moffs,0); + } + /* + * Mode 5 page 0 overlaps the same scratch page with Mode 0 page 3. + */ + asd_write_reg_word(asd_ha, + LmSEQ_RET_ADDR(lseq)+LSEQ_MODE5_PAGE0_OFFSET, + ret_addr[5]); + asd_write_reg_word(asd_ha, + LmSEQ_REG0_MODE(lseq)+LSEQ_MODE5_PAGE0_OFFSET,0); + asd_write_reg_word(asd_ha, + LmSEQ_MODE_FLAGS(lseq)+LSEQ_MODE5_PAGE0_OFFSET, 0); + asd_write_reg_word(asd_ha, + LmSEQ_RET_ADDR2(lseq)+LSEQ_MODE5_PAGE0_OFFSET,0xFFFF); + asd_write_reg_word(asd_ha, + LmSEQ_RET_ADDR1(lseq)+LSEQ_MODE5_PAGE0_OFFSET,0xFFFF); + asd_write_reg_byte(asd_ha, + LmSEQ_OPCODE_TO_CSEQ(lseq)+LSEQ_MODE5_PAGE0_OFFSET,0); + asd_write_reg_word(asd_ha, + LmSEQ_DATA_TO_CSEQ(lseq)+LSEQ_MODE5_PAGE0_OFFSET, 0); + + /* LSEQ Mode dependent 0, page 0 setup. */ + asd_write_reg_word(asd_ha, LmSEQ_FIRST_INV_DDB_SITE(lseq), + (u16)asd_ha->hw_prof.max_ddbs); + asd_write_reg_word(asd_ha, LmSEQ_EMPTY_TRANS_CTX(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_RESP_LEN(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_FIRST_INV_SCB_SITE(lseq), + (u16)last_scb_site_no+1); + asd_write_reg_word(asd_ha, LmSEQ_INTEN_SAVE(lseq), + (u16) LmM0INTEN_MASK & 0xFFFF0000 >> 16); + asd_write_reg_word(asd_ha, LmSEQ_INTEN_SAVE(lseq) + 2, + (u16) LmM0INTEN_MASK & 0xFFFF); + asd_write_reg_byte(asd_ha, LmSEQ_LINK_RST_FRM_LEN(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_LINK_RST_PROTOCOL(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_RESP_STATUS(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_LAST_LOADED_SGE(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_SAVE_SCBPTR(lseq), 0); + + /* LSEQ mode dependent, mode 1, page 0 setup. */ + asd_write_reg_word(asd_ha, LmSEQ_Q_XMIT_HEAD(lseq), 0xFFFF); + asd_write_reg_word(asd_ha, LmSEQ_M1_EMPTY_TRANS_CTX(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_INI_CONN_TAG(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_FAILED_OPEN_STATUS(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_XMIT_REQUEST_TYPE(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_M1_RESP_STATUS(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_M1_LAST_LOADED_SGE(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_M1_SAVE_SCBPTR(lseq), 0); + + /* LSEQ Mode dependent mode 2, page 0 setup */ + asd_write_reg_word(asd_ha, LmSEQ_PORT_COUNTER(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_PM_TABLE_PTR(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_SATA_INTERLOCK_TMR_SAVE(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_IP_BITL(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_COPY_SMP_CONN_TAG(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_P0M2_OFFS1AH(lseq), 0); + + /* LSEQ Mode dependent, mode 4/5, page 0 setup. */ + asd_write_reg_byte(asd_ha, LmSEQ_SAVED_OOB_STATUS(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_SAVED_OOB_MODE(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_Q_LINK_HEAD(lseq), 0xFFFF); + asd_write_reg_byte(asd_ha, LmSEQ_LINK_RST_ERR(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_SAVED_OOB_SIGNALS(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_SAS_RESET_MODE(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_LINK_RESET_RETRY_COUNT(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_NUM_LINK_RESET_RETRIES(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_OOB_INT_ENABLES(lseq), 0); + /* + * Set the desired interval between transmissions of the NOTIFY + * (ENABLE SPINUP) primitive. Must be initilized to val - 1. + */ + asd_write_reg_word(asd_ha, LmSEQ_NOTIFY_TIMER_TIMEOUT(lseq), + ASD_NOTIFY_TIMEOUT - 1); + /* No delay for the first NOTIFY to be sent to the attached target. */ + asd_write_reg_word(asd_ha, LmSEQ_NOTIFY_TIMER_DOWN_COUNT(lseq), + ASD_NOTIFY_DOWN_COUNT); + + /* LSEQ Mode dependent, mode 0 and 1, page 1 setup. */ + for (i = 0; i < 2; i++) { + int j; + /* Start from Page 1 of Mode 0 and 1. */ + moffs = LSEQ_PAGE_SIZE + i*LSEQ_MODE_SCRATCH_SIZE; + /* All the fields of page 1 can be intialized to 0. */ + for (j = 0; j < LSEQ_PAGE_SIZE; j += 4) + asd_write_reg_dword(asd_ha, LmSCRATCH(lseq)+moffs+j,0); + } + + /* LSEQ Mode dependent, mode 2, page 1 setup. */ + asd_write_reg_dword(asd_ha, LmSEQ_INVALID_DWORD_COUNT(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_DISPARITY_ERROR_COUNT(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_LOSS_OF_SYNC_COUNT(lseq), 0); + + /* LSEQ Mode dependent, mode 4/5, page 1. */ + for (i = 0; i < LSEQ_PAGE_SIZE; i+=4) + asd_write_reg_dword(asd_ha, LmSEQ_FRAME_TYPE_MASK(lseq)+i, 0); + asd_write_reg_byte(asd_ha, LmSEQ_FRAME_TYPE_MASK(lseq), 0xFF); + asd_write_reg_byte(asd_ha, LmSEQ_HASHED_DEST_ADDR_MASK(lseq), 0xFF); + asd_write_reg_byte(asd_ha, LmSEQ_HASHED_DEST_ADDR_MASK(lseq)+1,0xFF); + asd_write_reg_byte(asd_ha, LmSEQ_HASHED_DEST_ADDR_MASK(lseq)+2,0xFF); + asd_write_reg_byte(asd_ha, LmSEQ_HASHED_SRC_ADDR_MASK(lseq), 0xFF); + asd_write_reg_byte(asd_ha, LmSEQ_HASHED_SRC_ADDR_MASK(lseq)+1, 0xFF); + asd_write_reg_byte(asd_ha, LmSEQ_HASHED_SRC_ADDR_MASK(lseq)+2, 0xFF); + asd_write_reg_dword(asd_ha, LmSEQ_DATA_OFFSET(lseq), 0xFFFFFFFF); + + /* LSEQ Mode dependent, mode 0, page 2 setup. */ + asd_write_reg_dword(asd_ha, LmSEQ_SMP_RCV_TIMER_TERM_TS(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_DEVICE_BITS(lseq), 0); + asd_write_reg_word(asd_ha, LmSEQ_SDB_DDB(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_SDB_NUM_TAGS(lseq), 0); + asd_write_reg_byte(asd_ha, LmSEQ_SDB_CURR_TAG(lseq), 0); + + /* LSEQ Mode Dependent 1, page 2 setup. */ + asd_write_reg_dword(asd_ha, LmSEQ_TX_ID_ADDR_FRAME(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_TX_ID_ADDR_FRAME(lseq)+4, 0); + asd_write_reg_dword(asd_ha, LmSEQ_OPEN_TIMER_TERM_TS(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_SRST_AS_TIMER_TERM_TS(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_LAST_LOADED_SG_EL(lseq), 0); + + /* LSEQ Mode Dependent 2, page 2 setup. */ + /* The LmSEQ_STP_SHUTDOWN_TIMER_TERM_TS is IGNORED by the sequencer, + * i.e. always 0. */ + asd_write_reg_dword(asd_ha, LmSEQ_STP_SHUTDOWN_TIMER_TERM_TS(lseq),0); + asd_write_reg_dword(asd_ha, LmSEQ_CLOSE_TIMER_TERM_TS(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_BREAK_TIMER_TERM_TS(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_DWS_RESET_TIMER_TERM_TS(lseq), 0); + asd_write_reg_dword(asd_ha,LmSEQ_SATA_INTERLOCK_TIMER_TERM_TS(lseq),0); + asd_write_reg_dword(asd_ha, LmSEQ_MCTL_TIMER_TERM_TS(lseq), 0); + + /* LSEQ Mode Dependent 4/5, page 2 setup. */ + asd_write_reg_dword(asd_ha, LmSEQ_COMINIT_TIMER_TERM_TS(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_RCV_ID_TIMER_TERM_TS(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_RCV_FIS_TIMER_TERM_TS(lseq), 0); + asd_write_reg_dword(asd_ha, LmSEQ_DEV_PRES_TIMER_TERM_TS(lseq), 0); +} + +/** + * asd_init_lseq_scratch -- setup and init link sequencers + * @asd_ha: pointer to host adapter struct + */ +static void asd_init_lseq_scratch(struct asd_ha_struct *asd_ha) +{ + u8 lseq; + u8 lseq_mask; + + lseq_mask = asd_ha->hw_prof.enabled_phys; + for_each_sequencer(lseq_mask, lseq_mask, lseq) { + asd_init_lseq_mip(asd_ha, lseq); + asd_init_lseq_mdp(asd_ha, lseq); + } +} + +/** + * asd_init_scb_sites -- initialize sequencer SCB sites (memory). + * @asd_ha: pointer to host adapter structure + * + * This should be done before initializing common CSEQ and LSEQ + * scratch since those areas depend on some computed values here, + * last_scb_site_no, etc. + */ +static void asd_init_scb_sites(struct asd_ha_struct *asd_ha) +{ + u16 site_no; + u16 max_scbs = 0; + + for (site_no = asd_ha->hw_prof.max_scbs-1; + site_no != (u16) -1; + site_no--) { + u16 i; + + /* Initialize all fields in the SCB site to 0. */ + for (i = 0; i < ASD_SCB_SIZE; i += 4) + asd_scbsite_write_dword(asd_ha, site_no, i, 0); + + /* Workaround needed by SEQ to fix a SATA issue is to exclude + * certain SCB sites from the free list. */ + if (!SCB_SITE_VALID(site_no)) + continue; + + if (last_scb_site_no == 0) + last_scb_site_no = site_no; + + /* For every SCB site, we need to initialize the + * following fields: Q_NEXT, SCB_OPCODE, SCB_FLAGS, + * and SG Element Flag. */ + + /* Q_NEXT field of the last SCB is invalidated. */ + asd_scbsite_write_word(asd_ha, site_no, 0, first_scb_site_no); + + /* Initialize SCB Site Opcode field to invalid. */ + asd_scbsite_write_byte(asd_ha, site_no, + offsetof(struct scb_header, opcode), + 0xFF); + + /* Initialize SCB Site Flags field to mean a response + * frame has been received. This means inadvertent + * frames received to be dropped. */ + asd_scbsite_write_byte(asd_ha, site_no, 0x49, 0x01); + + first_scb_site_no = site_no; + max_scbs++; + } + asd_ha->hw_prof.max_scbs = max_scbs; + ASD_DPRINTK("max_scbs:%d\n", asd_ha->hw_prof.max_scbs); + ASD_DPRINTK("first_scb_site_no:0x%x\n", first_scb_site_no); + ASD_DPRINTK("last_scb_site_no:0x%x\n", last_scb_site_no); +} + +/** + * asd_init_cseq_cio - initialize CSEQ CIO registers + * @asd_ha: pointer to host adapter structure + */ +static void asd_init_cseq_cio(struct asd_ha_struct *asd_ha) +{ + int i; + + asd_write_reg_byte(asd_ha, CSEQCOMINTEN, 0); + asd_write_reg_byte(asd_ha, CSEQDLCTL, ASD_DL_SIZE_BITS); + asd_write_reg_byte(asd_ha, CSEQDLOFFS, 0); + asd_write_reg_byte(asd_ha, CSEQDLOFFS+1, 0); + asd_ha->seq.scbpro = 0; + asd_write_reg_dword(asd_ha, SCBPRO, 0); + asd_write_reg_dword(asd_ha, CSEQCON, 0); + + /* Intialize CSEQ Mode 11 Interrupt Vectors. + * The addresses are 16 bit wide and in dword units. + * The values of their macros are in byte units. + * Thus we have to divide by 4. */ + asd_write_reg_word(asd_ha, CM11INTVEC0, CSEQ_INT_VEC0); + asd_write_reg_word(asd_ha, CM11INTVEC1, CSEQ_INT_VEC1); + asd_write_reg_word(asd_ha, CM11INTVEC2, CSEQ_INT_VEC2); + + /* Enable ARP2HALTC (ARP2 Halted from Halt Code Write). */ + asd_write_reg_byte(asd_ha, CARP2INTEN, EN_ARP2HALTC); + + /* Initialize CSEQ Scratch Page to 0x04. */ + asd_write_reg_byte(asd_ha, CSCRATCHPAGE, 0x04); + + /* Initialize CSEQ Mode[0-8] Dependent registers. */ + /* Initialize Scratch Page to 0. */ + for (i = 0; i < 9; i++) + asd_write_reg_byte(asd_ha, CMnSCRATCHPAGE(i), 0); + + /* Reset the ARP2 Program Count. */ + asd_write_reg_word(asd_ha, CPRGMCNT, CSEQ_IDLE_LOOP_ENTRY); + + for (i = 0; i < 8; i++) { + /* Intialize Mode n Link m Interrupt Enable. */ + asd_write_reg_dword(asd_ha, CMnINTEN(i), EN_CMnRSPMBXF); + /* Initialize Mode n Request Mailbox. */ + asd_write_reg_dword(asd_ha, CMnREQMBX(i), 0); + } +} + +/** + * asd_init_lseq_cio -- initialize LmSEQ CIO registers + * @asd_ha: pointer to host adapter structure + */ +static void asd_init_lseq_cio(struct asd_ha_struct *asd_ha, int lseq) +{ + u8 *sas_addr; + int i; + + /* Enable ARP2HALTC (ARP2 Halted from Halt Code Write). */ + asd_write_reg_dword(asd_ha, LmARP2INTEN(lseq), EN_ARP2HALTC); + + asd_write_reg_byte(asd_ha, LmSCRATCHPAGE(lseq), 0); + + /* Initialize Mode 0,1, and 2 SCRATCHPAGE to 0. */ + for (i = 0; i < 3; i++) + asd_write_reg_byte(asd_ha, LmMnSCRATCHPAGE(lseq, i), 0); + + /* Initialize Mode 5 SCRATCHPAGE to 0. */ + asd_write_reg_byte(asd_ha, LmMnSCRATCHPAGE(lseq, 5), 0); + + asd_write_reg_dword(asd_ha, LmRSPMBX(lseq), 0); + /* Initialize Mode 0,1,2 and 5 Interrupt Enable and + * Interrupt registers. */ + asd_write_reg_dword(asd_ha, LmMnINTEN(lseq, 0), LmM0INTEN_MASK); + asd_write_reg_dword(asd_ha, LmMnINT(lseq, 0), 0xFFFFFFFF); + /* Mode 1 */ + asd_write_reg_dword(asd_ha, LmMnINTEN(lseq, 1), LmM1INTEN_MASK); + asd_write_reg_dword(asd_ha, LmMnINT(lseq, 1), 0xFFFFFFFF); + /* Mode 2 */ + asd_write_reg_dword(asd_ha, LmMnINTEN(lseq, 2), LmM2INTEN_MASK); + asd_write_reg_dword(asd_ha, LmMnINT(lseq, 2), 0xFFFFFFFF); + /* Mode 5 */ + asd_write_reg_dword(asd_ha, LmMnINTEN(lseq, 5), LmM5INTEN_MASK); + asd_write_reg_dword(asd_ha, LmMnINT(lseq, 5), 0xFFFFFFFF); + + /* Enable HW Timer status. */ + asd_write_reg_byte(asd_ha, LmHWTSTATEN(lseq), LmHWTSTATEN_MASK); + + /* Enable Primitive Status 0 and 1. */ + asd_write_reg_dword(asd_ha, LmPRIMSTAT0EN(lseq), LmPRIMSTAT0EN_MASK); + asd_write_reg_dword(asd_ha, LmPRIMSTAT1EN(lseq), LmPRIMSTAT1EN_MASK); + + /* Enable Frame Error. */ + asd_write_reg_dword(asd_ha, LmFRMERREN(lseq), LmFRMERREN_MASK); + asd_write_reg_byte(asd_ha, LmMnHOLDLVL(lseq, 0), 0x50); + + /* Initialize Mode 0 Transfer Level to 512. */ + asd_write_reg_byte(asd_ha, LmMnXFRLVL(lseq, 0), LmMnXFRLVL_512); + /* Initialize Mode 1 Transfer Level to 256. */ + asd_write_reg_byte(asd_ha, LmMnXFRLVL(lseq, 1), LmMnXFRLVL_256); + + /* Initialize Program Count. */ + asd_write_reg_word(asd_ha, LmPRGMCNT(lseq), LSEQ_IDLE_LOOP_ENTRY); + + /* Enable Blind SG Move. */ + asd_write_reg_dword(asd_ha, LmMODECTL(lseq), LmBLIND48); + asd_write_reg_word(asd_ha, LmM3SATATIMER(lseq), + ASD_SATA_INTERLOCK_TIMEOUT); + + (void) asd_read_reg_dword(asd_ha, LmREQMBX(lseq)); + + /* Clear Primitive Status 0 and 1. */ + asd_write_reg_dword(asd_ha, LmPRMSTAT0(lseq), 0xFFFFFFFF); + asd_write_reg_dword(asd_ha, LmPRMSTAT1(lseq), 0xFFFFFFFF); + + /* Clear HW Timer status. */ + asd_write_reg_byte(asd_ha, LmHWTSTAT(lseq), 0xFF); + + /* Clear DMA Errors for Mode 0 and 1. */ + asd_write_reg_byte(asd_ha, LmMnDMAERRS(lseq, 0), 0xFF); + asd_write_reg_byte(asd_ha, LmMnDMAERRS(lseq, 1), 0xFF); + + /* Clear SG DMA Errors for Mode 0 and 1. */ + asd_write_reg_byte(asd_ha, LmMnSGDMAERRS(lseq, 0), 0xFF); + asd_write_reg_byte(asd_ha, LmMnSGDMAERRS(lseq, 1), 0xFF); + + /* Clear Mode 0 Buffer Parity Error. */ + asd_write_reg_byte(asd_ha, LmMnBUFSTAT(lseq, 0), LmMnBUFPERR); + + /* Clear Mode 0 Frame Error register. */ + asd_write_reg_dword(asd_ha, LmMnFRMERR(lseq, 0), 0xFFFFFFFF); + + /* Reset LSEQ external interrupt arbiter. */ + asd_write_reg_byte(asd_ha, LmARP2INTCTL(lseq), RSTINTCTL); + + /* Set the Phy SAS for the LmSEQ WWN. */ + sas_addr = asd_ha->phys[lseq].phy_desc->sas_addr; + for (i = 0; i < SAS_ADDR_SIZE; i++) + asd_write_reg_byte(asd_ha, LmWWN(lseq) + i, sas_addr[i]); + + /* Set the Transmit Size to 1024 bytes, 0 = 256 Dwords. */ + asd_write_reg_byte(asd_ha, LmMnXMTSIZE(lseq, 1), 0); + + /* Set the Bus Inactivity Time Limit Timer. */ + asd_write_reg_word(asd_ha, LmBITL_TIMER(lseq), 9); + + /* Enable SATA Port Multiplier. */ + asd_write_reg_byte(asd_ha, LmMnSATAFS(lseq, 1), 0x80); + + /* Initialize Interrupt Vector[0-10] address in Mode 3. + * See the comment on CSEQ_INT_* */ + asd_write_reg_word(asd_ha, LmM3INTVEC0(lseq), LSEQ_INT_VEC0); + asd_write_reg_word(asd_ha, LmM3INTVEC1(lseq), LSEQ_INT_VEC1); + asd_write_reg_word(asd_ha, LmM3INTVEC2(lseq), LSEQ_INT_VEC2); + asd_write_reg_word(asd_ha, LmM3INTVEC3(lseq), LSEQ_INT_VEC3); + asd_write_reg_word(asd_ha, LmM3INTVEC4(lseq), LSEQ_INT_VEC4); + asd_write_reg_word(asd_ha, LmM3INTVEC5(lseq), LSEQ_INT_VEC5); + asd_write_reg_word(asd_ha, LmM3INTVEC6(lseq), LSEQ_INT_VEC6); + asd_write_reg_word(asd_ha, LmM3INTVEC7(lseq), LSEQ_INT_VEC7); + asd_write_reg_word(asd_ha, LmM3INTVEC8(lseq), LSEQ_INT_VEC8); + asd_write_reg_word(asd_ha, LmM3INTVEC9(lseq), LSEQ_INT_VEC9); + asd_write_reg_word(asd_ha, LmM3INTVEC10(lseq), LSEQ_INT_VEC10); + /* + * Program the Link LED control, applicable only for + * Chip Rev. B or later. + */ + asd_write_reg_dword(asd_ha, LmCONTROL(lseq), + (LEDTIMER | LEDMODE_TXRX | LEDTIMERS_100ms)); + + /* Set the Align Rate for SAS and STP mode. */ + asd_write_reg_byte(asd_ha, LmM1SASALIGN(lseq), SAS_ALIGN_DEFAULT); + asd_write_reg_byte(asd_ha, LmM1STPALIGN(lseq), STP_ALIGN_DEFAULT); +} + + +/** + * asd_post_init_cseq -- clear CSEQ Mode n Int. status and Response mailbox + * @asd_ha: pointer to host adapter struct + */ +static void asd_post_init_cseq(struct asd_ha_struct *asd_ha) +{ + int i; + + for (i = 0; i < 8; i++) + asd_write_reg_dword(asd_ha, CMnINT(i), 0xFFFFFFFF); + for (i = 0; i < 8; i++) + asd_read_reg_dword(asd_ha, CMnRSPMBX(i)); + /* Reset the external interrupt arbiter. */ + asd_write_reg_byte(asd_ha, CARP2INTCTL, RSTINTCTL); +} + +/** + * asd_init_ddb_0 -- initialize DDB 0 + * @asd_ha: pointer to host adapter structure + * + * Initialize DDB site 0 which is used internally by the sequencer. + */ +static void asd_init_ddb_0(struct asd_ha_struct *asd_ha) +{ + int i; + + /* Zero out the DDB explicitly */ + for (i = 0; i < sizeof(struct asd_ddb_seq_shared); i+=4) + asd_ddbsite_write_dword(asd_ha, 0, i, 0); + + asd_ddbsite_write_word(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, q_free_ddb_head), 0); + asd_ddbsite_write_word(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, q_free_ddb_tail), + asd_ha->hw_prof.max_ddbs-1); + asd_ddbsite_write_word(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, q_free_ddb_cnt), 0); + asd_ddbsite_write_word(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, q_used_ddb_head), 0xFFFF); + asd_ddbsite_write_word(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, q_used_ddb_tail), 0xFFFF); + asd_ddbsite_write_word(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, shared_mem_lock), 0); + asd_ddbsite_write_word(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, smp_conn_tag), 0); + asd_ddbsite_write_word(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, est_nexus_buf_cnt), 0); + asd_ddbsite_write_word(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, est_nexus_buf_thresh), + asd_ha->hw_prof.num_phys * 2); + asd_ddbsite_write_byte(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, settable_max_contexts),0); + asd_ddbsite_write_byte(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, conn_not_active), 0xFF); + asd_ddbsite_write_byte(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, phy_is_up), 0x00); + /* DDB 0 is reserved */ + set_bit(0, asd_ha->hw_prof.ddb_bitmap); +} + +/** + * asd_seq_setup_seqs -- setup and initialize central and link sequencers + * @asd_ha: pointer to host adapter structure + */ +static void asd_seq_setup_seqs(struct asd_ha_struct *asd_ha) +{ + int lseq; + u8 lseq_mask; + + /* Initialize SCB sites. Done first to compute some values which + * the rest of the init code depends on. */ + asd_init_scb_sites(asd_ha); + + /* Initialize CSEQ Scratch RAM registers. */ + asd_init_cseq_scratch(asd_ha); + + /* Initialize LmSEQ Scratch RAM registers. */ + asd_init_lseq_scratch(asd_ha); + + /* Initialize CSEQ CIO registers. */ + asd_init_cseq_cio(asd_ha); + + asd_init_ddb_0(asd_ha); + + /* Initialize LmSEQ CIO registers. */ + lseq_mask = asd_ha->hw_prof.enabled_phys; + for_each_sequencer(lseq_mask, lseq_mask, lseq) + asd_init_lseq_cio(asd_ha, lseq); + asd_post_init_cseq(asd_ha); +} + + +/** + * asd_seq_start_cseq -- start the central sequencer, CSEQ + * @asd_ha: pointer to host adapter structure + */ +static int asd_seq_start_cseq(struct asd_ha_struct *asd_ha) +{ + /* Reset the ARP2 instruction to location zero. */ + asd_write_reg_word(asd_ha, CPRGMCNT, CSEQ_IDLE_LOOP_ENTRY); + + /* Unpause the CSEQ */ + return asd_unpause_cseq(asd_ha); +} + +/** + * asd_seq_start_lseq -- start a link sequencer + * @asd_ha: pointer to host adapter structure + * @lseq: the link sequencer of interest + */ +static int asd_seq_start_lseq(struct asd_ha_struct *asd_ha, int lseq) +{ + /* Reset the ARP2 instruction to location zero. */ + asd_write_reg_word(asd_ha, LmPRGMCNT(lseq), LSEQ_IDLE_LOOP_ENTRY); + + /* Unpause the LmSEQ */ + return asd_seq_unpause_lseq(asd_ha, lseq); +} + +int asd_init_seqs(struct asd_ha_struct *asd_ha) +{ + int err; + + asd_printk("using sequencer %s\n", SAS_RAZOR_SEQUENCER_VERSION); + err = asd_seq_download_seqs(asd_ha); + if (err) { + asd_printk("couldn't download sequencers for %s\n", + pci_name(asd_ha->pcidev)); + return err; + } + + asd_seq_setup_seqs(asd_ha); + + return 0; +} + +int asd_start_seqs(struct asd_ha_struct *asd_ha) +{ + int err; + u8 lseq_mask; + int lseq; + + err = asd_seq_start_cseq(asd_ha); + if (err) { + asd_printk("couldn't start CSEQ for %s\n", + pci_name(asd_ha->pcidev)); + return err; + } + + lseq_mask = asd_ha->hw_prof.enabled_phys; + for_each_sequencer(lseq_mask, lseq_mask, lseq) { + err = asd_seq_start_lseq(asd_ha, lseq); + if (err) { + asd_printk("coudln't start LSEQ %d for %s\n", lseq, + pci_name(asd_ha->pcidev)); + return err; + } + } + + return 0; +} + +/** + * asd_update_port_links -- update port_map_by_links and phy_is_up + * @sas_phy: pointer to the phy which has been added to a port + * + * 1) When a link reset has completed and we got BYTES DMAED with a + * valid frame we call this function for that phy, to indicate that + * the phy is up, i.e. we update the phy_is_up in DDB 0. The + * sequencer checks phy_is_up when pending SCBs are to be sent, and + * when an open address frame has been received. + * + * 2) When we know of ports, we call this function to update the map + * of phys participaing in that port, i.e. we update the + * port_map_by_links in DDB 0. When a HARD_RESET primitive has been + * received, the sequencer disables all phys in that port. + * port_map_by_links is also used as the conn_mask byte in the + * initiator/target port DDB. + */ +void asd_update_port_links(struct asd_sas_phy *sas_phy) +{ + struct asd_ha_struct *asd_ha = sas_phy->ha->lldd_ha; + const u8 phy_mask = (u8) sas_phy->port->phy_mask; + u8 phy_is_up; + u8 mask; + int i, err; + + for_each_phy(phy_mask, mask, i) + asd_ddbsite_write_byte(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, + port_map_by_links)+i,phy_mask); + + for (i = 0; i < 12; i++) { + phy_is_up = asd_ddbsite_read_byte(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, phy_is_up)); + err = asd_ddbsite_update_byte(asd_ha, 0, + offsetof(struct asd_ddb_seq_shared, phy_is_up), + phy_is_up, + phy_is_up | phy_mask); + if (!err) + break; + else if (err == -EFAULT) { + asd_printk("phy_is_up: parity error in DDB 0\n"); + break; + } + } + + if (err) + asd_printk("couldn't update DDB 0:error:%d\n", err); +} diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_seq.h linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_seq.h --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_seq.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_seq.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,39 @@ +/* + * Aic94xx SAS/SATA driver sequencer interface header file. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _AIC94XX_SEQ_H_ +#define _AIC94XX_SEQ_H_ + +int asd_pause_cseq(struct asd_ha_struct *asd_ha); +int asd_unpause_cseq(struct asd_ha_struct *asd_ha); +int asd_pause_lseq(struct asd_ha_struct *asd_ha, u8 lseq_mask); +int asd_unpause_lseq(struct asd_ha_struct *asd_ha, u8 lseq_mask); +int asd_init_seqs(struct asd_ha_struct *asd_ha); +int asd_start_seqs(struct asd_ha_struct *asd_ha); + +void asd_update_port_links(struct asd_sas_phy *phy); + +#endif diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_seq_microcode.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_seq_microcode.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_seq_microcode.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_seq_microcode.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1467 @@ +/* + * Aic94xx SAS/SATA driver central and link sequencer code for AIC-94xx + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Compile options: RAZOR BYPASS_OOB SATA_II_NCQ TARGET_MODE CONCURR_CONNECTION + */ + +/* + * Those are offsets in byte units. Addresses in sequencer are in + * dword units, so those values will have to be divided by 4 before + * being written out to the sequencer. + */ +#define CSEQ_IDLE_LOOP_ENTRY (0x0000/4) +#define CSEQ_INT_VEC0 (0x0848/4) +#define CSEQ_INT_VEC1 (0x00E8/4) +#define CSEQ_INT_VEC2 (0x00EC/4) + +#define LSEQ_IDLE_LOOP_ENTRY (0x0000/4) +#define LSEQ_INT_VEC0 (0x0120/4) +#define LSEQ_INT_VEC1 (0x1688/4) +#define LSEQ_INT_VEC2 (0x0360/4) +#define LSEQ_INT_VEC3 (0x0390/4) +#define LSEQ_INT_VEC4 (0x1670/4) +#define LSEQ_INT_VEC5 (0x27B0/4) +#define LSEQ_INT_VEC6 (0x28C0/4) +#define LSEQ_INT_VEC7 (0x0118/4) +#define LSEQ_INT_VEC8 (0x15E0/4) +#define LSEQ_INT_VEC9 (0x18D4/4) +#define LSEQ_INT_VEC10 (0x0050/4) + +#define MODE2_TASK (0x1000/4) + +#define SAS_RAZOR_SEQUENCER_VERSION "V17/10c6" + +static const u8 Cseq[] = { +0x04,0x10,0x88,0xB3,0x88,0x11,0x00,0x80,0x06,0x11,0x48,0x80,0x01,0xC7,0x5F,0x68, +0xFD,0x05,0x0A,0x88,0x07,0x11,0x48,0x00,0x00,0x00,0x14,0x38,0x02,0x05,0x0A,0x00, +0x8A,0xD4,0x17,0x28,0xFF,0xD1,0x17,0x70,0x00,0x0C,0xD6,0x5A,0xFD,0x05,0x0A,0x88, +0x05,0x11,0x48,0x80,0x00,0x00,0x14,0x38,0x02,0x05,0x0A,0x00,0x8A,0xD4,0x25,0xA8, +0xFF,0xD1,0x25,0xF0,0x00,0x0C,0xD6,0x5A,0xFD,0x05,0x0A,0x88,0x04,0x11,0x48,0x00, +0xFF,0xC1,0x2D,0xF0,0x00,0x0C,0x20,0xDB,0xFF,0xC9,0x31,0xF0,0x00,0x0C,0x28,0x5B, +0x06,0x11,0x48,0x80,0xFF,0xD1,0x37,0xF0,0x00,0x0C,0x78,0x5E,0x04,0x11,0x48,0x00, +0xFF,0xDC,0x3D,0xF8,0x00,0x0C,0xE6,0xDB,0x02,0x05,0x0A,0x00,0x80,0xE1,0x45,0x68, +0x02,0xE2,0x41,0x30,0x02,0xE0,0x0D,0xB4,0x01,0x35,0xAA,0x69,0xFF,0xCD,0xF9,0x60, +0xFF,0xC5,0x7B,0xE0,0xFF,0xD1,0x63,0x61,0x00,0x0C,0x02,0xC0,0x06,0x11,0x48,0x80, +0x01,0x00,0x8C,0xB3,0x02,0x20,0x88,0xB3,0x04,0x06,0x80,0xB3,0x01,0xC7,0x8F,0x03, +0x04,0x11,0x48,0x00,0x88,0x11,0x00,0x80,0x00,0x0C,0x3E,0xC0,0xFE,0xC7,0x8F,0x8B, +0x01,0xC6,0x01,0xB0,0x02,0xC4,0x41,0xB0,0x02,0xC2,0x0D,0x30,0x02,0xC0,0x0D,0xB0, +0x07,0x11,0x48,0x84,0x02,0x20,0xC4,0x33,0x02,0x06,0xC0,0x33,0x02,0xE0,0x0D,0xB4, +0x80,0xE1,0xC3,0x03,0x00,0x0C,0x02,0xC0,0x01,0x11,0x2A,0x80,0x02,0x11,0x2A,0x80, +0x01,0x05,0x0A,0x84,0x80,0x00,0x1C,0x38,0x02,0xC4,0x41,0xB0,0x02,0x08,0x27,0xB0, +0x01,0x0A,0x15,0xB0,0xC0,0x0A,0x86,0xF0,0xD0,0x0A,0xA6,0x60,0x00,0x04,0x27,0x00, +0x44,0x04,0x19,0xA8,0x01,0x11,0x1A,0x80,0x01,0x12,0x08,0x32,0x01,0x0C,0x18,0x18, +0x04,0x12,0x26,0xB0,0x01,0x0C,0x19,0x1A,0x01,0x0C,0x15,0xB0,0x00,0x0B,0xA1,0xE0, +0x5A,0x00,0x0C,0x38,0xD0,0x0A,0x9F,0xF0,0x07,0x11,0x8C,0xC4,0x05,0x11,0x8C,0x44, +0xFF,0x04,0xBF,0x68,0x02,0x00,0x41,0x30,0x00,0x0C,0xBE,0x40,0x01,0x4D,0x15,0xB0, +0x01,0x0A,0x26,0xB0,0x04,0x3C,0xB1,0x33,0xFF,0x0A,0xB2,0x68,0x02,0x30,0xB0,0xB3, +0x00,0x00,0xB4,0x3B,0x04,0xD8,0x27,0x30,0x02,0x00,0x0D,0xB0,0x02,0x0E,0x0C,0xB0, +0xB1,0x00,0xA2,0x28,0x02,0x06,0x1C,0xB0,0x02,0x06,0x40,0xB0,0x02,0x11,0x4A,0x80, +0x01,0xF4,0x27,0xB0,0x00,0x11,0x4A,0x88,0x01,0x4E,0x22,0x30,0xFF,0x21,0xCC,0x70, +0xFF,0x0E,0xCC,0x78,0x10,0x4D,0x7E,0x78,0x02,0x20,0x88,0xB3,0x02,0x11,0x4A,0x80, +0x04,0xF0,0x81,0x30,0x04,0xEC,0x89,0x30,0x03,0xE8,0x15,0x88,0x44,0x0A,0x14,0xA8, +0x00,0x11,0x4A,0x88,0x80,0x0E,0x90,0x98,0xFF,0x0A,0x96,0x08,0x1C,0x11,0x6E,0x80, +0x00,0x0C,0x6A,0x58,0x40,0x35,0x02,0xF8,0x40,0x11,0x6A,0x00,0x04,0x11,0x68,0x80, +0x02,0x11,0x4A,0x80,0x04,0x40,0xE0,0x33,0x10,0x4D,0xF4,0xF8,0x04,0xE8,0xE1,0x33, +0xFC,0xF0,0xE1,0x0B,0x01,0xF4,0xE9,0x93,0x00,0x11,0x4A,0x88,0x00,0x0C,0x70,0xC0, +0x07,0x11,0x48,0x00,0xFF,0xD1,0x03,0x70,0xB1,0x00,0x7C,0x28,0xFF,0x0E,0xFA,0xE8, +0x04,0x11,0x48,0x00,0x02,0x20,0xC8,0x33,0x80,0x00,0x1C,0x38,0x02,0xCC,0x41,0x30, +0x17,0x4D,0x11,0xF1,0x0C,0x4D,0x15,0x71,0x0B,0x4D,0x15,0xF1,0x0A,0x4D,0x27,0xF1, +0x30,0x50,0x27,0xB0,0x00,0x0C,0x28,0xC1,0x04,0x0C,0x27,0x30,0x00,0x00,0x26,0xB8, +0x14,0x00,0x26,0xB8,0x08,0x11,0x26,0xB0,0x14,0x10,0x27,0x30,0x0C,0x28,0x27,0xB0, +0x02,0x46,0x45,0x30,0x04,0xB0,0x27,0xB0,0x00,0x0C,0x28,0xC1,0x10,0x10,0x27,0xB0, +0x02,0xE4,0x41,0x30,0x44,0x0C,0x18,0xA8,0x01,0x11,0x1A,0x80,0x01,0x0C,0xD2,0x33, +0x80,0xFF,0x14,0xB8,0x83,0x0E,0x14,0xA8,0xB1,0x00,0x48,0x29,0x00,0x11,0xD0,0x8B, +0x80,0xE8,0xD1,0x2B,0x08,0x12,0x80,0xB0,0x01,0x0A,0x90,0x30,0x07,0x0C,0x18,0x18, +0x30,0x12,0x14,0x08,0xFF,0x0A,0x96,0x08,0x1C,0x11,0x6E,0x80,0x02,0x08,0xCD,0x33, +0x00,0x0C,0x82,0xDC,0x02,0xCC,0x41,0x30,0x02,0xE6,0x79,0xB2,0x02,0xE8,0x7D,0xB2, +0x00,0x0C,0x6A,0x58,0x40,0x35,0x02,0xF8,0x40,0x11,0x6A,0x00,0xFF,0xFF,0xB0,0xBB, +0x01,0x11,0x1A,0x80,0xCC,0x11,0x38,0x5F,0x00,0x0C,0x3C,0xDF,0xFF,0xCD,0xF9,0x60, +0x00,0x0C,0x70,0xC0,0x02,0xD0,0x41,0xB0,0x02,0x0C,0x19,0xB0,0x80,0x00,0x14,0xB8, +0xB1,0x00,0xB4,0xAE,0x80,0x0F,0x70,0xE9,0x80,0x00,0xFC,0x3A,0x00,0x0C,0x72,0xC1, +0x02,0x0C,0xFD,0x32,0x08,0x10,0x81,0xB0,0x08,0x18,0x97,0x80,0x01,0x7E,0x91,0xB0, +0x1C,0x11,0x6E,0x80,0x00,0x0C,0x6A,0x58,0x40,0x35,0x02,0xF8,0x40,0x11,0x6A,0x00, +0x08,0x40,0x20,0xB2,0x08,0x50,0x81,0x30,0xFF,0x58,0x97,0x08,0x01,0x7E,0x91,0xB0, +0x1C,0x11,0x6E,0x80,0x00,0x0C,0x6A,0x58,0x40,0x35,0x02,0xF8,0x40,0x11,0x6A,0x00, +0x08,0x40,0xA0,0x32,0x02,0x7E,0x15,0xB0,0x82,0x10,0x14,0x28,0x01,0x10,0x22,0x98, +0x84,0x11,0x14,0xA8,0x83,0x0C,0x19,0x2A,0x02,0x0C,0x15,0xB0,0x89,0x10,0x64,0xA9, +0xFF,0xFF,0xB0,0xBB,0x01,0x11,0x1A,0x80,0xD0,0x11,0x38,0xDF,0x00,0x11,0x3E,0x5F, +0x00,0x0C,0x70,0xC0,0x00,0x0C,0xD2,0xD9,0x01,0x11,0x6A,0x00,0x02,0x11,0x4A,0x80, +0xFC,0xE0,0x81,0x88,0x07,0xE1,0x83,0xB0,0x03,0xE0,0x15,0x08,0x44,0x0A,0x14,0xA8, +0x00,0x11,0x4A,0x88,0x80,0x11,0x90,0x00,0x08,0x0A,0x96,0x00,0x1C,0x11,0x6E,0x80, +0x01,0x00,0x14,0xB8,0x83,0x30,0x60,0x28,0x00,0x0C,0x6A,0x58,0x40,0x35,0x02,0xF8, +0x40,0x11,0x6A,0x00,0x80,0x80,0x00,0x32,0x00,0x0C,0xE6,0x59,0x88,0x11,0x00,0x80, +0x00,0x0C,0x70,0xC0,0x06,0x11,0x48,0x80,0xD6,0x01,0x18,0x38,0xFF,0xD7,0xDD,0x61, +0xDA,0x01,0x18,0x38,0xFF,0xDB,0xE5,0x71,0x02,0x0C,0x1C,0xB0,0x02,0x12,0x40,0xB0, +0x02,0x00,0x27,0x30,0x04,0x11,0x48,0x9C,0x14,0x11,0x2A,0x00,0x02,0x11,0x4A,0x80, +0x08,0x00,0xC1,0xB3,0x00,0x11,0x4A,0x88,0xC0,0x0A,0x15,0x88,0xFF,0x0A,0x04,0xFA, +0x40,0x0A,0x42,0x72,0x80,0x0A,0x70,0xF2,0x01,0x0A,0x15,0xB0,0xC0,0x0A,0x8C,0x72, +0xC3,0x0A,0xBE,0xF2,0xC4,0x0A,0x02,0xF2,0xC6,0x0A,0xC0,0xF2,0xD0,0x0A,0x8E,0x72, +0x15,0x11,0x2A,0x80,0xA1,0x00,0x08,0xAB,0x08,0x0A,0x0B,0xF2,0x03,0x0A,0x0B,0x72, +0x04,0x0A,0x29,0x62,0x08,0x48,0x17,0xFA,0x02,0x46,0x45,0x30,0x08,0x11,0x20,0xDC, +0x02,0x52,0x21,0xB3,0x80,0xBF,0x17,0x7A,0x00,0x0C,0xF4,0xDE,0x03,0x0A,0x1B,0x62, +0xA1,0x00,0xCC,0xAA,0x08,0x0A,0x29,0xF2,0x04,0x48,0x29,0x7A,0xB1,0x00,0xD2,0xAE, +0x00,0x0C,0x3E,0xCF,0x02,0x20,0x0C,0xB0,0x00,0x0C,0xB0,0x5F,0x02,0x06,0x40,0xB0, +0xB1,0x00,0xB0,0xA9,0xB1,0x00,0xB6,0x28,0x00,0x0C,0x3C,0x4F,0x02,0x46,0x45,0x30, +0x01,0x00,0x14,0xB8,0x83,0xAC,0x59,0x2B,0x0F,0x0A,0x9F,0x76,0x02,0xA0,0xAD,0xB3, +0x02,0x20,0x40,0x33,0xFF,0xFF,0x04,0x3A,0xFF,0xD7,0x41,0xF2,0xA1,0x00,0x80,0x2F, +0xC1,0x11,0x36,0x47,0xB1,0x00,0xB0,0xA9,0xB1,0x00,0xB6,0x28,0x80,0x0B,0x55,0x7A, +0x00,0x11,0x18,0x08,0xB0,0x00,0x44,0xAF,0x04,0x0C,0x81,0xB2,0x02,0x46,0x45,0x30, +0x01,0x00,0x14,0xB8,0x83,0xAC,0x59,0x2B,0x00,0x04,0x18,0xB8,0xB1,0x00,0x68,0xA9, +0xFF,0xFF,0x08,0x3A,0x02,0x46,0x45,0x30,0x02,0x20,0x0C,0xB0,0xC8,0x11,0x18,0x5C, +0x02,0x06,0x40,0xB0,0x02,0xA2,0xAD,0x33,0x02,0x20,0x44,0xB3,0x00,0x0C,0x6A,0x52, +0xA1,0x00,0x88,0xAF,0xFF,0x8E,0x6F,0xEA,0xC9,0x11,0x36,0xC7,0x02,0x20,0x18,0x37, +0x02,0x20,0x0C,0xB0,0x44,0x0B,0x15,0xA8,0x00,0x0B,0x01,0x80,0x02,0x06,0x40,0xB0, +0xFF,0xFF,0x00,0xBA,0xFF,0xE5,0x85,0x62,0xFF,0xE7,0x85,0xE2,0x02,0x20,0xC8,0x33, +0x02,0x20,0xCC,0xB3,0xA1,0x00,0x9C,0x28,0x02,0xE6,0xA9,0xB3,0x02,0x20,0xCC,0xB3, +0x02,0xD4,0x41,0x30,0x02,0xE6,0x01,0x36,0x07,0x11,0x98,0xC2,0x00,0x00,0x44,0x38, +0x00,0x11,0x16,0x88,0x01,0x0B,0x15,0x30,0x83,0x8E,0x1D,0x2B,0x05,0x11,0x98,0x42, +0x01,0x0C,0x48,0x30,0x00,0x11,0x08,0x0A,0x00,0x11,0x18,0x8A,0xFF,0xFF,0x04,0x3A, +0xFF,0xD1,0xAB,0xE2,0x02,0x20,0xA0,0xB3,0x02,0x20,0xA4,0x33,0x01,0x11,0xB0,0x83, +0x00,0x0C,0xB2,0xC2,0x02,0x20,0x14,0xB0,0x02,0xD2,0x41,0x30,0x02,0x0A,0x04,0x32, +0x02,0x0A,0xA4,0xB3,0x06,0x11,0x48,0x80,0x01,0xC7,0x15,0x30,0x04,0x11,0x48,0x00, +0x01,0x0A,0x0C,0xED,0x01,0x0C,0x48,0x30,0x00,0x0C,0x14,0x43,0xD1,0x11,0x36,0xC7, +0x02,0x46,0x45,0x30,0xB1,0x00,0xD2,0xAE,0x00,0x0C,0x3E,0xCF,0xFD,0x0B,0xD3,0xE2, +0xFD,0x8F,0xD3,0xE2,0x04,0x9F,0xD5,0xEA,0x04,0x11,0x20,0xDC,0x02,0x20,0x60,0xB3, +0xA1,0x00,0xE2,0x2E,0x2A,0x11,0x3E,0xC7,0x2B,0x11,0x3E,0x47,0xC0,0x01,0x18,0xB8, +0x01,0xD6,0x15,0x30,0x00,0x0C,0x18,0x98,0x01,0x12,0x00,0x30,0xC8,0x01,0x18,0x38, +0x0F,0x00,0x14,0x08,0x00,0x0C,0x18,0x98,0x02,0x0C,0x1C,0xB0,0x02,0x0C,0x0C,0x30, +0x02,0x0E,0x0C,0xB0,0xB1,0x00,0x7C,0x28,0xFF,0x0E,0xF4,0x7A,0xFF,0xD1,0xEB,0x62, +0x04,0x06,0x22,0x30,0x00,0x0C,0x1C,0xC3,0x01,0x0C,0xC0,0x33,0x02,0x06,0x1C,0xB0, +0x02,0x06,0x18,0x30,0xFF,0xFF,0x14,0x38,0x83,0xD4,0xA9,0x2B,0xFF,0x12,0x26,0x18, +0x01,0xD6,0xAD,0x1B,0x07,0xD6,0xAD,0x8B,0xFF,0x0C,0x18,0x98,0xFF,0x12,0x12,0xFB, +0xC0,0x01,0x1C,0x38,0x01,0xD7,0x15,0xB0,0x00,0x0E,0x1C,0x98,0x01,0x00,0x26,0xB0, +0x07,0x0E,0xAE,0x8B,0x00,0xE0,0x3B,0xDF,0x02,0x05,0x0A,0x00,0x00,0x00,0x14,0x38, +0x8A,0xD4,0x1D,0x2B,0xFF,0xD1,0xD7,0x62,0x04,0x11,0x48,0x00,0x88,0x11,0x00,0x04, +0xC0,0x11,0x30,0x5B,0x00,0x0C,0x42,0x4F,0x00,0x11,0x3A,0xDF,0x88,0x11,0x00,0x04, +0xC8,0x11,0x30,0xDB,0x00,0x0C,0x42,0x4F,0x01,0x11,0x3A,0x5F,0x88,0x11,0x00,0x04, +0x01,0x11,0x1A,0x80,0x02,0x05,0x0A,0x00,0xFF,0xFF,0xB0,0xBB,0x02,0x12,0x40,0xB0, +0xFE,0x0C,0x18,0x18,0x02,0x0C,0x0C,0x30,0x02,0x46,0x45,0x30,0x01,0x9D,0x4F,0x7B, +0x20,0x0B,0x51,0x7B,0x02,0x9E,0x51,0x7B,0x04,0x4C,0x51,0xEB,0x04,0x49,0x51,0xEB, +0x20,0x9D,0xD1,0x6B,0xB1,0x00,0xA8,0x2E,0x00,0x0C,0xD0,0x4B,0x40,0x9E,0xD1,0x6B, +0x01,0x9C,0x15,0xB0,0x02,0x22,0x0C,0x30,0x00,0x00,0x44,0x38,0x00,0xAF,0x15,0x88, +0x02,0x06,0x44,0x30,0xFF,0x0A,0x64,0xEB,0x80,0xBF,0xD1,0x6B,0x09,0x11,0x6E,0x03, +0x00,0x0C,0xF4,0xDE,0x00,0x0C,0xD4,0x43,0x80,0xBF,0x73,0x7B,0x09,0xB7,0x6B,0x63, +0x00,0x11,0x7E,0x0B,0x02,0x0A,0x0C,0x30,0x00,0x0C,0x1C,0x5F,0x02,0x06,0x14,0x30, +0x00,0x0C,0xDE,0xCB,0x02,0x22,0x0C,0x30,0x00,0x00,0x44,0x38,0x00,0xAE,0xC1,0x08, +0x02,0x06,0x44,0x30,0x02,0x62,0x14,0xB0,0x01,0x0B,0xD4,0x6B,0xFF,0x0A,0xD4,0x73, +0x04,0x9D,0x97,0x7B,0x02,0x0A,0x0C,0x30,0x00,0x11,0x06,0xDC,0x04,0x11,0x14,0x30, +0x01,0xA8,0x19,0x30,0x01,0xA9,0x15,0xB0,0xB1,0x00,0xB4,0xAE,0x02,0x06,0x14,0x30, +0x80,0x0F,0x96,0x7B,0x01,0x11,0x06,0x5C,0x00,0x0C,0xD4,0x43,0x02,0x06,0x18,0x30, +0x02,0x20,0x0C,0xB0,0x02,0x22,0x0C,0x30,0x01,0x0A,0x00,0x30,0x02,0x06,0x44,0x30, +0x02,0x06,0x40,0xB0,0x00,0x0C,0x38,0x5F,0x04,0x4C,0xB5,0x7B,0xFF,0x03,0xB5,0xF3, +0x02,0x20,0x0C,0xB0,0x02,0x02,0x41,0xB0,0x01,0x0C,0x18,0x18,0x00,0x0C,0x36,0xDF, +0x02,0x06,0x40,0xB0,0xFF,0xFF,0x04,0x3A,0x04,0x9D,0xCF,0xFB,0xC0,0x0C,0xCE,0xF3, +0xFF,0xAA,0xCF,0xFB,0x02,0x20,0x0C,0xB0,0x01,0x11,0x14,0x00,0x00,0xAA,0xC5,0xE3, +0xB1,0x00,0x3C,0x2F,0x00,0x0C,0xCC,0x43,0xB1,0x00,0x36,0x2F,0xFF,0x21,0xCC,0x73, +0x01,0x0A,0x14,0x18,0x00,0x0C,0xBE,0x43,0x02,0x06,0x40,0xB0,0x01,0x11,0x22,0x9C, +0x00,0x0C,0x1C,0x5F,0x00,0x0C,0xDE,0xCB,0x02,0x06,0x18,0x30,0xFF,0x01,0xE5,0x73, +0x02,0x20,0xB0,0x33,0x02,0x00,0x41,0x30,0x00,0x0C,0x3A,0x43,0x02,0x06,0x18,0x30, +0xB1,0x00,0xCC,0x29,0xFF,0x21,0x3A,0xE3,0x01,0x10,0x22,0x1C,0x00,0x11,0x00,0x08, +0x02,0x05,0x0A,0x00,0xFF,0xE5,0xFD,0x73,0x02,0xE4,0x41,0x30,0xFF,0xFF,0xC8,0xBB, +0x01,0x00,0x0E,0xB0,0xFF,0x07,0x14,0x90,0x00,0xDC,0xB9,0x0B,0x0A,0x11,0x3A,0xDF, +0x02,0x05,0x0A,0x00,0xFF,0xDC,0x03,0xFC,0x11,0x00,0x00,0x98,0x01,0x00,0x14,0x30, +0x00,0xDD,0xEB,0xE3,0xFD,0x05,0x0A,0x88,0x88,0x11,0x00,0x04,0x02,0xA8,0x15,0x30, +0x01,0x0A,0x04,0xB0,0x01,0x0B,0x06,0x98,0xFF,0x0C,0x10,0xFC,0xFF,0x0B,0x06,0x18, +0xE0,0xA8,0x15,0xAC,0xFF,0x11,0x22,0x8C,0x02,0xA8,0x51,0x33,0x00,0x0C,0x06,0xC4, +0x04,0x9D,0xAF,0x7F,0x00,0x0C,0x40,0x5F,0xFF,0x21,0xAE,0x77,0x01,0x10,0x22,0x1C, +0xA1,0x00,0x90,0xAF,0xA1,0x00,0xA0,0xAF,0x3C,0x00,0x0C,0x38,0x02,0x34,0xA8,0x33, +0x02,0x36,0x40,0xB0,0xF8,0xD4,0x15,0x08,0xC0,0x0A,0x6A,0xF4,0xD0,0x0A,0x6A,0x74, +0x01,0xD4,0x15,0xB0,0x00,0x11,0x16,0x88,0x1C,0x02,0x18,0x38,0x83,0x0C,0x0C,0xAC, +0x00,0x0C,0x66,0x45,0x00,0x0C,0x70,0xC5,0x00,0x0C,0x7C,0xC5,0x00,0x0C,0x88,0x45, +0x00,0x0C,0x9E,0x44,0x00,0x0C,0x12,0x45,0x00,0x0C,0x96,0xC4,0x00,0x0C,0x0E,0xC5, +0x00,0x0C,0xA2,0xC5,0x00,0x0C,0xB6,0xC5,0x00,0x0C,0xCC,0x45,0x00,0x0C,0xCA,0xC4, +0x00,0x0C,0x3C,0x45,0x00,0x0C,0x76,0xC5,0x00,0x0C,0x38,0xC5,0x00,0x0C,0x90,0x45, +0x00,0x0C,0x2E,0x45,0x00,0x0C,0xBE,0x45,0x00,0x0C,0xC8,0xC5,0x00,0x0C,0xD0,0xC5, +0x00,0x0C,0x3C,0xC7,0x00,0x0C,0xAC,0x46,0x00,0x0C,0x44,0x46,0x00,0x0C,0xD2,0x45, +0x00,0x0C,0x9A,0x46,0x07,0xD4,0x83,0x7C,0x01,0x05,0xAD,0xB3,0x07,0xD4,0x0B,0x8A, +0x44,0x05,0x1D,0xA8,0x01,0x11,0x1E,0x00,0x00,0x11,0x26,0x88,0xFF,0x04,0x7F,0x7C, +0x44,0xD6,0x1D,0x28,0x01,0x11,0x1E,0x00,0x01,0x05,0x27,0xB4,0x01,0x05,0x09,0xB2, +0xC5,0x11,0x36,0xC7,0x01,0x0C,0x19,0x1A,0x01,0x0C,0x15,0xB0,0x00,0x0B,0x0D,0xE5, +0x07,0x11,0x8C,0xDC,0xA1,0x00,0xA2,0xA8,0x01,0x0C,0x48,0x30,0x02,0xD0,0x15,0x30, +0x88,0x20,0x94,0x2C,0xB1,0x00,0x98,0x28,0x04,0x11,0x48,0x84,0x80,0x0B,0x9B,0xFC, +0x00,0x0C,0x0E,0xDE,0x00,0x0C,0x2E,0x5D,0xA1,0x00,0x7C,0x2F,0x80,0x0B,0xA3,0x7C, +0x00,0x0C,0x0E,0xDE,0x00,0x0C,0x2E,0x5D,0x02,0x20,0x0C,0xB0,0x02,0x46,0x45,0x30, +0x02,0x02,0x41,0xB0,0xFF,0xFF,0x40,0xBB,0xFF,0x21,0xC8,0x74,0x02,0x02,0xB1,0xB3, +0x40,0x49,0xB7,0x7C,0x00,0x0C,0x94,0xDD,0x00,0x0C,0xC4,0x44,0x02,0xA0,0xAD,0xB3, +0x02,0x20,0x40,0x33,0xFF,0xFF,0x04,0x3A,0xFF,0xD7,0xC3,0xF4,0xB1,0x00,0x80,0xAF, +0x00,0x0C,0xC4,0x44,0xC1,0x11,0x36,0x5F,0x02,0xD8,0x41,0x30,0xFF,0x21,0xAE,0xE4, +0xA1,0x00,0x7A,0x2F,0x02,0x46,0x45,0x30,0x02,0x06,0xB1,0x33,0xFF,0xFF,0x0C,0xBA, +0x40,0x49,0x07,0xED,0x04,0x4C,0xFD,0xFC,0xB1,0x00,0x7C,0xAF,0xFF,0xA1,0xDD,0xE4, +0xC1,0x11,0x36,0x5F,0x00,0x0C,0x08,0xC5,0x02,0x20,0xA8,0x33,0xC0,0x11,0x40,0x5F, +0x02,0x20,0x14,0xB0,0xFF,0xD7,0xEB,0xF4,0x02,0xD6,0x41,0xB0,0x02,0xD4,0x01,0x32, +0x00,0x0C,0xEC,0x44,0x02,0xD4,0x81,0x33,0x02,0x0A,0x40,0xB0,0x02,0x00,0xAD,0xB3, +0x02,0xD4,0x41,0x30,0x02,0x0A,0x04,0x32,0x02,0xD6,0x01,0xB2,0xFF,0xD7,0x09,0xE5, +0x02,0x20,0x84,0xB3,0x00,0x0C,0x08,0xC5,0x80,0x49,0x03,0x6D,0x00,0x0C,0x54,0x5A, +0x00,0x0C,0x08,0xC5,0xB1,0x00,0xBE,0x29,0x00,0x0C,0x08,0xC5,0x00,0x0C,0x94,0xDD, +0x02,0xD8,0x41,0x30,0xFF,0x21,0xCA,0x64,0xFF,0x11,0x22,0x8C,0x00,0x0C,0x20,0xDD, +0xA1,0x00,0x7C,0x2F,0x02,0x04,0x0D,0x30,0x00,0x0C,0x20,0xDD,0x02,0x06,0x08,0x32, +0x02,0x20,0x0C,0xB0,0x02,0x46,0x45,0x30,0x02,0x04,0x41,0xB0,0xA1,0x00,0x54,0x2F, +0x80,0x49,0x27,0xFD,0x80,0x0B,0x2F,0xED,0xA1,0x00,0xBE,0xA9,0xFF,0x4D,0x2B,0x6D, +0x04,0x49,0x59,0xEA,0x80,0x0B,0x2F,0x7D,0x00,0x0C,0x24,0x5E,0xB1,0x00,0xBE,0x29, +0x40,0x49,0x9D,0xED,0x80,0x49,0x0D,0xED,0x01,0x49,0x0D,0x7D,0xA1,0x00,0x00,0x28, +0xB1,0x00,0x90,0x29,0x00,0x0C,0x2E,0x45,0x07,0x11,0x48,0x00,0x02,0x20,0xB4,0xB3, +0x84,0x80,0x14,0xB8,0x88,0xDA,0x4B,0x2D,0x04,0x11,0x48,0x00,0xB1,0x00,0x18,0x2F, +0x07,0x11,0x48,0x00,0xFF,0xD1,0x51,0x65,0x00,0x0C,0x4E,0x58,0xFF,0xD1,0x4D,0x75, +0xB1,0x00,0x7C,0x28,0xFF,0x0E,0x4A,0x6D,0x44,0x0C,0x1C,0x28,0x02,0x0E,0x1C,0x18, +0x01,0x11,0x1E,0xA0,0x0F,0x00,0x14,0x08,0x08,0x0A,0x26,0x80,0x02,0xDA,0x27,0xB0, +0x04,0x11,0x48,0x00,0x01,0x0C,0xA8,0xB3,0x00,0x0C,0x6C,0xC4,0x04,0x4C,0xCB,0x6C, +0x08,0x4C,0xAD,0xED,0x02,0x46,0x45,0x30,0x02,0x20,0x0C,0xB0,0x00,0x0C,0xAA,0xC4, +0x02,0x46,0x45,0x30,0x02,0x20,0x0C,0xB0,0xA1,0x00,0x54,0x2F,0xB1,0x00,0xBC,0xAA, +0x00,0x0C,0x0C,0xCD,0xA1,0x00,0x00,0x28,0x02,0x20,0x0C,0xB0,0x02,0x46,0x45,0x30, +0x80,0x0B,0x85,0x7D,0x04,0x0C,0x79,0x32,0x00,0x4D,0x95,0x5D,0x00,0x0C,0xA8,0x44, +0x02,0x20,0x0C,0xB0,0x02,0x46,0x45,0x30,0x00,0x4D,0x95,0x5D,0x00,0x0C,0x1C,0xC5, +0x02,0x46,0x45,0x30,0x01,0x4D,0x19,0xB0,0xB1,0x00,0xBE,0x29,0x40,0x49,0x9D,0xED, +0x80,0x49,0x0D,0xED,0xA1,0x00,0x30,0xAA,0xFF,0x45,0xA1,0xE5,0x1B,0x11,0x9A,0x45, +0x23,0x11,0x9A,0xC5,0x02,0x20,0x0C,0xB0,0x00,0x0C,0xB2,0x5E,0x00,0x0C,0x80,0x4D, +0x08,0x4C,0xAB,0xFC,0x02,0x06,0x22,0x30,0xF7,0x11,0x18,0x00,0xB1,0x00,0xA2,0xA9, +0x04,0x11,0x18,0x00,0xB1,0x00,0x74,0x29,0x00,0x0C,0xCA,0xC4,0x02,0x20,0x0C,0xB0, +0x00,0x0C,0xB2,0x5E,0x00,0x0C,0x8C,0x4D,0xA1,0x00,0x54,0x2F,0x02,0x00,0xC9,0x33, +0x00,0x0C,0xC8,0xDD,0xFF,0xE5,0xC7,0xE5,0xFF,0xFF,0xCC,0xBF,0xA1,0x00,0x9C,0x28, +0x0A,0x4D,0x3D,0xE7,0xCD,0x11,0x36,0x47,0x02,0x20,0xC8,0x33,0xA1,0x00,0x9C,0x28, +0x07,0x11,0xD4,0xC5,0x05,0x11,0xD4,0x45,0x01,0x0C,0x48,0x30,0x02,0x38,0xE2,0xFD, +0xFF,0xD1,0xE3,0xF5,0xB1,0x00,0x7C,0x28,0xFF,0x0E,0xD8,0xED,0x01,0x0C,0x60,0x30, +0x02,0x20,0x64,0xB4,0x01,0x00,0x14,0xB8,0x83,0xD4,0xA9,0x2B,0xC8,0x01,0x18,0x38, +0x0F,0x00,0x14,0x08,0x00,0x0C,0x18,0x98,0x02,0x0C,0x1C,0xB0,0x01,0x12,0x14,0x30, +0xFF,0x0A,0xFE,0xED,0x01,0x11,0x26,0x80,0xC0,0x01,0x1C,0x38,0x01,0xD7,0x15,0xB0, +0x00,0x0E,0x1C,0x98,0x01,0x00,0x26,0xB0,0x07,0x0E,0xAE,0x0F,0x01,0x0A,0x14,0x18, +0x00,0x0C,0x04,0x4E,0x01,0x0A,0x26,0x34,0x24,0x11,0x2A,0x00,0x00,0x40,0x18,0xB8, +0xA1,0x00,0xFA,0x2E,0xFF,0xBF,0x18,0xB8,0xA1,0x00,0x0A,0xAF,0xC0,0x49,0x0D,0x6D, +0x04,0x4E,0x15,0x6E,0x03,0x4E,0x21,0x6E,0xFB,0x4E,0x9D,0x8A,0x00,0x0C,0x44,0x5E, +0x00,0x0C,0x0C,0xD5,0x06,0x11,0x48,0x80,0x00,0x0C,0x4C,0xDE,0x04,0x11,0x48,0x84, +0xFC,0x4E,0x9D,0x0A,0xA1,0x00,0xBC,0x2A,0xC0,0x49,0x0D,0x6D,0xFF,0x4D,0x37,0xEE, +0x00,0x0C,0x2C,0xC6,0xC0,0x49,0x0D,0x6D,0x20,0x48,0x33,0xEE,0x01,0x48,0x0D,0xFD, +0x00,0x0C,0x44,0x46,0x20,0x49,0x37,0xEE,0x01,0x4E,0x39,0xEE,0xA1,0x00,0xBC,0x2A, +0x00,0x11,0x66,0x8A,0x01,0x4E,0x9D,0x8A,0xFC,0x49,0x93,0x0A,0xB1,0x00,0xB0,0xA9, +0x02,0x46,0x45,0x30,0x00,0x0C,0x36,0xC2,0x02,0x46,0x45,0x30,0xA6,0x01,0x18,0xB8, +0x30,0x01,0x1C,0x38,0x00,0x0C,0xFC,0x46,0xFF,0xFF,0x0C,0xBA,0xFF,0xD1,0x55,0xE6, +0x02,0x20,0xA0,0xB3,0x02,0x20,0xA4,0xB7,0x02,0x20,0x14,0xB0,0x02,0xD2,0x41,0x30, +0x02,0x0A,0x0C,0xB2,0x02,0x0A,0xA4,0xB3,0x02,0x0A,0x40,0x34,0xFF,0xD9,0x67,0xE6, +0x06,0x11,0x48,0x80,0x02,0x06,0xA1,0xB3,0x04,0x11,0x48,0x84,0x02,0x20,0xA8,0x33, +0x02,0x06,0xAD,0xB3,0x02,0xD8,0x41,0x30,0x02,0xD6,0x0D,0xB2,0x02,0xD4,0x41,0x30, +0xFF,0x07,0x0D,0xE5,0x06,0x11,0x48,0x80,0x02,0xD8,0xA5,0x33,0x04,0x11,0x48,0x84, +0x02,0x05,0x0A,0x00,0x02,0xD0,0x41,0xB0,0x04,0x11,0x48,0x00,0xFF,0xFF,0xB0,0xBB, +0x01,0x49,0x91,0xEE,0x80,0x33,0x91,0xFE,0x02,0x46,0x45,0x30,0xA6,0x01,0x1C,0x38, +0x33,0x01,0x18,0xB8,0x00,0x0C,0x20,0x5F,0x00,0x0C,0x92,0x56,0x26,0x11,0x3E,0xDF, +0x00,0x0C,0x5E,0xDE,0xFF,0x07,0x43,0xF7,0x02,0x20,0xB0,0x33,0x02,0x06,0x41,0x30, +0x00,0x0C,0x80,0xC6,0x00,0x0C,0x2A,0xDE,0x00,0x0C,0x2E,0x45,0x44,0x43,0x15,0xA8, +0x00,0x43,0x15,0x80,0x02,0x20,0x0C,0xB0,0x01,0x0A,0x00,0x30,0x02,0x06,0x40,0xB0, +0x15,0x11,0x3A,0x5F,0x02,0x05,0x0A,0x84,0x02,0x46,0x45,0x30,0xB1,0x00,0xBE,0x29, +0x00,0x4D,0x3F,0xC7,0x02,0x46,0x45,0x30,0x03,0xD5,0xED,0xF6,0x09,0xD5,0xED,0xF6, +0x12,0xD5,0xED,0xF6,0xF0,0xD5,0x15,0x08,0x80,0x0A,0xCE,0x76,0x08,0xD5,0xC5,0x6E, +0x44,0xD5,0x0F,0x28,0x06,0x07,0xC6,0x7E,0x00,0x11,0x7E,0x0B,0x0F,0xD5,0x7D,0x0A, +0x02,0x11,0x7A,0x02,0x04,0x11,0x44,0x5F,0x01,0x10,0x22,0x1C,0x44,0xD5,0x15,0xA8, +0x80,0x0A,0xDA,0x6E,0x01,0x0A,0x0E,0xB0,0x0C,0x07,0xDA,0x6E,0xC2,0x07,0xE2,0x6E, +0x31,0x07,0xDE,0x6E,0x00,0x11,0x7E,0x0B,0x01,0x11,0x22,0x9C,0x80,0xBF,0xE3,0xEE, +0x00,0x0C,0xF4,0xDE,0x04,0x11,0x6E,0x83,0x0F,0x11,0x22,0xDC,0x44,0xD5,0x19,0xA8, +0xB0,0x00,0x20,0x2C,0x01,0x11,0x22,0x9C,0x80,0xBF,0xF1,0xEE,0x00,0x0C,0xF4,0xDE, +0x01,0xD5,0x6F,0xB3,0x01,0x11,0x22,0x9C,0xBA,0x01,0x18,0x38,0xBC,0x01,0x1C,0xB8, +0x08,0x9F,0xFD,0xFE,0x90,0x01,0x18,0xB8,0x02,0x12,0x14,0x30,0x8B,0x10,0xAE,0x2F, +0x00,0x0C,0x06,0xDF,0x04,0x64,0x26,0x30,0x01,0x10,0x22,0x1C,0x07,0x11,0x48,0x00, +0x04,0x11,0xB8,0x33,0x02,0x0A,0xBA,0xB3,0x00,0x0C,0x14,0xDF,0x00,0x0C,0x14,0xDF, +0x04,0xDC,0xC9,0xB0,0x04,0x11,0x48,0x84,0x02,0xDC,0x15,0x30,0x83,0xDC,0xB9,0x2B, +0x02,0xDE,0x15,0xB0,0x84,0xDE,0xBD,0x2F,0xBA,0x01,0x1C,0xB8,0xBF,0x01,0x18,0x38, +0x11,0x12,0x22,0xA8,0x00,0x0C,0x0C,0xD5,0x02,0x0C,0x0C,0x30,0x02,0x0E,0x18,0xB0, +0x02,0x12,0x14,0x30,0x00,0x0C,0x06,0xDF,0x02,0x06,0x18,0x30,0xFC,0x0C,0x18,0x98, +0x04,0x12,0xC8,0xB0,0xFF,0x11,0x22,0x20,0x11,0x67,0x22,0xAC,0xA1,0x00,0x32,0xA8, +0xA1,0x00,0x4C,0xA8,0xA1,0x00,0x62,0xA8,0xA1,0x00,0x62,0x2A,0xA1,0x00,0x5C,0xAA, +0xA1,0x00,0x8E,0xAA,0xFD,0x05,0x0A,0x0C,0xA1,0x00,0x82,0xA9,0x00,0x0C,0x06,0x5E, +0x02,0x86,0x45,0x30,0xFF,0x23,0x0A,0xF6,0xB1,0x00,0xC6,0xAD,0x00,0x0C,0x52,0xD7, +0xB1,0x00,0xEC,0x2D,0x02,0xAE,0x45,0x30,0x00,0x0C,0x4A,0x47,0x00,0x00,0x40,0xB8, +0x00,0x0C,0x8E,0xDF,0x00,0x0C,0x68,0xD7,0x02,0x20,0x0C,0xB0,0x02,0xEA,0x41,0xB0, +0x02,0x34,0x15,0x30,0x02,0x06,0x40,0xB0,0x88,0x20,0x69,0xAF,0x01,0x10,0x22,0x1C, +0xB1,0x00,0x8A,0x2E,0x00,0x0C,0x58,0xD7,0x1F,0x11,0x18,0x9C,0x00,0x00,0x40,0xB8, +0x00,0x0C,0x8E,0xDF,0x00,0x0C,0x88,0x57,0x02,0x20,0x0C,0xB0,0x02,0xEA,0x41,0xB0, +0x09,0x0B,0x83,0x67,0x02,0x34,0x15,0x30,0x02,0x06,0x40,0xB0,0x88,0x20,0x89,0x2F, +0x00,0x0C,0x84,0xC7,0x02,0x06,0x40,0xB0,0x23,0x11,0x18,0x00,0xB1,0x00,0x12,0x2C, +0xB1,0x00,0x8A,0x2E,0x00,0x0C,0x70,0xD7,0xFF,0x11,0x22,0x8C,0xB1,0x00,0xC8,0x2E, +0x00,0x0C,0x0C,0xD5,0x01,0x11,0x4A,0x80,0x28,0x01,0x18,0xB8,0x80,0x0B,0x9B,0xFF, +0x34,0x01,0x18,0x38,0x08,0x12,0xD0,0xB3,0xE0,0x01,0x18,0x38,0x01,0x12,0x14,0x30, +0x07,0x0C,0x18,0x18,0x00,0x12,0xAC,0x67,0xF8,0x0C,0x18,0x18,0xE8,0x0C,0x9E,0xE7, +0x00,0x11,0x4A,0x88,0x01,0x10,0x22,0x1C,0x00,0x11,0x4A,0x88,0x01,0x11,0x22,0x9C, +0x01,0x8E,0x1D,0x1B,0x01,0x8E,0x0D,0xE5,0xC8,0x11,0x40,0xDF,0xFF,0x21,0xBC,0x77, +0x02,0xD6,0xB1,0xB3,0x00,0x0C,0x38,0x5F,0x02,0x20,0x18,0x37,0xFF,0x8E,0x1D,0x9B, +0xFF,0x8E,0x0D,0xED,0xFF,0x8D,0x0D,0xF5,0x02,0x8C,0x41,0xB0,0xFF,0xFF,0x18,0x3B, +0xC9,0x11,0x36,0xC7,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0xFF,0x4B,0x13,0xED,0xFF,0x4D,0x25,0xE8,0x03,0x48,0x25,0x78,0x20,0x48,0x1D,0xE8, +0x80,0x49,0x0F,0xF8,0xFF,0xFE,0x18,0xB8,0x00,0x0C,0x96,0xC1,0x02,0x4C,0x19,0x78, +0x00,0x00,0x14,0x38,0x88,0x0C,0x19,0x28,0x88,0x0E,0x19,0xA8,0x17,0x11,0x2E,0x40, +0x04,0x0C,0x79,0x32,0x01,0x11,0x22,0xC0,0x20,0x49,0x25,0x78,0x04,0x11,0x78,0xB2, +0x02,0x11,0x22,0xC0,0x00,0x0C,0x82,0xD9,0x80,0x4C,0x63,0x7A,0x21,0x1F,0x2D,0xE8, +0x40,0x48,0x63,0xFA,0x0B,0x11,0x2E,0xC0,0x0C,0x11,0x2E,0x40,0x00,0x0C,0x82,0xD9, +0xCD,0x11,0x32,0x40,0x01,0x11,0x1A,0x80,0xFF,0xFF,0x00,0xBA,0xFF,0x12,0x40,0x60, +0x01,0x11,0x1E,0x00,0xFE,0x0C,0x1C,0x98,0x02,0x20,0x26,0x30,0x02,0x20,0x26,0xB4, +0x02,0x0C,0x1C,0xB0,0x02,0x20,0x14,0xB0,0x02,0x12,0x40,0xB0,0x02,0x0A,0x00,0xB2, +0x02,0x0A,0x26,0xB0,0x02,0x0A,0x40,0x34,0x02,0x0C,0x1C,0xB0,0xFF,0xD9,0x53,0x60, +0x02,0x00,0x27,0xB4,0x02,0x20,0xA8,0x33,0x02,0x00,0xAD,0xB3,0x02,0xD8,0x41,0x30, +0x02,0xD6,0x01,0xB2,0x02,0xD4,0x41,0x30,0xFF,0x01,0x13,0xE5,0x02,0x0C,0x1C,0x98, +0x02,0xD8,0x27,0xB4,0x02,0x0C,0xC0,0x33,0x02,0x20,0xC4,0x33,0x01,0x24,0xD0,0xB3, +0x04,0x11,0x48,0x00,0xFD,0x05,0x0A,0x88,0x02,0x05,0x0A,0x00,0x02,0x38,0x74,0x68, +0xFD,0x05,0x0A,0x88,0x00,0x0C,0x6C,0x40,0x01,0xE0,0x61,0x30,0x02,0xE2,0x65,0x30, +0x01,0xE8,0x49,0xB0,0xFD,0x05,0x0A,0x0C,0x02,0xD0,0x41,0xB0,0x44,0xD8,0x19,0x28, +0x0F,0x0C,0x18,0x98,0x01,0x11,0x1A,0x20,0xC0,0x12,0x1C,0x88,0x05,0x24,0x8A,0xF0, +0xC0,0xD8,0x95,0x40,0xFF,0x0E,0x92,0x68,0x00,0x00,0x44,0x38,0xFF,0xFF,0x14,0x38, +0x83,0x8E,0x1D,0x2B,0xD0,0xD8,0x95,0xC0,0x01,0xD8,0xB1,0x1B,0x07,0xD8,0x13,0x6D, +0x02,0x02,0xA1,0x33,0x01,0x11,0xB0,0x07,0x01,0x00,0x0E,0xB0,0x01,0x07,0x14,0xB0, +0x00,0xDC,0xB9,0x07,0xFF,0x11,0x14,0x02,0xFF,0xFF,0x10,0x3A,0xFF,0xFF,0x8C,0x3A, +0x06,0x11,0x48,0x80,0x01,0xD5,0x15,0x30,0x00,0x21,0xB0,0xE8,0xD7,0x11,0xB2,0x40, +0xDB,0x11,0xB2,0x40,0x00,0x0C,0x32,0xD8,0x04,0x11,0x48,0x84,0xC0,0x0A,0x15,0x88, +0x40,0x0A,0xF4,0x70,0x01,0x0A,0x15,0xB0,0x00,0x11,0x16,0x88,0x61,0x04,0x18,0x38, +0x83,0x0C,0x0C,0xAC,0x00,0x0C,0xEE,0x40,0x00,0x0C,0xEE,0x40,0x00,0x0C,0xEE,0x40, +0x00,0x0C,0xE2,0x40,0x00,0x0C,0xEE,0x40,0x00,0x0C,0xF4,0xC0,0x00,0x0C,0xF4,0xC0, +0x00,0x0C,0xF4,0xC0,0x00,0x0C,0xE2,0x40,0x00,0x0C,0xFA,0x40,0x00,0x0C,0x36,0xC1, +0x00,0x0C,0x36,0xC1,0x00,0x0C,0x36,0xC1,0xA0,0x00,0x00,0x2A,0xA0,0x00,0x00,0x2A, +0x00,0x0C,0x36,0xC1,0xFF,0xFF,0x14,0x38,0x88,0x34,0xEB,0x28,0x00,0x0C,0x6C,0x5E, +0x00,0x0C,0x12,0xCD,0x00,0x0C,0x3C,0x5E,0x00,0x0C,0x12,0xCD,0x01,0x21,0x40,0x32, +0x01,0x20,0x42,0x32,0x01,0x11,0x22,0x9C,0x01,0x21,0x44,0xB2,0x01,0x20,0x46,0xB2, +0x01,0x11,0x22,0x9C,0x08,0x48,0x37,0x79,0x02,0x46,0x45,0x30,0x04,0x11,0xA8,0xB3, +0x01,0xB4,0xAD,0xB3,0xA8,0x01,0x18,0x38,0x01,0x12,0xB0,0xB3,0xFF,0xD8,0x0D,0x69, +0x08,0xD4,0xA9,0x1B,0x00,0x0C,0x14,0xC1,0x44,0xD4,0x0F,0xA8,0x01,0x07,0x14,0xB0, +0x00,0xD8,0x19,0x69,0x01,0xD4,0xA9,0x1B,0x07,0xD4,0x0D,0xE9,0x00,0x0C,0x04,0x41, +0x02,0x0C,0x1C,0xB0,0xFF,0x0E,0x1C,0x98,0x00,0xD8,0x27,0x10,0x01,0xD4,0x15,0xB0, +0x00,0x0A,0x14,0x98,0x80,0x01,0x1C,0xB8,0x00,0x0E,0x1C,0x98,0x02,0xA4,0x45,0x30, +0x02,0x20,0x26,0x30,0x33,0xD4,0x15,0x28,0x00,0x1C,0x39,0x82,0x01,0x11,0x22,0x9C, +0x00,0x00,0x14,0x38,0x88,0x12,0x92,0x2E,0x88,0x12,0x92,0x2E,0x01,0x11,0x22,0x9C, +0x02,0x22,0x0C,0x30,0x00,0x00,0x44,0x38,0xB8,0x01,0x18,0xB8,0x0F,0x00,0x14,0x08, +0x00,0x0C,0x18,0x98,0x01,0x12,0x14,0x30,0x02,0x06,0x44,0x30,0xFF,0x11,0x22,0x8C, +0x02,0x0C,0x0C,0x30,0x08,0x0C,0x18,0x18,0x02,0x12,0x1C,0xB0,0x02,0x0A,0x0C,0x30, +0x82,0x10,0x14,0x28,0x01,0x10,0x22,0x98,0x84,0x11,0x14,0xA8,0x83,0x0E,0x1C,0x28, +0x02,0x06,0x14,0x30,0x80,0x0F,0x60,0xF9,0xFE,0x0C,0x18,0x18,0x02,0x12,0x14,0x30, +0x02,0x06,0x18,0x30,0xFF,0x11,0x22,0x8C,0x01,0x0C,0x1A,0xB0,0x00,0x11,0x68,0x41, +0x02,0x48,0x15,0xB0,0x80,0x0C,0x04,0xA8,0xE1,0x48,0x71,0x29,0xFF,0x11,0x22,0x8C, +0x02,0x48,0x91,0x32,0x00,0x0C,0x68,0x41,0x00,0x11,0x1A,0x88,0x02,0x4C,0x15,0x30, +0x80,0x0C,0x04,0xA8,0xE1,0x4C,0x7F,0x29,0xFF,0x11,0x22,0x8C,0x02,0x4C,0x99,0x32, +0x00,0x0C,0x76,0x41,0x02,0x4C,0x15,0x30,0x01,0x0A,0x04,0x80,0x01,0x0C,0x06,0x30, +0xE1,0x4C,0x8D,0xA9,0xFF,0x11,0x22,0x8C,0x02,0x4C,0x99,0x32,0x00,0x0C,0x82,0xC1, +0x7F,0x11,0x92,0xC1,0x01,0x0C,0x1A,0xB0,0xFF,0x11,0x96,0xC1,0x02,0x48,0x15,0xB0, +0x81,0x0C,0x04,0x28,0xE1,0x48,0x9F,0x29,0xFF,0x11,0x22,0x8C,0x02,0x48,0x91,0x32, +0x00,0x0C,0x96,0xC1,0xFF,0x11,0x1A,0x00,0x02,0x4C,0x15,0x30,0x81,0x0C,0x04,0x28, +0xE1,0x4C,0xAD,0x29,0xFF,0x11,0x22,0x8C,0x02,0x4C,0x99,0x32,0x00,0x0C,0xA4,0x41, +0x02,0x4A,0x15,0x30,0x01,0x0A,0x04,0xB0,0x01,0x0B,0x06,0x98,0xE1,0x4A,0xBB,0xA9, +0xFF,0x11,0x22,0x8C,0x02,0x4A,0x95,0x32,0x00,0x0C,0xB0,0x41,0x02,0x4A,0x15,0x30, +0x01,0x0A,0x04,0xB0,0xFF,0x0B,0x06,0x18,0xE1,0x4A,0xC9,0xA9,0xFF,0x11,0x22,0x8C, +0x02,0x4A,0x95,0x32,0x00,0x0C,0xBE,0xC1,0x02,0x0C,0x0C,0x30,0x02,0xD8,0x0D,0xB0, +0x00,0x0C,0x4C,0xD8,0x1B,0x11,0x14,0x80,0x00,0x0C,0x10,0x5A,0xC0,0x0C,0xDA,0x71, +0xC0,0x11,0xE6,0x41,0xFF,0x8E,0xE5,0xF9,0x02,0x8C,0x41,0xB0,0xFF,0x21,0xF2,0x71, +0xFF,0xFF,0x18,0x3B,0xC8,0x11,0xEE,0x41,0xC8,0x11,0xE6,0xC1,0x00,0x0C,0x8E,0xDA, +0xFF,0x21,0xF2,0x71,0x02,0xD6,0xB1,0xB3,0x00,0x0C,0x4C,0xD8,0x1B,0x11,0x14,0x80, +0x00,0x0C,0x10,0x5A,0x00,0x00,0x40,0xB8,0xFF,0xFF,0xD4,0x3B,0x00,0x0C,0xC8,0xDE, +0x00,0x0C,0xFC,0x51,0x1B,0x11,0x12,0xDC,0x00,0x0C,0x8A,0xDE,0x00,0x0C,0xF6,0x51, +0x00,0x0C,0xF0,0x5D,0x02,0x06,0xB0,0xB3,0x02,0x06,0x18,0x30,0xFF,0xD9,0x0D,0xF2, +0x02,0xD8,0x41,0x30,0x02,0x00,0x41,0xB4,0x02,0x12,0x40,0xB0,0xFE,0x0C,0x18,0x9C, +0x02,0x0C,0x0C,0x30,0x00,0x0C,0x26,0x5A,0x02,0x0A,0x0C,0x30,0x00,0x0A,0x30,0xDA, +0x02,0x06,0x14,0x30,0x02,0xD4,0x19,0xB0,0x02,0x12,0x40,0xB0,0xFF,0x21,0x14,0x62, +0x02,0x06,0x18,0x30,0x02,0xDA,0x1D,0xB0,0xFF,0xFF,0x26,0x3C,0xC0,0x0C,0x2C,0xE2, +0x02,0x01,0xA8,0xBB,0xA0,0x01,0xB4,0x3F,0x04,0x01,0xA8,0xBB,0xA2,0x01,0xB4,0xBF, +0x1B,0x0C,0x42,0x62,0x01,0xB7,0x79,0x32,0x04,0xB7,0x3D,0x62,0x01,0x11,0x7A,0x02, +0xF0,0x9F,0x7D,0x8A,0x00,0x0C,0x5C,0xC2,0x03,0xB7,0x5D,0xE2,0x01,0x9E,0x7B,0x8A, +0x00,0x0C,0x5C,0xC2,0x23,0x0C,0x46,0xF2,0x13,0x0C,0x5C,0xE2,0x02,0x20,0x78,0x32, +0x80,0x0B,0x4F,0xFA,0x01,0x21,0x79,0x32,0x01,0x20,0x7B,0x32,0x88,0x11,0x02,0x00, +0x02,0x20,0x0C,0xB0,0x02,0xEA,0x41,0xB0,0x02,0x52,0x15,0x30,0x02,0x06,0x40,0xB0, +0x01,0x01,0x22,0xB0,0x02,0x0A,0x7C,0x32,0x00,0x0C,0xBC,0x5A,0x00,0x0C,0x12,0xCD, +0x00,0x0C,0x82,0xD9,0x00,0x0C,0xD0,0xDD,0x00,0x0C,0x8C,0xD2,0x02,0x46,0x45,0x30, +0xFF,0xFF,0x14,0x38,0x83,0xAC,0x59,0x2B,0x04,0x9F,0x71,0xFA,0x00,0x0C,0xE2,0x5E, +0x40,0x49,0x8B,0x7A,0xFF,0x45,0x8B,0x72,0x02,0x20,0x0C,0xB0,0x02,0x44,0x0D,0xB0, +0x00,0x0C,0xA2,0x5A,0xC5,0x11,0x32,0xD8,0x88,0x11,0x02,0x00,0x02,0x06,0x40,0xB0, +0xFF,0xFF,0x14,0x38,0x83,0x4A,0x95,0x2A,0x00,0x0C,0x46,0x5C,0x01,0x01,0x22,0xB0, +0x00,0x0C,0x10,0xC5,0x00,0x0C,0xA2,0x5A,0xC5,0x11,0x32,0xC0,0xFF,0xFF,0xAC,0x3B, +0x01,0x11,0x1A,0x80,0x02,0x12,0x40,0xB0,0xFE,0x0C,0x18,0x18,0xFF,0x21,0x12,0x75, +0x00,0x0C,0xCC,0x5E,0x00,0x0C,0x12,0xCD,0x02,0x20,0xAC,0xB3,0x02,0x00,0x41,0x30, +0x00,0x0C,0x96,0xC2,0x09,0x0A,0x13,0x65,0x08,0x48,0x13,0xFD,0x3D,0x1C,0x17,0xA8, +0x02,0x22,0x0C,0x30,0x02,0x46,0x45,0x30,0xA8,0x01,0x18,0x38,0x3D,0x0B,0x14,0xA8, +0x00,0x0C,0x18,0x98,0x02,0x0C,0x1C,0xB0,0x44,0x0B,0x0E,0xA8,0x01,0x07,0x14,0xB0, +0x00,0x12,0x26,0x00,0x00,0x0C,0x44,0xC1,0x02,0x48,0x15,0xB0,0x01,0x0B,0x92,0xEE, +0x01,0x0A,0x04,0xB0,0x01,0x0B,0x06,0x80,0xE1,0x48,0xC9,0x2A,0x01,0x11,0x22,0x9C, +0x02,0x48,0x91,0x32,0x00,0x0C,0xBC,0x42,0x02,0x20,0xD4,0xB3,0x01,0x0A,0xE5,0xB3, +0x10,0x0B,0xDB,0x7A,0xFF,0xFF,0x14,0x38,0x8A,0x34,0xDB,0x2A,0x00,0x0C,0x18,0x5D, +0x00,0x0C,0x62,0xCA,0x02,0x46,0x45,0x30,0xC0,0x11,0x26,0xDD,0x00,0x0C,0xE6,0xD2, +0x1E,0x11,0x5C,0xDA,0x02,0xEA,0x41,0xB0,0x00,0x11,0x5C,0xC2,0xFF,0x8E,0xF1,0xFA, +0x02,0x8C,0x41,0xB0,0xFF,0x21,0xF4,0x72,0x00,0x0C,0x8C,0xDD,0x00,0x0C,0xF2,0x42, +0xC8,0x11,0x26,0x5D,0x00,0x0C,0xFC,0x4A,0x02,0xEA,0x41,0xB0,0x00,0x0C,0x6C,0x5E, +0x00,0x0C,0x62,0xCA,0x02,0x0A,0x40,0xB0,0x10,0x0B,0x05,0xFB,0x80,0x11,0x64,0xD9, +0x02,0xEA,0x41,0xB0,0xA0,0x00,0x28,0x2A,0x02,0xEA,0x41,0xB0,0x1D,0x11,0x5C,0xC2, +0x02,0x20,0xD4,0xB3,0x01,0x0A,0xE5,0xB3,0xFF,0x0B,0x21,0x7B,0x01,0x0B,0x23,0xF3, +0x02,0x0B,0x39,0x73,0x03,0x0B,0x69,0xF3,0x04,0x0B,0x8F,0xF3,0x05,0x0B,0x99,0xF3, +0x06,0x0B,0x95,0xF3,0x07,0x0B,0xDF,0xF3,0x08,0x0B,0xDD,0x73,0x09,0x0B,0xFB,0x73, +0xFF,0x11,0x28,0x02,0xC0,0x11,0x6C,0xDC,0xC8,0x11,0x6C,0x5C,0x00,0x00,0x40,0xB8, +0x00,0x0C,0xCC,0x5D,0x00,0x0C,0x32,0x53,0x00,0x0C,0xBC,0xDD,0x00,0x0C,0x32,0x53, +0x23,0x11,0x12,0x5C,0x00,0x0C,0x8A,0xDE,0x00,0x0C,0x28,0xD3,0x00,0x0C,0x42,0xC4, +0x00,0x0C,0xD2,0x5E,0x00,0x0C,0x5C,0x4A,0x02,0x10,0x43,0x7B,0xC0,0x11,0x96,0xDC, +0x02,0xEA,0x41,0xB0,0x04,0x10,0x49,0x7B,0x00,0x0C,0x88,0xDC,0x02,0xEA,0x41,0xB0, +0x01,0x10,0x59,0xFB,0x00,0x00,0x40,0xB8,0x00,0x0C,0xC8,0xDE,0x00,0x0C,0x52,0x53, +0x23,0x11,0x12,0x5C,0x00,0x0C,0x8A,0xDE,0x00,0x0C,0x4C,0x53,0x02,0xEA,0x41,0xB0, +0xFD,0x8F,0x67,0xF3,0x80,0x10,0x61,0x7B,0xB0,0x00,0xB0,0x2F,0x02,0xEA,0x41,0xB0, +0x40,0x10,0x67,0x7B,0xB0,0x00,0xBE,0xAF,0x02,0xEA,0x41,0xB0,0x00,0x0C,0x42,0xC4, +0x02,0x46,0x45,0x30,0x00,0x0C,0xFE,0x5C,0x02,0x10,0x73,0x7B,0xC0,0x11,0xAC,0xDC, +0x02,0xEA,0x41,0xB0,0x04,0x10,0x79,0x7B,0x00,0x0C,0xA2,0x5C,0x02,0xEA,0x41,0xB0, +0x01,0x10,0x7F,0x7B,0xB0,0x00,0x6E,0x2F,0x02,0xEA,0x41,0xB0,0xFD,0x8F,0x8D,0x73, +0x80,0x10,0x87,0xFB,0xB0,0x00,0xB0,0x2F,0x02,0xEA,0x41,0xB0,0x40,0x10,0x8D,0xFB, +0xB0,0x00,0xBE,0xAF,0x02,0xEA,0x41,0xB0,0x00,0x0C,0x42,0xC4,0x00,0x0C,0x18,0x5D, +0x00,0x0C,0x62,0xCA,0x00,0x0C,0x98,0xC3,0x00,0x0C,0x14,0x5D,0x00,0x0C,0x62,0xCA, +0x02,0x46,0x45,0x30,0xC0,0x11,0x26,0xDD,0x00,0x0C,0xD2,0xCB,0xFF,0x8E,0xA9,0xFB, +0x02,0x8C,0x41,0xB0,0xFF,0x21,0xAC,0x73,0x00,0x0C,0x8C,0xDD,0x00,0x0C,0xAA,0x43, +0xC8,0x11,0x26,0x5D,0x00,0x0C,0xB4,0xCB,0x02,0xEA,0x41,0xB0,0x00,0x0C,0x6C,0x5E, +0x00,0x0C,0x62,0xCA,0x02,0x0A,0x40,0xB0,0x02,0x20,0x0C,0xB0,0x02,0xEA,0x41,0xB0, +0x01,0x10,0x15,0x30,0x02,0x06,0x40,0xB0,0x40,0x0A,0xD8,0x7B,0xFF,0x4D,0xCD,0xEB, +0x00,0x0C,0x90,0xD9,0x40,0x01,0x18,0x38,0x00,0x0C,0x30,0xD9,0x00,0x0C,0xCE,0x53, +0xB0,0x00,0x42,0xAA,0x00,0x0C,0xCE,0xC3,0x00,0x4D,0x5D,0x5A,0x02,0xEA,0x41,0xB0, +0x13,0x11,0x30,0x42,0x23,0x11,0x30,0x5A,0x02,0xEA,0x41,0xB0,0x00,0x11,0x5C,0xC2, +0x23,0x11,0x12,0x5C,0x00,0x0C,0x42,0xC4,0xFF,0x11,0x28,0x02,0x00,0x0C,0xFE,0x5C, +0xB0,0x00,0x06,0x2E,0x02,0x86,0x45,0x30,0xFF,0x23,0xF6,0xF3,0x00,0x0C,0xC6,0x5D, +0x00,0x0C,0xF2,0x53,0xC0,0x11,0xAC,0xDC,0xC8,0x11,0xAC,0x5C,0xB0,0x00,0x6E,0x2F, +0x02,0xEA,0x41,0xB0,0x02,0xAE,0x45,0x30,0x00,0x0C,0xE4,0x43,0xB0,0x00,0x0A,0x2E, +0x00,0x0C,0x42,0xC4,0x00,0x0C,0xD2,0x5E,0x00,0x0C,0x5C,0x4A,0xFD,0x8F,0x03,0xF4, +0x22,0x11,0x5C,0xC2,0x00,0x0C,0xFE,0x5C,0xB0,0x00,0x56,0xAF,0x02,0xEA,0x41,0xB0, +0x00,0x0C,0x5C,0x52,0xC0,0x11,0xAC,0xDC,0xC8,0x11,0xAC,0x5C,0xB0,0x00,0x6E,0x2F, +0x00,0x0C,0x42,0xC4,0x52,0x07,0x0C,0x38,0x00,0x0C,0x94,0xDE,0x02,0x0C,0x0C,0x30, +0x80,0x11,0x64,0xD9,0x02,0x06,0x18,0x30,0xFF,0x4B,0x31,0xFA,0x40,0x49,0x33,0x7C, +0xFF,0x45,0x33,0x74,0x02,0x20,0x0C,0xB0,0x02,0x44,0x41,0x30,0xFF,0xFF,0x14,0x38, +0x83,0x4A,0x95,0x2A,0x02,0x4A,0x15,0x30,0x89,0x10,0x30,0x2C,0x00,0x11,0x5C,0xDA, +0x02,0x06,0x40,0xB0,0x02,0xEA,0x89,0xB2,0x40,0x11,0x64,0xD9,0xFF,0xEB,0x13,0xF5, +0x02,0x20,0xA8,0x33,0x02,0xEA,0x41,0xB0,0x01,0x00,0x14,0xB8,0x83,0x4A,0x95,0x2A, +0x02,0xD4,0x41,0xB4,0x02,0xEA,0x41,0xB0,0x40,0x10,0x67,0x74,0x02,0x4A,0x15,0x30, +0x89,0x10,0x12,0xAD,0xFF,0x0B,0x5F,0xFC,0x01,0x0B,0x5F,0xF4,0x02,0x0B,0x63,0xF4, +0x03,0x0B,0x63,0x74,0x04,0x0B,0x63,0xF4,0x05,0x0B,0x63,0x74,0x06,0x0B,0x63,0x74, +0x07,0x0B,0x69,0xF4,0x08,0x0B,0x69,0xF4,0x09,0x0B,0x63,0x74,0x00,0x0C,0xD8,0x5D, +0x00,0x11,0x5C,0xC2,0x00,0x0C,0xEC,0xDD,0x00,0x11,0x5C,0xC2,0x13,0x11,0x30,0x42, +0xB0,0x00,0x46,0x2F,0x00,0x11,0x5C,0xC2,0x01,0x11,0x1A,0x80,0xFF,0xFF,0xB0,0xBB, +0x02,0x12,0x40,0xB0,0xFE,0x0C,0x18,0x18,0xFF,0x21,0x12,0x75,0x00,0x0C,0xBC,0xDD, +0x00,0x0C,0x84,0x54,0x00,0x0C,0x4C,0xD8,0x23,0x11,0x14,0x00,0x00,0x0C,0x10,0x5A, +0xFF,0xD9,0x71,0x74,0x02,0xD8,0x41,0x30,0x02,0x00,0x41,0x30,0x00,0x0C,0x74,0xC4, +0xFF,0x8E,0x95,0x7C,0x02,0x8C,0x41,0xB0,0xFF,0x21,0x12,0x75,0xFF,0xFF,0x18,0x3B, +0xFF,0xFF,0x44,0x3B,0xC8,0x11,0x9E,0xC4,0xC8,0x11,0x96,0x44,0x00,0x0C,0x8E,0xDA, +0xFF,0x21,0x12,0x75,0x02,0xD6,0xB1,0xB3,0x00,0x0C,0x4C,0xD8,0x23,0x11,0x14,0x00, +0x00,0x0C,0x10,0x42,0xFF,0x8E,0xAB,0xFC,0x02,0x8C,0x41,0xB0,0xFF,0x21,0x12,0x75, +0xC8,0x11,0xB4,0x44,0xC8,0x11,0xAC,0x44,0x00,0x0C,0x8E,0xDA,0xFF,0x21,0x12,0x75, +0x02,0xD6,0xB1,0xB3,0x00,0x0C,0x4C,0xD8,0x02,0x0C,0x0C,0x30,0x02,0x20,0xE0,0x33, +0x00,0x0C,0x26,0x5A,0xFF,0xFF,0xB0,0xBB,0xB0,0x00,0x92,0x2F,0x00,0x0C,0xE6,0xD4, +0x02,0x20,0x0C,0xB0,0x02,0xEA,0x41,0xB0,0x09,0x0B,0xCF,0xE4,0x02,0x34,0x15,0x30, +0x02,0x06,0x40,0xB0,0x88,0x20,0xE7,0xAC,0x00,0x0C,0xD0,0x44,0x02,0x06,0x40,0xB0, +0xFF,0xD9,0xDB,0xE4,0x02,0xD4,0x19,0xB0,0x02,0x12,0xE0,0xB3,0xFF,0xF1,0xDF,0xF4, +0x00,0x0C,0xE2,0xC4,0x00,0x0C,0x04,0xDD,0xFF,0xD7,0xE3,0xE4,0x02,0xDA,0x1D,0xB0, +0x02,0xD8,0x27,0x30,0x23,0x11,0x30,0x5A,0x00,0x0C,0xE8,0xC4,0x02,0x20,0xB0,0x33, +0x02,0xD4,0x19,0xB0,0x02,0x12,0x40,0xB0,0xFF,0x21,0xBC,0xE4,0x02,0x06,0x18,0x30, +0xC8,0x0C,0xF4,0x64,0xFF,0x8E,0xFD,0x6C,0xFF,0xF1,0x13,0x75,0x02,0xF0,0x41,0x30, +0x01,0x0C,0x18,0x18,0x00,0x0C,0x32,0xC0,0x02,0xF0,0x19,0x37,0x01,0x11,0x4A,0x80, +0x08,0x28,0xC1,0xB3,0x00,0x11,0x4A,0x0C,0x02,0x20,0x0C,0xB0,0x02,0xD4,0x19,0xB0, +0x02,0x12,0xAC,0x33,0x02,0xD8,0x41,0x30,0x02,0xD4,0x1D,0x30,0x02,0xD6,0x27,0xB0, +0x02,0x06,0x40,0xB0,0xFF,0x11,0x22,0x8C,0x00,0x0C,0x28,0x5E,0x00,0x0C,0x1A,0xC5, +0x00,0x0C,0x3C,0x5E,0x00,0x0C,0x12,0xCD,0x00,0x0C,0x66,0x5E,0x02,0x08,0x15,0x30, +0x02,0xEA,0x41,0xB0,0x02,0x0A,0xA0,0xB2,0x01,0x11,0x22,0x9C,0x00,0x0C,0x8E,0xDA, +0x02,0x0C,0xDC,0xB3,0x02,0xD6,0xD9,0x33,0x00,0x0C,0x26,0x5A,0xFF,0xFF,0xB0,0xBB, +0xFF,0x21,0x36,0xF1,0x02,0x20,0x0C,0xB0,0x02,0xEA,0x41,0xB0,0x02,0x50,0x15,0xB0, +0x02,0x06,0x40,0xB0,0x8A,0x08,0x45,0x2D,0x02,0x20,0xB0,0x33,0x02,0xD4,0x19,0xB0, +0x02,0x12,0x40,0xB0,0x00,0x0C,0x30,0x45,0xC4,0xF2,0x49,0xF5,0x04,0x4C,0x37,0x69, +0xFF,0xD9,0x55,0xF5,0x00,0x0C,0x04,0xDD,0xFF,0xD7,0x87,0xE5,0x02,0xDA,0x1D,0xB0, +0x02,0xD8,0x27,0x30,0x00,0x0C,0x86,0xC5,0x02,0xD4,0x19,0xB0,0x02,0x12,0x14,0x30, +0xFF,0x0B,0x7E,0xF5,0xFF,0xED,0x67,0xF5,0x02,0x20,0x0C,0xB0,0x02,0xEC,0x41,0xB0, +0x02,0x0A,0x00,0xB2,0x02,0x06,0x40,0xB0,0x00,0x0C,0x6A,0x45,0x02,0xEE,0x1D,0x30, +0x02,0x0A,0x26,0xB0,0xFF,0x01,0x73,0xE5,0x02,0xEE,0x1D,0x30,0x02,0x0E,0x1C,0x18, +0x02,0x0A,0x26,0xB0,0x02,0x20,0x0C,0xB0,0x02,0x00,0xAD,0xB3,0x02,0x0A,0x40,0xB0, +0x02,0xD6,0x01,0xB2,0x02,0x06,0x40,0xB0,0x00,0x0C,0x86,0xC5,0x02,0xDA,0x1D,0xB0, +0xFF,0xFF,0x26,0xB8,0x02,0xEC,0xB1,0xB3,0x00,0xEE,0x4D,0x58,0xC8,0xEE,0x8B,0x65, +0x00,0x0C,0xBE,0xD9,0x01,0x10,0x22,0x1C,0xFF,0xFF,0xB0,0xBB,0xFF,0x21,0x36,0xF1, +0x02,0x20,0x0C,0xB0,0x02,0xEA,0x41,0xB0,0x02,0x50,0x15,0xB0,0x02,0x06,0x40,0xB0, +0x8A,0x08,0xA1,0x2D,0x02,0x20,0xB0,0x33,0x02,0x04,0x41,0xB0,0x00,0x0C,0x8E,0x45, +0xFF,0xD9,0xB3,0x75,0x02,0x20,0x0C,0xB0,0x02,0x04,0xAD,0x33,0x02,0xD8,0x41,0x30, +0x02,0xD6,0x09,0x32,0x02,0x06,0x40,0xB0,0xFF,0xD7,0xB9,0x65,0x02,0xD8,0x45,0xB3, +0x00,0x0C,0xB8,0x45,0x02,0x04,0x19,0x33,0xFF,0x05,0xB9,0x65,0xFF,0xFF,0x44,0x3B, +0x00,0x0C,0xBE,0xD9,0x01,0x10,0x22,0x1C,0x02,0x46,0x45,0x30,0x02,0x20,0x0C,0xB0, +0x02,0xEA,0x41,0xB0,0x00,0x0C,0xC6,0x5D,0x00,0x0C,0x10,0xC5,0x01,0x14,0x15,0xB0, +0x00,0x9C,0x37,0xF9,0x01,0x10,0x22,0x1C,0x01,0x49,0x37,0x69,0xFF,0x0A,0x37,0x71, +0xC0,0x0A,0x15,0x88,0x80,0x0A,0x36,0x71,0xC0,0x0A,0x36,0xF1,0x01,0x10,0x22,0x1C, +0x01,0x00,0x44,0xB8,0xFE,0x8F,0xE3,0x65,0x00,0x0C,0xC6,0x5D,0x00,0x0C,0xE2,0xD5, +0x00,0x0C,0xF0,0x5D,0x01,0x00,0x14,0xB8,0x83,0x22,0x44,0x28,0x02,0xFE,0x15,0x30, +0x88,0x22,0xDA,0x2D,0xFF,0x11,0x22,0x8C,0x02,0xAC,0x15,0xB0,0x89,0x10,0x12,0xAD, +0x00,0x00,0x3C,0x3B,0x04,0x10,0x40,0x33,0x00,0x11,0x7E,0x0B,0x04,0x9D,0x13,0xED, +0xFF,0xFF,0x4C,0xBB,0x04,0x11,0x60,0x33,0x00,0x11,0x6A,0x0B,0x00,0x11,0x6C,0x0B, +0xFF,0xB4,0x13,0x7D,0x04,0x11,0x50,0x33,0x3D,0xB4,0x15,0xA8,0xA8,0x01,0x1C,0xB8, +0x00,0x11,0x16,0x88,0xFF,0x0A,0x14,0xFE,0xFF,0x11,0x26,0x00,0x08,0x0B,0x16,0x18, +0xFF,0x0A,0x14,0x98,0x00,0x0C,0x0A,0x46,0xFF,0x0B,0x14,0x90,0x01,0x0A,0x14,0x18, +0x00,0xB4,0x15,0x18,0xFF,0x0A,0x20,0x7E,0x44,0x0A,0x0E,0x28,0xFF,0x07,0x26,0x98, +0x02,0x22,0x14,0x30,0x01,0xA4,0x45,0x30,0x40,0x10,0x00,0xB3,0x02,0x0A,0x44,0xB4, +0x02,0x20,0x0C,0xB0,0x02,0x46,0x45,0x30,0x02,0xA4,0x45,0x30,0x01,0x35,0x15,0xB0, +0x00,0x0A,0x14,0x98,0x80,0x01,0x18,0x38,0x00,0x0C,0x18,0x98,0x02,0x12,0x40,0xB0, +0xFF,0x21,0x5A,0x76,0x00,0x0C,0x4C,0xC6,0x02,0x20,0x0C,0xB0,0x00,0x0C,0x66,0x5E, +0x02,0x20,0x18,0xB0,0x02,0xFC,0x15,0xB0,0x00,0x0C,0xB4,0x5E,0x80,0x0F,0x5A,0x7E, +0xFF,0x20,0x4C,0xE6,0xF0,0x21,0x5A,0xFE,0xFF,0x0A,0x5D,0xF6,0x01,0x49,0x5F,0x6E, +0x02,0x20,0x14,0xB0,0x02,0x06,0x40,0xB0,0x01,0x0A,0x6A,0xB2,0x01,0x0B,0x68,0xB2, +0x01,0x11,0x22,0x9C,0x1F,0x11,0x60,0xC6,0x20,0x11,0x60,0xC6,0x21,0x11,0x60,0x46, +0x02,0x06,0x40,0xB0,0x00,0x0C,0x82,0xD9,0x01,0x10,0x22,0x1C,0x01,0x35,0x15,0xB0, +0x01,0x34,0x17,0xB0,0x02,0x0A,0x40,0x34,0x02,0x50,0xA9,0x33,0x02,0x20,0x0C,0xB0, +0x00,0x00,0x40,0xB8,0x02,0x08,0x15,0x30,0x8A,0xD4,0x81,0x2E,0x00,0x0C,0x8A,0xDE, +0x00,0x0C,0x72,0xD6,0x02,0x06,0x40,0xB0,0x1D,0x11,0x82,0xD9,0x01,0x10,0x22,0x1C, +0x02,0x20,0x14,0xB0,0x02,0x06,0x40,0xB0,0x01,0x0B,0x68,0xB2,0x01,0x0A,0x6A,0xB2, +0x01,0x11,0x22,0x9C,0x01,0x00,0x14,0xB8,0x83,0x20,0x40,0x28,0x02,0xFC,0x15,0xB0, +0x88,0x20,0x36,0xA9,0x01,0x10,0x22,0x1C,0x02,0x48,0x15,0xB0,0x02,0x0B,0x94,0xEE, +0x01,0x0A,0x04,0xB0,0x02,0x0B,0x06,0x80,0xE1,0x48,0xA1,0x2E,0xFF,0x11,0x22,0x8C, +0x02,0x48,0x91,0x32,0x00,0x0C,0x94,0xC6,0xFF,0xFD,0x18,0xB8,0x00,0x0C,0x96,0xC1, +0x01,0x9E,0x1D,0xB0,0x08,0x0E,0xB0,0x7E,0xB0,0x01,0x18,0x38,0x00,0x0C,0x30,0xC1, +0xFF,0xA7,0x93,0x66,0x01,0x11,0x22,0x9C,0x82,0x10,0x14,0x28,0x01,0x10,0x22,0x98, +0x84,0x11,0x14,0xA8,0x02,0x0A,0x0C,0x30,0xFF,0xFF,0x14,0x38,0x84,0x11,0x1C,0x28, +0x02,0x06,0x14,0x30,0x83,0x0C,0x18,0x28,0x00,0x00,0x14,0x38,0x84,0x0E,0x1C,0x2C, +0x00,0x0C,0xCC,0x5D,0x00,0x0C,0x12,0xD5,0x02,0x46,0x15,0x30,0x88,0x22,0x36,0x29, +0x01,0x10,0x22,0x1C,0x02,0x46,0x45,0x30,0x00,0x00,0x14,0x38,0x8A,0x22,0xE0,0xAE, +0x02,0x22,0x18,0x30,0x02,0xFE,0x15,0x30,0xB1,0x00,0xB4,0xAE,0x80,0x0F,0x36,0x69, +0x22,0x11,0x92,0xC6,0x02,0xAC,0x15,0xB0,0x89,0x10,0x12,0xAD,0x02,0x20,0x0C,0xB0, +0x02,0xB0,0x41,0xB0,0x02,0x12,0x75,0xB3,0x02,0x14,0x4D,0x33,0x02,0x16,0x71,0xB3, +0x03,0x18,0x69,0xB3,0xFB,0x11,0x18,0x00,0xB1,0x00,0xA0,0x2F,0x00,0x11,0x5C,0xDA, +0x00,0x0C,0x10,0xC5,0x00,0x00,0x44,0x38,0x02,0x8A,0x15,0x30,0x89,0x0C,0xFA,0x2E, +0x80,0x0C,0x04,0xA8,0xE1,0x8A,0x07,0x2F,0xFF,0x11,0x22,0x8C,0x02,0x8A,0x15,0x33, +0x00,0x0C,0xFA,0x46,0x00,0x00,0x44,0x38,0x02,0x8A,0x15,0x30,0x81,0x0C,0x04,0x28, +0xE1,0x8A,0x15,0x2F,0xFF,0x11,0x22,0x8C,0x02,0x8A,0x15,0x33,0x00,0x0C,0x0A,0xC7, +0x00,0x0C,0x38,0x59,0x01,0x0A,0xB0,0xB3,0x0F,0x00,0x18,0x08,0x00,0x11,0x1A,0x88, +0x00,0x11,0x02,0x88,0x01,0xD8,0x15,0xB0,0x01,0x00,0x0E,0xB0,0x00,0x07,0x2E,0xFF, +0x02,0x38,0x28,0x7F,0x02,0x00,0x60,0xB8,0x02,0x0C,0x64,0xB0,0x11,0x00,0x00,0x98, +0x01,0x00,0x14,0x30,0x00,0xDD,0x23,0xE7,0x01,0x01,0x22,0x34,0xC0,0x0C,0x3A,0x67, +0x02,0x02,0x41,0x34,0x02,0x04,0x41,0x34,0xC0,0x0C,0x48,0x67,0x02,0x02,0x0D,0x30, +0xFF,0xFF,0x04,0x3A,0x02,0x06,0x40,0xB0,0xFF,0x21,0x12,0x75,0xC1,0x11,0x32,0x40, +0x02,0x04,0x0D,0x30,0xFF,0xFF,0x08,0x3A,0x02,0x06,0x40,0xB0,0xFF,0x21,0x12,0x75, +0xC9,0x11,0x32,0xC0,0xA0,0x00,0x18,0x2C,0x02,0x20,0x0C,0xB0,0xC8,0x11,0x52,0xDF, +0x02,0x06,0x40,0xB0,0x00,0x0C,0x5E,0xCF,0xFF,0xFF,0x44,0x3B,0xFF,0x21,0x7A,0x77, +0x02,0x04,0xB1,0xB3,0x40,0x49,0x69,0x7F,0xB0,0x00,0x94,0xAD,0x00,0x0C,0x76,0x47, +0x02,0xA2,0xAD,0x33,0x02,0x20,0x44,0xB3,0xFF,0xFF,0x08,0x3A,0xFF,0xD7,0x75,0x77, +0x00,0x0C,0x88,0xDF,0x00,0x0C,0x76,0x47,0xC9,0x11,0x32,0xD8,0x02,0xD8,0x41,0x30, +0xFF,0x21,0x60,0x67,0x02,0x06,0x40,0xB0,0x10,0x49,0x13,0x7D,0xEF,0x11,0x92,0xC1, +0x02,0x20,0xA8,0x33,0x02,0xD6,0x41,0xB0,0x02,0xD4,0x05,0xB2,0x02,0xD4,0x41,0xB4, +0x02,0x20,0xA8,0x33,0x02,0xD6,0x41,0xB0,0x02,0xD4,0x09,0xB2,0x02,0xD4,0x41,0xB4, +0x01,0x0C,0x1A,0xB0,0x00,0x11,0x94,0x47,0x02,0x9E,0x15,0x30,0x80,0x0C,0x04,0xA8, +0xE1,0x9E,0x9D,0x2F,0xFF,0x11,0x22,0x8C,0x02,0x9E,0x3D,0x33,0x00,0x0C,0x94,0x47, +0x01,0x0C,0x1A,0xB0,0x00,0x11,0xA4,0x47,0x02,0x9E,0x15,0x30,0x81,0x0C,0x04,0x28, +0xE1,0x9E,0xAD,0x2F,0xFF,0x11,0x22,0x8C,0x02,0x9E,0x3D,0x33,0x00,0x0C,0xA4,0x47, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +}; + +static const u8 Lseq[] = { +0x33,0x11,0x00,0x80,0x00,0x00,0x44,0x38,0x00,0x11,0x00,0x08,0xB3,0x00,0xEC,0xAB, +0xFD,0x05,0x0A,0x88,0x44,0x00,0x14,0x70,0x11,0x00,0x00,0x98,0x33,0x00,0x16,0x60, +0x44,0x11,0x00,0x80,0x00,0x0C,0x16,0xC0,0x00,0x11,0x00,0x08,0x80,0xE1,0x0B,0x68, +0x00,0x0C,0x20,0xC0,0x02,0x06,0xC0,0x33,0x02,0x06,0xD0,0xB3,0x02,0x06,0xCC,0x33, +0x04,0xE6,0x0D,0x30,0x02,0xE0,0x0D,0xB4,0x80,0xE1,0xC3,0x03,0x00,0x0C,0x08,0xC0, +0x47,0x00,0x0C,0x38,0x02,0x3C,0x14,0x30,0x81,0x38,0xB0,0xAB,0x02,0x3E,0x14,0xB0, +0x81,0x3A,0xB4,0xAB,0x22,0x00,0x36,0xE0,0x08,0xDA,0x83,0xE8,0x01,0xDA,0x5B,0x78, +0x10,0x83,0x14,0x08,0x00,0xE3,0x14,0x88,0xFF,0x0A,0x4C,0x78,0x40,0xD9,0x81,0xF8, +0x40,0x11,0x72,0x00,0xB1,0x00,0x90,0x2A,0x40,0xC8,0x49,0x78,0x80,0xC8,0x91,0x03, +0x10,0x11,0x06,0x81,0x01,0x11,0x74,0x84,0xFF,0x80,0x14,0x88,0x00,0xE0,0x14,0x88, +0xFF,0x0A,0x80,0x78,0x40,0xD9,0x81,0xF8,0xC0,0xD9,0x14,0x88,0xC0,0x0A,0x80,0xF0, +0x00,0x0C,0x0E,0xC6,0x55,0x11,0x02,0x00,0x01,0x3B,0x14,0xB0,0x00,0x3F,0x14,0x08, +0x01,0x0A,0x84,0xE8,0x01,0x01,0x22,0xB0,0x40,0xD9,0x0F,0x6E,0x80,0xD9,0x6F,0xEA, +0xFF,0x00,0x70,0x78,0x11,0x00,0x76,0xF0,0x22,0x00,0x7C,0xF0,0x40,0x05,0x0A,0x84, +0x20,0xD9,0x1B,0xE9,0x02,0xDA,0xC3,0x7F,0xA3,0x00,0xA8,0xAA,0x02,0xDB,0x8B,0xE8, +0x02,0xDA,0xC3,0x7F,0xA3,0x00,0xA8,0xAA,0x80,0xDA,0xC3,0x7F,0xA1,0x00,0x6C,0xAF, +0xA1,0x00,0xFC,0xAC,0xA1,0x00,0x76,0xAE,0x01,0x01,0x22,0xB0,0x55,0x11,0x00,0x80, +0xA2,0x00,0x00,0x28,0x02,0x11,0x76,0x04,0x07,0x11,0x2A,0x80,0x01,0x05,0x0A,0x84, +0x47,0x00,0x0C,0x38,0x10,0x45,0xCA,0x68,0x10,0xCE,0xCB,0xE8,0x20,0xCE,0xD7,0x68, +0x02,0x54,0x40,0x30,0xB2,0x00,0x98,0xAF,0x00,0x0C,0x14,0x49,0x00,0x00,0x90,0x38, +0x18,0x10,0xA1,0xB0,0x55,0x11,0x02,0x00,0x01,0x25,0x14,0xB0,0x01,0x11,0x4A,0x80, +0x05,0x11,0x00,0x80,0x18,0xE0,0xA3,0x30,0x55,0x11,0x00,0x80,0x01,0x0A,0x4A,0xB0, +0x01,0x01,0x22,0xB0,0x00,0x11,0x94,0x88,0x01,0x52,0x14,0xB0,0x01,0x0A,0xC0,0xF0, +0x00,0x11,0x92,0x88,0x00,0x11,0xA2,0x88,0x07,0x0A,0xC0,0x60,0xFD,0x11,0xC2,0x40, +0xFF,0x11,0xC2,0xC0,0x0A,0x11,0x92,0x00,0x01,0x0C,0xA2,0xB0,0x02,0x8A,0xC4,0x30, +0x00,0x0C,0x18,0xC1,0x01,0xCA,0xD7,0x78,0x00,0x00,0x90,0x38,0x34,0x00,0xA0,0xB8, +0xFF,0x00,0xA2,0xB8,0xB2,0x00,0x5E,0x2D,0x00,0x0C,0x18,0xC1,0x40,0xC8,0x15,0x69, +0x40,0x9E,0x15,0x69,0xB2,0x00,0x6C,0xAD,0x00,0x0C,0x14,0x49,0x00,0x11,0x94,0x88, +0xA1,0x52,0x16,0x71,0xB1,0x00,0xBA,0xAC,0x00,0x0C,0x14,0x49,0xB2,0x00,0x4E,0xAD, +0x03,0x3A,0x19,0xE1,0x00,0x11,0x94,0x88,0x01,0x52,0x14,0xB0,0x46,0x0A,0xF6,0xF0, +0x39,0x0A,0x04,0x71,0x34,0x0A,0x0E,0xF1,0x00,0x0C,0x18,0xC1,0x1B,0x48,0x15,0x88, +0x13,0x0A,0x18,0xE1,0x02,0x11,0xA0,0x5F,0xB2,0x00,0x04,0x2D,0x04,0x0C,0x81,0xB2, +0xB2,0x00,0x1A,0x2E,0x00,0x0C,0x10,0x41,0x1B,0x48,0x15,0x88,0x13,0x0A,0x18,0xE1, +0x01,0x11,0xA0,0x5F,0xB2,0x00,0x0A,0xAD,0x00,0x0C,0x10,0x41,0xB2,0x00,0xDA,0xAC, +0xB2,0x00,0x4E,0xAD,0x00,0x0C,0x18,0xC1,0x02,0xE4,0xC9,0x03,0xB2,0x00,0x3E,0xAF, +0x80,0x11,0x90,0x84,0x20,0x11,0x72,0x00,0x02,0xE4,0x3B,0xE9,0xFF,0xC0,0x3A,0xE9, +0xFF,0xC1,0x3A,0x69,0x0F,0xC2,0x3A,0x69,0x00,0x11,0x94,0x88,0x01,0x52,0x14,0xB0, +0x01,0x0A,0x30,0x71,0x05,0x0A,0x38,0xE1,0xB1,0x00,0x52,0xAC,0x00,0x0C,0x38,0x41, +0x9E,0x00,0x0C,0xB8,0x20,0x49,0x37,0x69,0xA1,0x00,0x70,0xAB,0xA1,0x00,0x5A,0x2B, +0x80,0x11,0x9E,0xDF,0x18,0x11,0xB8,0x80,0xFD,0xE4,0xC9,0x8B,0xFF,0xFF,0xC4,0x3C, +0x80,0x0B,0x45,0xF9,0xB3,0x00,0x66,0xA9,0x00,0x0C,0x1A,0xD8,0x02,0x05,0x0A,0x00, +0x0F,0xCB,0x75,0xE9,0x80,0x48,0x9B,0xE9,0x20,0xE4,0x9B,0xE9,0x40,0x49,0x8D,0xE9, +0xB2,0x00,0x92,0xAF,0x10,0xE5,0x83,0x69,0xB2,0x00,0x82,0x2F,0x80,0xC8,0x83,0x69, +0x80,0xCE,0x61,0xF9,0x20,0xE5,0x99,0xE9,0x08,0xC8,0x8D,0xE9,0x00,0x0C,0x6E,0x41, +0xB2,0x00,0x8A,0xAF,0x20,0xC8,0x8D,0xE9,0x02,0xCA,0x85,0xE9,0x02,0x86,0x86,0x69, +0x40,0x4C,0x6F,0x79,0x02,0x93,0x6E,0x79,0x02,0x7F,0x8C,0xF9,0x80,0xE5,0x8D,0xE9, +0x80,0xE4,0x91,0xE9,0x00,0x0C,0x08,0xC0,0xC0,0x06,0x14,0x38,0x8B,0xB0,0x7C,0x29, +0xC0,0x06,0x60,0xB9,0x80,0x11,0x72,0x00,0x08,0x11,0xE2,0xDD,0x08,0x11,0xB8,0x00, +0x00,0x0C,0xA6,0xC1,0x05,0x11,0x8A,0x41,0x0F,0x11,0x8A,0x41,0x40,0xE5,0xCB,0x03, +0x24,0x11,0x8A,0x41,0x00,0x0C,0xE2,0x5D,0xFD,0x05,0x0A,0x88,0x00,0x0C,0xC4,0x5F, +0x80,0x4C,0xA7,0xF9,0xB1,0x00,0xB8,0x29,0x00,0x0C,0x66,0x5D,0x00,0x0C,0x64,0x42, +0x00,0x0C,0xC4,0x5F,0x06,0x11,0x18,0x80,0x80,0x0B,0xA5,0x79,0x28,0x11,0x18,0x80, +0x20,0xE5,0xA5,0xE9,0x29,0x11,0x18,0x00,0x00,0x0C,0xE2,0x5D,0xB1,0x00,0xB8,0x29, +0x00,0x0C,0x64,0x42,0x06,0x11,0xF8,0x03,0x00,0x0C,0xF2,0xDD,0x00,0x0C,0x64,0x42, +0x47,0x00,0x0C,0x38,0x02,0x48,0xC1,0x79,0xB1,0x00,0x42,0x2C,0x80,0x48,0xC3,0x69, +0x12,0x11,0x94,0x00,0x02,0x52,0x44,0x32,0xB1,0x00,0x54,0xAC,0x00,0x0C,0xC4,0x41, +0x80,0x11,0x9E,0xDF,0xB1,0x00,0x52,0xAC,0x18,0x11,0xB8,0x80,0xFF,0xFF,0xC4,0x3C, +0x47,0x00,0x0C,0x38,0xF7,0x00,0x0C,0xB8,0x00,0x11,0x94,0x88,0x01,0x52,0x14,0xB0, +0x40,0xCE,0xDF,0x69,0x80,0xCC,0xD9,0x79,0x07,0x0A,0xF2,0xF1,0x00,0x0C,0xEC,0x41, +0x06,0x0A,0xDC,0x71,0x16,0x0A,0xEC,0x61,0xA3,0x00,0xBC,0x28,0x80,0xCC,0xE5,0x79, +0x41,0x0A,0xE8,0x61,0xA1,0x00,0x4E,0x2E,0x40,0x0A,0xE8,0xE1,0xA3,0x00,0xA2,0x28, +0x20,0xE4,0xC9,0x03,0xB1,0x00,0x8A,0xAA,0x18,0x11,0xB8,0x04,0x40,0xCE,0xC3,0xEF, +0xFF,0xFF,0xC4,0x3C,0x02,0xE4,0x3B,0xE9,0xFF,0xC0,0xEC,0x69,0xFF,0xC1,0xEC,0xE9, +0x0F,0xC2,0xEC,0xE9,0x80,0x7F,0xFE,0x69,0x07,0x11,0x02,0xC2,0x09,0x01,0xC0,0xBB, +0x00,0x0C,0x0A,0xC2,0x01,0x0C,0xF8,0xB3,0xB3,0x00,0x64,0xAB,0x00,0x0C,0xEC,0xC9, +0x0C,0x01,0xC0,0xBB,0xB1,0x00,0x92,0xAC,0x40,0xCA,0x95,0x03,0x02,0x06,0x22,0x30, +0xFF,0x11,0x22,0x8C,0x00,0x0C,0x66,0x5D,0x18,0x11,0xB8,0x80,0x00,0x0C,0x62,0x42, +0xB1,0x00,0xD2,0x2F,0x00,0x0C,0x5E,0xCA,0x04,0x0C,0x49,0x31,0x03,0x0A,0x23,0xE2, +0x02,0x34,0x15,0x30,0x02,0x0A,0x48,0xB1,0xFC,0xFF,0x14,0x38,0x83,0x90,0x48,0xA9, +0x08,0x11,0x48,0xB1,0x30,0xCE,0x35,0xFA,0x02,0x11,0x48,0x00,0x0C,0xD4,0x4D,0x31, +0x04,0xB0,0x4D,0x31,0x00,0x11,0x48,0x08,0x02,0x20,0xFC,0xB3,0xB1,0x00,0xCE,0xA9, +0x44,0x0A,0xE6,0x2B,0x0C,0x00,0x14,0x38,0x30,0xCE,0x41,0xFA,0x14,0x0A,0x14,0x98, +0x83,0x90,0x14,0x28,0xB1,0x00,0x06,0x2A,0x00,0x11,0xE4,0x0B,0x80,0xF2,0xE5,0x2B, +0x20,0x11,0xB8,0x00,0xB1,0x00,0x22,0x2A,0x02,0x08,0xE1,0xB3,0xC0,0x11,0xAA,0xDF, +0x02,0xFE,0x41,0xB0,0x00,0xFC,0xA7,0x5F,0x02,0xF0,0x79,0x32,0x02,0xF2,0x7D,0x32, +0xB1,0x00,0x82,0x2C,0x0E,0x11,0xAA,0x5F,0x00,0x0C,0x62,0x42,0x18,0x11,0xB8,0x80, +0xB3,0x00,0x82,0x2B,0xFF,0xFF,0xC4,0xB8,0xB1,0x00,0x9C,0x2C,0xBF,0xCA,0x95,0x8B, +0x01,0xE4,0x4D,0x7D,0xB3,0x00,0x20,0xAB,0x00,0x0C,0x4C,0xC5,0x00,0x11,0x00,0x08, +0x02,0xB4,0x14,0x30,0x81,0xB0,0x14,0x28,0x80,0x0A,0x8C,0xEA,0x40,0x0A,0xB0,0xEA, +0x20,0x0A,0xC2,0xEA,0x10,0x0A,0xC6,0x6A,0x04,0x0A,0xC8,0xEA,0x02,0x0A,0xCA,0x6A, +0x01,0x0A,0xCC,0x6A,0x08,0x0B,0xCE,0x6A,0x04,0x0B,0xD0,0x6A,0x02,0x0B,0xD2,0xEA, +0x01,0x0B,0xD6,0x6A,0x48,0xF0,0x60,0x3D,0x23,0x11,0x02,0x80,0x02,0x48,0xE4,0x33, +0x33,0x11,0x00,0x80,0x00,0x00,0x90,0x38,0x22,0x11,0x00,0x80,0x03,0x11,0x48,0x80, +0xC4,0x01,0x18,0x38,0xB3,0x00,0x40,0x2A,0x00,0x11,0x48,0x08,0x00,0x0C,0xA4,0xCA, +0xB2,0x00,0x2E,0xAE,0x00,0x0C,0xAC,0xC2,0x18,0x11,0x18,0x80,0xB1,0x00,0xB0,0xAA, +0xC4,0x01,0x1C,0xB8,0xF0,0x11,0xDA,0x5F,0x01,0x01,0x22,0xB0,0x80,0x11,0x60,0x05, +0x40,0xCE,0xBD,0xEA,0x80,0xCE,0xB7,0x7A,0x02,0x54,0x40,0x30,0xB1,0x00,0x74,0xAB, +0x00,0x0C,0xBE,0x4A,0x40,0x11,0x60,0x05,0xB1,0x00,0x8A,0xAA,0x20,0xE4,0xC9,0x03, +0x40,0x11,0x60,0x05,0x00,0x0C,0xF4,0x5A,0x20,0x11,0x60,0x05,0x10,0x11,0x60,0x05, +0x04,0x11,0x60,0x05,0x02,0x11,0x60,0x05,0x01,0x11,0x60,0x05,0x08,0x11,0x62,0x85, +0x04,0x11,0x62,0x85,0x00,0x0C,0xF4,0x5A,0x02,0x11,0x62,0x85,0x02,0x90,0x14,0x30, +0x8B,0x10,0xE6,0x2A,0x03,0x0A,0xE0,0x39,0xFF,0x11,0x22,0x20,0x10,0x00,0xD0,0xB9, +0x00,0x00,0xD4,0xB9,0x01,0x11,0x62,0x01,0x00,0x00,0xE0,0xBD,0x02,0x54,0x40,0x30, +0xB2,0x00,0x98,0xAF,0x00,0x0C,0xF2,0xCA,0xC0,0x11,0xD2,0xDF,0x20,0xE5,0xCB,0x03, +0xB1,0x00,0x76,0x2B,0x01,0x11,0x62,0x85,0x80,0xCE,0x05,0x6B,0x40,0xCE,0xFF,0xEA, +0xB1,0x00,0x72,0xAB,0x00,0x0C,0xC2,0xD7,0x20,0xE4,0xC9,0x87,0x18,0x11,0xB8,0x80, +0x20,0xE4,0xC9,0x03,0xA1,0x00,0x8A,0x2A,0x10,0x11,0xB8,0x84,0x9B,0x01,0x0C,0x38, +0x08,0x11,0xB8,0x00,0x00,0x00,0x90,0xB9,0x80,0x0B,0x15,0x7B,0x3F,0x4E,0x15,0x88, +0x01,0x0A,0x14,0x63,0xA3,0x00,0xBC,0xA9,0x01,0x0A,0x15,0xB0,0x00,0x11,0x16,0x88, +0x8E,0x01,0x18,0xB8,0x83,0x0C,0x0C,0xAC,0x00,0x0C,0x44,0x43,0x00,0x0C,0x4C,0xC3, +0x00,0x0C,0x42,0x43,0x00,0x0C,0x5C,0x43,0x00,0x0C,0x5C,0x43,0x00,0x0C,0x68,0xC3, +0x00,0x0C,0x68,0xC3,0xA3,0x00,0xAA,0x29,0x00,0x0C,0x5C,0x43,0xA2,0x00,0xAA,0x2B, +0xA2,0x00,0x92,0xAB,0xA2,0x00,0xAA,0x2B,0x00,0x0C,0x62,0xC3,0x02,0x05,0x0A,0x00, +0x01,0x4C,0x3D,0xEB,0x00,0x11,0xA6,0xDF,0xB3,0x00,0xA2,0xAB,0x60,0x13,0x08,0xB9, +0x05,0x11,0xB8,0x04,0x15,0x11,0x2A,0x80,0x06,0x11,0x94,0x81,0x33,0x11,0x95,0x31, +0x34,0x11,0xA2,0xDF,0x10,0x01,0xBC,0x3C,0x06,0x11,0x94,0x81,0x23,0x11,0x95,0xB1, +0x24,0x00,0x14,0x38,0x83,0x3C,0x99,0x28,0x04,0x11,0xA4,0xDF,0x00,0x11,0xA4,0x5F, +0x10,0x34,0xC1,0x30,0x10,0x01,0xBC,0x3C,0x34,0x10,0x95,0x31,0x34,0x11,0xA2,0xDF, +0x10,0x01,0xBC,0x3C,0x10,0x00,0xBC,0x38,0x04,0x3C,0x99,0x30,0x10,0x34,0xC1,0xB4, +0x05,0x11,0x94,0x81,0x13,0x11,0x95,0xB1,0x04,0x11,0x94,0x31,0x04,0x24,0x95,0xB1, +0x08,0x2C,0x95,0x31,0x24,0x11,0xA2,0x5F,0x10,0x01,0xBC,0x3C,0x00,0x0C,0x06,0x5B, +0x02,0xE4,0x29,0xEF,0x04,0x5D,0x8C,0x7B,0x30,0xCE,0x8B,0x7B,0x00,0x0C,0x1A,0xD8, +0xB2,0x00,0x30,0xAE,0x00,0x0C,0x94,0x4B,0x00,0x0C,0xEA,0x5F,0xB1,0x00,0xEA,0xAC, +0x00,0x0C,0x8C,0xC3,0xB1,0x00,0xDC,0xAC,0x00,0x0C,0x1A,0xD8,0x40,0xCE,0x17,0xEC, +0x80,0xCE,0x95,0xFB,0x08,0x5D,0xB2,0x6B,0x0F,0xCB,0x29,0xEC,0x80,0xC8,0x2B,0xEC, +0x30,0xCE,0xBD,0xEB,0x04,0xC9,0x27,0x6C,0x08,0xC9,0x2D,0x6C,0x01,0x85,0x44,0xEC, +0xB2,0x00,0x92,0xAF,0x10,0xE5,0x45,0x6C,0x04,0x85,0x08,0x78,0xFF,0xC6,0x08,0xE8, +0x02,0x11,0xCA,0xDD,0x00,0x0C,0xD6,0xDD,0x02,0x85,0xB2,0x6B,0xB1,0x00,0xB4,0x29, +0x00,0x0C,0x26,0x44,0x00,0x0C,0x92,0x5F,0x08,0xC9,0xBB,0xEB,0x02,0x11,0x0A,0x81, +0x00,0x0C,0xEE,0x43,0x04,0x11,0x60,0x44,0xB2,0x00,0x8A,0xAF,0x20,0xC8,0xD5,0xEB, +0x02,0xCA,0x31,0xEC,0x02,0x86,0x32,0x6C,0x04,0xC4,0xD4,0x6B,0x08,0x5D,0x08,0x78, +0x20,0x84,0x44,0x6C,0x10,0x85,0x08,0x78,0x10,0x11,0x0A,0x81,0x0B,0x0A,0xEF,0xE3, +0x01,0x3A,0xEF,0xE3,0xA2,0x00,0xD2,0x2D,0x04,0x5D,0x14,0x08,0x00,0xE4,0xC9,0x83, +0x02,0x11,0xCA,0xDD,0x00,0x0C,0xD6,0xDD,0x10,0x85,0xEA,0xEB,0x20,0x84,0x44,0x6C, +0x08,0x11,0xCA,0xDD,0x04,0xE4,0xE9,0x6B,0x08,0x4C,0xE9,0xEB,0xB2,0x00,0x40,0x2E, +0x00,0x11,0x68,0x44,0x10,0x11,0x0A,0x81,0x00,0x0C,0x20,0x44,0x80,0xCC,0xF3,0xEB, +0xC0,0xC9,0x21,0xEC,0xFF,0x03,0x21,0xF4,0x08,0x48,0xC7,0x0B,0x79,0x0B,0xC5,0x0B, +0x02,0x02,0x41,0xB0,0x08,0x48,0x17,0x88,0x79,0x0B,0x15,0x88,0x02,0xEE,0x41,0x30, +0x00,0xE2,0x21,0x64,0x30,0xCE,0x0F,0xFC,0x08,0xCE,0x21,0x7C,0x01,0x0B,0x14,0xB0, +0x00,0xE3,0x21,0xE4,0xB2,0x00,0x8A,0xAF,0x20,0xC8,0x21,0xEC,0x02,0x02,0xDD,0xB3, +0x06,0x11,0x58,0xDD,0x02,0xEE,0x41,0x30,0x00,0x0C,0x76,0xC3,0x08,0x5D,0x08,0x78, +0x00,0x11,0x02,0x88,0xDC,0x01,0x1C,0xB8,0xE0,0x11,0xDA,0xDF,0x01,0x01,0x22,0xB0, +0x08,0x4C,0x25,0x6C,0x04,0x11,0x68,0xC4,0x06,0x11,0x68,0x44,0x00,0x11,0x60,0xC4, +0x0D,0x11,0x36,0x44,0x0E,0x11,0x36,0x44,0x80,0xE5,0xB3,0xEB,0x18,0x11,0x36,0xC4, +0x19,0x11,0x36,0x44,0x40,0xE5,0xCB,0x03,0x25,0x11,0x36,0x44,0x80,0x0B,0x5D,0x6C, +0x80,0xCE,0x3D,0xFC,0x16,0x10,0x5D,0x74,0x00,0x0C,0xE2,0x5D,0x30,0xCE,0x43,0x7C, +0xB2,0x00,0x40,0x2E,0x02,0x11,0x60,0x44,0x60,0x01,0x08,0xB9,0xEF,0xE5,0xCB,0x8B, +0x1A,0x11,0x4A,0xC4,0x30,0xCE,0x51,0x7C,0xB2,0x00,0x40,0x2E,0x00,0x0C,0x58,0x44, +0x80,0x0B,0x5D,0xFC,0x3F,0x4E,0x15,0x88,0x01,0x0A,0x58,0xF4,0x02,0x0A,0x5C,0xE4, +0xB3,0x00,0x92,0xAB,0xFF,0x4A,0x77,0xEB,0x00,0x0C,0xA6,0xDF,0x02,0x11,0x60,0x44, +0x01,0x0C,0xD6,0xB3,0x08,0x11,0xCA,0xDD,0x01,0xEB,0x19,0xB0,0x20,0xE4,0xC9,0x03, +0x00,0x0C,0x58,0xDD,0xDF,0x5F,0xBE,0x88,0x04,0x11,0x88,0x81,0x00,0x0C,0x30,0x45, +0x02,0x20,0x14,0x31,0x02,0x20,0xA4,0xB1,0x00,0x0C,0x1A,0xD8,0x10,0x39,0x08,0x68, +0x00,0x0C,0xEA,0x5F,0x80,0x48,0x97,0xEC,0xC0,0x49,0x97,0xEC,0x00,0x11,0x94,0x88, +0x10,0x0B,0x89,0xFC,0x18,0x10,0xA5,0x30,0x18,0x11,0x24,0x01,0x00,0x0C,0x8C,0x44, +0x52,0x11,0xE0,0xDF,0x04,0x11,0x24,0x81,0xB1,0x00,0x82,0x2C,0x08,0x11,0xB8,0x00, +0x01,0xC0,0x23,0xB0,0xFF,0x11,0x22,0x20,0xB1,0x00,0x98,0x2B,0x02,0xE4,0x29,0xEF, +0xB1,0x00,0xDC,0xAC,0x00,0x0C,0x1A,0xD8,0x02,0x05,0x0A,0x00,0x0F,0xCB,0x11,0xED, +0xB2,0x00,0x82,0x2F,0x80,0xC8,0x13,0xED,0x80,0x48,0x0F,0xED,0xC0,0x49,0x25,0x6D, +0x30,0xCE,0xCF,0x6C,0xFF,0x58,0xAE,0xEC,0xC0,0xC9,0x05,0x6D,0x04,0xC9,0x05,0xED, +0x08,0xC9,0x15,0x6D,0x01,0x85,0x1E,0x6D,0x04,0x85,0xC8,0xFC,0xFF,0xC6,0xC8,0x6C, +0x80,0xE4,0xE5,0xEC,0x02,0x11,0xCA,0xDD,0x04,0x7F,0xC2,0x6C,0xB1,0x00,0xB4,0x29, +0x00,0x0C,0x04,0xC5,0x01,0x11,0xB8,0x00,0x00,0x0C,0x1A,0xD8,0x00,0x0C,0xD8,0xC4, +0xFF,0x58,0x08,0x68,0x02,0x11,0x0A,0x81,0x00,0x0C,0xD8,0xC4,0xB2,0x00,0xFA,0xAD, +0xB2,0x00,0x8A,0xAF,0x20,0xC8,0xDF,0x6C,0x02,0xCA,0x19,0x6D,0x02,0x86,0x1A,0xED, +0x80,0xE5,0xE3,0x6C,0x80,0xE4,0xE5,0xEC,0x00,0x0C,0x08,0xC0,0x00,0x0C,0xAC,0xDF, +0x00,0x0C,0x02,0xC5,0x00,0x0C,0xC4,0x5F,0x30,0xCE,0xEB,0xFC,0xB2,0x00,0xFA,0xAD, +0x00,0x0C,0x02,0xC5,0xFF,0x05,0x03,0x75,0x79,0x0B,0x15,0x88,0x02,0x04,0x41,0xB0, +0x79,0x0B,0xC5,0x0B,0x02,0xEE,0x41,0x30,0x00,0xE2,0x03,0xE5,0x02,0x04,0xDD,0xB3, +0x07,0x11,0x58,0x5D,0x00,0x00,0xC8,0xBB,0x02,0xEE,0x41,0x30,0x00,0x0C,0x70,0x44, +0x20,0xE4,0xC9,0x03,0x05,0x11,0x28,0x45,0x00,0x0C,0xAC,0xDF,0x80,0xC8,0x0D,0x7D, +0x05,0x11,0xE2,0x5D,0x03,0x11,0x28,0x45,0x01,0x11,0x28,0xC5,0x06,0x11,0x22,0x45, +0x08,0x11,0x22,0xC5,0x05,0x11,0x22,0x45,0x80,0xE5,0x01,0xED,0x14,0x11,0x22,0x45, +0x0F,0x11,0x22,0x45,0x40,0xE5,0xCB,0x03,0x24,0x11,0x22,0x45,0x03,0x11,0x0A,0x01, +0x10,0x11,0x22,0xC5,0x00,0x0C,0xE2,0x5D,0x00,0x0C,0xAC,0xDF,0x03,0x11,0x28,0x45, +0x30,0xCE,0x2F,0xFD,0x40,0x9E,0x2F,0x7D,0xB2,0x00,0x40,0x2E,0x00,0x0C,0x58,0xDD, +0xB1,0x00,0x82,0x2C,0xFF,0xFF,0xDC,0xBB,0x20,0xE4,0x41,0xED,0xFF,0x8E,0x41,0x6D, +0xFF,0xC1,0x41,0xF5,0x00,0x0C,0x72,0x5D,0xFF,0xEF,0x41,0xF5,0x02,0xE0,0x0D,0xB4, +0x02,0x05,0x0A,0x00,0x7F,0xCA,0x95,0x8B,0x01,0xE4,0x49,0xFD,0xB3,0x00,0x20,0xAB, +0x02,0x86,0x4C,0x7D,0x00,0x11,0xB0,0x88,0x40,0xE5,0x53,0xFD,0x02,0x11,0x0C,0x81, +0x02,0xE6,0xCC,0x01,0x00,0x00,0xC8,0xBB,0xFF,0xFF,0x40,0xB8,0x00,0x0C,0x24,0x40, +0x02,0x0C,0x0C,0x30,0x10,0x11,0x9C,0x5F,0x02,0x06,0x18,0x30,0x00,0x0C,0xAA,0xDF, +0x00,0x0C,0x1A,0xD8,0x10,0x49,0x09,0x68,0xFF,0xFF,0xC4,0x3C,0xB3,0x00,0x64,0xAB, +0x00,0x0C,0xC2,0xCF,0xB1,0x00,0xD2,0x2F,0x00,0x0C,0x70,0x4D,0x0E,0x11,0xAA,0x47, +0xA3,0x00,0x82,0xAB,0x11,0x11,0x02,0x00,0x02,0x05,0x0A,0x00,0x02,0xC0,0x41,0x30, +0x02,0x20,0xDC,0x33,0x04,0x4C,0xA9,0x6D,0x80,0x49,0x8F,0x7D,0xB3,0x00,0x82,0x2B, +0x02,0xEE,0x15,0xB0,0x88,0x20,0x86,0xAD,0xFF,0xFF,0xDC,0xBB,0x02,0x06,0x41,0x30, +0xFF,0x21,0x9E,0xF5,0xFF,0xEF,0x79,0x75,0x00,0x0C,0x7C,0xC5,0x02,0x06,0x09,0xB2, +0xFF,0xFF,0x0C,0xBA,0x04,0x11,0x9C,0x5F,0x02,0x04,0x41,0xB0,0xFF,0x21,0x9E,0xF5, +0x04,0x4C,0x7D,0xFD,0x02,0x20,0x80,0x33,0x00,0x0C,0xA0,0x45,0x00,0x0C,0xC2,0xDD, +0xFF,0xEF,0xBD,0xF5,0x01,0x11,0xF6,0x03,0x38,0x02,0xC0,0x3B,0x00,0x0C,0xB4,0x45, +0x00,0x11,0xF6,0x0B,0xBB,0x01,0xC0,0xBB,0x02,0x06,0x81,0x33,0xFF,0xFF,0x0C,0xBA, +0xFF,0xC1,0xB5,0xE5,0xFF,0xFF,0x84,0x3B,0x02,0xEE,0x41,0x30,0x02,0x46,0x45,0x30, +0x00,0x00,0xC8,0xBB,0x80,0xCA,0x95,0x03,0x01,0x01,0x22,0xB0,0xFD,0x05,0x0A,0x0C, +0x02,0xC0,0x41,0x30,0x04,0x10,0x80,0xB7,0xFF,0xC1,0xC3,0x77,0x00,0x0C,0xC0,0x5D, +0x0B,0x11,0xAA,0x47,0x01,0x0C,0xD8,0x33,0x01,0xEC,0xB9,0x30,0xB0,0x00,0x1A,0xA8, +0x01,0xEC,0xB9,0x30,0x20,0x5D,0x08,0xE8,0xFF,0x11,0x22,0x8C,0x00,0x0C,0x1A,0xD8, +0x05,0x11,0xF6,0x80,0x02,0x78,0x14,0x30,0x00,0x7A,0x14,0x00,0x89,0x10,0x08,0x28, +0xFF,0x11,0x22,0x8C,0x80,0xCE,0xE7,0xFD,0x80,0x49,0xC3,0xEF,0x00,0x0C,0xA6,0xDF, +0x80,0x0B,0xC3,0xEF,0x02,0x08,0xE1,0xB3,0x01,0x4D,0xF9,0x33,0x80,0x11,0x9C,0x5F, +0x80,0xCE,0xF7,0xED,0x40,0x9E,0xC3,0xEF,0x40,0x11,0xEE,0x5F,0x02,0x20,0xFC,0xB3, +0xB1,0x00,0xCE,0xA9,0x02,0x0C,0x1C,0x98,0x80,0xCE,0x05,0xFE,0xF0,0x11,0x26,0x00, +0x02,0xF0,0x27,0x30,0x00,0x0C,0x08,0xC6,0xF1,0x11,0x26,0x80,0x02,0x22,0x26,0xB0, +0x01,0xFC,0x27,0x30,0x00,0x0A,0xAA,0xDF,0x02,0xFE,0x41,0x34,0xC0,0xD9,0x14,0x88, +0x40,0x0A,0x1E,0xF6,0xC0,0x0A,0x1E,0x76,0xFF,0x0A,0x18,0xFE,0x40,0x11,0x72,0x84, +0x30,0xCB,0x17,0x7E,0x40,0xCB,0x97,0x03,0xBF,0x3D,0x7A,0x0C,0x0B,0x03,0x0C,0xB8, +0x01,0xCD,0xC3,0xEF,0x30,0xCB,0xC3,0x6F,0x20,0xD9,0x14,0xEF,0x00,0x11,0xB2,0x89, +0xF0,0xD8,0x14,0x08,0x10,0x0A,0x3C,0xF6,0x20,0x0A,0x40,0x76,0x90,0x0A,0x34,0xF6, +0x80,0x0A,0x38,0x76,0x00,0x0C,0x12,0xC7,0x08,0xCF,0x43,0x6E,0x00,0x0C,0x12,0xC7, +0x02,0xCF,0x43,0x6E,0x00,0x0C,0x12,0xC7,0x80,0xCF,0x43,0x6E,0x00,0x0C,0x12,0xC7, +0x40,0xCF,0x13,0x7F,0x0F,0xD8,0x14,0x08,0x08,0x0A,0x4C,0x76,0x09,0x0A,0x16,0xE7, +0x55,0x11,0x00,0x80,0x01,0x43,0x17,0xEF,0x00,0x11,0x00,0x08,0x44,0xC6,0x0F,0xA8, +0x01,0x07,0x14,0xB0,0x00,0x00,0x44,0x38,0x00,0xAF,0x11,0x7F,0x02,0xC8,0x91,0x03, +0xFE,0x11,0xAC,0x03,0x01,0xC8,0x9D,0x7E,0x11,0x11,0x00,0x80,0x08,0xE4,0x79,0x6E, +0x01,0x96,0x17,0x30,0x01,0x97,0x15,0x30,0x16,0x11,0xB2,0x81,0x01,0xD8,0x1A,0xB0, +0x01,0xD8,0x18,0x30,0x8A,0x0C,0x72,0x2E,0xB3,0x00,0x68,0x2A,0x80,0x0F,0x26,0xEF, +0x00,0x0C,0x78,0x46,0xA8,0x00,0x18,0xB8,0xB3,0x00,0xA6,0x2B,0xFF,0x0A,0x26,0xFF, +0x00,0x0C,0xE6,0x5F,0x84,0x01,0x18,0xB8,0xB3,0x00,0xA6,0x2B,0x01,0x0A,0xAC,0x33, +0x01,0x0A,0x9C,0x66,0x00,0x11,0xB2,0x89,0xF0,0xD8,0xB0,0x8B,0x80,0xD8,0xA5,0x8B, +0x70,0xD8,0xB1,0x8B,0x70,0x0B,0x15,0x88,0x00,0xD8,0x9D,0x66,0x80,0x80,0x95,0xFE, +0xFF,0xD2,0x9D,0xEE,0x00,0x0C,0x96,0x46,0x80,0xD2,0x9D,0xE6,0x20,0xD8,0x9D,0x76, +0x11,0x11,0x00,0x80,0xB1,0x00,0xDA,0xAC,0x00,0x11,0xB2,0x89,0xF0,0xD8,0x14,0x08, +0x80,0x0A,0xB0,0x0B,0x30,0x0A,0x14,0x08,0x80,0xD8,0xA9,0xEE,0x80,0x0A,0x14,0x00, +0x01,0xD8,0xB0,0x6E,0x55,0x11,0x00,0x80,0x01,0x43,0xB1,0x6E,0x08,0x0A,0x14,0x00, +0x00,0x11,0x00,0x08,0x01,0x0A,0xA4,0xB3,0x80,0x0A,0xB8,0xEE,0xA3,0x00,0x00,0xA8, +0x01,0xD8,0x46,0xB0,0x01,0xD8,0x44,0x30,0x00,0x00,0x14,0x38,0x8A,0x22,0x12,0xAF, +0x02,0x22,0x18,0x30,0x02,0xEE,0x15,0xB0,0xB3,0x00,0x68,0x2A,0x80,0x0F,0x12,0xFF, +0xFE,0x8F,0x13,0xE7,0x84,0x01,0x18,0xB8,0xB3,0x00,0xA6,0x2B,0x01,0x0A,0x12,0xE7, +0x20,0xD2,0xDD,0x7E,0x20,0x9E,0x11,0xEF,0x40,0x9D,0xD9,0x7E,0x02,0x9E,0x13,0xFF, +0xB2,0x00,0x1A,0xAF,0x00,0x0C,0xF0,0x46,0x02,0xC9,0xF1,0x6E,0x04,0x9D,0xF1,0x7E, +0xB3,0x00,0xCA,0x2B,0x04,0x11,0x14,0x30,0x01,0xA8,0x19,0x30,0x01,0xA9,0x15,0xB0, +0xB3,0x00,0x68,0x2A,0x80,0x0F,0xF0,0x7E,0xB3,0x00,0xDA,0xAB,0x00,0x0C,0x10,0x47, +0x08,0xD2,0x41,0x09,0x01,0xD2,0x15,0xB0,0xB1,0x00,0xFA,0xA8,0x34,0x11,0xDE,0x5F, +0xB1,0x00,0xE0,0xAA,0xB1,0x00,0x20,0xA9,0xB0,0xCC,0x15,0x08,0x10,0x0A,0x02,0x67, +0xB3,0x00,0x6C,0xA9,0xFE,0xC8,0x91,0x8B,0xB3,0x00,0x20,0xAB,0x80,0xCA,0x0B,0xEF, +0xA3,0x00,0x5E,0xAA,0xB1,0x00,0x92,0xAC,0xC1,0x03,0xC0,0xBB,0x40,0xCA,0x95,0x87, +0x4F,0x11,0x18,0x47,0x44,0x11,0x18,0xC7,0x51,0x11,0x18,0x47,0x41,0x11,0x18,0xC7, +0x00,0x11,0x00,0x08,0xFD,0xC9,0x93,0x8B,0x00,0x0C,0xDE,0xDF,0x80,0xCA,0x27,0xFF, +0xBF,0x3D,0x7A,0x88,0xC5,0x03,0xC0,0x3B,0x40,0xCA,0x95,0x87,0xFD,0xC8,0x91,0x0F, +0x00,0x0C,0x1A,0xD8,0x02,0x02,0x14,0x38,0x81,0xC8,0x15,0xA8,0x02,0x0B,0x3E,0x6F, +0x02,0x0A,0x44,0x6F,0x10,0xE4,0x6D,0xEF,0x0F,0xCB,0x4B,0x6F,0x80,0xC8,0x4D,0xEF, +0xEB,0x11,0xA8,0x5F,0x00,0x0C,0x08,0x50,0x03,0x11,0x4E,0xC7,0xFD,0xE4,0xC9,0x8B, +0xFF,0xFB,0x7B,0x7B,0x00,0x0C,0x98,0x44,0xB3,0x00,0xDA,0xAB,0xFB,0x11,0xEC,0xDF, +0x00,0xFB,0x61,0xC4,0x09,0x11,0x4E,0xC7,0x12,0x11,0x4E,0xC7,0x01,0x0C,0xF4,0xB3, +0x00,0x0C,0xE6,0x5F,0xB1,0x00,0x8A,0xAA,0x22,0x11,0x02,0x00,0xE8,0x11,0xDC,0x5F, +0x01,0x01,0x22,0xB0,0xB1,0x00,0xE0,0xAA,0x00,0x0C,0x6C,0x4F,0x00,0x0C,0x1A,0xD8, +0x80,0xC8,0x6D,0x6F,0x22,0x11,0x02,0x00,0xEB,0x11,0xA8,0x5F,0x01,0x01,0x22,0xB0, +0x00,0x0C,0x08,0x50,0x01,0x11,0xEE,0x5F,0xB3,0x00,0xDA,0xAB,0xB2,0x00,0x68,0xAF, +0xFD,0x05,0x0A,0x88,0x08,0xFB,0x61,0x44,0x10,0xC7,0xC3,0x7F,0x30,0xCE,0xC3,0x6F, +0x02,0xFA,0x15,0xB0,0x83,0x10,0xF4,0x2B,0x89,0x10,0xC2,0xAF,0x02,0xFC,0xF5,0x33, +0x30,0x11,0xDE,0xC7,0x00,0x0C,0x1A,0xD8,0x80,0xCA,0x09,0xE8,0xB3,0x00,0x5E,0x2A, +0x00,0x0C,0x64,0x42,0x00,0x0C,0x1A,0xD8,0x80,0xCA,0x09,0xE8,0xB2,0x00,0x68,0xAF, +0x00,0x0C,0x64,0x42,0x10,0x48,0xC3,0x6F,0x10,0x11,0x9E,0xDF,0xFF,0x0A,0x9B,0xFF, +0x01,0x0A,0xC3,0xE7,0x04,0x11,0x80,0xB6,0xA3,0x00,0xC8,0xAA,0xA3,0x00,0xCC,0x2A, +0xA3,0x00,0xEE,0x2A,0xA3,0x00,0x4E,0x2A,0xA3,0x00,0x48,0x2A,0xA3,0x00,0xB2,0x2A, +0xA1,0x00,0x48,0xAA,0xA3,0x00,0x2E,0x2A,0x20,0xE4,0xC9,0x03,0x80,0xE4,0xC3,0x6F, +0x08,0x5D,0xC2,0x6F,0x02,0x11,0xCA,0xDD,0x05,0x11,0xB8,0x80,0x80,0x11,0xB8,0x00, +0x00,0x0C,0x1A,0xD8,0x80,0xE4,0xC3,0x6F,0x04,0xC9,0x09,0x78,0x80,0xC8,0x09,0xF8, +0x08,0x11,0xB8,0x00,0xFF,0x11,0x22,0x8C,0x80,0xE4,0xC3,0x6F,0x08,0x5D,0xC2,0x6F, +0xA0,0xE5,0xCD,0xEF,0xC0,0x11,0xD2,0xDF,0xB0,0x00,0x1A,0xA8,0x80,0xE4,0x09,0x78, +0xFF,0x11,0x22,0x8C,0xFF,0x00,0xD8,0xEF,0x00,0x11,0x86,0x09,0x40,0x11,0x90,0x00, +0x01,0x0C,0xB8,0xB4,0xA1,0x00,0x34,0x2A,0xA1,0x00,0x32,0x2A,0xA1,0x00,0xA4,0x2A, +0xA2,0x00,0xCE,0xAB,0xFE,0xBA,0x74,0x89,0x02,0x92,0x71,0x31,0xFE,0xBA,0x74,0x89, +0x02,0xB8,0x2C,0x37,0x02,0x20,0xC4,0xB4,0xA2,0x00,0xC0,0xAF,0xA3,0x00,0x1E,0xAB, +0x02,0x06,0x44,0x30,0xFF,0x11,0x22,0x8C,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x02,0x05,0x0A,0x00,0x01,0xCB,0x2C,0xE8,0xEF,0x11,0x48,0x5A,0x00,0x0C,0x2C,0x50, +0xEF,0x11,0x6E,0xDA,0x02,0xCB,0x97,0x03,0xB3,0x00,0x20,0xAB,0x55,0x11,0x02,0x00, +0x01,0x48,0x15,0xB0,0x01,0x01,0x22,0xB0,0xFF,0x0A,0x2C,0xE8,0x30,0xCB,0x2D,0xE8, +0x44,0x11,0x02,0x00,0xB2,0x07,0xC0,0xBB,0x01,0x01,0x22,0xB0,0xFD,0x05,0x0A,0x88, +0x00,0x0C,0xCE,0x59,0x02,0x0C,0x1C,0x98,0x20,0xC6,0x27,0x00,0x00,0x11,0x26,0x88, +0x00,0x0A,0xCC,0xD9,0x00,0x04,0xC0,0xBB,0xFD,0x05,0x0A,0x88,0x04,0x00,0x0C,0xB8, +0x01,0xCD,0x91,0xFC,0x80,0xCA,0x91,0xEC,0x0F,0xCB,0x9F,0xE8,0x10,0xCE,0xEB,0x68, +0x40,0xC8,0x8D,0x68,0xB2,0x00,0x82,0x2F,0x80,0xC8,0x81,0x68,0x10,0xC8,0x77,0x68, +0x40,0xCE,0xA9,0x68,0x20,0xCE,0xB5,0xE8,0x04,0xC8,0x61,0xE8,0x02,0x05,0x0A,0x00, +0x80,0xC9,0x91,0xEC,0xFF,0x8E,0x59,0x68,0xFF,0xC1,0xF3,0x60,0x40,0x83,0x90,0xEC, +0x01,0xC8,0x59,0x68,0x08,0xC8,0x59,0x68,0x80,0xCC,0x59,0x78,0x40,0xC9,0x91,0x7C, +0x5A,0x11,0xA2,0x5A,0xB1,0x00,0xCE,0xAF,0x01,0x11,0xE4,0x81,0xFD,0x05,0x0A,0x88, +0x08,0xC8,0x67,0xE8,0x20,0xC9,0x91,0x7C,0x00,0x0C,0x84,0x40,0x02,0x11,0xE4,0x81, +0xB1,0x00,0xCE,0xAF,0x00,0x0C,0x30,0xD9,0x00,0x0C,0x7A,0x5A,0xFD,0x05,0x0A,0x88, +0xB2,0x00,0x8A,0xAF,0x20,0xC8,0xA1,0xE8,0xE4,0x11,0x32,0x42,0xB2,0x00,0x8A,0xAF, +0x20,0xC8,0xA1,0xE8,0xE7,0x11,0x48,0xDA,0x00,0x0C,0x90,0x54,0x00,0x0C,0x84,0x40, +0x40,0x39,0x84,0x78,0x40,0x11,0x72,0x00,0x00,0x0C,0x30,0xD9,0x00,0x0C,0x8A,0x5A, +0x80,0xC8,0x95,0x68,0xE8,0x11,0x32,0x42,0x80,0xC8,0x95,0x68,0xEB,0x11,0x48,0xDA, +0x00,0x0C,0x90,0x54,0x01,0x11,0xEA,0xDF,0x03,0xC8,0x91,0x0B,0xF0,0x08,0x04,0xB9, +0x03,0x11,0x0E,0x81,0x3F,0xC9,0x93,0x0B,0x00,0x0C,0xA0,0x40,0x00,0x0C,0x30,0xD9, +0x40,0xCA,0x91,0xEC,0xB0,0x00,0xC4,0xAD,0x00,0x04,0xC0,0xBB,0x00,0x0C,0x56,0xC1, +0x80,0xCC,0xAF,0xE8,0x10,0xC9,0x91,0x7C,0x00,0x0C,0x6A,0x40,0x40,0xCA,0x91,0xEC, +0x10,0xC9,0x85,0x68,0x00,0x0C,0x6A,0x40,0x02,0x05,0x0A,0x00,0x80,0xC9,0xE5,0x68, +0x80,0x83,0xE4,0x68,0x10,0xCA,0xE5,0x68,0x02,0xCA,0xE7,0xE8,0xB2,0x00,0x8A,0xAF, +0x20,0xC8,0x6B,0xE8,0xFF,0xC1,0xF3,0x60,0x02,0xA1,0x42,0x01,0xB2,0x00,0x82,0x2F, +0xB2,0x00,0x8A,0xAF,0xA0,0xC8,0xE5,0x68,0x80,0x83,0xE4,0x68,0x13,0x00,0xE0,0xB9, +0xFF,0x11,0x22,0x20,0xFF,0xE8,0xE4,0x68,0x04,0x86,0xE4,0x68,0x04,0xA1,0xC6,0xF8, +0x80,0xC4,0xE1,0x68,0x04,0xC4,0xE1,0x70,0x01,0xC4,0x89,0x1B,0xFD,0xA1,0x42,0x0D, +0x00,0x11,0x88,0x0B,0x00,0x0C,0x6A,0x40,0xFD,0xA1,0x42,0x0D,0xB2,0x00,0xE2,0xAD, +0x00,0x0C,0x84,0x40,0xB2,0x00,0x00,0xAE,0x00,0x0C,0x9E,0x50,0x02,0xCA,0xF5,0xE8, +0xFF,0xC1,0x91,0xF4,0xA0,0x00,0x72,0xAD,0xB2,0x00,0xE2,0xAD,0x00,0x0C,0x9E,0xC0, +0xA0,0x00,0x08,0x28,0x30,0x0A,0x18,0x08,0xFF,0x0C,0x0A,0x79,0x20,0x0C,0x0C,0x71, +0x30,0x0C,0x16,0x71,0x20,0x11,0x18,0x00,0x80,0x0A,0x1A,0xE9,0x01,0xC8,0x1B,0x79, +0xA3,0x00,0x18,0x29,0x10,0x11,0x1A,0xC1,0x08,0x11,0x2A,0xD9,0x02,0x0A,0x0C,0x30, +0x20,0x11,0xEA,0xDF,0x02,0x06,0x14,0x30,0x00,0x0C,0x18,0xC1,0x00,0x11,0x2A,0x59, +0x40,0x11,0x1A,0xC1,0xB2,0x00,0x62,0xAF,0x01,0x0A,0x40,0x31,0x01,0x0A,0x90,0xFC, +0x10,0x45,0x26,0x69,0x00,0x11,0x7E,0x0B,0xF7,0x11,0xEC,0xDF,0x02,0xA0,0x98,0xB3, +0x01,0x45,0x9C,0x37,0x11,0x11,0x02,0x00,0x01,0x0C,0x96,0x30,0x01,0x01,0x22,0x34, +0x01,0xA1,0x90,0x7C,0xF5,0x45,0x8A,0x08,0x10,0x45,0x90,0xEC,0x20,0x45,0x44,0xE9, +0x80,0xA0,0x40,0xE9,0xF7,0x11,0x6E,0xDA,0x80,0x45,0x40,0xF9,0x02,0xF4,0x45,0xB1, +0x00,0x0C,0x48,0xD9,0x01,0x11,0x2A,0xC1,0x0A,0x11,0x2A,0x59,0x1F,0x11,0x50,0x59, +0x08,0xA0,0x14,0x88,0x02,0x0A,0x14,0x00,0x00,0x11,0x16,0x88,0x02,0x0A,0x40,0xB5, +0x11,0x11,0x02,0x00,0x01,0x0C,0x90,0x30,0x01,0x01,0x22,0x34,0x02,0x05,0x0A,0x00, +0x04,0x9E,0x5D,0xF9,0xF9,0x11,0xEE,0xDF,0xDF,0x11,0xEE,0x5F,0xB3,0x00,0xDA,0xAB, +0x03,0x11,0x90,0xB3,0x10,0x45,0x6A,0xE9,0x00,0x00,0x98,0xBB,0x00,0x11,0x9C,0x0B, +0x00,0x0C,0x6E,0x41,0xFE,0xCD,0x9B,0x8B,0xF5,0xCE,0x9D,0x0B,0x00,0x11,0x02,0x88, +0xF7,0x5E,0xBC,0x88,0x40,0x11,0x90,0x00,0x00,0x11,0x86,0x09,0x40,0xCB,0x7B,0xE9, +0x08,0x11,0xB8,0x00,0x01,0x01,0x22,0xB0,0x40,0x3D,0x7A,0x00,0xFF,0xE0,0xC0,0x81, +0xB3,0x00,0x8E,0xAA,0xB3,0x00,0x20,0xAB,0xFD,0x05,0x0A,0x0C,0xFD,0x4D,0x9B,0x8A, +0xB0,0x00,0x74,0xAF,0xB3,0x00,0x92,0x29,0x30,0x45,0x90,0xF9,0x00,0x0C,0x9A,0xD9, +0x55,0x11,0x02,0x00,0x1D,0x11,0xAC,0x02,0x02,0x4D,0x9B,0x02,0x02,0x5B,0xB7,0x02, +0x01,0x01,0x22,0x34,0x22,0x11,0x02,0x00,0x02,0x11,0x4A,0x80,0xF3,0x11,0x68,0x5A, +0x00,0x0C,0xB0,0xD1,0xC4,0x01,0x1C,0xB8,0xF3,0x11,0x4A,0x5A,0x00,0x0C,0xAC,0x51, +0xB2,0x00,0x2E,0xAE,0x00,0x0C,0xAE,0x41,0x08,0x44,0xB0,0xE9,0xF3,0x11,0x6E,0x5A, +0x00,0x11,0x4A,0x88,0x01,0x01,0x22,0x34,0x04,0x11,0x0A,0x81,0x00,0x0C,0xB8,0xC7, +0xB0,0x00,0x1A,0xA8,0x80,0x5D,0xF8,0x78,0xFF,0x4D,0xC5,0xE9,0x80,0x0B,0xC3,0x79, +0x18,0x11,0xCC,0xC1,0x10,0x11,0xCC,0x41,0x0F,0x11,0xCC,0xD9,0x30,0xCE,0x91,0x7C, +0x40,0x9E,0x91,0xFC,0xA2,0x00,0x40,0xAE,0xA3,0x00,0x2E,0x2A,0x13,0x11,0xD6,0x83, +0x02,0x11,0xD6,0xC1,0x17,0x11,0xD6,0x03,0x01,0x11,0xD6,0xC1,0x01,0x0C,0xD8,0x33, +0xB0,0x00,0x1A,0xA8,0x02,0x05,0x0A,0x00,0x04,0x3A,0xF8,0xF8,0x01,0xEB,0x69,0x30, +0x03,0x11,0x6A,0xB0,0xFD,0x05,0x0A,0x88,0xB0,0x00,0x1A,0xA8,0x01,0xEC,0x49,0x30, +0xFF,0xCE,0xEF,0xE9,0x00,0x11,0x48,0x08,0x00,0x0C,0xF8,0xC0,0xC8,0xCC,0x19,0x98, +0x01,0x11,0x1A,0x80,0x01,0x12,0x16,0xB0,0x11,0xCC,0x15,0x28,0x03,0x0C,0x98,0x8B, +0xFF,0xCE,0x9D,0x9B,0xC0,0x0A,0x18,0x98,0x02,0x12,0x40,0xB0,0x00,0x11,0x48,0x08, +0x01,0x0B,0x14,0xB0,0x44,0x0A,0x18,0xA8,0x02,0x0C,0x1C,0x34,0x02,0x0C,0x0C,0x30, +0x08,0x0C,0x18,0x18,0x02,0x12,0x1C,0xB0,0x02,0x0A,0x0C,0x30,0x82,0x10,0x14,0x28, +0x01,0x10,0x22,0x98,0x84,0x11,0x14,0xA8,0x83,0x0E,0x1C,0x28,0x02,0x06,0x14,0x30, +0x80,0x0F,0x1E,0xFA,0xFE,0x0C,0x18,0x18,0x02,0x12,0x14,0x30,0x02,0x06,0x18,0x30, +0xFF,0x11,0x22,0x8C,0x08,0x0C,0x1C,0x98,0x01,0x0D,0x1E,0xB0,0x02,0x0A,0x26,0xB0, +0x00,0x00,0x26,0xB8,0x10,0x12,0xC0,0x30,0xF0,0x0C,0x18,0x98,0x20,0x00,0xBC,0x38, +0x49,0x11,0xB8,0x84,0xD4,0x01,0x1C,0x38,0x01,0x0E,0x1A,0x30,0x01,0x0C,0x1C,0xB0, +0x01,0x0D,0x18,0xB0,0x03,0x11,0x48,0x80,0x00,0x0C,0x76,0x5A,0xB3,0x00,0xC2,0x2A, +0x04,0x12,0x50,0x30,0x04,0x28,0x26,0xB0,0x00,0x00,0x48,0x38,0x01,0xC5,0x0B,0x34, +0xD4,0x01,0x1C,0x38,0x03,0x11,0x48,0x80,0x00,0x0C,0x76,0x5A,0x11,0x12,0x22,0xA8, +0x00,0x0C,0x66,0x52,0x02,0x0C,0x0C,0x30,0x02,0x0E,0x18,0xB0,0x02,0x05,0x0A,0x00, +0x04,0x12,0x50,0x30,0x02,0x06,0x18,0x30,0xFC,0x0C,0x18,0x98,0x04,0x12,0x50,0x30, +0xFF,0x11,0x22,0x20,0x11,0x2B,0x22,0xA8,0xFD,0x05,0x0A,0x88,0x00,0x00,0x48,0xBC, +0x00,0x0C,0x76,0x5A,0x11,0x12,0x22,0xA8,0x00,0x00,0x48,0xBC,0x00,0x0C,0x76,0x5A, +0x02,0x0C,0x1C,0xB0,0x00,0x11,0x26,0x88,0x00,0x00,0x48,0xBC,0x03,0x02,0x48,0xB8, +0x01,0x11,0x1A,0x04,0x10,0x30,0x14,0xB8,0x00,0x0C,0xAA,0xDC,0x00,0x0C,0x98,0x5A, +0x00,0x0C,0x84,0x4A,0x28,0x11,0x86,0x42,0x29,0x11,0x86,0xC2,0x00,0x0C,0xA8,0x5A, +0x00,0x0C,0x92,0x42,0x40,0xC0,0x14,0xB8,0x00,0x0C,0xAA,0xDC,0x00,0x0C,0x98,0x5A, +0x1C,0x11,0xAC,0x5A,0xFF,0x11,0x50,0xD9,0xF7,0xA0,0x40,0x89,0x01,0x11,0x2A,0xC1, +0x20,0xCE,0x93,0xFB,0x20,0x9D,0x93,0xFB,0xB2,0x00,0x00,0xAE,0x00,0x0C,0x90,0x4C, +0x04,0x9E,0x3D,0x87,0x04,0xC8,0x91,0x03,0x01,0x0C,0x14,0x30,0x10,0x11,0xB8,0x42, +0x01,0x0C,0x14,0x30,0x30,0x11,0xB8,0xC2,0x01,0x0C,0x14,0x30,0x60,0x11,0xB8,0xC2, +0x01,0x0C,0x14,0x30,0x20,0x11,0xB8,0x42,0x01,0x0C,0x14,0x30,0x24,0x11,0xB8,0xC2, +0x01,0x05,0x16,0xB0,0x02,0x05,0x0A,0x00,0x02,0xCA,0xC3,0x6A,0xF0,0x44,0xC4,0x6A, +0x08,0x44,0xC8,0x7A,0x08,0x11,0x88,0x00,0x00,0x0C,0xE0,0x5A,0x00,0x0C,0xDE,0x4A, +0xFF,0x21,0xD2,0xF2,0x81,0x0A,0xD3,0xE2,0x04,0x10,0x81,0xB0,0x01,0x0D,0x89,0x30, +0x00,0x0C,0xDE,0xC2,0x01,0x0A,0xB6,0x31,0x04,0xDC,0x80,0x30,0x30,0xCE,0xDB,0xFA, +0x02,0x0C,0xDC,0xC2,0x01,0x0C,0x18,0x00,0x01,0x0C,0x88,0x30,0x01,0x0B,0x0A,0x34, +0x08,0x44,0xCC,0xFC,0x0B,0xCB,0xED,0xEA,0x55,0x11,0x02,0x00,0x40,0x4E,0x1B,0x88, +0x01,0x01,0x22,0xB0,0xFF,0x0D,0xE0,0x7A,0x08,0x11,0x88,0x00,0x01,0x10,0x22,0x1C, +0x47,0x00,0x0C,0x38,0x01,0x39,0x1E,0xEB,0x02,0x39,0x90,0xFC,0x04,0x67,0x14,0x08, +0x80,0x5B,0x00,0x6B,0x08,0x68,0xB0,0x33,0x30,0x67,0xA4,0x0B,0x00,0x0C,0x0A,0x43, +0x08,0x53,0xB0,0xB3,0x07,0x53,0x22,0x30,0x30,0x53,0xA4,0x8B,0x08,0xD8,0xD1,0x30, +0x00,0xD2,0xCF,0x80,0xFF,0x67,0xCE,0x08,0x80,0x11,0xCC,0x00,0x05,0x11,0xB4,0x80, +0x01,0x11,0x4A,0x80,0x22,0x0A,0x1C,0x28,0xE0,0x0E,0x1C,0x18,0x01,0x11,0x1E,0xA0, +0x08,0xD8,0x27,0x30,0x07,0x11,0x26,0xB0,0x01,0xD2,0x27,0xB4,0xFF,0x65,0x28,0xFB, +0xBF,0xE4,0xC9,0x8B,0x0F,0x53,0xC0,0xB0,0x01,0x53,0xFA,0x33,0x00,0x0C,0x30,0x43, +0x80,0x5B,0x34,0x7B,0x40,0xE4,0x35,0xEB,0x40,0xE4,0xC9,0x03,0x0F,0x11,0xC0,0xB0, +0x01,0xFD,0xC1,0x30,0x01,0x11,0x72,0x84,0x10,0x53,0xC0,0x30,0x01,0x11,0x72,0x84, +0x47,0x00,0x0C,0x38,0x20,0x11,0xD8,0x5C,0x00,0x0C,0xD6,0xDB,0xFF,0x00,0x90,0xEC, +0x30,0xCE,0x91,0x7C,0x40,0x11,0x90,0x84,0x47,0x00,0x0C,0x38,0x02,0x54,0x40,0x30, +0x01,0x48,0x71,0x7B,0x88,0x49,0x5B,0x6B,0x20,0x48,0x59,0xEB,0xB3,0x00,0x64,0xAB, +0x00,0x0C,0x5A,0xCB,0x00,0x0C,0x98,0xDB,0xA0,0x00,0xC0,0x3B,0x40,0xCA,0x95,0x87, +0x20,0x11,0xD6,0xDC,0x00,0x0C,0x94,0xDB,0x30,0xCE,0x91,0x7C,0x1B,0x48,0x15,0x88, +0x01,0x0A,0x90,0xE4,0x04,0x11,0x80,0x32,0xB2,0x00,0x12,0xAD,0x80,0x4C,0x91,0xFC, +0xB3,0x00,0x64,0xAB,0x00,0x0C,0x90,0x4C,0xC9,0x00,0xC0,0x3B,0x40,0xCA,0x95,0x87, +0x80,0x11,0xD8,0x5C,0x00,0x0C,0x94,0xDB,0x80,0xCE,0x7B,0xFB,0x80,0x49,0xCD,0x6C, +0x00,0x0C,0x7C,0xC3,0x40,0x9E,0xCD,0x6C,0x00,0x0C,0x92,0x5C,0x80,0xE1,0x93,0x7B, +0xFF,0x21,0x8E,0x73,0xB3,0x00,0x64,0xAB,0x00,0x0C,0x8A,0x53,0x00,0x0C,0x9C,0xDC, +0x01,0x11,0x22,0x9C,0x00,0x0C,0x54,0xDB,0x01,0x10,0x22,0x1C,0xD5,0x00,0xC0,0xBB, +0x40,0xCA,0x95,0x03,0x01,0x10,0x22,0x1C,0x00,0x0C,0xD0,0x5C,0x18,0x11,0xB8,0x04, +0x04,0x40,0x99,0xB0,0x04,0x0C,0xE1,0x30,0x04,0x24,0x31,0x31,0x20,0x10,0xBC,0xB8, +0x11,0x00,0xAA,0x73,0x20,0x0B,0xAF,0xFB,0xF0,0x04,0x14,0x88,0x60,0x0A,0xAE,0xF3, +0x20,0x11,0xB0,0x43,0x01,0xCD,0xAF,0x6B,0x05,0x11,0xB0,0xC3,0x00,0x11,0xB0,0xC3, +0x01,0x05,0x14,0x30,0x82,0x05,0x0A,0x80,0x30,0x50,0xC1,0x30,0x01,0x0C,0xB8,0x30, +0x01,0x0A,0x0A,0x30,0x40,0xE4,0xC9,0x03,0x20,0x0C,0xC0,0x63,0x02,0xC3,0x86,0x01, +0x40,0x61,0x90,0xEC,0x5F,0x01,0x18,0xB8,0x80,0x5F,0xC9,0xEB,0x6F,0x01,0x18,0xB8, +0x04,0x12,0x1C,0x08,0x22,0x0E,0x1C,0xA8,0xE0,0x0E,0x1C,0x18,0x01,0x11,0x1E,0xA0, +0x01,0x11,0x4A,0x80,0x10,0x12,0x26,0xB0,0x00,0x11,0x4A,0x0C,0x40,0x61,0xDC,0x7B, +0x30,0x60,0xA0,0x32,0x00,0x0C,0xEE,0x43,0x20,0x60,0xA0,0xB2,0x80,0x6F,0xDF,0x02, +0x04,0x6F,0x19,0x08,0x22,0x0C,0x18,0xA8,0xE0,0x0C,0x18,0x18,0x01,0x11,0x1A,0x20, +0x01,0x11,0x4A,0x80,0x10,0x12,0xE0,0x32,0x00,0x11,0x4A,0x88,0x11,0x00,0x08,0x74, +0x30,0xCE,0x09,0xFC,0x0C,0x3F,0xFF,0xFB,0x01,0x7F,0xFE,0x7B,0x0C,0x3F,0xB1,0x0B, +0x2E,0xD8,0xB1,0xAB,0x03,0x11,0xB2,0xB3,0x04,0xD8,0xE1,0x30,0x03,0x3F,0x09,0x7C, +0x01,0x93,0x08,0x7C,0x03,0x3F,0xB1,0x0B,0x03,0x11,0xB2,0xB3,0x04,0xD8,0x99,0x30, +0x04,0x70,0x18,0x32,0x04,0x98,0x48,0x32,0x04,0x4C,0x80,0xB2,0x11,0x00,0x2C,0x74, +0x01,0x7F,0x16,0xFC,0x20,0x11,0xD6,0xDC,0x00,0x0C,0xD0,0x5C,0xFF,0xFF,0xC4,0xB8, +0x00,0x0C,0x92,0x5C,0x01,0x93,0x14,0x30,0x48,0x11,0xB8,0x80,0x30,0xCE,0x41,0xFC, +0x02,0x0A,0x3E,0xEC,0x20,0x49,0x3F,0xEC,0x01,0x0A,0x40,0x7C,0x80,0x11,0xD8,0x5C, +0x00,0x0C,0xD0,0x5C,0x00,0x0C,0x3E,0x44,0x01,0x7F,0x14,0xB0,0x08,0x11,0xB8,0x00, +0x04,0x0A,0x38,0xEC,0x20,0x48,0x41,0xFC,0x80,0x11,0xD8,0x5C,0x00,0x0C,0x40,0x44, +0xFF,0xFB,0x18,0xB8,0xB3,0x00,0xE0,0x2A,0x30,0xCE,0x41,0xFC,0xB2,0x00,0x12,0xAD, +0x80,0xE4,0xC9,0x87,0x40,0x01,0x18,0x38,0xB3,0x00,0x40,0x2A,0x00,0x0C,0x50,0x4C, +0x94,0x00,0x18,0xB8,0xB3,0x00,0x40,0x2A,0x00,0x0C,0x50,0x54,0x04,0x94,0x80,0x36, +0x80,0x11,0xD8,0x44,0x04,0x10,0x80,0xB2,0xB3,0x00,0x64,0xAB,0x00,0x0C,0x90,0x4C, +0xFF,0xFF,0x0C,0xBA,0xFF,0xC1,0x61,0x64,0x02,0x20,0x80,0x33,0x02,0x20,0x84,0x37, +0x02,0x20,0x14,0xB0,0x02,0xC2,0x41,0xB0,0x02,0x0A,0x0C,0xB2,0x02,0x0A,0x84,0x33, +0x02,0x0A,0x40,0x34,0x47,0x00,0x0C,0x38,0x08,0x39,0x78,0x6C,0x04,0x39,0x72,0x6C, +0x10,0x3A,0x90,0xFC,0x00,0x0C,0xD6,0xDB,0x04,0x11,0x72,0x00,0x10,0x11,0x74,0x84, +0xFF,0x63,0x80,0x74,0xC0,0x11,0x18,0x80,0xB0,0x00,0xD2,0xAF,0x80,0xE5,0xCB,0x03, +0x08,0x11,0x72,0x84,0xB0,0x00,0x1A,0xA8,0x0F,0x00,0x14,0x08,0x01,0x0A,0x14,0x18, +0x00,0x7C,0x90,0x6C,0x01,0xE4,0xF9,0x78,0x08,0x11,0xB8,0x00,0x00,0x0C,0xF8,0xC0, +0xFF,0x11,0x22,0x8C,0x08,0xE4,0x91,0xEC,0x04,0x3C,0xEC,0xB3,0x00,0x11,0x78,0x08, +0x5F,0x3D,0x7A,0x08,0x08,0xE4,0xC9,0x87,0x02,0x05,0x0A,0x00,0x08,0xE4,0x91,0x7C, +0x40,0x3D,0x14,0x08,0xBF,0xF7,0xEF,0x8B,0x00,0xF7,0xEF,0x83,0x04,0xF6,0x79,0xB0, +0xF7,0xE4,0xC9,0x0F,0x02,0x05,0x0A,0x00,0x00,0xC8,0x91,0x83,0x01,0x0B,0x14,0xB0, +0x00,0xC8,0x17,0x88,0x00,0x0B,0xB8,0xE4,0xBF,0x3D,0x7A,0x88,0x00,0x11,0xC0,0x89, +0xA3,0x00,0xA2,0x2B,0x02,0x20,0x18,0xB0,0x02,0xF4,0x15,0x30,0xB3,0x00,0x68,0x2A, +0x80,0x0F,0x92,0xFB,0xFF,0x0A,0x93,0x73,0xC0,0x0A,0x15,0x88,0x80,0x0A,0x92,0x73, +0xC0,0x0A,0x92,0xF3,0x41,0x49,0x93,0xEB,0x01,0x11,0x22,0x9C,0xA3,0x00,0xB2,0x2A, +0x08,0x5E,0xBC,0x00,0xFF,0x21,0x90,0xF4,0x08,0x11,0xD6,0xC4,0xA3,0x00,0xC8,0xAA, +0xA3,0x00,0xCC,0x2A,0x02,0xC9,0x93,0x03,0x04,0x5D,0x90,0x7C,0xFF,0xFB,0xF9,0x6C, +0x30,0xCE,0xE7,0x6C,0xB0,0x00,0xEA,0x2F,0x00,0x0C,0xF6,0xC4,0xB2,0x00,0x30,0xAE, +0x00,0x0C,0x90,0x4C,0x0B,0x0A,0xF3,0xE4,0x04,0x1F,0xF7,0xFC,0x04,0x11,0x60,0x33, +0x00,0x0C,0xF6,0xC4,0x08,0x48,0xF7,0x7C,0xB2,0x00,0x20,0xAF,0x49,0x11,0xB8,0x84, +0x01,0x11,0xB8,0x84,0xA3,0x00,0x4E,0x2A,0x91,0x06,0x0C,0xB8,0x02,0xE0,0x14,0xB0, +0x81,0x80,0xB0,0xAB,0x02,0xE2,0x14,0x30,0x81,0x82,0xB4,0xAB,0x02,0xE4,0x14,0x30, +0x40,0x0A,0x14,0x00,0x81,0x84,0xB8,0xAB,0x02,0xE6,0x14,0xB0,0x81,0x86,0xBC,0xAB, +0x40,0xC8,0x17,0xFD,0x10,0xDB,0xBF,0x6D,0x08,0xD8,0x01,0xB5,0x01,0x88,0x24,0xED, +0x10,0x88,0xC8,0x6D,0x08,0x88,0xD6,0x6D,0x04,0x88,0xFA,0xED,0x02,0x88,0x06,0xEE, +0x02,0x06,0x22,0x30,0x01,0x11,0x74,0x84,0x04,0xDB,0x33,0xED,0x08,0xDB,0xAB,0x6D, +0xF0,0xDA,0xB1,0xED,0xFF,0xD8,0x9F,0x6D,0x03,0xDB,0x6D,0xED,0x10,0xDB,0xBF,0x6D, +0x00,0x0C,0x18,0x45,0xB5,0x06,0x0C,0xB8,0x01,0xCD,0x91,0x6C,0x11,0x11,0x00,0x80, +0x01,0xC8,0x3D,0xED,0x00,0x11,0x8C,0x8D,0x04,0x9E,0x43,0x7D,0xFB,0x11,0xEE,0x5F, +0x02,0x11,0xEA,0xDF,0x70,0x0B,0x15,0x88,0x80,0x80,0xB1,0x0B,0x00,0xD8,0x15,0x00, +0x08,0xA0,0xB0,0x0B,0x00,0xD8,0x15,0x00,0x01,0x0A,0x14,0x00,0x00,0x0C,0xFA,0x58, +0x00,0x0C,0xDA,0x5C,0xB0,0x00,0xE2,0xAF,0x00,0x11,0x2A,0x8B,0xB3,0x00,0x56,0xAA, +0x40,0xCE,0x67,0x7D,0x01,0x11,0x00,0x00,0x02,0x20,0x40,0x30,0x00,0x11,0x00,0x08, +0xE3,0x11,0x6E,0xDA,0x0C,0x07,0xC0,0xBB,0x40,0xCA,0x95,0x87,0x80,0xCC,0x91,0xEC, +0xA3,0x00,0x6C,0x29,0x04,0x11,0x06,0x05,0x01,0xCD,0x9D,0xED,0xF7,0xA0,0x40,0x89, +0x11,0x11,0x00,0x80,0xFE,0xC8,0x91,0x8B,0x10,0xE4,0xC9,0x03,0x01,0x83,0x80,0xED, +0x0F,0x8C,0x14,0x88,0x90,0x0A,0xF4,0x03,0xB0,0x00,0xE2,0xAF,0x00,0x0C,0x98,0xC5, +0x4C,0x8C,0x14,0xA8,0x80,0x0A,0xF4,0x83,0x80,0x8C,0x90,0x6D,0xF0,0x8C,0x0E,0x08, +0x0C,0x07,0x90,0xED,0xB0,0x00,0xE6,0x2F,0xC2,0x07,0x94,0xED,0x31,0x07,0x98,0xED, +0xB0,0x00,0xE2,0xAF,0x00,0x0C,0x98,0xC5,0xFF,0x95,0x99,0x75,0x01,0x95,0x2B,0x1B, +0xBF,0x3D,0x7A,0x88,0x00,0x11,0xC0,0x89,0x03,0x11,0x06,0x85,0x01,0xCD,0xA9,0x6D, +0x11,0x11,0x00,0x80,0x02,0xC8,0xA7,0xED,0x08,0xE4,0xC9,0x03,0xE8,0x11,0x32,0x5A, +0xFF,0x11,0x00,0x85,0x80,0xCE,0xAF,0xFD,0x08,0xC8,0x91,0x03,0x08,0x11,0x06,0x05, +0x60,0xCE,0xB9,0x6D,0x80,0xCE,0xBD,0xFD,0x0C,0xC8,0xB1,0x8B,0x0C,0xD8,0xBD,0xE5, +0x20,0x30,0x14,0xB8,0x00,0x0C,0xAA,0xDC,0xF0,0x11,0x04,0x05,0x10,0x45,0xC6,0x6D, +0x03,0xC8,0xC7,0xFD,0x80,0xC0,0x14,0xB8,0x00,0x0C,0xAA,0xDC,0x10,0x11,0x06,0x05, +0x1E,0xDC,0x1B,0x7D,0x18,0xDC,0xD5,0xFD,0x87,0x11,0xB4,0xDA,0x22,0x11,0x00,0x80, +0x80,0xE4,0xC9,0x03,0xE7,0xE4,0xC8,0x09,0x1E,0x11,0x08,0x05,0x80,0xDC,0xDF,0xED, +0x40,0xDC,0xE9,0xED,0x02,0xDE,0xF1,0x6D,0x00,0x0C,0x1C,0xC5,0x01,0xCD,0xE7,0x6D, +0x30,0xCB,0xE7,0x7D,0x80,0xCB,0xE7,0x6D,0x80,0xCB,0x97,0x03,0x80,0x11,0x08,0x85, +0x00,0x11,0x02,0x88,0x40,0x11,0xB8,0x00,0x01,0x01,0x22,0xB0,0x40,0x11,0x08,0x85, +0x01,0xA1,0xF8,0x7D,0xFD,0xE6,0xCC,0x89,0x00,0x11,0x00,0x08,0x00,0x0C,0x7C,0xC3, +0x02,0x11,0x0C,0x05,0xFF,0xD9,0x1F,0xFD,0x81,0x11,0xB0,0x03,0xFF,0x81,0xB2,0x0B, +0xFF,0x11,0x02,0x81,0x01,0xA1,0x90,0xEC,0x00,0x0C,0x12,0x46,0x0E,0xDA,0x0D,0xEE, +0x01,0xDA,0x21,0xFD,0xFE,0xE2,0xC4,0x89,0x82,0x11,0xB0,0x03,0x0F,0x82,0xB2,0x0B, +0x0F,0x11,0x04,0x81,0x04,0x3A,0x12,0x7E,0x0C,0x00,0x68,0xB8,0x02,0xD8,0x6D,0x34, +0xB0,0x00,0x1A,0xA8,0x02,0x05,0x0A,0x00,0x10,0xE4,0x43,0x6E,0x0F,0xCB,0x37,0x6E, +0x20,0xE4,0x3B,0x6E,0xB2,0x00,0x82,0x2F,0x80,0xC8,0x39,0x6E,0x20,0xC8,0x3B,0xEE, +0xB2,0x00,0x8A,0xAF,0xFD,0x05,0x0A,0x88,0xDC,0x01,0x1C,0xB8,0xE3,0x11,0x4A,0xDA, +0x00,0x0C,0xF8,0x50,0x10,0xC9,0x93,0x03,0x15,0x11,0x3C,0xC6,0x08,0x11,0x3C,0xC6, +0x05,0x11,0x3C,0x46,0x16,0x11,0x3C,0xC6,0xFD,0x05,0x0A,0x88,0x00,0x0C,0xCE,0x5C, +0x14,0x11,0x4A,0xC6,0xFD,0x05,0x0A,0x88,0x00,0x0C,0x82,0xDC,0xB3,0x00,0x82,0x2B, +0x0D,0x11,0x4A,0x46,0x00,0x0C,0xCC,0xD9,0xA0,0x00,0x64,0xAA,0x10,0xE4,0x75,0x6E, +0xB3,0x00,0x72,0x2B,0x02,0x58,0x19,0x30,0xFC,0xFF,0x14,0x38,0x83,0x90,0x14,0x28, +0x8A,0x0C,0x6C,0x2E,0xB3,0x00,0x68,0x2A,0x80,0x0F,0x68,0x6E,0x04,0x0C,0x78,0xB2, +0x01,0x11,0xCE,0xDC,0xFC,0xFF,0x14,0x38,0x83,0x90,0xB0,0x2A,0x00,0x0C,0x6C,0x46, +0x04,0x11,0x78,0xB2,0x02,0x11,0xCE,0xDC,0x10,0x50,0xC1,0xB0,0x20,0x00,0xBC,0x38, +0x69,0x11,0xB8,0x80,0x10,0xE4,0xC9,0x87,0x18,0x11,0xB8,0x04,0x01,0x30,0x14,0x30, +0xFF,0x0A,0x8A,0x7E,0x01,0x0A,0x8A,0x76,0x15,0x0A,0x88,0xF6,0x0A,0x0A,0x4C,0x77, +0x02,0x0A,0x5C,0x77,0xF8,0x0A,0x14,0x88,0xC0,0x0A,0x36,0xF7,0xD0,0x0A,0x38,0xF7, +0xA3,0x00,0xCC,0x29,0x04,0x30,0xB0,0xB3,0x11,0x11,0x00,0x80,0x02,0x22,0x0C,0x30, +0x02,0x20,0x0C,0xB0,0x02,0xDA,0x41,0xB0,0x02,0x46,0x45,0x30,0x03,0xC8,0xD7,0xEE, +0xC0,0xCA,0xD7,0x6E,0x3F,0xCB,0xD7,0xEE,0x04,0x06,0x22,0x30,0x01,0xD8,0xF7,0xB3, +0x02,0x20,0xDC,0x33,0x20,0x0B,0xE1,0xFE,0x40,0x9E,0xDD,0x6E,0xF7,0x11,0xEE,0x5F, +0x08,0x48,0x19,0x08,0xB1,0x00,0xEA,0xAF,0xB2,0x00,0x1A,0xAF,0x01,0x9D,0xC9,0x6E, +0x01,0x02,0x90,0x3B,0x80,0x11,0x94,0x83,0x01,0xCD,0x9B,0x03,0x01,0x45,0x9C,0xB3, +0xB3,0x00,0x20,0xAB,0xB3,0x00,0x56,0xAA,0x01,0xFB,0xC7,0xF6,0xB0,0x00,0x06,0x2B, +0x10,0xCE,0xC5,0xFE,0x00,0x0C,0xE0,0x5C,0xBC,0x01,0xC0,0xBF,0x38,0x02,0xC0,0xBF, +0x02,0x9E,0xCD,0xEE,0x04,0x11,0xEA,0xDF,0xFC,0x11,0xEC,0x5F,0x30,0x4C,0x19,0x08, +0x4C,0x0C,0x18,0x28,0x00,0x0C,0xE8,0xDF,0x00,0x0C,0xE0,0xC6,0xB3,0x00,0xDA,0xAB, +0x02,0x06,0x40,0xB0,0x02,0x06,0x44,0x30,0x04,0x3A,0xDC,0xFE,0x04,0xD8,0x69,0xB4, +0x70,0x0B,0x15,0x88,0xFF,0x0A,0xEA,0x7E,0x80,0x80,0xF1,0xEE,0x02,0x82,0xE5,0xB3, +0x00,0x0C,0xF4,0xC6,0x01,0x7C,0xEA,0x7E,0xFF,0xFF,0xE4,0x3B,0x00,0x0C,0xF4,0xC6, +0x01,0x23,0xE4,0xB3,0x01,0x22,0xE6,0xB3,0x08,0x11,0xB8,0x00,0x00,0x00,0x90,0xB9, +0x8F,0x80,0x17,0x08,0x00,0x0B,0x94,0x01,0x09,0x0B,0x15,0x08,0x55,0x11,0x00,0x80, +0x01,0x43,0x05,0x7F,0x08,0x11,0x14,0x00,0x11,0x11,0x00,0x80,0xF6,0x81,0x17,0x08, +0x00,0x0B,0x94,0x01,0x02,0xF2,0x95,0x31,0x08,0x84,0x95,0xB1,0x08,0xA8,0x94,0xB1, +0x08,0x94,0x95,0x31,0x09,0x0A,0x1C,0x77,0x55,0x11,0x00,0x80,0x01,0x43,0x1B,0xEF, +0x08,0x11,0x40,0x01,0x11,0x11,0x00,0x80,0x10,0x04,0xBC,0xB8,0x1C,0x11,0xFA,0x5C, +0x49,0x11,0xB8,0x00,0x01,0xC8,0x91,0x03,0x80,0xCA,0x95,0x03,0xB3,0x00,0x20,0xAB, +0x02,0x96,0x71,0xB1,0x01,0xBA,0x74,0x01,0xFE,0x11,0xEE,0x5F,0xE8,0x11,0x32,0x5A, +0x02,0x7C,0x30,0x7F,0x02,0xE4,0xC9,0x03,0x00,0x0C,0xBC,0xC6,0x02,0x11,0x3A,0x47, +0x01,0x11,0x3A,0x47,0x01,0x0C,0x48,0x30,0xC8,0xCD,0x1D,0x98,0x01,0x11,0x1E,0x00, +0x01,0x30,0x26,0xB0,0x11,0xCD,0x15,0xA8,0x03,0x0E,0x9A,0x8B,0x01,0xCE,0x9D,0x1B, +0xC0,0x0A,0x1C,0x18,0x02,0x32,0x26,0xB4,0x04,0x30,0xB0,0xB3,0x30,0xCB,0xDD,0xEE, +0x44,0x11,0x00,0x80,0x02,0xDA,0xE1,0xB3,0x10,0xCB,0x97,0x03,0x80,0xE1,0x91,0x7C, +0xB3,0x00,0x20,0xAB,0x2F,0x08,0xC0,0xBF,0x04,0x30,0x22,0x30,0x44,0x11,0x00,0x80, +0xB2,0x00,0xC8,0x28,0xA3,0x00,0x20,0x2B,0xB2,0x00,0xB4,0x2A,0xEB,0x00,0xF0,0x3B, +0xB2,0x00,0x4C,0xAA,0xA0,0x00,0x24,0xA8,0x01,0xC7,0x14,0xB0,0x00,0xC5,0x14,0x08, +0x80,0x0A,0x80,0xEF,0x40,0x0A,0x84,0x6F,0x20,0x0A,0x88,0x6F,0x08,0x0A,0xA0,0x6F, +0x04,0x0A,0xB6,0xEF,0x02,0x0A,0xC0,0x6F,0x01,0x0A,0xCA,0x6F,0xFF,0x11,0x22,0x8C, +0x80,0x11,0x8E,0x81,0x20,0xC9,0x93,0x87,0x40,0x11,0x8E,0x81,0xE4,0x11,0x8C,0x47, +0x20,0x11,0x8E,0x81,0xE0,0x11,0x8C,0xC7,0x01,0x11,0x4A,0x80,0x01,0x11,0x1A,0x80, +0x02,0x0C,0x1C,0xB0,0x04,0x12,0xB0,0xB3,0x01,0x00,0x14,0xB8,0x83,0xD8,0xB1,0x2B, +0x00,0x00,0x14,0x38,0x84,0xDA,0xB5,0xAB,0x00,0x0C,0x90,0x4C,0x04,0xD8,0x27,0xB4, +0x01,0xC7,0x19,0x88,0x01,0xCB,0x14,0x08,0x00,0x0C,0xAE,0x77,0xFE,0xC7,0x8F,0x8B, +0x00,0xC7,0x8F,0x83,0x01,0x0A,0xB0,0x7F,0xEF,0x11,0x6E,0xDA,0x08,0x11,0x8E,0x05, +0x08,0x11,0x8E,0x81,0xEC,0x11,0x32,0xDA,0xE8,0x11,0x8C,0x47,0x04,0x11,0x8E,0x81, +0x04,0xC9,0x93,0x03,0x59,0x11,0xA2,0x5A,0xB1,0x00,0xCE,0xAF,0x09,0x11,0xE4,0x85, +0x02,0x11,0x8E,0x81,0x08,0xC9,0x93,0x03,0x58,0x11,0xA2,0xDA,0xB1,0x00,0xCE,0xAF, +0x05,0x11,0xE4,0x85,0x01,0x11,0x8E,0x81,0x40,0xC9,0x93,0x87,0x80,0x11,0x8E,0x81, +0xDF,0xC9,0x93,0x0F,0xB3,0x00,0xC2,0x2A,0xF1,0x07,0x0C,0x38,0x02,0x48,0x15,0xB0, +0x01,0x0B,0x92,0xEB,0x01,0x0A,0x04,0xB0,0x01,0x0B,0x06,0x80,0xE1,0x48,0xE5,0xAF, +0x01,0x11,0x22,0x9C,0x01,0xC5,0x0B,0x34,0x02,0x48,0x91,0x32,0x00,0x0C,0xD6,0x47, +0xA2,0x00,0xA8,0x2F,0xA2,0x00,0xAC,0xAF,0xA2,0x00,0xBC,0x2F,0xA2,0x00,0xC0,0xAF, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x01,0x4E,0x15,0xB0,0x01,0x4C,0x17,0xB0,0x81,0x5A,0x15,0x28,0x02,0x0B,0x0C,0xF8, +0xB1,0x00,0x86,0xA9,0x00,0x0C,0x5C,0x40,0x20,0x0A,0x14,0x78,0x30,0xE4,0xC9,0x83, +0x22,0x11,0x9E,0x82,0xDD,0x5A,0xB5,0x0A,0x02,0x0A,0x1A,0xF8,0x20,0xE4,0xC9,0x03, +0x02,0x11,0x9E,0x02,0x10,0x0A,0x20,0xF8,0x40,0xE4,0xC9,0x03,0x10,0x11,0x9E,0x02, +0x40,0x0A,0x26,0xF8,0xFD,0x4D,0x9B,0x8A,0xFD,0x5B,0xB7,0x8A,0x10,0xCB,0x3D,0xE8, +0xEB,0x0A,0x14,0x08,0xFF,0x0A,0x5A,0xF8,0x20,0xCB,0x5B,0xE8,0x20,0xCB,0x97,0x03, +0x97,0xC5,0x8A,0x89,0x81,0x08,0xC0,0xBB,0x00,0x0C,0xB4,0xDA,0x40,0x0A,0x44,0x78, +0x01,0xCB,0x97,0x03,0x00,0x0C,0x44,0x40,0x40,0x4E,0x9F,0x0A,0x20,0x0A,0x44,0xE8, +0xA9,0x44,0x15,0x88,0xFF,0x0A,0x5A,0xF8,0xA9,0x0A,0x48,0xF8,0x80,0xE4,0xC9,0x03, +0x01,0x0A,0xDC,0xB3,0x00,0x0C,0x0A,0x5B,0x40,0xEF,0x51,0x78,0x01,0xCA,0x95,0x03, +0x20,0xE4,0x55,0x78,0x02,0xEE,0xDD,0x03,0x40,0xE4,0x59,0x78,0x10,0xEE,0xDD,0x03, +0xFE,0x3F,0x7E,0x88,0xFF,0x11,0x9E,0x82,0x01,0x11,0x76,0x04,0x02,0xF0,0x41,0x30, +0x80,0x0A,0x79,0x70,0x81,0x0A,0x4D,0x73,0x00,0x0C,0x5C,0x43,0xB2,0x00,0x60,0x2F, +0x00,0x0C,0x22,0x5B,0xFF,0xF1,0x5F,0x60,0x02,0x05,0x0A,0x00,0x0F,0xCB,0x97,0x0B, +0x40,0x3D,0x7A,0x00,0x01,0x3F,0x7E,0x00,0xB3,0x00,0x20,0xAB,0xA0,0x00,0x24,0xA8, +0xF4,0xCB,0x97,0x8B,0x01,0x0C,0x15,0xB0,0x01,0x0A,0x8A,0x70,0x81,0x0A,0x8E,0x70, +0x02,0x0A,0x92,0x70,0xFF,0x0A,0xC6,0xF8,0x03,0x0A,0x8A,0xF0,0x04,0x0A,0x8A,0x70, +0x00,0x0C,0xEA,0xC0,0x01,0x11,0xE8,0x03,0x00,0x0C,0xA8,0xC0,0x02,0x11,0xE8,0x03, +0x00,0x0C,0xA8,0xC0,0x54,0x11,0x02,0x80,0x02,0x05,0x0A,0x00,0x01,0x0D,0x85,0xB2, +0x55,0x11,0x00,0x80,0x20,0x11,0x9E,0x02,0x20,0x11,0x88,0x82,0x20,0x11,0x9C,0x82, +0x22,0x5A,0xB5,0x82,0xFD,0x05,0x0A,0x88,0x01,0x01,0x22,0xB0,0x00,0x0C,0xAA,0x40, +0x00,0x0C,0x28,0xDA,0x00,0x0C,0xC8,0x5A,0x01,0x11,0xB5,0xF8,0x03,0x11,0x48,0x80, +0x04,0xD8,0x81,0x33,0x00,0x11,0x48,0x08,0x00,0x0C,0xBE,0xC8,0x34,0x08,0x0C,0x38, +0x20,0xEE,0x49,0xEF,0x02,0x11,0x49,0x7F,0x00,0x0C,0xC8,0xC0,0x00,0x0C,0x22,0x5B, +0xFF,0xFF,0x40,0xB8,0x40,0xEF,0xA5,0xE9,0x00,0x0C,0x28,0xC1,0x7E,0x08,0x0C,0xB8, +0x55,0x11,0x02,0x00,0x02,0x05,0x0A,0x00,0x04,0x11,0xB0,0x32,0xFE,0x3F,0x7E,0x88, +0x08,0x11,0x80,0x02,0x00,0x11,0x82,0x8A,0x00,0x11,0x80,0x0A,0xC2,0x60,0xC1,0x02, +0xFF,0xFF,0x94,0x3A,0xFF,0x11,0x9E,0x82,0x01,0x01,0x22,0xB0,0xB3,0x00,0x7C,0x2A, +0x97,0xC5,0x8A,0x89,0xB2,0x00,0x10,0x2B,0x03,0x11,0x90,0xB3,0x08,0x11,0x96,0x03, +0xA3,0x00,0xA2,0x2B,0x10,0x0D,0xF3,0xF8,0x55,0x11,0x00,0x80,0xEF,0x5A,0xB5,0x8A, +0x44,0x11,0x00,0x80,0x00,0x0C,0x68,0x5A,0x10,0x0D,0xFD,0xE8,0x55,0x11,0x00,0x80, +0x10,0x5A,0xB5,0x02,0x44,0x11,0x00,0x80,0x00,0x11,0x78,0x8A,0x03,0x11,0x7A,0xB2, +0x11,0x11,0x66,0x40,0x40,0xEE,0x0F,0xF9,0xB1,0x00,0xCE,0xA9,0x00,0x0C,0x18,0x5B, +0x00,0x0A,0x4E,0xDF,0xBF,0xEE,0xDD,0x8B,0xA9,0xEE,0x6B,0x78,0x14,0xCB,0x97,0x83, +0xF4,0xCB,0x97,0x8B,0x01,0x11,0xE8,0x03,0xFF,0xFF,0x40,0xB8,0x00,0x0C,0xC8,0x5A, +0xB1,0x00,0xCE,0xA9,0x00,0x0C,0x18,0x5B,0x00,0x0A,0x4E,0xDF,0x08,0xEE,0x25,0x79, +0x00,0x0C,0x4C,0x5A,0x00,0x0C,0x6A,0x40,0x80,0xEE,0x6B,0xF8,0x40,0xEF,0xA5,0xE9, +0x00,0x11,0x62,0x5F,0x91,0x00,0x40,0xB9,0x02,0x00,0x40,0xB9,0x01,0x11,0x50,0x5F, +0x02,0xFE,0xF5,0xB3,0xB0,0x00,0x74,0xAF,0xB0,0x00,0x1A,0xA8,0xC0,0xCA,0x5B,0x6F, +0x02,0xF4,0x6F,0xF1,0x00,0x0C,0x2A,0xDB,0xE4,0x11,0x4A,0x5F,0x11,0x11,0x48,0x5B, +0xB1,0x00,0x82,0x2C,0x44,0x11,0x48,0x5B,0xB0,0x00,0x1A,0xA8,0x40,0xCB,0x57,0x69, +0x80,0xCB,0x67,0x69,0x00,0x0C,0x16,0x5A,0x00,0x0C,0xF4,0xC9,0x02,0xCB,0xF7,0x69, +0xE7,0x11,0x46,0x5F,0x00,0x0C,0xF8,0xC9,0x00,0x0C,0x5A,0xC7,0x00,0x11,0x48,0x5B, +0xB1,0x00,0x82,0x2C,0x00,0x00,0x90,0xB9,0x00,0x11,0xB2,0x89,0x1C,0xD8,0x94,0x31, +0x40,0x11,0x72,0x00,0x1C,0xF0,0xF4,0x3B,0x00,0x0C,0xD2,0xC1,0x84,0x80,0x40,0x38, +0x0C,0x11,0x4E,0xDF,0xFF,0xFF,0x40,0xB8,0x00,0x0C,0x6A,0x40,0x00,0x0C,0xB4,0xDA, +0x55,0x11,0x00,0x80,0x10,0x11,0x80,0x02,0x44,0x11,0x00,0x80,0x2D,0x11,0x52,0x5F, +0x02,0x05,0x0A,0x00,0x0F,0xCB,0x97,0x0B,0xB3,0x00,0x20,0xAB,0xFD,0x05,0x0A,0x88, +0xB0,0x00,0x1A,0xA8,0x00,0x0C,0x16,0x5A,0x00,0x0C,0x90,0x49,0x08,0xC7,0x8C,0xF9, +0x08,0x11,0x8E,0x81,0x01,0xCB,0x90,0x79,0xFF,0xF1,0x6B,0xE0,0x00,0x0C,0x5A,0xC7, +0x40,0x11,0x72,0x00,0xB0,0x00,0x1A,0xA8,0x55,0x11,0x00,0x80,0x40,0x48,0x9F,0x69, +0x44,0x11,0x00,0x80,0xFF,0xF1,0x6B,0xE0,0x00,0x0C,0x5A,0xC7,0x90,0x11,0x80,0x82, +0x44,0x11,0x00,0x80,0x00,0x0C,0x6A,0x40,0x02,0xCC,0x0D,0xB0,0xB1,0x11,0x14,0x80, +0xB1,0x00,0xFA,0xA8,0x02,0x06,0x14,0x30,0x01,0x0B,0xB2,0x79,0xB0,0x00,0x1A,0xA8, +0x01,0xCD,0x5B,0x6F,0xFE,0xCD,0x9B,0x8B,0xB0,0x00,0x1A,0xA8,0x40,0xCA,0x5B,0xEF, +0xD0,0x01,0x1C,0xB8,0xE8,0x11,0x54,0x5F,0xB0,0x00,0x1A,0xA8,0x40,0xCB,0xCF,0xE9, +0x00,0x0C,0x16,0x5A,0x00,0x0C,0xF4,0xC9,0x02,0xCB,0xF7,0x69,0xD0,0x01,0x1C,0xB8, +0xEB,0x11,0x44,0xDF,0x00,0x0C,0xFA,0x49,0x00,0x0C,0x5A,0xC7,0x00,0x11,0x48,0x5B, +0x14,0x20,0xF4,0x3B,0xB1,0x00,0xCE,0xA9,0x01,0x0A,0xC4,0xB3,0x01,0xFA,0x15,0xB0, +0x00,0x11,0x16,0x88,0xB1,0x00,0x06,0x2A,0xB1,0x00,0x22,0x2A,0x02,0x0C,0x1C,0x98, +0xFF,0xC6,0x27,0x08,0x01,0xFB,0x27,0xB0,0x02,0x0A,0x26,0xB0,0xB1,0x00,0x82,0x2C, +0x00,0xE2,0x4F,0x5F,0xFF,0xFF,0x40,0xB8,0x44,0x11,0x48,0x5B,0xFB,0xCB,0x97,0x8B, +0x01,0xF6,0xEB,0x33,0x00,0x0C,0x6A,0x40,0x01,0x11,0xFC,0x41,0x02,0x11,0xFC,0x41, +0x00,0x11,0xFC,0xC1,0x03,0x11,0xFC,0xC1,0x01,0x0C,0xE4,0x33,0xB1,0x00,0xCE,0xA9, +0x03,0xF2,0x05,0xE2,0x00,0x11,0xEA,0x8B,0x02,0x0C,0x1C,0x98,0x18,0xC6,0x27,0x80, +0x01,0xF2,0x27,0xB0,0x01,0xF5,0x27,0x30,0x00,0x0A,0x4E,0xDF,0x35,0x08,0x0C,0xB8, +0xFF,0xF5,0xC9,0xF8,0xFF,0xF5,0xEB,0x9B,0x00,0x0C,0x4C,0x42,0x55,0x11,0x02,0x00, +0x40,0x4E,0x25,0x7A,0x00,0x11,0x82,0x8A,0xC0,0x11,0x9E,0x82,0x01,0x11,0x76,0x80, +0x01,0x01,0x22,0xB0,0x01,0x10,0x22,0x1C,0x01,0x01,0x22,0xB0,0x01,0x11,0x22,0x9C, +0x00,0x0C,0xB4,0xDA,0x00,0x0C,0x68,0x5A,0x01,0x10,0x9F,0xB3,0x01,0x16,0xED,0xB3, +0x01,0x16,0xEB,0xB3,0x01,0x11,0x3B,0x7A,0x03,0x11,0x48,0x80,0x04,0x12,0x81,0x33, +0x00,0x11,0x48,0x08,0x55,0x11,0x00,0x80,0x03,0x0A,0x7A,0x72,0x04,0x0A,0x7A,0xF2, +0xFD,0x11,0x9E,0x02,0xEB,0x00,0xF0,0x3B,0x10,0x42,0x49,0xEA,0x10,0xF8,0xF1,0x03, +0xFD,0x60,0xC1,0x8A,0x44,0x11,0x00,0x80,0xE3,0x11,0x58,0xDF,0x00,0x0C,0x58,0xD2, +0xB0,0x00,0x1A,0xA8,0xD8,0x01,0x1C,0x38,0xE3,0x11,0x44,0x5F,0x00,0x0C,0x5A,0x57, +0x55,0x11,0x02,0x00,0x00,0x11,0x82,0x8A,0x10,0x11,0x80,0x02,0x90,0x11,0x80,0x82, +0x02,0xF8,0xB5,0xB2,0x01,0x01,0x22,0xB0,0xD8,0x01,0x1C,0x38,0xE0,0x11,0x54,0xC7, +0x54,0x11,0x02,0x80,0x01,0x0F,0xA1,0x32,0x01,0x0D,0x85,0xB2,0x01,0x0E,0x8B,0x32, +0x06,0x11,0xE2,0x02,0x00,0x11,0xE4,0x8A,0x29,0x11,0xE6,0x02,0x01,0x11,0xD8,0x82, +0x01,0x01,0x22,0x34,0x01,0xA2,0x18,0x38,0x03,0x02,0x1C,0x38,0x03,0x0A,0x86,0x72, +0x08,0xA1,0x18,0x38,0x11,0x10,0x1C,0x38,0x01,0xCA,0x95,0x03,0x00,0x11,0x8C,0x0A, +0x3D,0x60,0xC1,0x8A,0x01,0x0C,0x82,0xB2,0x50,0x11,0x80,0x82,0x08,0x11,0xC4,0x83, +0xFF,0xE2,0xC5,0x9B,0xFF,0xE2,0x91,0x6A,0xA0,0x41,0x83,0x82,0x01,0x0E,0x14,0xB0, +0x00,0x49,0xC5,0x8B,0xFF,0xE2,0x99,0x7A,0x01,0x0D,0x82,0x32,0x01,0x0F,0x14,0x30, +0x00,0xE2,0xA5,0xEA,0x00,0x49,0xA3,0xFA,0x80,0x11,0x9C,0x82,0xE0,0x11,0x82,0x82, +0x03,0x11,0x8C,0x82,0xA0,0xE4,0xC9,0x83,0x82,0x11,0xDC,0x03,0x00,0x0C,0x0A,0x5B, +0xFF,0x11,0x9E,0x82,0x44,0x11,0x00,0x04,0x04,0xCB,0x97,0x03,0x22,0x11,0x02,0x00, +0x97,0xC5,0x8A,0x89,0xEF,0x11,0x56,0x5F,0xFE,0xC7,0x8F,0x8B,0x01,0x01,0x22,0xB0, +0x08,0x11,0x88,0x00,0x02,0x00,0x40,0xB9,0x00,0x11,0x8A,0x88,0x00,0x11,0x50,0xC7, +0x01,0x3F,0x7E,0x00,0xC0,0x01,0x1C,0x38,0xEC,0x11,0x54,0xDF,0xB0,0x00,0x1A,0xA8, +0x80,0xE4,0xE1,0xFA,0x80,0xEE,0xDD,0xFA,0x02,0xF4,0xDD,0xF2,0xB2,0x00,0x10,0x2B, +0x68,0xC5,0x8A,0x01,0xB1,0x00,0x90,0x29,0xB3,0x00,0x86,0x2A,0x00,0x0C,0xF4,0x42, +0x20,0xE4,0x5B,0xEF,0xC0,0x01,0x1C,0x38,0xEF,0x11,0x44,0x5F,0x00,0x0C,0x5A,0x57, +0x55,0x11,0x02,0x00,0xFE,0x3F,0x7E,0x88,0x01,0x44,0xDD,0x33,0x00,0x0C,0x0A,0x5B, +0xFF,0x11,0x9E,0x82,0x01,0x01,0x22,0xB0,0x1F,0xE4,0xC9,0x8B,0xFF,0x21,0x02,0xF3, +0x11,0x11,0x60,0xDF,0x00,0x11,0x78,0x8A,0x01,0xEE,0x7B,0xB2,0x01,0xEF,0x7D,0x32, +0x01,0xF3,0x7F,0x32,0x01,0xCB,0x14,0x08,0xFE,0xC7,0x8F,0x8B,0x00,0xC7,0x8F,0x83, +0x11,0xEE,0x23,0xAC,0x01,0x43,0xDF,0x33,0x80,0x48,0xE7,0x8B,0xFF,0x11,0x94,0x06, +0x06,0x11,0x96,0x01,0x68,0xC7,0x48,0x7F,0x68,0x11,0x8E,0x81,0x80,0x11,0x74,0x84, +0x02,0x0C,0x1C,0x98,0x10,0xC6,0x27,0x00,0x01,0xEE,0x27,0x30,0x01,0xEF,0x27,0xB0, +0x01,0xF3,0x27,0xB4,0x02,0x00,0xE1,0x33,0xFF,0xF1,0x29,0xE3,0x11,0x11,0x4E,0xC7, +0x12,0x11,0x4E,0xC7,0x11,0x11,0x02,0x00,0x08,0x11,0xB8,0x00,0x01,0xC0,0x23,0xB0, +0x02,0x11,0x4A,0x80,0x03,0xE0,0x15,0x08,0x1C,0x00,0x98,0x38,0xFC,0xE0,0xC1,0x08, +0x07,0xE1,0xC1,0xB0,0x1C,0x11,0xC0,0x00,0x06,0x11,0xC0,0xB0,0x44,0x0A,0xC0,0xA8, +0x00,0x11,0x4A,0x88,0x10,0x04,0xBC,0xB8,0x49,0x11,0xB8,0x00,0x01,0x01,0x22,0x34, +0x80,0xE1,0xC3,0x03,0x01,0x0C,0x00,0xB4,0xB1,0x00,0xB8,0x2A,0x00,0x0C,0x5A,0xCB, +0xF0,0x0D,0x59,0x7B,0x01,0x0C,0x59,0x7B,0xB1,0x00,0xE0,0xAA,0x00,0x0C,0x5A,0xCB, +0x00,0x11,0x66,0x40,0x09,0x11,0x66,0x40,0x01,0x0C,0x15,0xB0,0xFF,0x0A,0x64,0x7B, +0x01,0x0A,0x7A,0x73,0x00,0x0C,0x8C,0xC3,0x02,0x05,0x0A,0x00,0x22,0x11,0x02,0x00, +0x01,0x11,0x4A,0x80,0x42,0x11,0x00,0x80,0x0C,0xE0,0x21,0xB2,0x22,0x11,0x00,0x80, +0x00,0x11,0x4A,0x88,0x45,0x11,0x00,0x00,0x04,0x3C,0x39,0xB2,0x01,0x01,0x22,0xB0, +0x0A,0x11,0x66,0x40,0x02,0x05,0x0A,0x00,0x22,0x11,0x02,0x00,0x01,0x11,0x4A,0x80, +0x0C,0x11,0xC0,0xB3,0x00,0x11,0x4A,0x88,0x55,0x11,0x00,0x80,0x04,0x11,0x78,0xB2, +0x01,0x01,0x22,0xB0,0x00,0x11,0x66,0x40,0x01,0x11,0x04,0x01,0x01,0xE2,0xC4,0x01, +0x00,0x11,0x66,0x40,0x04,0x4C,0xAB,0x7B,0xFB,0xFF,0x18,0xB8,0xB3,0x00,0x0E,0xAB, +0x08,0x11,0x5E,0xDF,0x00,0x0C,0xBC,0x5C,0xCA,0x11,0xCE,0xDB,0x0C,0x28,0x95,0x31, +0x10,0x3F,0xA5,0x7B,0x04,0x34,0x95,0x31,0x10,0x00,0xBC,0x38,0x00,0xC8,0x4C,0xDF, +0x00,0x0C,0xC8,0xC3,0x80,0x9E,0xAF,0xFB,0xA0,0x00,0xE8,0xAB,0x00,0x0C,0x16,0xDE, +0x00,0x0C,0xB4,0xDC,0x14,0x10,0x95,0xB1,0x00,0xC8,0x4C,0xDF,0x08,0x48,0xC5,0xEB, +0x10,0x00,0xBC,0x38,0x11,0x48,0x15,0x88,0x11,0x0A,0xC6,0xE3,0x04,0x0C,0x81,0xB2, +0x00,0x0C,0x1A,0xDE,0x00,0x0C,0xC8,0xC3,0x10,0x20,0xBC,0xB8,0x04,0x11,0x80,0x32, +0x00,0x11,0x94,0x88,0x04,0x11,0xA4,0xB0,0x04,0x11,0x24,0x05,0x01,0x0C,0x1C,0xB0, +0x00,0x11,0x1E,0x08,0x46,0x11,0x26,0x80,0x0F,0x11,0x27,0x08,0x00,0x00,0x26,0x3C, +0x47,0x00,0x0C,0x38,0x01,0xCA,0xE3,0xFB,0xFE,0xCA,0x95,0x8B,0x40,0xCB,0x97,0x03, +0x20,0x11,0xB8,0x84,0x02,0xC8,0xE7,0xFB,0x80,0xC4,0x89,0x03,0x02,0x38,0x0D,0xB4, +0xB1,0x00,0x70,0x2B,0x00,0x0C,0xEE,0x43,0x18,0x11,0xB8,0x80,0x40,0x11,0x90,0x00, +0xFD,0xE4,0xC9,0x8B,0xFF,0xFF,0xC4,0xB8,0xFF,0x11,0x22,0x8C,0x04,0x11,0x5E,0xDF, +0x00,0x0C,0x9C,0x5D,0x0C,0x94,0xFE,0x73,0x10,0x3F,0x7F,0x02,0xB1,0x00,0x54,0xAC, +0x00,0x0C,0xEC,0xC3,0x00,0x0C,0xAC,0x5D,0x80,0xB5,0xED,0x6B,0xF9,0x09,0x0C,0x38, +0x08,0x48,0x1F,0x7C,0x21,0xB5,0x15,0xEC,0x18,0x11,0xB8,0x80,0xFF,0xFF,0x4C,0xBB, +0x1B,0x48,0x15,0x88,0x00,0x0C,0xCE,0x44,0x02,0x20,0x0C,0xB0,0xF8,0x1C,0x17,0x08, +0x00,0x0C,0xFE,0xDE,0x02,0x06,0x40,0xB0,0x00,0x0C,0x28,0xC4,0xFF,0xFF,0x4C,0xBB, +0x21,0xB5,0x29,0xEC,0x0B,0x0A,0x29,0xF4,0x40,0x48,0x2B,0x6C,0xA0,0x00,0xFE,0xA9, +0x0C,0x11,0x2C,0x44,0x0B,0x11,0x2C,0xC4,0xA0,0x00,0x02,0xAA,0x00,0x0C,0x42,0x5D, +0x01,0x48,0x39,0x6C,0x00,0x0C,0x9C,0x5D,0x04,0xE4,0xC9,0x03,0x00,0x0C,0xB8,0x45, +0x00,0x0C,0x9C,0x5D,0x00,0x0C,0xC8,0xC5,0x40,0x4C,0x4F,0xFC,0x01,0xB6,0x6B,0x33, +0x1B,0x48,0x15,0x88,0x01,0x0A,0x4E,0xE4,0x80,0x11,0x5E,0xDF,0xFF,0xFF,0x4C,0xBB, +0x02,0x11,0x48,0x00,0x0C,0xD4,0x51,0xB2,0x00,0x11,0x48,0x08,0xB1,0x00,0x4A,0x2B, +0x08,0x49,0xF3,0x6B,0x00,0x0C,0xF4,0xC3,0xF6,0x09,0x0C,0x38,0xB1,0x00,0x54,0xAC, +0x08,0x48,0x71,0xEE,0x04,0x0C,0xB1,0x33,0x00,0x0C,0x20,0xDE,0x00,0x0C,0x0A,0x45, +0x47,0x00,0x0C,0x38,0x02,0xE4,0x6B,0x7C,0xFD,0xE4,0xC9,0x8B,0x02,0x5E,0x14,0xB0, +0x00,0x0C,0xEC,0xC3,0x00,0x11,0x94,0x88,0x01,0x52,0x14,0xB0,0xA1,0x0A,0xA0,0x74, +0x34,0x0A,0x02,0xF4,0x46,0x0A,0x94,0xF4,0x39,0x0A,0x9E,0x74,0x41,0x0A,0x60,0x76, +0x5F,0x0A,0x7E,0xF4,0x27,0x0A,0xE8,0x73,0x00,0x0C,0xEC,0xC3,0x18,0x48,0x15,0x88, +0xFF,0x0A,0xE8,0x6B,0x01,0x11,0x94,0x80,0x20,0x52,0x14,0x08,0x5B,0x0A,0x14,0x28, +0x01,0x0A,0x14,0x18,0x03,0x0A,0x14,0x10,0x00,0x48,0xE9,0x7B,0x01,0x0A,0x30,0xFC, +0x46,0x3A,0x31,0x64,0x00,0x0C,0xE8,0x43,0x00,0x11,0x86,0x09,0x46,0x3A,0xE9,0xE3, +0x01,0x48,0xE9,0xFB,0xB1,0x00,0x58,0x2B,0x00,0x0C,0xF2,0xC3,0x00,0x0C,0xE8,0x43, +0x02,0x11,0x94,0x80,0x77,0x52,0x14,0x88,0x88,0xB5,0x6B,0x0B,0x00,0xB5,0x6B,0x83, +0x49,0x0B,0xC0,0x3B,0x50,0xCA,0x95,0x83,0xB1,0x00,0x92,0xAC,0x02,0x11,0x4A,0x80, +0x02,0x22,0xF8,0xB3,0x00,0x0C,0xF2,0xC3,0x09,0x0A,0xBD,0xF4,0x0A,0x0A,0xD5,0x74, +0x04,0x1F,0xDB,0x7C,0x01,0x00,0x74,0xBE,0x0B,0x48,0xDB,0xFC,0x1B,0x48,0x15,0x88, +0x08,0x0A,0xDA,0xEC,0x11,0x0A,0x04,0xF5,0x12,0x0A,0x0A,0x75,0x13,0x0A,0x10,0x75, +0x01,0x0A,0xE6,0xF4,0x02,0x0A,0xE0,0xF4,0x03,0x0A,0xEC,0x74,0x19,0x0A,0xF8,0xF4, +0x1A,0x0A,0xF2,0xF4,0x1B,0x0A,0xFE,0x74,0x02,0x00,0x74,0x3A,0xFF,0x20,0x9C,0x3A, +0xFB,0x09,0x70,0x3E,0x34,0x00,0x74,0x3A,0xFF,0x00,0x9C,0xBA,0x01,0x0A,0x70,0x3E, +0x5F,0x00,0x74,0xBA,0xFF,0x20,0x9C,0x3A,0x19,0x0A,0x70,0x3E,0x5F,0x20,0x74,0x3A, +0xFF,0x20,0x9C,0x3A,0x1C,0x0A,0x70,0x3E,0x5F,0x00,0x74,0xBA,0xFF,0x00,0x9C,0xBA, +0x17,0x0A,0x70,0xBE,0x41,0x00,0x74,0xBA,0xFF,0x20,0x9C,0x3A,0x29,0x0B,0x70,0xBE, +0x41,0x20,0x74,0x3A,0xFF,0x20,0x9C,0x3A,0x2D,0x0B,0x70,0x3E,0x41,0x00,0x74,0xBA, +0xFF,0x00,0x9C,0xBA,0x27,0x0B,0x70,0x3E,0x46,0x00,0x74,0x3A,0xFF,0x00,0x9C,0xBA, +0x1E,0x0A,0x70,0xBE,0x39,0x00,0x74,0xBA,0xFF,0x00,0x9C,0xBA,0x2A,0x0A,0x70,0x3E, +0x03,0x00,0x74,0x3E,0x20,0x48,0x33,0xED,0x40,0x4C,0x33,0x6D,0x1B,0x48,0x15,0x88, +0x08,0x0A,0x2C,0xFD,0x40,0x01,0x18,0x38,0x01,0x0A,0x20,0x6D,0x28,0x01,0x18,0xB8, +0xB3,0x00,0x40,0x2A,0x00,0x0C,0x48,0x4F,0x7F,0x11,0xC0,0x5F,0x1B,0x48,0x15,0x88, +0x01,0x0A,0xC2,0xEC,0xFF,0x11,0x22,0x8C,0x10,0x0A,0x48,0x6F,0x01,0xB6,0x6B,0x33, +0x00,0x0C,0xC2,0x44,0x1B,0x48,0x15,0x88,0x01,0x0A,0x3C,0x65,0x0A,0x0A,0xDB,0xF4, +0xB0,0x00,0x6E,0x2A,0x00,0x0C,0x10,0xC5,0x08,0x0A,0xDA,0x7C,0x7F,0x11,0xC0,0x5F, +0x00,0x0C,0x10,0xC5,0x01,0x11,0x94,0x80,0x20,0x52,0x18,0x08,0x5B,0x0C,0x18,0x28, +0x01,0x0C,0x18,0x18,0xB0,0x00,0xA0,0xAF,0x00,0x0C,0xBC,0x44,0x00,0x00,0x90,0x38, +0x02,0x3A,0x5B,0x75,0x01,0x3A,0x3F,0x77,0x03,0x3A,0x3F,0xF7,0x02,0x3A,0xA1,0xB0, +0x00,0x0C,0x5C,0x45,0x5F,0x00,0xA0,0x38,0x02,0x4E,0xA3,0x30,0x16,0x11,0xA0,0x30, +0x16,0x11,0xA2,0xB0,0x00,0x11,0x90,0x08,0x41,0x50,0x48,0x67,0x10,0x10,0x90,0x38, +0x04,0x24,0xA1,0xB0,0x04,0x10,0xA2,0xB4,0x00,0x11,0x94,0x88,0x01,0x52,0x14,0xB0, +0xA1,0x0A,0x90,0xF5,0x08,0xCE,0x95,0xFD,0x34,0x0A,0x94,0x75,0x41,0x0A,0x8C,0x65, +0x02,0xA4,0x45,0x30,0x1F,0x54,0x14,0x08,0x00,0x0A,0x14,0x98,0x80,0x01,0x18,0x38, +0x00,0x0C,0x18,0x98,0x02,0x12,0x40,0xB0,0xFF,0x21,0x9A,0x75,0x02,0x46,0x45,0x30, +0x02,0x20,0x70,0x33,0x00,0x0C,0x98,0xC5,0x02,0xB8,0x41,0x30,0x00,0x0C,0x96,0x45, +0xFF,0xFF,0x40,0xB8,0x00,0x0C,0x98,0xC5,0x02,0xA6,0x41,0x30,0xFF,0x21,0x9A,0x75, +0x01,0x11,0x22,0x9C,0x01,0x10,0x22,0x1C,0x0F,0x11,0x94,0x00,0x01,0x52,0x6C,0xB3, +0x04,0x4C,0xAD,0xED,0x01,0x48,0xAD,0xFD,0x88,0xB6,0xAD,0x6D,0x40,0x11,0x5E,0xDF, +0x00,0x11,0x94,0x88,0x14,0x52,0x20,0x32,0x02,0x11,0x94,0x80,0x01,0x52,0x6A,0xB3, +0xFB,0xE4,0xC9,0x8B,0x01,0x11,0x94,0x80,0x80,0x52,0x48,0x7F,0x04,0xE4,0xC9,0x87, +0xB1,0x00,0x42,0x2C,0x80,0x48,0xC1,0xFD,0xB1,0x00,0x52,0xAC,0x00,0x0C,0xEC,0xC3, +0xF6,0x09,0x0C,0x38,0x04,0xE4,0x0B,0xFD,0x08,0x48,0x57,0xEC,0xA1,0x00,0x54,0x2C, +0xB1,0x00,0x42,0x2C,0x80,0x48,0xED,0xEB,0x00,0x0C,0x1A,0xDE,0x00,0x0C,0x04,0xDD, +0x00,0x0C,0xEC,0xC3,0xCC,0x01,0x1C,0x38,0xEC,0x11,0x54,0xDF,0xB0,0x00,0x1A,0xA8, +0xCC,0x01,0x1C,0x38,0xEF,0x11,0x44,0x5F,0x00,0x0C,0x5A,0x57,0xFB,0x1F,0x3F,0x8A, +0xA0,0x00,0x76,0x2B,0x11,0x11,0x02,0x00,0x10,0x02,0xE0,0x39,0xFF,0x11,0x22,0x20, +0x04,0x11,0xD0,0x31,0x0F,0x02,0xE0,0xB9,0xFF,0x11,0x22,0x20,0x04,0x11,0xD0,0x31, +0x00,0x00,0xE0,0x39,0x00,0x11,0xB0,0x88,0x32,0x11,0x00,0x00,0x02,0xF2,0x91,0x30, +0x01,0x01,0x22,0x34,0x20,0x84,0x48,0xFF,0x20,0x11,0x08,0x01,0x1C,0x11,0x60,0x47, +0x01,0x9E,0x1D,0xB0,0x08,0x0E,0x08,0x7E,0xB0,0x01,0x18,0x38,0xA3,0x00,0x40,0xAA, +0xFF,0xA7,0x23,0x62,0x01,0x11,0x22,0x9C,0x03,0x0C,0x14,0x08,0xFF,0x0A,0x14,0x10, +0x01,0x0A,0x14,0x18,0x04,0x0A,0x14,0x18,0x03,0x0A,0x14,0x8C,0x00,0x0C,0x0D,0xDE, +0x22,0x0A,0x7E,0xAE,0x00,0x40,0x0D,0x5E,0xFC,0x3F,0x7F,0x0A,0x00,0x3F,0x7F,0x06, +0xFF,0xFF,0x14,0x38,0x89,0xDA,0x2B,0xAE,0x00,0xE0,0x14,0xB8,0x89,0xD8,0x2B,0x2E, +0x04,0xD8,0x81,0x36,0x00,0x20,0x80,0xBA,0x00,0x00,0x84,0x3E,0x02,0xCA,0x95,0x87, +0x08,0x4C,0x27,0xEA,0x0B,0x0A,0x37,0xE6,0x04,0x1F,0x27,0x7A,0x88,0xB5,0x3B,0xFE, +0x01,0x10,0x22,0x1C,0x80,0xB5,0x6B,0x03,0x02,0x20,0x4C,0x33,0x01,0x11,0x22,0x9C, +0x00,0x0C,0x44,0x5E,0x77,0xB5,0x6B,0x8F,0x08,0x48,0x4D,0x7E,0x02,0x0C,0x0C,0x30, +0xB2,0x00,0x28,0x2F,0x02,0x06,0x18,0x30,0xFF,0xFF,0x4C,0x3F,0x00,0x0C,0x42,0x5D, +0x01,0x48,0x5B,0x6E,0x80,0x11,0xAC,0x5F,0x00,0x0C,0xB0,0xDD,0x04,0x94,0x50,0x32, +0x00,0x0C,0xB8,0x45,0x80,0x11,0xAC,0x5F,0x00,0x0C,0xB0,0xDD,0x00,0x0C,0xC8,0xC5, +0xFF,0xC0,0xE8,0x6B,0xFF,0xC1,0xE8,0xEB,0xF0,0xC2,0xE8,0xEB,0x02,0x48,0xE9,0xFB, +0x01,0x11,0x94,0x80,0x20,0x52,0xE8,0x6B,0xB1,0x00,0x52,0xAC,0x00,0x0C,0xE8,0x43, +0x04,0x28,0xB1,0x33,0xB2,0x00,0x20,0x2E,0xFF,0xFF,0x14,0x38,0x82,0x40,0xB1,0x2B, +0x82,0x42,0xB5,0x2B,0x01,0x00,0x14,0xB8,0x83,0xD8,0xB1,0x2B,0x00,0x00,0x14,0x38, +0x84,0xDA,0xB5,0xAB,0x02,0xD8,0x15,0xB0,0x83,0x28,0x51,0x2A,0x02,0xDA,0x15,0x30, +0x84,0x2A,0x55,0xAA,0x28,0x01,0x18,0xB8,0xB3,0x00,0x40,0x2A,0x00,0x0C,0xF2,0xD4, +0x00,0x0C,0x0A,0x45,0x02,0x11,0x4A,0x80,0x02,0xFC,0x45,0xB0,0x10,0x9E,0xD3,0xEE, +0x08,0xCE,0xF5,0xFE,0x00,0x11,0xFE,0x8B,0xFF,0xB4,0x15,0x90,0x01,0x0A,0xFC,0x1B, +0x04,0x11,0x94,0x80,0x01,0x52,0xF6,0xB3,0xFF,0xFB,0xBD,0x7E,0x44,0xFF,0x0F,0xA8, +0x01,0x07,0x14,0xB0,0x00,0xFB,0xC3,0x7E,0x00,0xFB,0xF7,0x13,0x33,0xFF,0x17,0xA8, +0x00,0x11,0x4A,0x88,0x00,0x0C,0xFE,0xDE,0x00,0x0C,0xB8,0xCE,0xB0,0x00,0x66,0x2D, +0x02,0x11,0x4A,0x80,0xFF,0xFB,0xC3,0xEE,0xF8,0xFF,0xFF,0x8B,0x08,0xFF,0xFF,0x1B, +0x00,0x0C,0xC4,0xC6,0x01,0xFF,0xFF,0x1B,0x01,0xFE,0x15,0x30,0x00,0xFF,0x15,0x18, +0x80,0x0A,0xCE,0xFE,0x07,0xFF,0xA7,0x6E,0x00,0x0C,0xA2,0xC6,0x00,0x11,0x4A,0x88, +0x01,0xB5,0xF9,0xFE,0x00,0x11,0x4A,0x88,0xB1,0x00,0xCE,0xA9,0x02,0x0C,0x1C,0x98, +0x10,0x9E,0xED,0x6E,0xF2,0x11,0x26,0x80,0x02,0x0A,0x0C,0x30,0x10,0x11,0xAC,0x5F, +0x7F,0x11,0xC0,0x5F,0x02,0x06,0x14,0x30,0x04,0x11,0x60,0x33,0xF5,0x45,0x8A,0x08, +0xF5,0xCE,0x9D,0x0B,0x00,0x0C,0xEE,0x46,0xF3,0x11,0x26,0x00,0x02,0x22,0x26,0xB0, +0x00,0x0A,0x4E,0xDF,0x00,0x0C,0xF8,0xC6,0x06,0x11,0xF8,0x03,0xB0,0x00,0xF2,0xAD, +0x18,0x11,0xB8,0x80,0xEF,0xCA,0x95,0x8B,0xA0,0x00,0x64,0xAA,0x00,0x0C,0x32,0x5F, +0x00,0x12,0x22,0x7A,0xFF,0x0C,0x18,0x98,0x00,0x12,0x26,0x90,0x02,0x22,0x0C,0x30, +0x02,0xA4,0x45,0x30,0x2E,0x0B,0x14,0x28,0x80,0x01,0x18,0x38,0x00,0x0C,0x18,0x98, +0x02,0x0C,0x1C,0xB0,0x02,0x12,0x40,0xB0,0xFF,0xFF,0x26,0xB8,0x02,0x06,0x44,0x30, +0xA1,0x00,0xBA,0x2C,0x08,0x9E,0x1F,0xFF,0x02,0x45,0x8A,0x84,0xF5,0x45,0x8A,0x8C, +0xF8,0x1C,0x17,0x08,0xB2,0x00,0x32,0xAF,0x00,0x12,0x26,0x00,0x00,0x0C,0x30,0xC7, +0xF8,0x1C,0x17,0x08,0xB2,0x00,0x32,0xAF,0xFF,0x0A,0x14,0x10,0x00,0x12,0x26,0x88, +0xFF,0x11,0x22,0x8C,0x6A,0x0B,0x14,0x28,0xB0,0x01,0x18,0x38,0x00,0x0C,0x18,0x98, +0x02,0x0C,0x1C,0xB0,0x11,0x0B,0x0E,0xA8,0x01,0x07,0x14,0x34,0x00,0x00,0x90,0x38, +0x18,0x11,0xA0,0xB0,0x18,0x10,0xA2,0x34,0xA1,0x00,0x4A,0x2A,0xA1,0x00,0x48,0xAA, +0xFF,0x11,0x22,0x8C,0xA1,0x00,0x32,0x2A,0xA3,0x00,0x4E,0x2A,0xA3,0x00,0x2E,0x2A, +0xA1,0x00,0x2A,0x29,0xA1,0x00,0xAC,0xAA,0xA1,0x00,0x34,0x2A,0xA1,0x00,0x6E,0x2A, +0xA1,0x00,0x68,0x2A,0xA0,0x00,0x08,0x28,0xA3,0x00,0xC8,0xAA,0xA3,0x00,0xFE,0xAA, +0xA3,0x00,0xB2,0x2A,0x00,0x11,0x02,0x88,0x01,0x0C,0x8C,0x31,0x01,0x01,0x22,0x34, +0x11,0x11,0x02,0x00,0xF5,0x45,0x8A,0x08,0xF7,0xA0,0x40,0x89,0xFB,0xFC,0x18,0xB8, +0x00,0x0C,0xC2,0x5F,0x00,0x00,0x90,0x3B,0xC0,0xCA,0x95,0x0B,0x02,0x05,0x0A,0x00, +0x40,0x3D,0x7A,0x00,0xFF,0xE0,0xC0,0x81,0xB3,0x00,0x82,0xAA,0x01,0x01,0x22,0xB0, +0x01,0xE4,0xC9,0x87,0x04,0x87,0x88,0xFF,0x80,0xC8,0x91,0x03,0x04,0x11,0x0E,0x01, +0xFF,0x11,0x22,0x8C,0x02,0x87,0x90,0xFF,0x20,0xC8,0x91,0x03,0x02,0x11,0x0E,0x01, +0xFF,0x11,0x22,0x8C,0x40,0x84,0x96,0xFF,0xB1,0x00,0xE8,0xAD,0xFF,0x11,0x22,0x8C, +0xB1,0x00,0xBA,0xAC,0x00,0x0C,0x48,0x4F,0xC0,0xC8,0x23,0xEA,0xB0,0x00,0x92,0x2F, +0x80,0xCC,0x15,0x08,0x80,0x0B,0x17,0x08,0x00,0x0B,0x22,0x72,0x01,0x11,0x22,0x9C, +0x01,0x0C,0x1A,0xB0,0x00,0x11,0xAE,0x47,0x00,0x11,0x1A,0x88,0xB3,0x00,0xC2,0x2A, +0x02,0x9E,0x15,0x30,0x80,0x0C,0x04,0xA8,0xE2,0x9E,0xB9,0x2F,0x01,0xC5,0x0B,0x34, +0x02,0x9E,0x3D,0x33,0x00,0x0C,0xB0,0x47,0x01,0x0C,0x1A,0xB0,0xFF,0x11,0xC2,0x47, +0xFF,0x11,0x1A,0x00,0xB3,0x00,0xC2,0x2A,0x02,0x9E,0x15,0x30,0x81,0x0C,0x04,0x28, +0xE2,0x9E,0xCD,0x2F,0x01,0xC5,0x0B,0x34,0x02,0x9E,0x3D,0x33,0x00,0x0C,0xC4,0x47, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x0C,0x26,0x59,0x00,0x0C,0x06,0xD0,0xA0,0x00,0x10,0xAF,0x00,0x0C,0x40,0x59, +0x02,0x86,0xA9,0xB3,0xFF,0xD5,0x1F,0x70,0x02,0xD4,0x45,0xB0,0x84,0x01,0x18,0xB8, +0x00,0x0C,0xA6,0x5B,0x01,0x0A,0x1A,0xE0,0x44,0xC6,0x0F,0xA8,0x01,0x07,0x14,0xB0, +0x00,0x9C,0x69,0x68,0x02,0xAE,0xA9,0xB3,0x00,0x0C,0x0A,0x40,0x00,0x0C,0x3C,0xD9, +0x02,0x84,0x15,0xB0,0x8B,0x10,0x72,0x28,0x83,0x10,0x08,0x2B,0x02,0x80,0xA9,0xB3, +0x02,0xD4,0x45,0xB0,0x02,0xAE,0x15,0x30,0xFF,0xFF,0x5C,0x3B,0x00,0x00,0x44,0x38, +0x02,0x0A,0x00,0x33,0xFF,0x0B,0x36,0x60,0x02,0x0A,0x04,0xB3,0x00,0x0C,0x52,0x59, +0xFF,0x87,0x3F,0xE0,0x02,0xD4,0x0D,0xB3,0x00,0x0C,0x44,0x40,0x02,0x88,0x45,0xB0, +0x02,0xD4,0x5D,0xB3,0x00,0x00,0x44,0x38,0x02,0xD4,0x11,0x33,0x02,0xD4,0x45,0xB0, +0x00,0x11,0xB2,0x89,0x0F,0xD8,0x00,0x0B,0xF0,0xD8,0x02,0x8B,0x0C,0x11,0xB2,0x01, +0x08,0xD8,0x08,0xB3,0x08,0xD8,0x28,0x33,0x00,0x0C,0x1A,0x5A,0x01,0x0A,0x38,0xB3, +0x01,0x0B,0x54,0x33,0x00,0x0C,0x78,0x58,0x04,0x9D,0x3B,0x03,0x01,0x11,0x14,0x00, +0x00,0x9C,0x65,0xF8,0x01,0xA8,0x51,0x1B,0x11,0x0A,0x14,0xA8,0xFF,0x0A,0x60,0x68, +0x02,0x11,0xB2,0x81,0x02,0xD8,0x04,0xB3,0x00,0x0C,0x56,0xD9,0x02,0xD4,0x45,0xB0, +0xA0,0x00,0xDC,0x2E,0x00,0x0C,0x56,0xD9,0x00,0x0C,0x52,0x59,0xA0,0x00,0x12,0x2F, +0x00,0x00,0x44,0x38,0x02,0x86,0x15,0x30,0xFF,0x0B,0xA0,0xF0,0x8A,0xD4,0x9B,0xA8, +0x02,0x0A,0x0C,0x30,0x02,0x0A,0x44,0x30,0x01,0x9C,0x15,0xB0,0x02,0xD4,0x45,0xB0, +0x00,0x9C,0x99,0xF8,0x02,0x06,0x44,0x30,0x02,0xA6,0x0D,0xB0,0x08,0xB4,0xB1,0x33, +0x02,0xD4,0x45,0xB0,0x02,0x06,0x4C,0xB3,0x08,0xD8,0x69,0x33,0x00,0x11,0x6E,0x0F, +0x02,0x06,0x14,0x30,0x02,0x0A,0x44,0x30,0x02,0xAE,0x15,0x30,0x00,0x0C,0x7C,0xC0, +0x02,0xD4,0x45,0x34,0x02,0x22,0x0C,0x30,0x00,0x00,0x44,0x38,0x01,0x00,0x18,0xB8, +0x02,0x8C,0x15,0x30,0x83,0x0C,0x04,0xA8,0xE3,0x8C,0xB1,0xA8,0x00,0x0C,0xB4,0x40, +0x02,0x8C,0x19,0x33,0x00,0x0C,0xA8,0xC0,0x02,0x06,0x44,0x30,0x22,0x11,0x02,0x00, +0x02,0x0A,0xEC,0xB3,0x01,0x01,0x22,0xB0,0xB1,0x00,0x92,0xAC,0x61,0x0C,0xC0,0xBB, +0x40,0xCA,0x95,0x87,0x08,0x84,0x49,0x31,0x02,0x22,0x48,0xB1,0x40,0xCE,0xCF,0xF8, +0x22,0x11,0x02,0x00,0x02,0xF6,0x15,0xB0,0x01,0x01,0x22,0xB0,0x02,0x0A,0x48,0xB1, +0xFC,0xFF,0x14,0x38,0x83,0x90,0x48,0xA9,0x02,0x11,0x48,0xB1,0xB1,0x00,0xD2,0x29, +0x01,0x0A,0xC4,0xB3,0x02,0x0C,0x0C,0x30,0x00,0x0C,0x0A,0xD9,0x02,0x06,0x18,0x30, +0x0C,0x00,0x14,0x38,0x83,0x90,0x14,0x28,0xB1,0x00,0x06,0x2A,0x20,0x11,0xB8,0x00, +0xB1,0x00,0x22,0x2A,0x02,0x0A,0x0C,0x30,0x02,0x0C,0x1C,0x98,0x28,0xC6,0x27,0x80, +0x55,0x11,0x02,0x00,0x01,0x43,0xF9,0x68,0x08,0xCC,0xF9,0xE8,0x09,0x11,0xFA,0x40, +0x08,0x11,0xFA,0xC0,0x01,0x01,0x22,0xB0,0x30,0xCC,0x15,0x88,0x80,0x0A,0x14,0x00, +0x00,0x0C,0x26,0x00,0x02,0x06,0x26,0xB0,0xB1,0x00,0x82,0x2C,0x00,0xE2,0x2F,0x5A, +0xA0,0x00,0x64,0xAA,0x00,0x0C,0x26,0x59,0x00,0x0C,0xC6,0x52,0x80,0x45,0xC6,0x7A, +0x01,0xC9,0xC7,0xEA,0x00,0x11,0x8C,0x09,0x53,0x11,0x3A,0xD9,0x01,0xC9,0x93,0x87, +0x02,0x0A,0x0C,0x30,0x00,0x0C,0x26,0x59,0x00,0x0C,0x22,0x51,0x00,0x0C,0x14,0xD9, +0x00,0x11,0x18,0x08,0x02,0x06,0x14,0x30,0xA1,0x00,0x1A,0x29,0x02,0x22,0x0C,0x30, +0x00,0x00,0x44,0x38,0x02,0x90,0x15,0xB0,0x82,0x10,0x14,0x28,0x01,0x10,0x22,0x98, +0x84,0x11,0x14,0xA8,0x83,0x8E,0x15,0xA8,0x02,0x06,0x44,0x30,0x80,0x0B,0x70,0xEB, +0x01,0x11,0x22,0x9C,0xA1,0x00,0xA4,0x2A,0x00,0x80,0x18,0xB8,0x00,0x0C,0x42,0xC1, +0x00,0x40,0x18,0xB8,0x00,0x00,0x44,0x38,0x02,0x8A,0x15,0x30,0x89,0x0C,0x44,0xA9, +0x80,0x0C,0x04,0xA8,0xE3,0x8A,0x4F,0xA9,0xFF,0x11,0x22,0x8C,0x02,0x8A,0x15,0x33, +0x00,0x0C,0x44,0xC1,0xFF,0x7F,0x18,0xB8,0x00,0x0C,0x58,0x41,0xFF,0xBF,0x18,0xB8, +0x00,0x00,0x44,0x38,0x02,0x8A,0x15,0x30,0x81,0x0C,0x04,0x28,0xE3,0x8A,0x63,0x29, +0xFF,0x11,0x22,0x8C,0x02,0x8A,0x15,0x33,0x00,0x0C,0x5A,0xC1,0x02,0xA6,0x15,0xB0, +0x8B,0x10,0xC6,0xAA,0x16,0x11,0x2E,0x42,0x04,0x9F,0x75,0x79,0x02,0xAC,0x15,0xB0, +0x89,0x10,0x74,0x29,0x04,0x9F,0x73,0xE9,0x03,0xB4,0xB1,0xB3,0x00,0x11,0xB6,0x8B, +0x02,0xB8,0xA5,0x33,0x22,0x11,0x02,0x00,0x02,0xA2,0xE8,0xB3,0x02,0xD2,0x45,0x31, +0xFF,0xD8,0x8B,0xE9,0xFF,0xD9,0x8B,0x69,0xFF,0xDA,0x8B,0x69,0xF7,0x11,0x2A,0xDA, +0x01,0x01,0x22,0x34,0x04,0xD8,0x51,0xB0,0x02,0x11,0x4A,0x80,0x04,0x28,0xE8,0x33, +0x00,0x0C,0xA6,0xC1,0x22,0x11,0x02,0x00,0x02,0x11,0x4A,0x80,0x80,0xF7,0xA7,0xF9, +0x03,0xB4,0x51,0x30,0x00,0x11,0x56,0x08,0x04,0xF4,0x51,0x30,0xFF,0x11,0x22,0x20, +0x80,0x2B,0xA6,0xF9,0x00,0x11,0xEE,0x0B,0x80,0xC9,0x93,0x03,0x00,0x11,0x4A,0x88, +0x01,0x01,0x22,0x34,0xC0,0x5F,0x15,0x88,0xC0,0x0A,0xC4,0x71,0x18,0x10,0x95,0xB1, +0x18,0x00,0x14,0x38,0x83,0x58,0x99,0xA8,0x04,0x11,0x48,0x5A,0x00,0x11,0x48,0xDA, +0x10,0x50,0xC1,0xB0,0x10,0x01,0xBC,0x3C,0x07,0x11,0x94,0x01,0x13,0x11,0x95,0xB1, +0x04,0x11,0x94,0x31,0x00,0x0C,0xC6,0xC1,0x18,0x10,0x95,0xB1,0x18,0x11,0x94,0xB1, +0x30,0x11,0x4E,0xDA,0x10,0x01,0xBC,0x3C,0x04,0x30,0xB0,0xB3,0x02,0xDA,0x41,0xB0, +0x0F,0xCB,0xF9,0x69,0x01,0xCD,0xF7,0xF9,0x80,0xCC,0xF7,0xE9,0x40,0xCE,0xF7,0xF9, +0x02,0x44,0x15,0xB0,0x88,0xF6,0xF7,0xA9,0x11,0x11,0x00,0x80,0x02,0xDA,0x41,0xB0, +0x02,0x46,0x45,0x30,0x00,0x00,0x90,0xB9,0x10,0x00,0xBC,0x38,0x04,0x58,0x99,0xB0, +0x10,0x50,0xC1,0xB0,0x00,0x11,0xB2,0xDA,0x20,0x13,0x08,0x39,0x49,0x11,0xB8,0x00, +0x00,0x0C,0x56,0xDA,0x80,0xCA,0x95,0x03,0xFF,0x0C,0xC0,0xBF,0x27,0x11,0xFA,0xC1, +0x09,0x11,0xFA,0xC1,0x00,0x0C,0xB2,0xDA,0xA1,0x00,0xDC,0xAE,0xB0,0x00,0x1A,0xA8, +0x02,0x05,0x0A,0x00,0x0F,0xCB,0x0D,0xEA,0x80,0xC8,0x13,0x6A,0x08,0x5D,0x2C,0xFA, +0x10,0xC9,0x93,0x03,0x00,0x0C,0x16,0x42,0x08,0x11,0x18,0x00,0xB0,0x00,0xCA,0x2D, +0x09,0x11,0x14,0xC2,0x05,0x11,0x14,0xC2,0x00,0x0C,0xB2,0xDA,0x15,0x11,0x2E,0x5A, +0xA0,0x00,0x40,0x2D,0x02,0x22,0x0C,0x30,0x00,0x00,0x44,0x38,0xB8,0x01,0x18,0xB8, +0x01,0xC6,0x15,0xB0,0x00,0x0C,0x18,0x98,0x01,0x12,0x14,0x30,0x01,0x96,0x17,0x30, +0xA0,0x00,0xF0,0x2F,0xA1,0x00,0x6E,0x2A,0xA0,0x00,0x08,0x28,0x01,0x0C,0xD6,0xB3, +0x02,0x20,0xD8,0xB3,0xB0,0x00,0x1A,0xA8,0x02,0x05,0x0A,0x00,0x04,0x3A,0x2C,0x7A, +0x01,0xEB,0x69,0x30,0x01,0xFA,0x6B,0xB0,0x02,0xEC,0x6D,0x30,0xFD,0x05,0x0A,0x0C, +0x00,0x00,0x14,0x38,0x88,0x12,0x70,0x2B,0x88,0x12,0x70,0x2B,0x01,0x11,0x22,0x9C, +0x01,0x0C,0xF6,0x30,0x01,0x0A,0xF0,0x30,0x02,0x11,0xF2,0x34,0x01,0x0C,0x98,0xB0, +0x03,0x11,0x9A,0xB0,0x01,0x0C,0x14,0x30,0x04,0x11,0x48,0x42,0x01,0x11,0x02,0x80, +0x02,0x22,0x44,0x30,0x21,0x11,0x00,0x80,0x00,0x0C,0x64,0x42,0x10,0x11,0x02,0x80, +0x02,0x22,0x44,0x30,0x20,0x11,0x00,0x00,0x02,0x22,0x44,0x30,0x01,0x01,0x22,0x34, +0x82,0x10,0x14,0x28,0x01,0x10,0x22,0x98,0x84,0x11,0x14,0xA8,0x02,0x0A,0x0C,0x30, +0xFF,0xFF,0x14,0x38,0x84,0x11,0x1C,0x28,0x02,0x06,0x14,0x30,0x83,0x0C,0x18,0x28, +0x00,0x00,0x14,0x38,0x84,0x0E,0x1C,0x2C,0x09,0x10,0x00,0xB1,0x01,0x11,0x74,0x00, +0x00,0x0C,0x9E,0x42,0x00,0x0C,0x86,0x5A,0x00,0x0C,0x90,0xC2,0x00,0x0C,0x94,0x5A, +0x10,0x11,0x06,0x81,0x04,0x11,0x0E,0x01,0x01,0x11,0x10,0x85,0x00,0x0C,0x94,0x5A, +0xC0,0x11,0x08,0x81,0x08,0x11,0x10,0x85,0xFF,0x11,0x00,0x01,0xF0,0xEF,0x04,0xB9, +0x7F,0xFF,0x08,0xB9,0xFF,0xFB,0x0C,0x39,0x10,0x11,0x10,0x01,0x00,0x11,0x02,0x88, +0x00,0x0C,0xAA,0xDA,0x11,0x11,0x00,0x80,0x00,0x0C,0xAC,0xDA,0x01,0x01,0x22,0x34, +0x11,0x00,0xAC,0xF2,0x7F,0x03,0x24,0x39,0xFF,0x11,0x8C,0x00,0xFF,0x11,0x8E,0x80, +0x02,0x11,0x74,0x84,0x00,0x0C,0xC2,0x5A,0x02,0x4C,0x15,0x30,0x01,0x0A,0x04,0x80, +0x01,0x0C,0x06,0x30,0xE3,0x4C,0xBF,0xAA,0x01,0xC5,0x0B,0x34,0x02,0x4C,0x99,0x32, +0x00,0x0C,0xB4,0xC2,0x01,0x05,0x8A,0xB3,0x02,0x05,0x0A,0x84,0xFF,0x11,0x22,0x8C, +0x01,0x0C,0x1A,0xB0,0x00,0x11,0xCE,0x42,0x00,0x11,0x1A,0x88,0x00,0x0C,0xC2,0x5A, +0x02,0x48,0x15,0xB0,0x80,0x0C,0x04,0xA8,0xE3,0x48,0xD9,0x2A,0x01,0xC5,0x0B,0x34, +0x02,0x48,0x91,0x32,0x00,0x0C,0xD0,0x42,0x01,0x0C,0x1A,0xB0,0xFF,0x11,0xE0,0x42, +0x00,0x0C,0xC2,0x5A,0x02,0x48,0x15,0xB0,0x81,0x0C,0x04,0x28,0xE3,0x48,0xEB,0xAA, +0x01,0xC5,0x0B,0x34,0x02,0x48,0x91,0x32,0x00,0x0C,0xE2,0xC2,0x00,0x11,0x1A,0x88, +0x00,0x0C,0xC2,0x5A,0x02,0x48,0x15,0xB0,0x82,0x0C,0x04,0x28,0xE3,0x48,0xFB,0x2A, +0x01,0xC5,0x0B,0x34,0x02,0x48,0x91,0x32,0x00,0x0C,0xF2,0x42,0x00,0x11,0x1A,0x88, +0x00,0x0C,0xC2,0x5A,0x02,0x4C,0x15,0x30,0x80,0x0C,0x04,0xA8,0xE3,0x4C,0x0B,0x2B, +0x01,0xC5,0x0B,0x34,0x02,0x4C,0x99,0x32,0x00,0x0C,0x02,0xC3,0x00,0x0C,0xC2,0x5A, +0x02,0x4C,0x15,0x30,0x81,0x0C,0x04,0x28,0xE3,0x4C,0x19,0x2B,0x01,0xC5,0x0B,0x34, +0x02,0x4C,0x99,0x32,0x00,0x0C,0x10,0xC3,0xA2,0x00,0xC0,0xAF,0xA2,0x00,0xAC,0xAF, +0x00,0x0C,0xC2,0x5A,0x02,0x22,0x0C,0x30,0x00,0x00,0x44,0x38,0x44,0xC6,0x0F,0xA8, +0x01,0x07,0x1C,0x30,0x02,0xAE,0x19,0x30,0xFF,0x0E,0x14,0x90,0x00,0x0C,0xB0,0x8B, +0x01,0x0D,0xB2,0xB3,0x0F,0xCB,0x37,0xFB,0x00,0xD9,0xB3,0x0B,0x01,0x0E,0x14,0xB0, +0x03,0xC8,0x41,0xEB,0xC0,0xCA,0x41,0x6B,0x30,0xCB,0x41,0xEB,0x00,0xD8,0xB1,0x83, +0x02,0x0C,0x14,0x30,0x02,0xD8,0x05,0x30,0xE3,0xAE,0x4B,0xAB,0x02,0x06,0x44,0x30, +0x01,0xC5,0x0B,0x34,0x02,0xAE,0x5D,0x33,0x00,0x0C,0x2A,0xC3,0x00,0x0C,0xC2,0x5A, +0x02,0x48,0x15,0xB0,0x02,0x0B,0x50,0x6B,0x01,0x0A,0x04,0xB0,0x02,0x0B,0x06,0x80, +0xE3,0x48,0x5D,0xAB,0x01,0xC5,0x0B,0x34,0x02,0x48,0x91,0x32,0x00,0x0C,0x50,0x43, +0xFF,0xFD,0x18,0xB8,0x00,0x0C,0xE0,0x42,0x00,0x0C,0x4E,0x5B,0x40,0x49,0x6F,0x6B, +0x00,0x0C,0x72,0x5B,0x00,0x0C,0x60,0x5B,0x01,0x11,0x22,0x9C,0x00,0x0C,0x60,0x5B, +0x01,0x10,0x22,0x1C,0x00,0x0C,0xC2,0x5A,0x02,0x4A,0x15,0x30,0x01,0x0A,0x04,0xB0, +0x01,0x0B,0x06,0x98,0xE3,0x4A,0x7F,0x2B,0x01,0xC5,0x0B,0x34,0x02,0x4A,0x95,0x32, +0x00,0x0C,0x74,0x43,0x00,0x0C,0xC2,0x5A,0x02,0x4A,0x15,0x30,0x01,0x0A,0x04,0xB0, +0xFF,0x0B,0x06,0x18,0xE3,0x4A,0x8F,0x2B,0x01,0xC5,0x0B,0x34,0x02,0x4A,0x95,0x32, +0x00,0x0C,0x84,0x43,0x02,0x05,0x0A,0x00,0x02,0x4A,0x15,0x30,0xFF,0x0A,0x04,0x18, +0x01,0x0B,0x06,0xB0,0xE3,0x4A,0x9F,0xAB,0x00,0x0C,0xA2,0xC3,0x02,0x4A,0x95,0x32, +0x00,0x0C,0x94,0xC3,0xF0,0x04,0xC6,0x6A,0xFD,0x05,0x0A,0x0C,0x0C,0x11,0xB2,0x01, +0x01,0x12,0x14,0x30,0x00,0xD8,0xB2,0xE3,0x1F,0xD9,0x14,0x08,0x14,0x0A,0xA8,0x63, +0x01,0x11,0x14,0x84,0xFF,0xD9,0xB2,0x99,0xFF,0x0C,0x18,0x98,0xFF,0x12,0x14,0x10, +0xFF,0x11,0x16,0x00,0x01,0x10,0x22,0x98,0x84,0x11,0x14,0xA8,0x01,0xD8,0xB0,0xB3, +0x00,0x11,0xB2,0x0B,0x83,0xD8,0x15,0xA8,0x80,0x0B,0xC8,0xEB,0x02,0x11,0x14,0x84, +0x00,0x11,0x14,0x8C,0xB3,0x00,0xC2,0x2A,0x02,0xA8,0x15,0x30,0x01,0x0A,0x04,0xB0, +0x01,0x0B,0x06,0x98,0xE3,0xA8,0xD7,0xAB,0x01,0xC5,0x0B,0x34,0x02,0xA8,0x51,0x33, +0x00,0x0C,0xCC,0x43,0x04,0x9D,0xC7,0xFA,0xB3,0x00,0xC2,0x2A,0x02,0xA8,0x15,0x30, +0x01,0x0A,0x04,0xB0,0xFF,0x0B,0x06,0x18,0xE3,0xA8,0xE9,0x2B,0x01,0xC5,0x0B,0x34, +0x02,0xA8,0x51,0x33,0x00,0x0C,0xDE,0x43,0xFF,0x11,0x22,0x8C,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8,0x00,0x00,0x01,0xB8, +}; diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_task.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_task.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_task.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_task.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,642 @@ +/* + * Aic94xx SAS/SATA Tasks + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "aic94xx.h" +#include "aic94xx_sas.h" +#include "aic94xx_hwi.h" + +static void asd_unbuild_ata_ascb(struct asd_ascb *a); +static void asd_unbuild_smp_ascb(struct asd_ascb *a); +static void asd_unbuild_ssp_ascb(struct asd_ascb *a); + +static inline void asd_can_dequeue(struct asd_ha_struct *asd_ha, int num) +{ + unsigned long flags; + + spin_lock_irqsave(&asd_ha->seq.pend_q_lock, flags); + asd_ha->seq.can_queue += num; + spin_unlock_irqrestore(&asd_ha->seq.pend_q_lock, flags); +} + +/* PCI_DMA_... to our direction translation. + */ +static const u8 data_dir_flags[] = { + [PCI_DMA_BIDIRECTIONAL] = DATA_DIR_BYRECIPIENT, /* UNSPECIFIED */ + [PCI_DMA_TODEVICE] = DATA_DIR_OUT, /* OUTBOUND */ + [PCI_DMA_FROMDEVICE] = DATA_DIR_IN, /* INBOUND */ + [PCI_DMA_NONE] = DATA_DIR_NONE, /* NO TRANSFER */ +}; + +static inline int asd_map_scatterlist(struct sas_task *task, + struct sg_el *sg_arr, + unsigned long gfp_flags) +{ + struct asd_ascb *ascb = task->lldd_task; + struct asd_ha_struct *asd_ha = ascb->ha; + struct scatterlist *sc; + int num_sg, res; + + if (task->data_dir == PCI_DMA_NONE) + return 0; + + if (task->num_scatter == 0) { + void *p = task->scatter; + dma_addr_t dma = pci_map_single(asd_ha->pcidev, p, + task->total_xfer_len, + task->data_dir); + sg_arr[0].bus_addr = cpu_to_le64((u64)dma); + sg_arr[0].size = cpu_to_le32(task->total_xfer_len); + sg_arr[0].flags |= ASD_SG_EL_LIST_EOL; + return 0; + } + + num_sg = pci_map_sg(asd_ha->pcidev, task->scatter, task->num_scatter, + task->data_dir); + if (num_sg == 0) + return -ENOMEM; + + if (num_sg > 3) { + int i; + + ascb->sg_arr = asd_alloc_coherent(asd_ha, + num_sg*sizeof(struct sg_el), + gfp_flags); + if (!ascb->sg_arr) { + res = -ENOMEM; + goto err_unmap; + } + for (sc = task->scatter, i = 0; i < num_sg; i++, sc++) { + struct sg_el *sg = + &((struct sg_el *)ascb->sg_arr->vaddr)[i]; + sg->bus_addr = cpu_to_le64((u64)sg_dma_address(sc)); + sg->size = cpu_to_le32((u32)sg_dma_len(sc)); + if (i == num_sg-1) + sg->flags |= ASD_SG_EL_LIST_EOL; + } + + for (sc = task->scatter, i = 0; i < 2; i++, sc++) { + sg_arr[i].bus_addr = + cpu_to_le64((u64)sg_dma_address(sc)); + sg_arr[i].size = cpu_to_le32((u32)sg_dma_len(sc)); + } + sg_arr[1].next_sg_offs = 2 * sizeof(*sg_arr); + sg_arr[1].flags |= ASD_SG_EL_LIST_EOS; + + memset(&sg_arr[2], 0, sizeof(*sg_arr)); + sg_arr[2].bus_addr=cpu_to_le64((u64)ascb->sg_arr->dma_handle); + } else { + int i; + for (sc = task->scatter, i = 0; i < num_sg; i++, sc++) { + sg_arr[i].bus_addr = + cpu_to_le64((u64)sg_dma_address(sc)); + sg_arr[i].size = cpu_to_le32((u32)sg_dma_len(sc)); + } + sg_arr[i-1].flags |= ASD_SG_EL_LIST_EOL; + } + + return 0; +err_unmap: + pci_unmap_sg(asd_ha->pcidev, task->scatter, task->num_scatter, + task->data_dir); + return res; +} + +static inline void asd_unmap_scatterlist(struct asd_ascb *ascb) +{ + struct asd_ha_struct *asd_ha = ascb->ha; + struct sas_task *task = ascb->uldd_task; + + if (task->data_dir == PCI_DMA_NONE) + return; + + if (task->num_scatter == 0) { + dma_addr_t dma = (dma_addr_t) + le64_to_cpu(ascb->scb->ssp_task.sg_element[0].bus_addr); + pci_unmap_single(ascb->ha->pcidev, dma, task->total_xfer_len, + task->data_dir); + return; + } + + asd_free_coherent(asd_ha, ascb->sg_arr); + pci_unmap_sg(asd_ha->pcidev, task->scatter, task->num_scatter, + task->data_dir); +} + +/* ---------- Task complete tasklet ---------- */ + +static void asd_get_response_tasklet(struct asd_ascb *ascb, + struct done_list_struct *dl) +{ + struct asd_ha_struct *asd_ha = ascb->ha; + struct sas_task *task = ascb->uldd_task; + struct task_status_struct *ts = &task->task_status; + unsigned long flags; + struct tc_resp_sb_struct { + __le16 index_escb; + u8 len_lsb; + u8 flags; + } __attribute__ ((packed)) *resp_sb = (void *) dl->status_block; + +/* int size = ((resp_sb->flags & 7) << 8) | resp_sb->len_lsb; */ + int edb_id = ((resp_sb->flags & 0x70) >> 4)-1; + struct asd_ascb *escb; + struct asd_dma_tok *edb; + void *r; + + spin_lock_irqsave(&asd_ha->seq.tc_index_lock, flags); + escb = asd_tc_index_find(&asd_ha->seq, + (int)le16_to_cpu(resp_sb->index_escb)); + spin_unlock_irqrestore(&asd_ha->seq.tc_index_lock, flags); + + if (!escb) { + ASD_DPRINTK("Uh-oh! No escb for this dl?!\n"); + return; + } + + ts->buf_valid_size = 0; + edb = asd_ha->seq.edb_arr[edb_id + escb->edb_index]; + r = edb->vaddr; + if (task->task_proto == SAS_PROTO_SSP) { + struct ssp_response_iu *iu = + r + 16 + sizeof(struct ssp_frame_hdr); + + ts->residual = le32_to_cpu(*(__le32 *)r); + ts->resp = SAS_TASK_COMPLETE; + if (iu->datapres == 0) + ts->stat = iu->status; + else if (iu->datapres == 1) + ts->stat = iu->resp_data[3]; + else if (iu->datapres == 2) { + ts->stat = SAM_CHECK_COND; + ts->buf_valid_size = min((u32) SAS_STATUS_BUF_SIZE, + be32_to_cpu(iu->sense_data_len)); + memcpy(ts->buf, iu->sense_data, ts->buf_valid_size); + if (iu->status != SAM_CHECK_COND) { + ASD_DPRINTK("device %llx sent sense data, but " + "stat(0x%x) is not CHECK_CONDITION" + "\n", + SAS_ADDR(task->dev->sas_addr), + ts->stat); + } + } + } else { + struct ata_task_resp *resp = (void *) &ts->buf[0]; + + ts->residual = le32_to_cpu(*(__le32 *)r); + + if (SAS_STATUS_BUF_SIZE >= sizeof(*resp)) { + resp->frame_len = le16_to_cpu(*(__le16 *)(r+6)); + memcpy(&resp->ending_fis[0], r+16, 24); + ts->buf_valid_size = sizeof(*resp); + } + } + + asd_invalidate_edb(escb, edb_id); +} + +static void asd_task_tasklet_complete(struct asd_ascb *ascb, + struct done_list_struct *dl) +{ + struct sas_task *task = ascb->uldd_task; + struct task_status_struct *ts = &task->task_status; + unsigned long flags; + u8 opcode = dl->opcode; + + asd_can_dequeue(ascb->ha, 1); + +Again: + switch (opcode) { + case TC_NO_ERROR: + ts->resp = SAS_TASK_COMPLETE; + ts->stat = SAM_GOOD; + break; + case TC_UNDERRUN: + ts->resp = SAS_TASK_COMPLETE; + ts->stat = SAS_DATA_UNDERRUN; + ts->residual = le32_to_cpu(*(__le32 *)dl->status_block); + break; + case TC_OVERRUN: + ts->resp = SAS_TASK_COMPLETE; + ts->stat = SAS_DATA_OVERRUN; + ts->residual = 0; + break; + case TC_SSP_RESP: + case TC_ATA_RESP: + ts->resp = SAS_TASK_COMPLETE; + ts->stat = SAS_PROTO_RESPONSE; + asd_get_response_tasklet(ascb, dl); + break; + case TF_OPEN_REJECT: + ts->resp = SAS_TASK_UNDELIVERED; + ts->stat = SAS_OPEN_REJECT; + if (dl->status_block[1] & 2) + ts->open_rej_reason = 1 + dl->status_block[2]; + else if (dl->status_block[1] & 1) + ts->open_rej_reason = (dl->status_block[2] >> 4)+10; + else + ts->open_rej_reason = SAS_OREJ_UNKNOWN; + break; + case TF_OPEN_TO: + ts->resp = SAS_TASK_UNDELIVERED; + ts->stat = SAS_OPEN_TO; + break; + case TF_PHY_DOWN: + case TU_PHY_DOWN: + ts->resp = SAS_TASK_UNDELIVERED; + ts->stat = SAS_PHY_DOWN; + break; + case TI_PHY_DOWN: + ts->resp = SAS_TASK_COMPLETE; + ts->stat = SAS_PHY_DOWN; + break; + case TI_BREAK: + case TI_PROTO_ERR: + case TI_NAK: + case TI_ACK_NAK_TO: + case TF_SMP_XMIT_RCV_ERR: + case TC_ATA_R_ERR_RECV: + ts->resp = SAS_TASK_COMPLETE; + ts->stat = SAS_INTERRUPTED; + break; + case TF_BREAK: + case TU_BREAK: + case TU_ACK_NAK_TO: + case TF_SMPRSP_TO: + ts->resp = SAS_TASK_UNDELIVERED; + ts->stat = SAS_DEV_NO_RESPONSE; + break; + case TF_NAK_RECV: + ts->resp = SAS_TASK_COMPLETE; + ts->stat = SAS_NAK_R_ERR; + break; + case TA_I_T_NEXUS_LOSS: + opcode = dl->status_block[0]; + goto Again; + break; + case TF_INV_CONN_HANDLE: + ts->resp = SAS_TASK_UNDELIVERED; + ts->stat = SAS_DEVICE_UNKNOWN; + break; + case TF_REQUESTED_N_PENDING: + ts->resp = SAS_TASK_UNDELIVERED; + ts->stat = SAS_PENDING; + break; + case TC_TASK_CLEARED: + case TA_ON_REQ: + ts->resp = SAS_TASK_COMPLETE; + ts->stat = SAS_ABORTED_TASK; + break; + + case TF_NO_SMP_CONN: + case TF_TMF_NO_CTX: + case TF_TMF_NO_TAG: + case TF_TMF_TAG_FREE: + case TF_TMF_TASK_DONE: + case TF_TMF_NO_CONN_HANDLE: + case TF_IRTT_TO: + case TF_IU_SHORT: + case TF_DATA_OFFS_ERR: + ts->resp = SAS_TASK_UNDELIVERED; + ts->stat = SAS_DEV_NO_RESPONSE; + break; + + case TC_LINK_ADM_RESP: + case TC_CONTROL_PHY: + case TC_RESUME: + case TC_PARTIAL_SG_LIST: + default: + ASD_DPRINTK("%s: dl opcode: 0x%x?\n", __FUNCTION__, opcode); + break; + } + + switch (task->task_proto) { + case SATA_PROTO: + case SAS_PROTO_STP: + asd_unbuild_ata_ascb(ascb); + break; + case SAS_PROTO_SMP: + asd_unbuild_smp_ascb(ascb); + break; + case SAS_PROTO_SSP: + asd_unbuild_ssp_ascb(ascb); + default: + break; + } + + spin_lock_irqsave(&task->task_state_lock, flags); + task->task_state_flags &= ~SAS_TASK_STATE_PENDING; + task->task_state_flags |= SAS_TASK_STATE_DONE; + if (unlikely((task->task_state_flags & SAS_TASK_STATE_ABORTED))) { + spin_unlock_irqrestore(&task->task_state_lock, flags); + ASD_DPRINTK("task 0x%p done with opcode 0x%x resp 0x%x " + "stat 0x%x but aborted by upper layer!\n", + task, opcode, ts->resp, ts->stat); + complete(&ascb->completion); + } else { + spin_unlock_irqrestore(&task->task_state_lock, flags); + task->lldd_task = NULL; + asd_ascb_free(ascb); + mb(); + task->task_done(task); + } +} + +/* ---------- ATA ---------- */ + +static int asd_build_ata_ascb(struct asd_ascb *ascb, struct sas_task *task, + unsigned long gfp_flags) +{ + struct domain_device *dev = task->dev; + struct scb *scb; + u8 flags; + int res = 0; + + scb = ascb->scb; + + if (unlikely(task->ata_task.device_control_reg_update)) + scb->header.opcode = CONTROL_ATA_DEV; + else if (dev->sata_dev.command_set == ATA_COMMAND_SET) + scb->header.opcode = INITIATE_ATA_TASK; + else + scb->header.opcode = INITIATE_ATAPI_TASK; + + scb->ata_task.proto_conn_rate = (1 << 5); /* STP */ + if (dev->port->oob_mode == SAS_OOB_MODE) + scb->ata_task.proto_conn_rate |= dev->linkrate; + + scb->ata_task.total_xfer_len = cpu_to_le32(task->total_xfer_len); + scb->ata_task.fis = task->ata_task.fis; + scb->ata_task.fis.fis_type = 0x27; + if (likely(!task->ata_task.device_control_reg_update)) + scb->ata_task.fis.flags |= 0x80; /* C=1: update ATA cmd reg */ + scb->ata_task.fis.flags &= 0xF0; /* PM_PORT field shall be 0 */ + if (dev->sata_dev.command_set == ATAPI_COMMAND_SET) + memcpy(scb->ata_task.atapi_packet, task->ata_task.atapi_packet, + 16); + scb->ata_task.sister_scb = cpu_to_le16(0xFFFF); + scb->ata_task.conn_handle = cpu_to_le16( + (u16)(unsigned long)dev->lldd_dev); + + if (likely(!task->ata_task.device_control_reg_update)) { + flags = 0; + if (task->ata_task.dma_xfer) + flags |= DATA_XFER_MODE_DMA; + if (task->ata_task.use_ncq && + dev->sata_dev.command_set != ATAPI_COMMAND_SET) + flags |= ATA_Q_TYPE_NCQ; + flags |= data_dir_flags[task->data_dir]; + scb->ata_task.ata_flags = flags; + + scb->ata_task.retry_count = task->ata_task.retry_count; + + flags = 0; + if (task->ata_task.set_affil_pol) + flags |= SET_AFFIL_POLICY; + if (task->ata_task.stp_affil_pol) + flags |= STP_AFFIL_POLICY; + scb->ata_task.flags = flags; + } + ascb->tasklet_complete = asd_task_tasklet_complete; + + if (likely(!task->ata_task.device_control_reg_update)) + res = asd_map_scatterlist(task, scb->ata_task.sg_element, + gfp_flags); + + return res; +} + +static void asd_unbuild_ata_ascb(struct asd_ascb *a) +{ + asd_unmap_scatterlist(a); +} + +/* ---------- SMP ---------- */ + +static int asd_build_smp_ascb(struct asd_ascb *ascb, struct sas_task *task, + unsigned long gfp_flags) +{ + struct asd_ha_struct *asd_ha = ascb->ha; + struct domain_device *dev = task->dev; + struct scb *scb; + + pci_map_sg(asd_ha->pcidev, &task->smp_task.smp_req, 1, + PCI_DMA_FROMDEVICE); + pci_map_sg(asd_ha->pcidev, &task->smp_task.smp_resp, 1, + PCI_DMA_FROMDEVICE); + + scb = ascb->scb; + + scb->header.opcode = INITIATE_SMP_TASK; + + scb->smp_task.proto_conn_rate = dev->linkrate; + + scb->smp_task.smp_req.bus_addr = + cpu_to_le64((u64)sg_dma_address(&task->smp_task.smp_req)); + scb->smp_task.smp_req.size = + cpu_to_le32((u32)sg_dma_len(&task->smp_task.smp_req)-4); + + scb->smp_task.smp_resp.bus_addr = + cpu_to_le64((u64)sg_dma_address(&task->smp_task.smp_resp)); + scb->smp_task.smp_resp.size = + cpu_to_le32((u32)sg_dma_len(&task->smp_task.smp_resp)-4); + + scb->smp_task.sister_scb = cpu_to_le16(0xFFFF); + scb->smp_task.conn_handle = cpu_to_le16((u16) + (unsigned long)dev->lldd_dev); + + ascb->tasklet_complete = asd_task_tasklet_complete; + + return 0; +} + +static void asd_unbuild_smp_ascb(struct asd_ascb *a) +{ + struct sas_task *task = a->uldd_task; + + BUG_ON(!task); + pci_unmap_sg(a->ha->pcidev, &task->smp_task.smp_req, 1, + PCI_DMA_FROMDEVICE); + pci_unmap_sg(a->ha->pcidev, &task->smp_task.smp_resp, 1, + PCI_DMA_FROMDEVICE); +} + +/* ---------- SSP ---------- */ + +static int asd_build_ssp_ascb(struct asd_ascb *ascb, struct sas_task *task, + unsigned long gfp_flags) +{ + struct domain_device *dev = task->dev; + struct scb *scb; + int res = 0; + + scb = ascb->scb; + + scb->header.opcode = INITIATE_SSP_TASK; + + scb->ssp_task.proto_conn_rate = (1 << 4); /* SSP */ + scb->ssp_task.proto_conn_rate |= dev->linkrate; + scb->ssp_task.total_xfer_len = cpu_to_le32(task->total_xfer_len); + scb->ssp_task.ssp_frame.frame_type = SSP_DATA; + memcpy(scb->ssp_task.ssp_frame.hashed_dest_addr, dev->hashed_sas_addr, + HASHED_SAS_ADDR_SIZE); + memcpy(scb->ssp_task.ssp_frame.hashed_src_addr, + dev->port->ha->hashed_sas_addr, HASHED_SAS_ADDR_SIZE); + scb->ssp_task.ssp_frame.tptt = cpu_to_be16(0xFFFF); + + memcpy(scb->ssp_task.ssp_cmd.lun, task->ssp_task.LUN, 8); + if (task->ssp_task.enable_first_burst) + scb->ssp_task.ssp_cmd.efb_prio_attr |= EFB_MASK; + scb->ssp_task.ssp_cmd.efb_prio_attr |= (task->ssp_task.task_prio << 3); + scb->ssp_task.ssp_cmd.efb_prio_attr |= (task->ssp_task.task_attr & 7); + memcpy(scb->ssp_task.ssp_cmd.cdb, task->ssp_task.cdb, 16); + + scb->ssp_task.sister_scb = cpu_to_le16(0xFFFF); + scb->ssp_task.conn_handle = cpu_to_le16( + (u16)(unsigned long)dev->lldd_dev); + scb->ssp_task.data_dir = data_dir_flags[task->data_dir]; + scb->ssp_task.retry_count = scb->ssp_task.retry_count; + + ascb->tasklet_complete = asd_task_tasklet_complete; + + res = asd_map_scatterlist(task, scb->ssp_task.sg_element, gfp_flags); + + return res; +} + +static void asd_unbuild_ssp_ascb(struct asd_ascb *a) +{ + asd_unmap_scatterlist(a); +} + +/* ---------- Execute Task ---------- */ + +static inline int asd_can_queue(struct asd_ha_struct *asd_ha, int num) +{ + int res = 0; + unsigned long flags; + + spin_lock_irqsave(&asd_ha->seq.pend_q_lock, flags); + if ((asd_ha->seq.can_queue - num) < 0) + res = -SAS_QUEUE_FULL; + else + asd_ha->seq.can_queue -= num; + spin_unlock_irqrestore(&asd_ha->seq.pend_q_lock, flags); + + return res; +} + +int asd_execute_task(struct sas_task *task, const int num, + unsigned long gfp_flags) +{ + int res = 0; + LIST_HEAD(alist); + struct sas_task *t = task; + struct asd_ascb *ascb = NULL, *a; + struct asd_ha_struct *asd_ha = task->dev->port->ha->lldd_ha; + + res = asd_can_queue(asd_ha, num); + if (res) + return res; + + res = num; + ascb = asd_ascb_alloc_list(asd_ha, &res, gfp_flags); + if (res) { + res = -ENOMEM; + goto out_err; + } + + __list_add(&alist, ascb->list.prev, &ascb->list); + list_for_each_entry(a, &alist, list) { + a->uldd_task = t; + t->lldd_task = a; + t = list_entry(t->list.next, struct sas_task, list); + } + list_for_each_entry(a, &alist, list) { + t = a->uldd_task; + a->uldd_timer = 1; + if (t->task_proto & SAS_PROTO_STP) + t->task_proto = SAS_PROTO_STP; + switch (t->task_proto) { + case SATA_PROTO: + case SAS_PROTO_STP: + res = asd_build_ata_ascb(a, t, gfp_flags); + break; + case SAS_PROTO_SMP: + res = asd_build_smp_ascb(a, t, gfp_flags); + break; + case SAS_PROTO_SSP: + res = asd_build_ssp_ascb(a, t, gfp_flags); + break; + default: + asd_printk("unknown sas_task proto: 0x%x\n", + t->task_proto); + res = -ENOMEM; + break; + } + if (res) + goto out_err_unmap; + } + list_del_init(&alist); + + res = asd_post_ascb_list(asd_ha, ascb, num); + if (unlikely(res)) { + a = NULL; + __list_add(&alist, ascb->list.prev, &ascb->list); + goto out_err_unmap; + } + + return 0; +out_err_unmap: + { + struct asd_ascb *b = a; + list_for_each_entry(a, &alist, list) { + if (a == b) + break; + t = a->uldd_task; + switch (t->task_proto) { + case SATA_PROTO: + case SAS_PROTO_STP: + asd_unbuild_ata_ascb(a); + break; + case SAS_PROTO_SMP: + asd_unbuild_smp_ascb(a); + break; + case SAS_PROTO_SSP: + asd_unbuild_ssp_ascb(a); + default: + break; + } + t->lldd_task = NULL; + } + } + list_del_init(&alist); +out_err: + if (ascb) + asd_ascb_free_list(ascb); + asd_can_dequeue(asd_ha, num); + return res; +} diff -uprN linux-2.6.18/drivers/scsi/aic94xx/aic94xx_tmf.c linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_tmf.c --- linux-2.6.18/drivers/scsi/aic94xx/aic94xx_tmf.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/aic94xx/aic94xx_tmf.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,636 @@ +/* + * Aic94xx Task Management Functions + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This file is part of the aic94xx driver. + * + * The aic94xx driver is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2 of the + * License. + * + * The aic94xx driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the aic94xx driver; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "aic94xx.h" +#include "aic94xx_sas.h" +#include "aic94xx_hwi.h" + +/* ---------- Internal enqueue ---------- */ + +static int asd_enqueue_internal(struct asd_ascb *ascb, + void (*tasklet_complete)(struct asd_ascb *, + struct done_list_struct *), + void (*timed_out)(unsigned long)) +{ + int res; + + ascb->tasklet_complete = tasklet_complete; + ascb->uldd_timer = 1; + + ascb->timer.data = (unsigned long) ascb; + ascb->timer.function = timed_out; + ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT; + + add_timer(&ascb->timer); + + res = asd_post_ascb_list(ascb->ha, ascb, 1); + if (unlikely(res)) + del_timer(&ascb->timer); + return res; +} + +static inline void asd_timedout_common(unsigned long data) +{ + struct asd_ascb *ascb = (void *) data; + struct asd_seq_data *seq = &ascb->ha->seq; + unsigned long flags; + + spin_lock_irqsave(&seq->pend_q_lock, flags); + seq->pending--; + list_del_init(&ascb->list); + spin_unlock_irqrestore(&seq->pend_q_lock, flags); +} + +/* ---------- CLEAR NEXUS ---------- */ + +static void asd_clear_nexus_tasklet_complete(struct asd_ascb *ascb, + struct done_list_struct *dl) +{ + ASD_DPRINTK("%s: here\n", __FUNCTION__); + if (!del_timer(&ascb->timer)) { + ASD_DPRINTK("%s: couldn't delete timer\n", __FUNCTION__); + return; + } + ASD_DPRINTK("%s: opcode: 0x%x\n", __FUNCTION__, dl->opcode); + ascb->uldd_task = (void *) (unsigned long) dl->opcode; + complete(&ascb->completion); +} + +static void asd_clear_nexus_timedout(unsigned long data) +{ + struct asd_ascb *ascb = (void *) data; + + ASD_DPRINTK("%s: here\n", __FUNCTION__); + asd_timedout_common(data); + ascb->uldd_task = (void *) TMF_RESP_FUNC_FAILED; + complete(&ascb->completion); +} + +#define CLEAR_NEXUS_PRE \ + ASD_DPRINTK("%s: PRE\n", __FUNCTION__); \ + res = 1; \ + ascb = asd_ascb_alloc_list(asd_ha, &res, GFP_KERNEL); \ + if (!ascb) \ + return -ENOMEM; \ + \ + scb = ascb->scb; \ + scb->header.opcode = CLEAR_NEXUS + +#define CLEAR_NEXUS_POST \ + ASD_DPRINTK("%s: POST\n", __FUNCTION__); \ + res = asd_enqueue_internal(ascb, asd_clear_nexus_tasklet_complete, \ + asd_clear_nexus_timedout); \ + if (res) \ + goto out_err; \ + ASD_DPRINTK("%s: clear nexus posted, waiting...\n", __FUNCTION__); \ + wait_for_completion(&ascb->completion); \ + res = (int) (unsigned long) ascb->uldd_task; \ + if (res == TC_NO_ERROR) \ + res = TMF_RESP_FUNC_COMPLETE; \ +out_err: \ + asd_ascb_free(ascb); \ + return res + +int asd_clear_nexus_ha(struct sas_ha_struct *sas_ha) +{ + struct asd_ha_struct *asd_ha = sas_ha->lldd_ha; + struct asd_ascb *ascb; + struct scb *scb; + int res; + + CLEAR_NEXUS_PRE; + scb->clear_nexus.nexus = NEXUS_ADAPTER; + CLEAR_NEXUS_POST; +} + +int asd_clear_nexus_port(struct asd_sas_port *port) +{ + struct asd_ha_struct *asd_ha = port->ha->lldd_ha; + struct asd_ascb *ascb; + struct scb *scb; + int res; + + CLEAR_NEXUS_PRE; + scb->clear_nexus.nexus = NEXUS_PORT; + scb->clear_nexus.conn_mask = port->phy_mask; + CLEAR_NEXUS_POST; +} + +#if 0 +static int asd_clear_nexus_I_T(struct domain_device *dev) +{ + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + struct asd_ascb *ascb; + struct scb *scb; + int res; + + CLEAR_NEXUS_PRE; + scb->clear_nexus.nexus = NEXUS_I_T; + scb->clear_nexus.flags = SEND_Q | EXEC_Q | NOTINQ; + if (dev->tproto) + scb->clear_nexus.flags |= SUSPEND_TX; + scb->clear_nexus.conn_handle = cpu_to_le16((u16)(unsigned long) + dev->lldd_dev); + CLEAR_NEXUS_POST; +} +#endif + +static int asd_clear_nexus_I_T_L(struct domain_device *dev, u8 *lun) +{ + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + struct asd_ascb *ascb; + struct scb *scb; + int res; + + CLEAR_NEXUS_PRE; + scb->clear_nexus.nexus = NEXUS_I_T_L; + scb->clear_nexus.flags = SEND_Q | EXEC_Q | NOTINQ; + if (dev->tproto) + scb->clear_nexus.flags |= SUSPEND_TX; + memcpy(scb->clear_nexus.ssp_task.lun, lun, 8); + scb->clear_nexus.conn_handle = cpu_to_le16((u16)(unsigned long) + dev->lldd_dev); + CLEAR_NEXUS_POST; +} + +static int asd_clear_nexus_tag(struct sas_task *task) +{ + struct asd_ha_struct *asd_ha = task->dev->port->ha->lldd_ha; + struct asd_ascb *tascb = task->lldd_task; + struct asd_ascb *ascb; + struct scb *scb; + int res; + + CLEAR_NEXUS_PRE; + scb->clear_nexus.nexus = NEXUS_TAG; + memcpy(scb->clear_nexus.ssp_task.lun, task->ssp_task.LUN, 8); + scb->clear_nexus.ssp_task.tag = tascb->tag; + if (task->dev->tproto) + scb->clear_nexus.conn_handle = cpu_to_le16((u16)(unsigned long) + task->dev->lldd_dev); + CLEAR_NEXUS_POST; +} + +static int asd_clear_nexus_index(struct sas_task *task) +{ + struct asd_ha_struct *asd_ha = task->dev->port->ha->lldd_ha; + struct asd_ascb *tascb = task->lldd_task; + struct asd_ascb *ascb; + struct scb *scb; + int res; + + CLEAR_NEXUS_PRE; + scb->clear_nexus.nexus = NEXUS_TRANS_CX; + if (task->dev->tproto) + scb->clear_nexus.conn_handle = cpu_to_le16((u16)(unsigned long) + task->dev->lldd_dev); + scb->clear_nexus.index = cpu_to_le16(tascb->tc_index); + CLEAR_NEXUS_POST; +} + +/* ---------- TMFs ---------- */ + +static void asd_tmf_timedout(unsigned long data) +{ + struct asd_ascb *ascb = (void *) data; + + ASD_DPRINTK("tmf timed out\n"); + asd_timedout_common(data); + ascb->uldd_task = (void *) TMF_RESP_FUNC_FAILED; + complete(&ascb->completion); +} + +static int asd_get_tmf_resp_tasklet(struct asd_ascb *ascb, + struct done_list_struct *dl) +{ + struct asd_ha_struct *asd_ha = ascb->ha; + unsigned long flags; + struct tc_resp_sb_struct { + __le16 index_escb; + u8 len_lsb; + u8 flags; + } __attribute__ ((packed)) *resp_sb = (void *) dl->status_block; + + int edb_id = ((resp_sb->flags & 0x70) >> 4)-1; + struct asd_ascb *escb; + struct asd_dma_tok *edb; + struct ssp_frame_hdr *fh; + struct ssp_response_iu *ru; + int res = TMF_RESP_FUNC_FAILED; + + ASD_DPRINTK("tmf resp tasklet\n"); + + spin_lock_irqsave(&asd_ha->seq.tc_index_lock, flags); + escb = asd_tc_index_find(&asd_ha->seq, + (int)le16_to_cpu(resp_sb->index_escb)); + spin_unlock_irqrestore(&asd_ha->seq.tc_index_lock, flags); + + if (!escb) { + ASD_DPRINTK("Uh-oh! No escb for this dl?!\n"); + return res; + } + + edb = asd_ha->seq.edb_arr[edb_id + escb->edb_index]; + ascb->tag = *(__be16 *)(edb->vaddr+4); + fh = edb->vaddr + 16; + ru = edb->vaddr + 16 + sizeof(*fh); + res = ru->status; + if (ru->datapres == 1) /* Response data present */ + res = ru->resp_data[3]; +#if 0 + ascb->tag = fh->tag; +#endif + ascb->tag_valid = 1; + + asd_invalidate_edb(escb, edb_id); + return res; +} + +static void asd_tmf_tasklet_complete(struct asd_ascb *ascb, + struct done_list_struct *dl) +{ + if (!del_timer(&ascb->timer)) + return; + + ASD_DPRINTK("tmf tasklet complete\n"); + + if (dl->opcode == TC_SSP_RESP) + ascb->uldd_task = (void *) (unsigned long) + asd_get_tmf_resp_tasklet(ascb, dl); + else + ascb->uldd_task = (void *) 0xFF00 + (unsigned long) dl->opcode; + + complete(&ascb->completion); +} + +static inline int asd_clear_nexus(struct sas_task *task) +{ + int res = TMF_RESP_FUNC_FAILED; + struct asd_ascb *tascb = task->lldd_task; + unsigned long flags; + + ASD_DPRINTK("task not done, clearing nexus\n"); + if (tascb->tag_valid) + res = asd_clear_nexus_tag(task); + else + res = asd_clear_nexus_index(task); + wait_for_completion_timeout(&tascb->completion, + AIC94XX_SCB_TIMEOUT); + ASD_DPRINTK("came back from clear nexus\n"); + spin_lock_irqsave(&task->task_state_lock, flags); + if (task->task_state_flags & SAS_TASK_STATE_DONE) + res = TMF_RESP_FUNC_COMPLETE; + spin_unlock_irqrestore(&task->task_state_lock, flags); + + return res; +} + +/** + * asd_abort_task -- ABORT TASK TMF + * @task: the task to be aborted + * + * Before calling ABORT TASK the task state flags should be ORed with + * SAS_TASK_STATE_ABORTED (unless SAS_TASK_STATE_DONE is set) under + * the task_state_lock IRQ spinlock, then ABORT TASK *must* be called. + * + * Implements the ABORT TASK TMF, I_T_L_Q nexus. + * Returns: SAS TMF responses (see sas_task.h), + * -ENOMEM, + * -SAS_QUEUE_FULL. + * + * When ABORT TASK returns, the caller of ABORT TASK checks first the + * task->task_state_flags, and then the return value of ABORT TASK. + * + * If the task has task state bit SAS_TASK_STATE_DONE set, then the + * task was completed successfully prior to it being aborted. The + * caller of ABORT TASK has responsibility to call task->task_done() + * xor free the task, depending on their framework. The return code + * is TMF_RESP_FUNC_FAILED in this case. + * + * Else the SAS_TASK_STATE_DONE bit is not set, + * If the return code is TMF_RESP_FUNC_COMPLETE, then + * the task was aborted successfully. The caller of + * ABORT TASK has responsibility to call task->task_done() + * to finish the task, xor free the task depending on their + * framework. + * else + * the ABORT TASK returned some kind of error. The task + * was _not_ cancelled. Nothing can be assumed. + * The caller of ABORT TASK may wish to retry. + */ +int asd_abort_task(struct sas_task *task) +{ + struct asd_ascb *tascb = task->lldd_task; + struct asd_ha_struct *asd_ha = tascb->ha; + int res = 1; + unsigned long flags; + struct asd_ascb *ascb = NULL; + struct scb *scb; + + spin_lock_irqsave(&task->task_state_lock, flags); + if (task->task_state_flags & SAS_TASK_STATE_DONE) { + spin_unlock_irqrestore(&task->task_state_lock, flags); + res = TMF_RESP_FUNC_COMPLETE; + ASD_DPRINTK("%s: task 0x%p done\n", __FUNCTION__, task); + goto out_done; + } + spin_unlock_irqrestore(&task->task_state_lock, flags); + + ascb = asd_ascb_alloc_list(asd_ha, &res, GFP_KERNEL); + if (!ascb) + return -ENOMEM; + scb = ascb->scb; + + scb->header.opcode = ABORT_TASK; + + switch (task->task_proto) { + case SATA_PROTO: + case SAS_PROTO_STP: + scb->abort_task.proto_conn_rate = (1 << 5); /* STP */ + break; + case SAS_PROTO_SSP: + scb->abort_task.proto_conn_rate = (1 << 4); /* SSP */ + scb->abort_task.proto_conn_rate |= task->dev->linkrate; + break; + case SAS_PROTO_SMP: + break; + default: + break; + } + + if (task->task_proto == SAS_PROTO_SSP) { + scb->abort_task.ssp_frame.frame_type = SSP_TASK; + memcpy(scb->abort_task.ssp_frame.hashed_dest_addr, + task->dev->hashed_sas_addr, HASHED_SAS_ADDR_SIZE); + memcpy(scb->abort_task.ssp_frame.hashed_src_addr, + task->dev->port->ha->hashed_sas_addr, + HASHED_SAS_ADDR_SIZE); + scb->abort_task.ssp_frame.tptt = cpu_to_be16(0xFFFF); + + memcpy(scb->abort_task.ssp_task.lun, task->ssp_task.LUN, 8); + scb->abort_task.ssp_task.tmf = TMF_ABORT_TASK; + scb->abort_task.ssp_task.tag = cpu_to_be16(0xFFFF); + } + + scb->abort_task.sister_scb = cpu_to_le16(0xFFFF); + scb->abort_task.conn_handle = cpu_to_le16( + (u16)(unsigned long)task->dev->lldd_dev); + scb->abort_task.retry_count = 1; + scb->abort_task.index = cpu_to_le16((u16)tascb->tc_index); + scb->abort_task.itnl_to = cpu_to_le16(ITNL_TIMEOUT_CONST); + + res = asd_enqueue_internal(ascb, asd_tmf_tasklet_complete, + asd_tmf_timedout); + if (res) + goto out; + wait_for_completion(&ascb->completion); + ASD_DPRINTK("tmf came back\n"); + + res = (int) (unsigned long) ascb->uldd_task; + tascb->tag = ascb->tag; + tascb->tag_valid = ascb->tag_valid; + + spin_lock_irqsave(&task->task_state_lock, flags); + if (task->task_state_flags & SAS_TASK_STATE_DONE) { + spin_unlock_irqrestore(&task->task_state_lock, flags); + res = TMF_RESP_FUNC_COMPLETE; + ASD_DPRINTK("%s: task 0x%p done\n", __FUNCTION__, task); + goto out_done; + } + spin_unlock_irqrestore(&task->task_state_lock, flags); + + switch (res) { + /* The task to be aborted has been sent to the device. + * We got a Response IU for the ABORT TASK TMF. */ + case TC_NO_ERROR + 0xFF00: + case TMF_RESP_FUNC_COMPLETE: + case TMF_RESP_FUNC_FAILED: + res = asd_clear_nexus(task); + break; + case TMF_RESP_INVALID_FRAME: + case TMF_RESP_OVERLAPPED_TAG: + case TMF_RESP_FUNC_ESUPP: + case TMF_RESP_NO_LUN: + goto out_done; break; + } + /* In the following we assume that the managing layer + * will _never_ make a mistake, when issuing ABORT TASK. + */ + switch (res) { + default: + res = asd_clear_nexus(task); + /* fallthrough */ + case TC_NO_ERROR + 0xFF00: + case TMF_RESP_FUNC_COMPLETE: + break; + /* The task hasn't been sent to the device xor we never got + * a (sane) Response IU for the ABORT TASK TMF. + */ + case TF_NAK_RECV + 0xFF00: + res = TMF_RESP_INVALID_FRAME; + break; + case TF_TMF_TASK_DONE + 0xFF00: /* done but not reported yet */ + res = TMF_RESP_FUNC_FAILED; + wait_for_completion_timeout(&tascb->completion, + AIC94XX_SCB_TIMEOUT); + spin_lock_irqsave(&task->task_state_lock, flags); + if (task->task_state_flags & SAS_TASK_STATE_DONE) + res = TMF_RESP_FUNC_COMPLETE; + spin_unlock_irqrestore(&task->task_state_lock, flags); + goto out_done; + case TF_TMF_NO_TAG + 0xFF00: + case TF_TMF_TAG_FREE + 0xFF00: /* the tag is in the free list */ + case TF_TMF_NO_CONN_HANDLE + 0xFF00: /* no such device */ + res = TMF_RESP_FUNC_COMPLETE; + goto out_done; + case TF_TMF_NO_CTX + 0xFF00: /* not in seq, or proto != SSP */ + res = TMF_RESP_FUNC_ESUPP; + goto out; + } +out_done: + if (res == TMF_RESP_FUNC_COMPLETE) { + task->lldd_task = NULL; + mb(); + asd_ascb_free(tascb); + } +out: + asd_ascb_free(ascb); + ASD_DPRINTK("task 0x%p aborted, res: 0x%x\n", task, res); + return res; +} + +/** + * asd_initiate_ssp_tmf -- send a TMF to an I_T_L or I_T_L_Q nexus + * @dev: pointer to struct domain_device of interest + * @lun: pointer to u8[8] which is the LUN + * @tmf: the TMF to be performed (see sas_task.h or the SAS spec) + * @index: the transaction context of the task to be queried if QT TMF + * + * This function is used to send ABORT TASK SET, CLEAR ACA, + * CLEAR TASK SET, LU RESET and QUERY TASK TMFs. + * + * No SCBs should be queued to the I_T_L nexus when this SCB is + * pending. + * + * Returns: TMF response code (see sas_task.h or the SAS spec) + */ +static int asd_initiate_ssp_tmf(struct domain_device *dev, u8 *lun, + int tmf, int index) +{ + struct asd_ha_struct *asd_ha = dev->port->ha->lldd_ha; + struct asd_ascb *ascb; + int res = 1; + struct scb *scb; + + if (!(dev->tproto & SAS_PROTO_SSP)) + return TMF_RESP_FUNC_ESUPP; + + ascb = asd_ascb_alloc_list(asd_ha, &res, GFP_KERNEL); + if (!ascb) + return -ENOMEM; + scb = ascb->scb; + + if (tmf == TMF_QUERY_TASK) + scb->header.opcode = QUERY_SSP_TASK; + else + scb->header.opcode = INITIATE_SSP_TMF; + + scb->ssp_tmf.proto_conn_rate = (1 << 4); /* SSP */ + scb->ssp_tmf.proto_conn_rate |= dev->linkrate; + /* SSP frame header */ + scb->ssp_tmf.ssp_frame.frame_type = SSP_TASK; + memcpy(scb->ssp_tmf.ssp_frame.hashed_dest_addr, + dev->hashed_sas_addr, HASHED_SAS_ADDR_SIZE); + memcpy(scb->ssp_tmf.ssp_frame.hashed_src_addr, + dev->port->ha->hashed_sas_addr, HASHED_SAS_ADDR_SIZE); + scb->ssp_tmf.ssp_frame.tptt = cpu_to_be16(0xFFFF); + /* SSP Task IU */ + memcpy(scb->ssp_tmf.ssp_task.lun, lun, 8); + scb->ssp_tmf.ssp_task.tmf = tmf; + + scb->ssp_tmf.sister_scb = cpu_to_le16(0xFFFF); + scb->ssp_tmf.conn_handle= cpu_to_le16((u16)(unsigned long) + dev->lldd_dev); + scb->ssp_tmf.retry_count = 1; + scb->ssp_tmf.itnl_to = cpu_to_le16(ITNL_TIMEOUT_CONST); + if (tmf == TMF_QUERY_TASK) + scb->ssp_tmf.index = cpu_to_le16(index); + + res = asd_enqueue_internal(ascb, asd_tmf_tasklet_complete, + asd_tmf_timedout); + if (res) + goto out_err; + wait_for_completion(&ascb->completion); + res = (int) (unsigned long) ascb->uldd_task; + + switch (res) { + case TC_NO_ERROR + 0xFF00: + res = TMF_RESP_FUNC_COMPLETE; + break; + case TF_NAK_RECV + 0xFF00: + res = TMF_RESP_INVALID_FRAME; + break; + case TF_TMF_TASK_DONE + 0xFF00: + res = TMF_RESP_FUNC_FAILED; + break; + case TF_TMF_NO_TAG + 0xFF00: + case TF_TMF_TAG_FREE + 0xFF00: /* the tag is in the free list */ + case TF_TMF_NO_CONN_HANDLE + 0xFF00: /* no such device */ + res = TMF_RESP_FUNC_COMPLETE; + break; + case TF_TMF_NO_CTX + 0xFF00: /* not in seq, or proto != SSP */ + res = TMF_RESP_FUNC_ESUPP; + break; + default: + ASD_DPRINTK("%s: converting result 0x%x to TMF_RESP_FUNC_FAILED\n", + __FUNCTION__, res); + res = TMF_RESP_FUNC_FAILED; + break; + } +out_err: + asd_ascb_free(ascb); + return res; +} + +int asd_abort_task_set(struct domain_device *dev, u8 *lun) +{ + int res = asd_initiate_ssp_tmf(dev, lun, TMF_ABORT_TASK_SET, 0); + + if (res == TMF_RESP_FUNC_COMPLETE) + asd_clear_nexus_I_T_L(dev, lun); + return res; +} + +int asd_clear_aca(struct domain_device *dev, u8 *lun) +{ + int res = asd_initiate_ssp_tmf(dev, lun, TMF_CLEAR_ACA, 0); + + if (res == TMF_RESP_FUNC_COMPLETE) + asd_clear_nexus_I_T_L(dev, lun); + return res; +} + +int asd_clear_task_set(struct domain_device *dev, u8 *lun) +{ + int res = asd_initiate_ssp_tmf(dev, lun, TMF_CLEAR_TASK_SET, 0); + + if (res == TMF_RESP_FUNC_COMPLETE) + asd_clear_nexus_I_T_L(dev, lun); + return res; +} + +int asd_lu_reset(struct domain_device *dev, u8 *lun) +{ + int res = asd_initiate_ssp_tmf(dev, lun, TMF_LU_RESET, 0); + + if (res == TMF_RESP_FUNC_COMPLETE) + asd_clear_nexus_I_T_L(dev, lun); + return res; +} + +/** + * asd_query_task -- send a QUERY TASK TMF to an I_T_L_Q nexus + * task: pointer to sas_task struct of interest + * + * Returns: TMF_RESP_FUNC_COMPLETE if the task is not in the task set, + * or TMF_RESP_FUNC_SUCC if the task is in the task set. + * + * Normally the management layer sets the task to aborted state, + * and then calls query task and then abort task. + */ +int asd_query_task(struct sas_task *task) +{ + struct asd_ascb *ascb = task->lldd_task; + int index; + + if (ascb) { + index = ascb->tc_index; + return asd_initiate_ssp_tmf(task->dev, task->ssp_task.LUN, + TMF_QUERY_TASK, index); + } + return TMF_RESP_FUNC_COMPLETE; +} diff -uprN linux-2.6.18/drivers/scsi/libsas/Kconfig linux-2.6.18.ovz/drivers/scsi/libsas/Kconfig --- linux-2.6.18/drivers/scsi/libsas/Kconfig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/Kconfig 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,39 @@ +# +# Kernel configuration file for the SAS Class +# +# Copyright (C) 2005 Adaptec, Inc. All rights reserved. +# Copyright (C) 2005 Luben Tuikov +# +# This file is licensed under GPLv2. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; version 2 of the +# License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 +# USA +# + +config SCSI_SAS_LIBSAS + tristate "SAS Domain Transport Attributes" + depends on SCSI + select SCSI_SAS_ATTRS + help + This provides transport specific helpers for SAS drivers which + use the domain device construct (like the aic94xxx). + +config SCSI_SAS_LIBSAS_DEBUG + bool "Compile the SAS Domain Transport Attributes in debug mode" + default y + depends on SCSI_SAS_LIBSAS + help + Compiles the SAS Layer in debug mode. In debug mode, the + SAS Layer prints diagnostic and debug messages. diff -uprN linux-2.6.18/drivers/scsi/libsas/Makefile linux-2.6.18.ovz/drivers/scsi/libsas/Makefile --- linux-2.6.18/drivers/scsi/libsas/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/Makefile 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,36 @@ +# +# Kernel Makefile for the libsas helpers +# +# Copyright (C) 2005 Adaptec, Inc. All rights reserved. +# Copyright (C) 2005 Luben Tuikov +# +# This file is licensed under GPLv2. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; version 2 of the +# License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 +# USA + +ifeq ($(CONFIG_SCSI_SAS_LIBSAS_DEBUG),y) + EXTRA_CFLAGS += -DSAS_DEBUG +endif + +obj-$(CONFIG_SCSI_SAS_LIBSAS) += libsas.o +libsas-y += sas_init.o \ + sas_phy.o \ + sas_port.o \ + sas_event.o \ + sas_dump.o \ + sas_discover.o \ + sas_expander.o \ + sas_scsi_host.o diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_discover.c linux-2.6.18.ovz/drivers/scsi/libsas/sas_discover.c --- linux-2.6.18/drivers/scsi/libsas/sas_discover.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_discover.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,749 @@ +/* + * Serial Attached SCSI (SAS) Discover process + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include "sas_internal.h" + +#include +#include +#include "../scsi_sas_internal.h" + +/* ---------- Basic task processing for discovery purposes ---------- */ + +void sas_init_dev(struct domain_device *dev) +{ + INIT_LIST_HEAD(&dev->siblings); + INIT_LIST_HEAD(&dev->dev_list_node); + switch (dev->dev_type) { + case SAS_END_DEV: + break; + case EDGE_DEV: + case FANOUT_DEV: + INIT_LIST_HEAD(&dev->ex_dev.children); + break; + case SATA_DEV: + case SATA_PM: + case SATA_PM_PORT: + INIT_LIST_HEAD(&dev->sata_dev.children); + break; + default: + break; + } +} + +static void sas_task_timedout(unsigned long _task) +{ + struct sas_task *task = (void *) _task; + unsigned long flags; + + spin_lock_irqsave(&task->task_state_lock, flags); + if (!(task->task_state_flags & SAS_TASK_STATE_DONE)) + task->task_state_flags |= SAS_TASK_STATE_ABORTED; + spin_unlock_irqrestore(&task->task_state_lock, flags); + + complete(&task->completion); +} + +static void sas_disc_task_done(struct sas_task *task) +{ + if (!del_timer(&task->timer)) + return; + complete(&task->completion); +} + +#define SAS_DEV_TIMEOUT 10 + +/** + * sas_execute_task -- Basic task processing for discovery + * @task: the task to be executed + * @buffer: pointer to buffer to do I/O + * @size: size of @buffer + * @pci_dma_dir: PCI_DMA_... + */ +static int sas_execute_task(struct sas_task *task, void *buffer, int size, + int pci_dma_dir) +{ + int res = 0; + struct scatterlist *scatter = NULL; + struct task_status_struct *ts = &task->task_status; + int num_scatter = 0; + int retries = 0; + struct sas_internal *i = + to_sas_internal(task->dev->port->ha->core.shost->transportt); + + if (pci_dma_dir != PCI_DMA_NONE) { + scatter = kzalloc(sizeof(*scatter), GFP_KERNEL); + if (!scatter) + goto out; + + sg_init_one(scatter, buffer, size); + num_scatter = 1; + } + + task->task_proto = task->dev->tproto; + task->scatter = scatter; + task->num_scatter = num_scatter; + task->total_xfer_len = size; + task->data_dir = pci_dma_dir; + task->task_done = sas_disc_task_done; + + for (retries = 0; retries < 5; retries++) { + task->task_state_flags = SAS_TASK_STATE_PENDING; + init_completion(&task->completion); + + task->timer.data = (unsigned long) task; + task->timer.function = sas_task_timedout; + task->timer.expires = jiffies + SAS_DEV_TIMEOUT*HZ; + add_timer(&task->timer); + + res = i->dft->lldd_execute_task(task, 1, GFP_KERNEL); + if (res) { + del_timer(&task->timer); + SAS_DPRINTK("executing SAS discovery task failed:%d\n", + res); + goto ex_err; + } + wait_for_completion(&task->completion); + res = -ETASK; + if (task->task_state_flags & SAS_TASK_STATE_ABORTED) { + int res2; + SAS_DPRINTK("task aborted, flags:0x%x\n", + task->task_state_flags); + res2 = i->dft->lldd_abort_task(task); + SAS_DPRINTK("came back from abort task\n"); + if (!(task->task_state_flags & SAS_TASK_STATE_DONE)) { + if (res2 == TMF_RESP_FUNC_COMPLETE) + continue; /* Retry the task */ + else + goto ex_err; + } + } + if (task->task_status.stat == SAM_BUSY || + task->task_status.stat == SAM_TASK_SET_FULL || + task->task_status.stat == SAS_QUEUE_FULL) { + SAS_DPRINTK("task: q busy, sleeping...\n"); + schedule_timeout_interruptible(HZ); + } else if (task->task_status.stat == SAM_CHECK_COND) { + struct scsi_sense_hdr shdr; + + if (!scsi_normalize_sense(ts->buf, ts->buf_valid_size, + &shdr)) { + SAS_DPRINTK("couldn't normalize sense\n"); + continue; + } + if ((shdr.sense_key == 6 && shdr.asc == 0x29) || + (shdr.sense_key == 2 && shdr.asc == 4 && + shdr.ascq == 1)) { + SAS_DPRINTK("device %016llx LUN: %016llx " + "powering up or not ready yet, " + "sleeping...\n", + SAS_ADDR(task->dev->sas_addr), + SAS_ADDR(task->ssp_task.LUN)); + + schedule_timeout_interruptible(5*HZ); + } else if (shdr.sense_key == 1) { + res = 0; + break; + } else if (shdr.sense_key == 5) { + break; + } else { + SAS_DPRINTK("dev %016llx LUN: %016llx " + "sense key:0x%x ASC:0x%x ASCQ:0x%x" + "\n", + SAS_ADDR(task->dev->sas_addr), + SAS_ADDR(task->ssp_task.LUN), + shdr.sense_key, + shdr.asc, shdr.ascq); + } + } else if (task->task_status.resp != SAS_TASK_COMPLETE || + task->task_status.stat != SAM_GOOD) { + SAS_DPRINTK("task finished with resp:0x%x, " + "stat:0x%x\n", + task->task_status.resp, + task->task_status.stat); + goto ex_err; + } else { + res = 0; + break; + } + } +ex_err: + if (pci_dma_dir != PCI_DMA_NONE) + kfree(scatter); +out: + return res; +} + +/* ---------- Domain device discovery ---------- */ + +/** + * sas_get_port_device -- Discover devices which caused port creation + * @port: pointer to struct sas_port of interest + * + * Devices directly attached to a HA port, have no parent. This is + * how we know they are (domain) "root" devices. All other devices + * do, and should have their "parent" pointer set appropriately as + * soon as a child device is discovered. + */ +static int sas_get_port_device(struct asd_sas_port *port) +{ + unsigned long flags; + struct asd_sas_phy *phy; + struct sas_rphy *rphy; + struct domain_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + spin_lock_irqsave(&port->phy_list_lock, flags); + if (list_empty(&port->phy_list)) { + spin_unlock_irqrestore(&port->phy_list_lock, flags); + kfree(dev); + return -ENODEV; + } + phy = container_of(port->phy_list.next, struct asd_sas_phy, port_phy_el); + spin_lock(&phy->frame_rcvd_lock); + memcpy(dev->frame_rcvd, phy->frame_rcvd, min(sizeof(dev->frame_rcvd), + (size_t)phy->frame_rcvd_size)); + spin_unlock(&phy->frame_rcvd_lock); + spin_unlock_irqrestore(&port->phy_list_lock, flags); + + if (dev->frame_rcvd[0] == 0x34 && port->oob_mode == SATA_OOB_MODE) { + struct dev_to_host_fis *fis = + (struct dev_to_host_fis *) dev->frame_rcvd; + if (fis->interrupt_reason == 1 && fis->lbal == 1 && + fis->byte_count_low==0x69 && fis->byte_count_high == 0x96 + && (fis->device & ~0x10) == 0) + dev->dev_type = SATA_PM; + else + dev->dev_type = SATA_DEV; + dev->tproto = SATA_PROTO; + } else { + struct sas_identify_frame *id = + (struct sas_identify_frame *) dev->frame_rcvd; + dev->dev_type = id->dev_type; + dev->iproto = id->initiator_bits; + dev->tproto = id->target_bits; + } + + sas_init_dev(dev); + + switch (dev->dev_type) { + case SAS_END_DEV: + rphy = sas_end_device_alloc(port->port); + break; + case EDGE_DEV: + rphy = sas_expander_alloc(port->port, + SAS_EDGE_EXPANDER_DEVICE); + break; + case FANOUT_DEV: + rphy = sas_expander_alloc(port->port, + SAS_FANOUT_EXPANDER_DEVICE); + break; + case SATA_DEV: + default: + printk("ERROR: Unidentified device type %d\n", dev->dev_type); + rphy = NULL; + break; + } + + if (!rphy) { + kfree(dev); + return -ENODEV; + } + rphy->identify.phy_identifier = phy->phy->identify.phy_identifier; + memcpy(dev->sas_addr, port->attached_sas_addr, SAS_ADDR_SIZE); + sas_fill_in_rphy(dev, rphy); + sas_hash_addr(dev->hashed_sas_addr, dev->sas_addr); + port->port_dev = dev; + dev->port = port; + dev->linkrate = port->linkrate; + dev->min_linkrate = port->linkrate; + dev->max_linkrate = port->linkrate; + dev->pathways = port->num_phys; + memset(port->disc.fanout_sas_addr, 0, SAS_ADDR_SIZE); + memset(port->disc.eeds_a, 0, SAS_ADDR_SIZE); + memset(port->disc.eeds_b, 0, SAS_ADDR_SIZE); + port->disc.max_level = 0; + + dev->rphy = rphy; + spin_lock(&port->dev_list_lock); + list_add_tail(&dev->dev_list_node, &port->dev_list); + spin_unlock(&port->dev_list_lock); + + return 0; +} + +/* ---------- Discover and Revalidate ---------- */ + +/* ---------- SATA ---------- */ + +static void sas_get_ata_command_set(struct domain_device *dev) +{ + struct dev_to_host_fis *fis = + (struct dev_to_host_fis *) dev->frame_rcvd; + + if ((fis->sector_count == 1 && /* ATA */ + fis->lbal == 1 && + fis->lbam == 0 && + fis->lbah == 0 && + fis->device == 0) + || + (fis->sector_count == 0 && /* CE-ATA (mATA) */ + fis->lbal == 0 && + fis->lbam == 0xCE && + fis->lbah == 0xAA && + (fis->device & ~0x10) == 0)) + + dev->sata_dev.command_set = ATA_COMMAND_SET; + + else if ((fis->interrupt_reason == 1 && /* ATAPI */ + fis->lbal == 1 && + fis->byte_count_low == 0x14 && + fis->byte_count_high == 0xEB && + (fis->device & ~0x10) == 0)) + + dev->sata_dev.command_set = ATAPI_COMMAND_SET; + + else if ((fis->sector_count == 1 && /* SEMB */ + fis->lbal == 1 && + fis->lbam == 0x3C && + fis->lbah == 0xC3 && + fis->device == 0) + || + (fis->interrupt_reason == 1 && /* SATA PM */ + fis->lbal == 1 && + fis->byte_count_low == 0x69 && + fis->byte_count_high == 0x96 && + (fis->device & ~0x10) == 0)) + + /* Treat it as a superset? */ + dev->sata_dev.command_set = ATAPI_COMMAND_SET; +} + +/** + * sas_issue_ata_cmd -- Basic SATA command processing for discovery + * @dev: the device to send the command to + * @command: the command register + * @features: the features register + * @buffer: pointer to buffer to do I/O + * @size: size of @buffer + * @pci_dma_dir: PCI_DMA_... + */ +static int sas_issue_ata_cmd(struct domain_device *dev, u8 command, + u8 features, void *buffer, int size, + int pci_dma_dir) +{ + int res = 0; + struct sas_task *task; + struct dev_to_host_fis *d2h_fis = (struct dev_to_host_fis *) + &dev->frame_rcvd[0]; + + res = -ENOMEM; + task = sas_alloc_task(GFP_KERNEL); + if (!task) + goto out; + + task->dev = dev; + + task->ata_task.fis.command = command; + task->ata_task.fis.features = features; + task->ata_task.fis.device = d2h_fis->device; + task->ata_task.retry_count = 1; + + res = sas_execute_task(task, buffer, size, pci_dma_dir); + + sas_free_task(task); +out: + return res; +} + +static void sas_sata_propagate_sas_addr(struct domain_device *dev) +{ + unsigned long flags; + struct asd_sas_port *port = dev->port; + struct asd_sas_phy *phy; + + BUG_ON(dev->parent); + + memcpy(port->attached_sas_addr, dev->sas_addr, SAS_ADDR_SIZE); + spin_lock_irqsave(&port->phy_list_lock, flags); + list_for_each_entry(phy, &port->phy_list, port_phy_el) + memcpy(phy->attached_sas_addr, dev->sas_addr, SAS_ADDR_SIZE); + spin_unlock_irqrestore(&port->phy_list_lock, flags); +} + +#define ATA_IDENTIFY_DEV 0xEC +#define ATA_IDENTIFY_PACKET_DEV 0xA1 +#define ATA_SET_FEATURES 0xEF +#define ATA_FEATURE_PUP_STBY_SPIN_UP 0x07 + +/** + * sas_discover_sata_dev -- discover a STP/SATA device (SATA_DEV) + * @dev: STP/SATA device of interest (ATA/ATAPI) + * + * The LLDD has already been notified of this device, so that we can + * send FISes to it. Here we try to get IDENTIFY DEVICE or IDENTIFY + * PACKET DEVICE, if ATAPI device, so that the LLDD can fine-tune its + * performance for this device. + */ +static int sas_discover_sata_dev(struct domain_device *dev) +{ + int res; + __le16 *identify_x; + u8 command; + + identify_x = kzalloc(512, GFP_KERNEL); + if (!identify_x) + return -ENOMEM; + + if (dev->sata_dev.command_set == ATA_COMMAND_SET) { + dev->sata_dev.identify_device = identify_x; + command = ATA_IDENTIFY_DEV; + } else { + dev->sata_dev.identify_packet_device = identify_x; + command = ATA_IDENTIFY_PACKET_DEV; + } + + res = sas_issue_ata_cmd(dev, command, 0, identify_x, 512, + PCI_DMA_FROMDEVICE); + if (res) + goto out_err; + + /* lives on the media? */ + if (le16_to_cpu(identify_x[0]) & 4) { + /* incomplete response */ + SAS_DPRINTK("sending SET FEATURE/PUP_STBY_SPIN_UP to " + "dev %llx\n", SAS_ADDR(dev->sas_addr)); + if (!le16_to_cpu(identify_x[83] & (1<<6))) + goto cont1; + res = sas_issue_ata_cmd(dev, ATA_SET_FEATURES, + ATA_FEATURE_PUP_STBY_SPIN_UP, + NULL, 0, PCI_DMA_NONE); + if (res) + goto cont1; + + schedule_timeout_interruptible(5*HZ); /* More time? */ + res = sas_issue_ata_cmd(dev, command, 0, identify_x, 512, + PCI_DMA_FROMDEVICE); + if (res) + goto out_err; + } +cont1: + /* Get WWN */ + if (dev->port->oob_mode != SATA_OOB_MODE) { + memcpy(dev->sas_addr, dev->sata_dev.rps_resp.rps.stp_sas_addr, + SAS_ADDR_SIZE); + } else if (dev->sata_dev.command_set == ATA_COMMAND_SET && + (le16_to_cpu(dev->sata_dev.identify_device[108]) & 0xF000) + == 0x5000) { + int i; + + for (i = 0; i < 4; i++) { + dev->sas_addr[2*i] = + (le16_to_cpu(dev->sata_dev.identify_device[108+i]) & 0xFF00) >> 8; + dev->sas_addr[2*i+1] = + le16_to_cpu(dev->sata_dev.identify_device[108+i]) & 0x00FF; + } + } + sas_hash_addr(dev->hashed_sas_addr, dev->sas_addr); + if (!dev->parent) + sas_sata_propagate_sas_addr(dev); + + /* XXX Hint: register this SATA device with SATL. + When this returns, dev->sata_dev->lu is alive and + present. + sas_satl_register_dev(dev); + */ + return 0; +out_err: + dev->sata_dev.identify_packet_device = NULL; + dev->sata_dev.identify_device = NULL; + kfree(identify_x); + return res; +} + +static int sas_discover_sata_pm(struct domain_device *dev) +{ + return -ENODEV; +} + +int sas_notify_lldd_dev_found(struct domain_device *dev) +{ + int res = 0; + struct sas_ha_struct *sas_ha = dev->port->ha; + struct Scsi_Host *shost = sas_ha->core.shost; + struct sas_internal *i = to_sas_internal(shost->transportt); + + if (i->dft->lldd_dev_found) { + res = i->dft->lldd_dev_found(dev); + if (res) { + printk("sas: driver on pcidev %s cannot handle " + "device %llx, error:%d\n", + pci_name(sas_ha->pcidev), + SAS_ADDR(dev->sas_addr), res); + } + } + return res; +} + + +void sas_notify_lldd_dev_gone(struct domain_device *dev) +{ + struct sas_ha_struct *sas_ha = dev->port->ha; + struct Scsi_Host *shost = sas_ha->core.shost; + struct sas_internal *i = to_sas_internal(shost->transportt); + + if (i->dft->lldd_dev_gone) + i->dft->lldd_dev_gone(dev); +} + +/* ---------- Common/dispatchers ---------- */ + +/** + * sas_discover_sata -- discover an STP/SATA domain device + * @dev: pointer to struct domain_device of interest + * + * First we notify the LLDD of this device, so we can send frames to + * it. Then depending on the type of device we call the appropriate + * discover functions. Once device discover is done, we notify the + * LLDD so that it can fine-tune its parameters for the device, by + * removing it and then adding it. That is, the second time around, + * the driver would have certain fields, that it is looking at, set. + * Finally we initialize the kobj so that the device can be added to + * the system at registration time. Devices directly attached to a HA + * port, have no parents. All other devices do, and should have their + * "parent" pointer set appropriately before calling this function. + */ +int sas_discover_sata(struct domain_device *dev) +{ + int res; + + sas_get_ata_command_set(dev); + + res = sas_notify_lldd_dev_found(dev); + if (res) + return res; + + switch (dev->dev_type) { + case SATA_DEV: + res = sas_discover_sata_dev(dev); + break; + case SATA_PM: + res = sas_discover_sata_pm(dev); + break; + default: + break; + } + + sas_notify_lldd_dev_gone(dev); + if (!res) { + sas_notify_lldd_dev_found(dev); + } + return res; +} + +/** + * sas_discover_end_dev -- discover an end device (SSP, etc) + * @end: pointer to domain device of interest + * + * See comment in sas_discover_sata(). + */ +int sas_discover_end_dev(struct domain_device *dev) +{ + int res; + + res = sas_notify_lldd_dev_found(dev); + if (res) + return res; + + res = sas_rphy_add(dev->rphy); + if (res) + goto out_err; + + /* do this to get the end device port attributes which will have + * been scanned in sas_rphy_add */ + sas_notify_lldd_dev_gone(dev); + sas_notify_lldd_dev_found(dev); + + return 0; + +out_err: + sas_notify_lldd_dev_gone(dev); + return res; +} + +/* ---------- Device registration and unregistration ---------- */ + +static inline void sas_unregister_common_dev(struct domain_device *dev) +{ + sas_notify_lldd_dev_gone(dev); + if (!dev->parent) + dev->port->port_dev = NULL; + else + list_del_init(&dev->siblings); + list_del_init(&dev->dev_list_node); +} + +void sas_unregister_dev(struct domain_device *dev) +{ + if (dev->rphy) { + sas_remove_children(&dev->rphy->dev); + sas_rphy_delete(dev->rphy); + dev->rphy = NULL; + } + if (dev->dev_type == EDGE_DEV || dev->dev_type == FANOUT_DEV) { + /* remove the phys and ports, everything else should be gone */ + kfree(dev->ex_dev.ex_phy); + dev->ex_dev.ex_phy = NULL; + } + sas_unregister_common_dev(dev); +} + +void sas_unregister_domain_devices(struct asd_sas_port *port) +{ + struct domain_device *dev, *n; + + list_for_each_entry_safe_reverse(dev,n,&port->dev_list,dev_list_node) + sas_unregister_dev(dev); + + port->port->rphy = NULL; + +} + +/* ---------- Discovery and Revalidation ---------- */ + +/** + * sas_discover_domain -- discover the domain + * @port: port to the domain of interest + * + * NOTE: this process _must_ quit (return) as soon as any connection + * errors are encountered. Connection recovery is done elsewhere. + * Discover process only interrogates devices in order to discover the + * domain. + */ +static void sas_discover_domain(void *data) +{ + int error = 0; + struct asd_sas_port *port = data; + + sas_begin_event(DISCE_DISCOVER_DOMAIN, &port->disc.disc_event_lock, + &port->disc.pending); + + if (port->port_dev) + return ; + else { + error = sas_get_port_device(port); + if (error) + return; + } + + SAS_DPRINTK("DOING DISCOVERY on port %d, pid:%d\n", port->id, + current->pid); + + switch (port->port_dev->dev_type) { + case SAS_END_DEV: + error = sas_discover_end_dev(port->port_dev); + break; + case EDGE_DEV: + case FANOUT_DEV: + error = sas_discover_root_expander(port->port_dev); + break; + case SATA_DEV: + case SATA_PM: + error = sas_discover_sata(port->port_dev); + break; + default: + SAS_DPRINTK("unhandled device %d\n", port->port_dev->dev_type); + break; + } + + if (error) { + kfree(port->port_dev); /* not kobject_register-ed yet */ + port->port_dev = NULL; + } + + SAS_DPRINTK("DONE DISCOVERY on port %d, pid:%d, result:%d\n", port->id, + current->pid, error); +} + +static void sas_revalidate_domain(void *data) +{ + int res = 0; + struct asd_sas_port *port = data; + + sas_begin_event(DISCE_REVALIDATE_DOMAIN, &port->disc.disc_event_lock, + &port->disc.pending); + + SAS_DPRINTK("REVALIDATING DOMAIN on port %d, pid:%d\n", port->id, + current->pid); + if (port->port_dev) + res = sas_ex_revalidate_domain(port->port_dev); + + SAS_DPRINTK("done REVALIDATING DOMAIN on port %d, pid:%d, res 0x%x\n", + port->id, current->pid, res); +} + +/* ---------- Events ---------- */ + +int sas_discover_event(struct asd_sas_port *port, enum discover_event ev) +{ + struct sas_discovery *disc; + + if (!port) + return 0; + disc = &port->disc; + + BUG_ON(ev >= DISC_NUM_EVENTS); + + sas_queue_event(ev, &disc->disc_event_lock, &disc->pending, + &disc->disc_work[ev], port->ha->core.shost); + + return 0; +} + +/** + * sas_init_disc -- initialize the discovery struct in the port + * @port: pointer to struct port + * + * Called when the ports are being initialized. + */ +void sas_init_disc(struct sas_discovery *disc, struct asd_sas_port *port) +{ + int i; + + static void (*sas_event_fns[DISC_NUM_EVENTS])(void *) = { + [DISCE_DISCOVER_DOMAIN] = sas_discover_domain, + [DISCE_REVALIDATE_DOMAIN] = sas_revalidate_domain, + }; + + spin_lock_init(&disc->disc_event_lock); + disc->pending = 0; + for (i = 0; i < DISC_NUM_EVENTS; i++) + INIT_WORK(&disc->disc_work[i], sas_event_fns[i], port); +} diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_dump.c linux-2.6.18.ovz/drivers/scsi/libsas/sas_dump.c --- linux-2.6.18/drivers/scsi/libsas/sas_dump.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_dump.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,76 @@ +/* + * Serial Attached SCSI (SAS) Dump/Debugging routines + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "sas_dump.h" + +#ifdef SAS_DEBUG + +static const char *sas_hae_str[] = { + [0] = "HAE_RESET", +}; + +static const char *sas_porte_str[] = { + [0] = "PORTE_BYTES_DMAED", + [1] = "PORTE_BROADCAST_RCVD", + [2] = "PORTE_LINK_RESET_ERR", + [3] = "PORTE_TIMER_EVENT", + [4] = "PORTE_HARD_RESET", +}; + +static const char *sas_phye_str[] = { + [0] = "PHYE_LOSS_OF_SIGNAL", + [1] = "PHYE_OOB_DONE", + [2] = "PHYE_OOB_ERROR", + [3] = "PHYE_SPINUP_HOLD", +}; + +void sas_dprint_porte(int phyid, enum port_event pe) +{ + SAS_DPRINTK("phy%d: port event: %s\n", phyid, sas_porte_str[pe]); +} +void sas_dprint_phye(int phyid, enum phy_event pe) +{ + SAS_DPRINTK("phy%d: phy event: %s\n", phyid, sas_phye_str[pe]); +} + +void sas_dprint_hae(struct sas_ha_struct *sas_ha, enum ha_event he) +{ + SAS_DPRINTK("ha %s: %s event\n", pci_name(sas_ha->pcidev), + sas_hae_str[he]); +} + +void sas_dump_port(struct asd_sas_port *port) +{ + SAS_DPRINTK("port%d: class:0x%x\n", port->id, port->class); + SAS_DPRINTK("port%d: sas_addr:%llx\n", port->id, + SAS_ADDR(port->sas_addr)); + SAS_DPRINTK("port%d: attached_sas_addr:%llx\n", port->id, + SAS_ADDR(port->attached_sas_addr)); + SAS_DPRINTK("port%d: iproto:0x%x\n", port->id, port->iproto); + SAS_DPRINTK("port%d: tproto:0x%x\n", port->id, port->tproto); + SAS_DPRINTK("port%d: oob_mode:0x%x\n", port->id, port->oob_mode); + SAS_DPRINTK("port%d: num_phys:%d\n", port->id, port->num_phys); +} + +#endif /* SAS_DEBUG */ diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_dump.h linux-2.6.18.ovz/drivers/scsi/libsas/sas_dump.h --- linux-2.6.18/drivers/scsi/libsas/sas_dump.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_dump.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,42 @@ +/* + * Serial Attached SCSI (SAS) Dump/Debugging routines header file + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "sas_internal.h" + +#ifdef SAS_DEBUG + +void sas_dprint_porte(int phyid, enum port_event pe); +void sas_dprint_phye(int phyid, enum phy_event pe); +void sas_dprint_hae(struct sas_ha_struct *sas_ha, enum ha_event he); +void sas_dump_port(struct asd_sas_port *port); + +#else /* SAS_DEBUG */ + +static inline void sas_dprint_porte(int phyid, enum port_event pe) { } +static inline void sas_dprint_phye(int phyid, enum phy_event pe) { } +static inline void sas_dprint_hae(struct sas_ha_struct *sas_ha, + enum ha_event he) { } +static inline void sas_dump_port(struct asd_sas_port *port) { } + +#endif /* SAS_DEBUG */ diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_event.c linux-2.6.18.ovz/drivers/scsi/libsas/sas_event.c --- linux-2.6.18/drivers/scsi/libsas/sas_event.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_event.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,75 @@ +/* + * Serial Attached SCSI (SAS) Event processing + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "sas_internal.h" +#include "sas_dump.h" + +static void notify_ha_event(struct sas_ha_struct *sas_ha, enum ha_event event) +{ + BUG_ON(event >= HA_NUM_EVENTS); + + sas_queue_event(event, &sas_ha->event_lock, &sas_ha->pending, + &sas_ha->ha_events[event], sas_ha->core.shost); +} + +static void notify_port_event(struct asd_sas_phy *phy, enum port_event event) +{ + struct sas_ha_struct *ha = phy->ha; + + BUG_ON(event >= PORT_NUM_EVENTS); + + sas_queue_event(event, &ha->event_lock, &phy->port_events_pending, + &phy->port_events[event], ha->core.shost); +} + +static void notify_phy_event(struct asd_sas_phy *phy, enum phy_event event) +{ + struct sas_ha_struct *ha = phy->ha; + + BUG_ON(event >= PHY_NUM_EVENTS); + + sas_queue_event(event, &ha->event_lock, &phy->phy_events_pending, + &phy->phy_events[event], ha->core.shost); +} + +int sas_init_events(struct sas_ha_struct *sas_ha) +{ + static void (*sas_ha_event_fns[HA_NUM_EVENTS])(void *) = { + [HAE_RESET] = sas_hae_reset, + }; + + int i; + + spin_lock_init(&sas_ha->event_lock); + + for (i = 0; i < HA_NUM_EVENTS; i++) + INIT_WORK(&sas_ha->ha_events[i], sas_ha_event_fns[i], sas_ha); + + sas_ha->notify_ha_event = notify_ha_event; + sas_ha->notify_port_event = notify_port_event; + sas_ha->notify_phy_event = notify_phy_event; + + return 0; +} diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_expander.c linux-2.6.18.ovz/drivers/scsi/libsas/sas_expander.c --- linux-2.6.18/drivers/scsi/libsas/sas_expander.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_expander.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,1862 @@ +/* + * Serial Attached SCSI (SAS) Expander discovery and configuration + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +#include "sas_internal.h" + +#include +#include +#include "../scsi_sas_internal.h" + +static int sas_discover_expander(struct domain_device *dev); +static int sas_configure_routing(struct domain_device *dev, u8 *sas_addr); +static int sas_configure_phy(struct domain_device *dev, int phy_id, + u8 *sas_addr, int include); +static int sas_disable_routing(struct domain_device *dev, u8 *sas_addr); + +#if 0 +/* FIXME: smp needs to migrate into the sas class */ +static ssize_t smp_portal_read(struct kobject *, char *, loff_t, size_t); +static ssize_t smp_portal_write(struct kobject *, char *, loff_t, size_t); +#endif + +/* ---------- SMP task management ---------- */ + +static void smp_task_timedout(unsigned long _task) +{ + struct sas_task *task = (void *) _task; + unsigned long flags; + + spin_lock_irqsave(&task->task_state_lock, flags); + if (!(task->task_state_flags & SAS_TASK_STATE_DONE)) + task->task_state_flags |= SAS_TASK_STATE_ABORTED; + spin_unlock_irqrestore(&task->task_state_lock, flags); + + complete(&task->completion); +} + +static void smp_task_done(struct sas_task *task) +{ + if (!del_timer(&task->timer)) + return; + complete(&task->completion); +} + +/* Give it some long enough timeout. In seconds. */ +#define SMP_TIMEOUT 10 + +static int smp_execute_task(struct domain_device *dev, void *req, int req_size, + void *resp, int resp_size) +{ + int res; + struct sas_task *task = sas_alloc_task(GFP_KERNEL); + struct sas_internal *i = + to_sas_internal(dev->port->ha->core.shost->transportt); + + if (!task) + return -ENOMEM; + + task->dev = dev; + task->task_proto = dev->tproto; + sg_init_one(&task->smp_task.smp_req, req, req_size); + sg_init_one(&task->smp_task.smp_resp, resp, resp_size); + + task->task_done = smp_task_done; + + task->timer.data = (unsigned long) task; + task->timer.function = smp_task_timedout; + task->timer.expires = jiffies + SMP_TIMEOUT*HZ; + add_timer(&task->timer); + + res = i->dft->lldd_execute_task(task, 1, GFP_KERNEL); + + if (res) { + del_timer(&task->timer); + SAS_DPRINTK("executing SMP task failed:%d\n", res); + goto ex_err; + } + + wait_for_completion(&task->completion); + res = -ETASK; + if ((task->task_state_flags & SAS_TASK_STATE_ABORTED)) { + SAS_DPRINTK("smp task timed out or aborted\n"); + i->dft->lldd_abort_task(task); + if (!(task->task_state_flags & SAS_TASK_STATE_DONE)) { + SAS_DPRINTK("SMP task aborted and not done\n"); + goto ex_err; + } + } + if (task->task_status.resp == SAS_TASK_COMPLETE && + task->task_status.stat == SAM_GOOD) + res = 0; + else + SAS_DPRINTK("%s: task to dev %016llx response: 0x%x " + "status 0x%x\n", __FUNCTION__, + SAS_ADDR(dev->sas_addr), + task->task_status.resp, + task->task_status.stat); +ex_err: + sas_free_task(task); + return res; +} + +/* ---------- Allocations ---------- */ + +static inline void *alloc_smp_req(int size) +{ + u8 *p = kzalloc(size, GFP_KERNEL); + if (p) + p[0] = SMP_REQUEST; + return p; +} + +static inline void *alloc_smp_resp(int size) +{ + return kzalloc(size, GFP_KERNEL); +} + +/* ---------- Expander configuration ---------- */ + +static void sas_set_ex_phy(struct domain_device *dev, int phy_id, + void *disc_resp) +{ + struct expander_device *ex = &dev->ex_dev; + struct ex_phy *phy = &ex->ex_phy[phy_id]; + struct smp_resp *resp = disc_resp; + struct discover_resp *dr = &resp->disc; + struct sas_rphy *rphy = dev->rphy; + int rediscover = (phy->phy != NULL); + + if (!rediscover) { + phy->phy = sas_phy_alloc(&rphy->dev, phy_id); + + /* FIXME: error_handling */ + BUG_ON(!phy->phy); + } + + switch (resp->result) { + case SMP_RESP_PHY_VACANT: + phy->phy_state = PHY_VACANT; + return; + default: + phy->phy_state = PHY_NOT_PRESENT; + return; + case SMP_RESP_FUNC_ACC: + phy->phy_state = PHY_EMPTY; /* do not know yet */ + break; + } + + phy->phy_id = phy_id; + phy->attached_dev_type = dr->attached_dev_type; + phy->linkrate = dr->linkrate; + phy->attached_sata_host = dr->attached_sata_host; + phy->attached_sata_dev = dr->attached_sata_dev; + phy->attached_sata_ps = dr->attached_sata_ps; + phy->attached_iproto = dr->iproto << 1; + phy->attached_tproto = dr->tproto << 1; + memcpy(phy->attached_sas_addr, dr->attached_sas_addr, SAS_ADDR_SIZE); + phy->attached_phy_id = dr->attached_phy_id; + phy->phy_change_count = dr->change_count; + phy->routing_attr = dr->routing_attr; + phy->virtual = dr->virtual; + phy->last_da_index = -1; + + phy->phy->identify.initiator_port_protocols = phy->attached_iproto; + phy->phy->identify.target_port_protocols = phy->attached_tproto; + phy->phy->identify.phy_identifier = phy_id; + phy->phy->minimum_linkrate_hw = SAS_LINK_RATE_1_5_GBPS; + phy->phy->maximum_linkrate_hw = SAS_LINK_RATE_3_0_GBPS; + phy->phy->minimum_linkrate = SAS_LINK_RATE_1_5_GBPS; + phy->phy->maximum_linkrate = SAS_LINK_RATE_3_0_GBPS; + switch (phy->linkrate) { + case PHY_LINKRATE_1_5: + phy->phy->negotiated_linkrate = SAS_LINK_RATE_1_5_GBPS; + break; + case PHY_LINKRATE_3: + phy->phy->negotiated_linkrate = SAS_LINK_RATE_3_0_GBPS; + break; + case PHY_LINKRATE_6: + phy->phy->negotiated_linkrate = SAS_LINK_RATE_6_0_GBPS; + break; + default: + phy->phy->negotiated_linkrate = SAS_LINK_RATE_UNKNOWN; + break; + } + + if (!rediscover) + sas_phy_add(phy->phy); + + SAS_DPRINTK("ex %016llx phy%02d:%c attached: %016llx\n", + SAS_ADDR(dev->sas_addr), phy->phy_id, + phy->routing_attr == TABLE_ROUTING ? 'T' : + phy->routing_attr == DIRECT_ROUTING ? 'D' : + phy->routing_attr == SUBTRACTIVE_ROUTING ? 'S' : '?', + SAS_ADDR(phy->attached_sas_addr)); + + return; +} + +#define DISCOVER_REQ_SIZE 16 +#define DISCOVER_RESP_SIZE 56 + +static int sas_ex_phy_discover(struct domain_device *dev, int single) +{ + struct expander_device *ex = &dev->ex_dev; + int res = 0; + u8 *disc_req; + u8 *disc_resp; + + disc_req = alloc_smp_req(DISCOVER_REQ_SIZE); + if (!disc_req) + return -ENOMEM; + + disc_resp = alloc_smp_req(DISCOVER_RESP_SIZE); + if (!disc_resp) { + kfree(disc_req); + return -ENOMEM; + } + + disc_req[1] = SMP_DISCOVER; + + if (0 <= single && single < ex->num_phys) { + disc_req[9] = single; + res = smp_execute_task(dev, disc_req, DISCOVER_REQ_SIZE, + disc_resp, DISCOVER_RESP_SIZE); + if (res) + goto out_err; + sas_set_ex_phy(dev, single, disc_resp); + } else { + int i; + + for (i = 0; i < ex->num_phys; i++) { + disc_req[9] = i; + res = smp_execute_task(dev, disc_req, + DISCOVER_REQ_SIZE, disc_resp, + DISCOVER_RESP_SIZE); + if (res) + goto out_err; + sas_set_ex_phy(dev, i, disc_resp); + } + } +out_err: + kfree(disc_resp); + kfree(disc_req); + return res; +} + +static int sas_expander_discover(struct domain_device *dev) +{ + struct expander_device *ex = &dev->ex_dev; + int res = -ENOMEM; + + ex->ex_phy = kzalloc(sizeof(*ex->ex_phy)*ex->num_phys, GFP_KERNEL); + if (!ex->ex_phy) + return -ENOMEM; + + res = sas_ex_phy_discover(dev, -1); + if (res) + goto out_err; + + return 0; + out_err: + kfree(ex->ex_phy); + ex->ex_phy = NULL; + return res; +} + +#define MAX_EXPANDER_PHYS 128 + +static void ex_assign_report_general(struct domain_device *dev, + struct smp_resp *resp) +{ + struct report_general_resp *rg = &resp->rg; + + dev->ex_dev.ex_change_count = be16_to_cpu(rg->change_count); + dev->ex_dev.max_route_indexes = be16_to_cpu(rg->route_indexes); + dev->ex_dev.num_phys = min(rg->num_phys, (u8)MAX_EXPANDER_PHYS); + dev->ex_dev.conf_route_table = rg->conf_route_table; + dev->ex_dev.configuring = rg->configuring; + memcpy(dev->ex_dev.enclosure_logical_id, rg->enclosure_logical_id, 8); +} + +#define RG_REQ_SIZE 8 +#define RG_RESP_SIZE 32 + +static int sas_ex_general(struct domain_device *dev) +{ + u8 *rg_req; + struct smp_resp *rg_resp; + int res; + int i; + + rg_req = alloc_smp_req(RG_REQ_SIZE); + if (!rg_req) + return -ENOMEM; + + rg_resp = alloc_smp_resp(RG_RESP_SIZE); + if (!rg_resp) { + kfree(rg_req); + return -ENOMEM; + } + + rg_req[1] = SMP_REPORT_GENERAL; + + for (i = 0; i < 5; i++) { + res = smp_execute_task(dev, rg_req, RG_REQ_SIZE, rg_resp, + RG_RESP_SIZE); + + if (res) { + SAS_DPRINTK("RG to ex %016llx failed:0x%x\n", + SAS_ADDR(dev->sas_addr), res); + goto out; + } else if (rg_resp->result != SMP_RESP_FUNC_ACC) { + SAS_DPRINTK("RG:ex %016llx returned SMP result:0x%x\n", + SAS_ADDR(dev->sas_addr), rg_resp->result); + res = rg_resp->result; + goto out; + } + + ex_assign_report_general(dev, rg_resp); + + if (dev->ex_dev.configuring) { + SAS_DPRINTK("RG: ex %llx self-configuring...\n", + SAS_ADDR(dev->sas_addr)); + schedule_timeout_interruptible(5*HZ); + } else + break; + } +out: + kfree(rg_req); + kfree(rg_resp); + return res; +} + +static void ex_assign_manuf_info(struct domain_device *dev, void + *_mi_resp) +{ + u8 *mi_resp = _mi_resp; + struct sas_rphy *rphy = dev->rphy; + struct sas_expander_device *edev = rphy_to_expander_device(rphy); + + memcpy(edev->vendor_id, mi_resp + 12, SAS_EXPANDER_VENDOR_ID_LEN); + memcpy(edev->product_id, mi_resp + 20, SAS_EXPANDER_PRODUCT_ID_LEN); + memcpy(edev->product_rev, mi_resp + 36, + SAS_EXPANDER_PRODUCT_REV_LEN); + + if (mi_resp[8] & 1) { + memcpy(edev->component_vendor_id, mi_resp + 40, + SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN); + edev->component_id = mi_resp[48] << 8 | mi_resp[49]; + edev->component_revision_id = mi_resp[50]; + } +} + +#define MI_REQ_SIZE 8 +#define MI_RESP_SIZE 64 + +static int sas_ex_manuf_info(struct domain_device *dev) +{ + u8 *mi_req; + u8 *mi_resp; + int res; + + mi_req = alloc_smp_req(MI_REQ_SIZE); + if (!mi_req) + return -ENOMEM; + + mi_resp = alloc_smp_resp(MI_RESP_SIZE); + if (!mi_resp) { + kfree(mi_req); + return -ENOMEM; + } + + mi_req[1] = SMP_REPORT_MANUF_INFO; + + res = smp_execute_task(dev, mi_req, MI_REQ_SIZE, mi_resp,MI_RESP_SIZE); + if (res) { + SAS_DPRINTK("MI: ex %016llx failed:0x%x\n", + SAS_ADDR(dev->sas_addr), res); + goto out; + } else if (mi_resp[2] != SMP_RESP_FUNC_ACC) { + SAS_DPRINTK("MI ex %016llx returned SMP result:0x%x\n", + SAS_ADDR(dev->sas_addr), mi_resp[2]); + goto out; + } + + ex_assign_manuf_info(dev, mi_resp); +out: + kfree(mi_req); + kfree(mi_resp); + return res; +} + +#define PC_REQ_SIZE 44 +#define PC_RESP_SIZE 8 + +int sas_smp_phy_control(struct domain_device *dev, int phy_id, + enum phy_func phy_func) +{ + u8 *pc_req; + u8 *pc_resp; + int res; + + pc_req = alloc_smp_req(PC_REQ_SIZE); + if (!pc_req) + return -ENOMEM; + + pc_resp = alloc_smp_resp(PC_RESP_SIZE); + if (!pc_resp) { + kfree(pc_req); + return -ENOMEM; + } + + pc_req[1] = SMP_PHY_CONTROL; + pc_req[9] = phy_id; + pc_req[10]= phy_func; + + res = smp_execute_task(dev, pc_req, PC_REQ_SIZE, pc_resp,PC_RESP_SIZE); + + kfree(pc_resp); + kfree(pc_req); + return res; +} + +static void sas_ex_disable_phy(struct domain_device *dev, int phy_id) +{ + struct expander_device *ex = &dev->ex_dev; + struct ex_phy *phy = &ex->ex_phy[phy_id]; + + sas_smp_phy_control(dev, phy_id, PHY_FUNC_DISABLE); + phy->linkrate = PHY_DISABLED; +} + +static void sas_ex_disable_port(struct domain_device *dev, u8 *sas_addr) +{ + struct expander_device *ex = &dev->ex_dev; + int i; + + for (i = 0; i < ex->num_phys; i++) { + struct ex_phy *phy = &ex->ex_phy[i]; + + if (phy->phy_state == PHY_VACANT || + phy->phy_state == PHY_NOT_PRESENT) + continue; + + if (SAS_ADDR(phy->attached_sas_addr) == SAS_ADDR(sas_addr)) + sas_ex_disable_phy(dev, i); + } +} + +static int sas_dev_present_in_domain(struct asd_sas_port *port, + u8 *sas_addr) +{ + struct domain_device *dev; + + if (SAS_ADDR(port->sas_addr) == SAS_ADDR(sas_addr)) + return 1; + list_for_each_entry(dev, &port->dev_list, dev_list_node) { + if (SAS_ADDR(dev->sas_addr) == SAS_ADDR(sas_addr)) + return 1; + } + return 0; +} + +#define RPEL_REQ_SIZE 16 +#define RPEL_RESP_SIZE 32 +int sas_smp_get_phy_events(struct sas_phy *phy) +{ + int res; + struct sas_rphy *rphy = dev_to_rphy(phy->dev.parent); + struct domain_device *dev = sas_find_dev_by_rphy(rphy); + u8 *req = alloc_smp_req(RPEL_REQ_SIZE); + u8 *resp = kzalloc(RPEL_RESP_SIZE, GFP_KERNEL); + + if (!resp) + return -ENOMEM; + + req[1] = SMP_REPORT_PHY_ERR_LOG; + req[9] = phy->number; + + res = smp_execute_task(dev, req, RPEL_REQ_SIZE, + resp, RPEL_RESP_SIZE); + + if (!res) + goto out; + + phy->invalid_dword_count = scsi_to_u32(&resp[12]); + phy->running_disparity_error_count = scsi_to_u32(&resp[16]); + phy->loss_of_dword_sync_count = scsi_to_u32(&resp[20]); + phy->phy_reset_problem_count = scsi_to_u32(&resp[24]); + + out: + kfree(resp); + return res; + +} + +#define RPS_REQ_SIZE 16 +#define RPS_RESP_SIZE 60 + +static int sas_get_report_phy_sata(struct domain_device *dev, + int phy_id, + struct smp_resp *rps_resp) +{ + int res; + u8 *rps_req = alloc_smp_req(RPS_REQ_SIZE); + + if (!rps_req) + return -ENOMEM; + + rps_req[1] = SMP_REPORT_PHY_SATA; + rps_req[9] = phy_id; + + res = smp_execute_task(dev, rps_req, RPS_REQ_SIZE, + rps_resp, RPS_RESP_SIZE); + + kfree(rps_req); + return 0; +} + +static void sas_ex_get_linkrate(struct domain_device *parent, + struct domain_device *child, + struct ex_phy *parent_phy) +{ + struct expander_device *parent_ex = &parent->ex_dev; + struct sas_port *port; + int i; + + child->pathways = 0; + + port = parent_phy->port; + + for (i = 0; i < parent_ex->num_phys; i++) { + struct ex_phy *phy = &parent_ex->ex_phy[i]; + + if (phy->phy_state == PHY_VACANT || + phy->phy_state == PHY_NOT_PRESENT) + continue; + + if (SAS_ADDR(phy->attached_sas_addr) == + SAS_ADDR(child->sas_addr)) { + + child->min_linkrate = min(parent->min_linkrate, + phy->linkrate); + child->max_linkrate = max(parent->max_linkrate, + phy->linkrate); + child->pathways++; + sas_port_add_phy(port, phy->phy); + } + } + child->linkrate = min(parent_phy->linkrate, child->max_linkrate); + child->pathways = min(child->pathways, parent->pathways); +} + +static struct domain_device *sas_ex_discover_end_dev( + struct domain_device *parent, int phy_id) +{ + struct expander_device *parent_ex = &parent->ex_dev; + struct ex_phy *phy = &parent_ex->ex_phy[phy_id]; + struct domain_device *child = NULL; + struct sas_rphy *rphy; + int res; + + if (phy->attached_sata_host || phy->attached_sata_ps) + return NULL; + + child = kzalloc(sizeof(*child), GFP_KERNEL); + if (!child) + return NULL; + + child->parent = parent; + child->port = parent->port; + child->iproto = phy->attached_iproto; + memcpy(child->sas_addr, phy->attached_sas_addr, SAS_ADDR_SIZE); + sas_hash_addr(child->hashed_sas_addr, child->sas_addr); + phy->port = sas_port_alloc(&parent->rphy->dev, phy_id); + BUG_ON(!phy->port); + /* FIXME: better error handling*/ + BUG_ON(sas_port_add(phy->port) != 0); + sas_ex_get_linkrate(parent, child, phy); + + if ((phy->attached_tproto & SAS_PROTO_STP) || phy->attached_sata_dev) { + child->dev_type = SATA_DEV; + if (phy->attached_tproto & SAS_PROTO_STP) + child->tproto = phy->attached_tproto; + if (phy->attached_sata_dev) + child->tproto |= SATA_DEV; + res = sas_get_report_phy_sata(parent, phy_id, + &child->sata_dev.rps_resp); + if (res) { + SAS_DPRINTK("report phy sata to %016llx:0x%x returned " + "0x%x\n", SAS_ADDR(parent->sas_addr), + phy_id, res); + kfree(child); + return NULL; + } + memcpy(child->frame_rcvd, &child->sata_dev.rps_resp.rps.fis, + sizeof(struct dev_to_host_fis)); + sas_init_dev(child); + res = sas_discover_sata(child); + if (res) { + SAS_DPRINTK("sas_discover_sata() for device %16llx at " + "%016llx:0x%x returned 0x%x\n", + SAS_ADDR(child->sas_addr), + SAS_ADDR(parent->sas_addr), phy_id, res); + kfree(child); + return NULL; + } + } else if (phy->attached_tproto & SAS_PROTO_SSP) { + child->dev_type = SAS_END_DEV; + rphy = sas_end_device_alloc(phy->port); + /* FIXME: error handling */ + BUG_ON(!rphy); + child->tproto = phy->attached_tproto; + sas_init_dev(child); + + child->rphy = rphy; + sas_fill_in_rphy(child, rphy); + + spin_lock(&parent->port->dev_list_lock); + list_add_tail(&child->dev_list_node, &parent->port->dev_list); + spin_unlock(&parent->port->dev_list_lock); + + res = sas_discover_end_dev(child); + if (res) { + SAS_DPRINTK("sas_discover_end_dev() for device %16llx " + "at %016llx:0x%x returned 0x%x\n", + SAS_ADDR(child->sas_addr), + SAS_ADDR(parent->sas_addr), phy_id, res); + /* FIXME: this kfrees list elements without removing them */ + //kfree(child); + return NULL; + } + } else { + SAS_DPRINTK("target proto 0x%x at %016llx:0x%x not handled\n", + phy->attached_tproto, SAS_ADDR(parent->sas_addr), + phy_id); + } + + list_add_tail(&child->siblings, &parent_ex->children); + return child; +} + +static struct domain_device *sas_ex_discover_expander( + struct domain_device *parent, int phy_id) +{ + struct sas_expander_device *parent_ex = rphy_to_expander_device(parent->rphy); + struct ex_phy *phy = &parent->ex_dev.ex_phy[phy_id]; + struct domain_device *child = NULL; + struct sas_rphy *rphy; + struct sas_expander_device *edev; + struct asd_sas_port *port; + int res; + + if (phy->routing_attr == DIRECT_ROUTING) { + SAS_DPRINTK("ex %016llx:0x%x:D <--> ex %016llx:0x%x is not " + "allowed\n", + SAS_ADDR(parent->sas_addr), phy_id, + SAS_ADDR(phy->attached_sas_addr), + phy->attached_phy_id); + return NULL; + } + child = kzalloc(sizeof(*child), GFP_KERNEL); + if (!child) + return NULL; + + phy->port = sas_port_alloc(&parent->rphy->dev, phy_id); + /* FIXME: better error handling */ + BUG_ON(sas_port_add(phy->port) != 0); + + + switch (phy->attached_dev_type) { + case EDGE_DEV: + rphy = sas_expander_alloc(phy->port, + SAS_EDGE_EXPANDER_DEVICE); + break; + case FANOUT_DEV: + rphy = sas_expander_alloc(phy->port, + SAS_FANOUT_EXPANDER_DEVICE); + break; + default: + rphy = NULL; /* shut gcc up */ + BUG(); + } + port = parent->port; + child->rphy = rphy; + edev = rphy_to_expander_device(rphy); + child->dev_type = phy->attached_dev_type; + child->parent = parent; + child->port = port; + child->iproto = phy->attached_iproto; + child->tproto = phy->attached_tproto; + memcpy(child->sas_addr, phy->attached_sas_addr, SAS_ADDR_SIZE); + sas_hash_addr(child->hashed_sas_addr, child->sas_addr); + sas_ex_get_linkrate(parent, child, phy); + edev->level = parent_ex->level + 1; + parent->port->disc.max_level = max(parent->port->disc.max_level, + edev->level); + sas_init_dev(child); + sas_fill_in_rphy(child, rphy); + sas_rphy_add(rphy); + + spin_lock(&parent->port->dev_list_lock); + list_add_tail(&child->dev_list_node, &parent->port->dev_list); + spin_unlock(&parent->port->dev_list_lock); + + res = sas_discover_expander(child); + if (res) { + kfree(child); + return NULL; + } + list_add_tail(&child->siblings, &parent->ex_dev.children); + return child; +} + +static int sas_ex_discover_dev(struct domain_device *dev, int phy_id) +{ + struct expander_device *ex = &dev->ex_dev; + struct ex_phy *ex_phy = &ex->ex_phy[phy_id]; + struct domain_device *child = NULL; + int res = 0; + + /* Phy state */ + if (ex_phy->linkrate == PHY_SPINUP_HOLD) { + if (!sas_smp_phy_control(dev, phy_id, PHY_FUNC_LINK_RESET)) + res = sas_ex_phy_discover(dev, phy_id); + if (res) + return res; + } + + /* Parent and domain coherency */ + if (!dev->parent && (SAS_ADDR(ex_phy->attached_sas_addr) == + SAS_ADDR(dev->port->sas_addr))) { + sas_add_parent_port(dev, phy_id); + return 0; + } + if (dev->parent && (SAS_ADDR(ex_phy->attached_sas_addr) == + SAS_ADDR(dev->parent->sas_addr))) { + sas_add_parent_port(dev, phy_id); + if (ex_phy->routing_attr == TABLE_ROUTING) + sas_configure_phy(dev, phy_id, dev->port->sas_addr, 1); + return 0; + } + + if (sas_dev_present_in_domain(dev->port, ex_phy->attached_sas_addr)) + sas_ex_disable_port(dev, ex_phy->attached_sas_addr); + + if (ex_phy->attached_dev_type == NO_DEVICE) { + if (ex_phy->routing_attr == DIRECT_ROUTING) { + memset(ex_phy->attached_sas_addr, 0, SAS_ADDR_SIZE); + sas_configure_routing(dev, ex_phy->attached_sas_addr); + } + return 0; + } else if (ex_phy->linkrate == PHY_LINKRATE_UNKNOWN) + return 0; + + if (ex_phy->attached_dev_type != SAS_END_DEV && + ex_phy->attached_dev_type != FANOUT_DEV && + ex_phy->attached_dev_type != EDGE_DEV) { + SAS_DPRINTK("unknown device type(0x%x) attached to ex %016llx " + "phy 0x%x\n", ex_phy->attached_dev_type, + SAS_ADDR(dev->sas_addr), + phy_id); + return 0; + } + + res = sas_configure_routing(dev, ex_phy->attached_sas_addr); + if (res) { + SAS_DPRINTK("configure routing for dev %016llx " + "reported 0x%x. Forgotten\n", + SAS_ADDR(ex_phy->attached_sas_addr), res); + sas_disable_routing(dev, ex_phy->attached_sas_addr); + return res; + } + + switch (ex_phy->attached_dev_type) { + case SAS_END_DEV: + child = sas_ex_discover_end_dev(dev, phy_id); + break; + case FANOUT_DEV: + if (SAS_ADDR(dev->port->disc.fanout_sas_addr)) { + SAS_DPRINTK("second fanout expander %016llx phy 0x%x " + "attached to ex %016llx phy 0x%x\n", + SAS_ADDR(ex_phy->attached_sas_addr), + ex_phy->attached_phy_id, + SAS_ADDR(dev->sas_addr), + phy_id); + sas_ex_disable_phy(dev, phy_id); + break; + } else + memcpy(dev->port->disc.fanout_sas_addr, + ex_phy->attached_sas_addr, SAS_ADDR_SIZE); + /* fallthrough */ + case EDGE_DEV: + child = sas_ex_discover_expander(dev, phy_id); + break; + default: + break; + } + + if (child) { + int i; + + for (i = 0; i < ex->num_phys; i++) { + if (ex->ex_phy[i].phy_state == PHY_VACANT || + ex->ex_phy[i].phy_state == PHY_NOT_PRESENT) + continue; + + if (SAS_ADDR(ex->ex_phy[i].attached_sas_addr) == + SAS_ADDR(child->sas_addr)) + ex->ex_phy[i].phy_state= PHY_DEVICE_DISCOVERED; + } + } + + return res; +} + +static int sas_find_sub_addr(struct domain_device *dev, u8 *sub_addr) +{ + struct expander_device *ex = &dev->ex_dev; + int i; + + for (i = 0; i < ex->num_phys; i++) { + struct ex_phy *phy = &ex->ex_phy[i]; + + if (phy->phy_state == PHY_VACANT || + phy->phy_state == PHY_NOT_PRESENT) + continue; + + if ((phy->attached_dev_type == EDGE_DEV || + phy->attached_dev_type == FANOUT_DEV) && + phy->routing_attr == SUBTRACTIVE_ROUTING) { + + memcpy(sub_addr, phy->attached_sas_addr,SAS_ADDR_SIZE); + + return 1; + } + } + return 0; +} + +static int sas_check_level_subtractive_boundary(struct domain_device *dev) +{ + struct expander_device *ex = &dev->ex_dev; + struct domain_device *child; + u8 sub_addr[8] = {0, }; + + list_for_each_entry(child, &ex->children, siblings) { + if (child->dev_type != EDGE_DEV && + child->dev_type != FANOUT_DEV) + continue; + if (sub_addr[0] == 0) { + sas_find_sub_addr(child, sub_addr); + continue; + } else { + u8 s2[8]; + + if (sas_find_sub_addr(child, s2) && + (SAS_ADDR(sub_addr) != SAS_ADDR(s2))) { + + SAS_DPRINTK("ex %016llx->%016llx-?->%016llx " + "diverges from subtractive " + "boundary %016llx\n", + SAS_ADDR(dev->sas_addr), + SAS_ADDR(child->sas_addr), + SAS_ADDR(s2), + SAS_ADDR(sub_addr)); + + sas_ex_disable_port(child, s2); + } + } + } + return 0; +} +/** + * sas_ex_discover_devices -- discover devices attached to this expander + * dev: pointer to the expander domain device + * single: if you want to do a single phy, else set to -1; + * + * Configure this expander for use with its devices and register the + * devices of this expander. + */ +static int sas_ex_discover_devices(struct domain_device *dev, int single) +{ + struct expander_device *ex = &dev->ex_dev; + int i = 0, end = ex->num_phys; + int res = 0; + + if (0 <= single && single < end) { + i = single; + end = i+1; + } + + for ( ; i < end; i++) { + struct ex_phy *ex_phy = &ex->ex_phy[i]; + + if (ex_phy->phy_state == PHY_VACANT || + ex_phy->phy_state == PHY_NOT_PRESENT || + ex_phy->phy_state == PHY_DEVICE_DISCOVERED) + continue; + + switch (ex_phy->linkrate) { + case PHY_DISABLED: + case PHY_RESET_PROBLEM: + case PHY_PORT_SELECTOR: + continue; + default: + res = sas_ex_discover_dev(dev, i); + if (res) + break; + continue; + } + } + + if (!res) + sas_check_level_subtractive_boundary(dev); + + return res; +} + +static int sas_check_ex_subtractive_boundary(struct domain_device *dev) +{ + struct expander_device *ex = &dev->ex_dev; + int i; + u8 *sub_sas_addr = NULL; + + if (dev->dev_type != EDGE_DEV) + return 0; + + for (i = 0; i < ex->num_phys; i++) { + struct ex_phy *phy = &ex->ex_phy[i]; + + if (phy->phy_state == PHY_VACANT || + phy->phy_state == PHY_NOT_PRESENT) + continue; + + if ((phy->attached_dev_type == FANOUT_DEV || + phy->attached_dev_type == EDGE_DEV) && + phy->routing_attr == SUBTRACTIVE_ROUTING) { + + if (!sub_sas_addr) + sub_sas_addr = &phy->attached_sas_addr[0]; + else if (SAS_ADDR(sub_sas_addr) != + SAS_ADDR(phy->attached_sas_addr)) { + + SAS_DPRINTK("ex %016llx phy 0x%x " + "diverges(%016llx) on subtractive " + "boundary(%016llx). Disabled\n", + SAS_ADDR(dev->sas_addr), i, + SAS_ADDR(phy->attached_sas_addr), + SAS_ADDR(sub_sas_addr)); + sas_ex_disable_phy(dev, i); + } + } + } + return 0; +} + +static void sas_print_parent_topology_bug(struct domain_device *child, + struct ex_phy *parent_phy, + struct ex_phy *child_phy) +{ + static const char ra_char[] = { + [DIRECT_ROUTING] = 'D', + [SUBTRACTIVE_ROUTING] = 'S', + [TABLE_ROUTING] = 'T', + }; + static const char *ex_type[] = { + [EDGE_DEV] = "edge", + [FANOUT_DEV] = "fanout", + }; + struct domain_device *parent = child->parent; + + sas_printk("%s ex %016llx phy 0x%x <--> %s ex %016llx phy 0x%x " + "has %c:%c routing link!\n", + + ex_type[parent->dev_type], + SAS_ADDR(parent->sas_addr), + parent_phy->phy_id, + + ex_type[child->dev_type], + SAS_ADDR(child->sas_addr), + child_phy->phy_id, + + ra_char[parent_phy->routing_attr], + ra_char[child_phy->routing_attr]); +} + +static int sas_check_eeds(struct domain_device *child, + struct ex_phy *parent_phy, + struct ex_phy *child_phy) +{ + int res = 0; + struct domain_device *parent = child->parent; + + if (SAS_ADDR(parent->port->disc.fanout_sas_addr) != 0) { + res = -ENODEV; + SAS_DPRINTK("edge ex %016llx phy S:0x%x <--> edge ex %016llx " + "phy S:0x%x, while there is a fanout ex %016llx\n", + SAS_ADDR(parent->sas_addr), + parent_phy->phy_id, + SAS_ADDR(child->sas_addr), + child_phy->phy_id, + SAS_ADDR(parent->port->disc.fanout_sas_addr)); + } else if (SAS_ADDR(parent->port->disc.eeds_a) == 0) { + memcpy(parent->port->disc.eeds_a, parent->sas_addr, + SAS_ADDR_SIZE); + memcpy(parent->port->disc.eeds_b, child->sas_addr, + SAS_ADDR_SIZE); + } else if (((SAS_ADDR(parent->port->disc.eeds_a) == + SAS_ADDR(parent->sas_addr)) || + (SAS_ADDR(parent->port->disc.eeds_a) == + SAS_ADDR(child->sas_addr))) + && + ((SAS_ADDR(parent->port->disc.eeds_b) == + SAS_ADDR(parent->sas_addr)) || + (SAS_ADDR(parent->port->disc.eeds_b) == + SAS_ADDR(child->sas_addr)))) + ; + else { + res = -ENODEV; + SAS_DPRINTK("edge ex %016llx phy 0x%x <--> edge ex %016llx " + "phy 0x%x link forms a third EEDS!\n", + SAS_ADDR(parent->sas_addr), + parent_phy->phy_id, + SAS_ADDR(child->sas_addr), + child_phy->phy_id); + } + + return res; +} + +/* Here we spill over 80 columns. It is intentional. + */ +static int sas_check_parent_topology(struct domain_device *child) +{ + struct expander_device *child_ex = &child->ex_dev; + struct expander_device *parent_ex; + int i; + int res = 0; + + if (!child->parent) + return 0; + + if (child->parent->dev_type != EDGE_DEV && + child->parent->dev_type != FANOUT_DEV) + return 0; + + parent_ex = &child->parent->ex_dev; + + for (i = 0; i < parent_ex->num_phys; i++) { + struct ex_phy *parent_phy = &parent_ex->ex_phy[i]; + struct ex_phy *child_phy; + + if (parent_phy->phy_state == PHY_VACANT || + parent_phy->phy_state == PHY_NOT_PRESENT) + continue; + + if (SAS_ADDR(parent_phy->attached_sas_addr) != SAS_ADDR(child->sas_addr)) + continue; + + child_phy = &child_ex->ex_phy[parent_phy->attached_phy_id]; + + switch (child->parent->dev_type) { + case EDGE_DEV: + if (child->dev_type == FANOUT_DEV) { + if (parent_phy->routing_attr != SUBTRACTIVE_ROUTING || + child_phy->routing_attr != TABLE_ROUTING) { + sas_print_parent_topology_bug(child, parent_phy, child_phy); + res = -ENODEV; + } + } else if (parent_phy->routing_attr == SUBTRACTIVE_ROUTING) { + if (child_phy->routing_attr == SUBTRACTIVE_ROUTING) { + res = sas_check_eeds(child, parent_phy, child_phy); + } else if (child_phy->routing_attr != TABLE_ROUTING) { + sas_print_parent_topology_bug(child, parent_phy, child_phy); + res = -ENODEV; + } + } else if (parent_phy->routing_attr == TABLE_ROUTING && + child_phy->routing_attr != SUBTRACTIVE_ROUTING) { + sas_print_parent_topology_bug(child, parent_phy, child_phy); + res = -ENODEV; + } + break; + case FANOUT_DEV: + if (parent_phy->routing_attr != TABLE_ROUTING || + child_phy->routing_attr != SUBTRACTIVE_ROUTING) { + sas_print_parent_topology_bug(child, parent_phy, child_phy); + res = -ENODEV; + } + break; + default: + break; + } + } + + return res; +} + +#define RRI_REQ_SIZE 16 +#define RRI_RESP_SIZE 44 + +static int sas_configure_present(struct domain_device *dev, int phy_id, + u8 *sas_addr, int *index, int *present) +{ + int i, res = 0; + struct expander_device *ex = &dev->ex_dev; + struct ex_phy *phy = &ex->ex_phy[phy_id]; + u8 *rri_req; + u8 *rri_resp; + + *present = 0; + *index = 0; + + rri_req = alloc_smp_req(RRI_REQ_SIZE); + if (!rri_req) + return -ENOMEM; + + rri_resp = alloc_smp_resp(RRI_RESP_SIZE); + if (!rri_resp) { + kfree(rri_req); + return -ENOMEM; + } + + rri_req[1] = SMP_REPORT_ROUTE_INFO; + rri_req[9] = phy_id; + + for (i = 0; i < ex->max_route_indexes ; i++) { + *(__be16 *)(rri_req+6) = cpu_to_be16(i); + res = smp_execute_task(dev, rri_req, RRI_REQ_SIZE, rri_resp, + RRI_RESP_SIZE); + if (res) + goto out; + res = rri_resp[2]; + if (res == SMP_RESP_NO_INDEX) { + SAS_DPRINTK("overflow of indexes: dev %016llx " + "phy 0x%x index 0x%x\n", + SAS_ADDR(dev->sas_addr), phy_id, i); + goto out; + } else if (res != SMP_RESP_FUNC_ACC) { + SAS_DPRINTK("%s: dev %016llx phy 0x%x index 0x%x " + "result 0x%x\n", __FUNCTION__, + SAS_ADDR(dev->sas_addr), phy_id, i, res); + goto out; + } + if (SAS_ADDR(sas_addr) != 0) { + if (SAS_ADDR(rri_resp+16) == SAS_ADDR(sas_addr)) { + *index = i; + if ((rri_resp[12] & 0x80) == 0x80) + *present = 0; + else + *present = 1; + goto out; + } else if (SAS_ADDR(rri_resp+16) == 0) { + *index = i; + *present = 0; + goto out; + } + } else if (SAS_ADDR(rri_resp+16) == 0 && + phy->last_da_index < i) { + phy->last_da_index = i; + *index = i; + *present = 0; + goto out; + } + } + res = -1; +out: + kfree(rri_req); + kfree(rri_resp); + return res; +} + +#define CRI_REQ_SIZE 44 +#define CRI_RESP_SIZE 8 + +static int sas_configure_set(struct domain_device *dev, int phy_id, + u8 *sas_addr, int index, int include) +{ + int res; + u8 *cri_req; + u8 *cri_resp; + + cri_req = alloc_smp_req(CRI_REQ_SIZE); + if (!cri_req) + return -ENOMEM; + + cri_resp = alloc_smp_resp(CRI_RESP_SIZE); + if (!cri_resp) { + kfree(cri_req); + return -ENOMEM; + } + + cri_req[1] = SMP_CONF_ROUTE_INFO; + *(__be16 *)(cri_req+6) = cpu_to_be16(index); + cri_req[9] = phy_id; + if (SAS_ADDR(sas_addr) == 0 || !include) + cri_req[12] |= 0x80; + memcpy(cri_req+16, sas_addr, SAS_ADDR_SIZE); + + res = smp_execute_task(dev, cri_req, CRI_REQ_SIZE, cri_resp, + CRI_RESP_SIZE); + if (res) + goto out; + res = cri_resp[2]; + if (res == SMP_RESP_NO_INDEX) { + SAS_DPRINTK("overflow of indexes: dev %016llx phy 0x%x " + "index 0x%x\n", + SAS_ADDR(dev->sas_addr), phy_id, index); + } +out: + kfree(cri_req); + kfree(cri_resp); + return res; +} + +static int sas_configure_phy(struct domain_device *dev, int phy_id, + u8 *sas_addr, int include) +{ + int index; + int present; + int res; + + res = sas_configure_present(dev, phy_id, sas_addr, &index, &present); + if (res) + return res; + if (include ^ present) + return sas_configure_set(dev, phy_id, sas_addr, index,include); + + return res; +} + +/** + * sas_configure_parent -- configure routing table of parent + * parent: parent expander + * child: child expander + * sas_addr: SAS port identifier of device directly attached to child + */ +static int sas_configure_parent(struct domain_device *parent, + struct domain_device *child, + u8 *sas_addr, int include) +{ + struct expander_device *ex_parent = &parent->ex_dev; + int res = 0; + int i; + + if (parent->parent) { + res = sas_configure_parent(parent->parent, parent, sas_addr, + include); + if (res) + return res; + } + + if (ex_parent->conf_route_table == 0) { + SAS_DPRINTK("ex %016llx has self-configuring routing table\n", + SAS_ADDR(parent->sas_addr)); + return 0; + } + + for (i = 0; i < ex_parent->num_phys; i++) { + struct ex_phy *phy = &ex_parent->ex_phy[i]; + + if ((phy->routing_attr == TABLE_ROUTING) && + (SAS_ADDR(phy->attached_sas_addr) == + SAS_ADDR(child->sas_addr))) { + res = sas_configure_phy(parent, i, sas_addr, include); + if (res) + return res; + } + } + + return res; +} + +/** + * sas_configure_routing -- configure routing + * dev: expander device + * sas_addr: port identifier of device directly attached to the expander device + */ +static int sas_configure_routing(struct domain_device *dev, u8 *sas_addr) +{ + if (dev->parent) + return sas_configure_parent(dev->parent, dev, sas_addr, 1); + return 0; +} + +static int sas_disable_routing(struct domain_device *dev, u8 *sas_addr) +{ + if (dev->parent) + return sas_configure_parent(dev->parent, dev, sas_addr, 0); + return 0; +} + +#if 0 +#define SMP_BIN_ATTR_NAME "smp_portal" + +static void sas_ex_smp_hook(struct domain_device *dev) +{ + struct expander_device *ex_dev = &dev->ex_dev; + struct bin_attribute *bin_attr = &ex_dev->smp_bin_attr; + + memset(bin_attr, 0, sizeof(*bin_attr)); + + bin_attr->attr.name = SMP_BIN_ATTR_NAME; + bin_attr->attr.owner = THIS_MODULE; + bin_attr->attr.mode = 0600; + + bin_attr->size = 0; + bin_attr->private = NULL; + bin_attr->read = smp_portal_read; + bin_attr->write= smp_portal_write; + bin_attr->mmap = NULL; + + ex_dev->smp_portal_pid = -1; + init_MUTEX(&ex_dev->smp_sema); +} +#endif + +/** + * sas_discover_expander -- expander discovery + * @ex: pointer to expander domain device + * + * See comment in sas_discover_sata(). + */ +static int sas_discover_expander(struct domain_device *dev) +{ + int res; + + res = sas_notify_lldd_dev_found(dev); + if (res) + return res; + + res = sas_ex_general(dev); + if (res) + goto out_err; + res = sas_ex_manuf_info(dev); + if (res) + goto out_err; + + res = sas_expander_discover(dev); + if (res) { + SAS_DPRINTK("expander %016llx discovery failed(0x%x)\n", + SAS_ADDR(dev->sas_addr), res); + goto out_err; + } + + sas_check_ex_subtractive_boundary(dev); + res = sas_check_parent_topology(dev); + if (res) + goto out_err; + return 0; +out_err: + sas_notify_lldd_dev_gone(dev); + return res; +} + +static int sas_ex_level_discovery(struct asd_sas_port *port, const int level) +{ + int res = 0; + struct domain_device *dev; + + list_for_each_entry(dev, &port->dev_list, dev_list_node) { + if (dev->dev_type == EDGE_DEV || + dev->dev_type == FANOUT_DEV) { + struct sas_expander_device *ex = + rphy_to_expander_device(dev->rphy); + + if (level == ex->level) + res = sas_ex_discover_devices(dev, -1); + else if (level > 0) + res = sas_ex_discover_devices(port->port_dev, -1); + + } + } + + return res; +} + +static int sas_ex_bfs_disc(struct asd_sas_port *port) +{ + int res; + int level; + + do { + level = port->disc.max_level; + res = sas_ex_level_discovery(port, level); + mb(); + } while (level < port->disc.max_level); + + return res; +} + +int sas_discover_root_expander(struct domain_device *dev) +{ + int res; + struct sas_expander_device *ex = rphy_to_expander_device(dev->rphy); + + sas_rphy_add(dev->rphy); + + ex->level = dev->port->disc.max_level; /* 0 */ + res = sas_discover_expander(dev); + if (!res) + sas_ex_bfs_disc(dev->port); + + return res; +} + +/* ---------- Domain revalidation ---------- */ + +static int sas_get_phy_discover(struct domain_device *dev, + int phy_id, struct smp_resp *disc_resp) +{ + int res; + u8 *disc_req; + + disc_req = alloc_smp_req(DISCOVER_REQ_SIZE); + if (!disc_req) + return -ENOMEM; + + disc_req[1] = SMP_DISCOVER; + disc_req[9] = phy_id; + + res = smp_execute_task(dev, disc_req, DISCOVER_REQ_SIZE, + disc_resp, DISCOVER_RESP_SIZE); + if (res) + goto out; + else if (disc_resp->result != SMP_RESP_FUNC_ACC) { + res = disc_resp->result; + goto out; + } +out: + kfree(disc_req); + return res; +} + +static int sas_get_phy_change_count(struct domain_device *dev, + int phy_id, int *pcc) +{ + int res; + struct smp_resp *disc_resp; + + disc_resp = alloc_smp_resp(DISCOVER_RESP_SIZE); + if (!disc_resp) + return -ENOMEM; + + res = sas_get_phy_discover(dev, phy_id, disc_resp); + if (!res) + *pcc = disc_resp->disc.change_count; + + kfree(disc_resp); + return res; +} + +static int sas_get_phy_attached_sas_addr(struct domain_device *dev, + int phy_id, u8 *attached_sas_addr) +{ + int res; + struct smp_resp *disc_resp; + struct discover_resp *dr; + + disc_resp = alloc_smp_resp(DISCOVER_RESP_SIZE); + if (!disc_resp) + return -ENOMEM; + dr = &disc_resp->disc; + + res = sas_get_phy_discover(dev, phy_id, disc_resp); + if (!res) { + memcpy(attached_sas_addr,disc_resp->disc.attached_sas_addr,8); + if (dr->attached_dev_type == 0) + memset(attached_sas_addr, 0, 8); + } + kfree(disc_resp); + return res; +} + +static int sas_find_bcast_phy(struct domain_device *dev, int *phy_id, + int from_phy) +{ + struct expander_device *ex = &dev->ex_dev; + int res = 0; + int i; + + for (i = from_phy; i < ex->num_phys; i++) { + int phy_change_count = 0; + + res = sas_get_phy_change_count(dev, i, &phy_change_count); + if (res) + goto out; + else if (phy_change_count != ex->ex_phy[i].phy_change_count) { + ex->ex_phy[i].phy_change_count = phy_change_count; + *phy_id = i; + return 0; + } + } +out: + return res; +} + +static int sas_get_ex_change_count(struct domain_device *dev, int *ecc) +{ + int res; + u8 *rg_req; + struct smp_resp *rg_resp; + + rg_req = alloc_smp_req(RG_REQ_SIZE); + if (!rg_req) + return -ENOMEM; + + rg_resp = alloc_smp_resp(RG_RESP_SIZE); + if (!rg_resp) { + kfree(rg_req); + return -ENOMEM; + } + + rg_req[1] = SMP_REPORT_GENERAL; + + res = smp_execute_task(dev, rg_req, RG_REQ_SIZE, rg_resp, + RG_RESP_SIZE); + if (res) + goto out; + if (rg_resp->result != SMP_RESP_FUNC_ACC) { + res = rg_resp->result; + goto out; + } + + *ecc = be16_to_cpu(rg_resp->rg.change_count); +out: + kfree(rg_resp); + kfree(rg_req); + return res; +} + +static int sas_find_bcast_dev(struct domain_device *dev, + struct domain_device **src_dev) +{ + struct expander_device *ex = &dev->ex_dev; + int ex_change_count = -1; + int res; + + res = sas_get_ex_change_count(dev, &ex_change_count); + if (res) + goto out; + if (ex_change_count != -1 && + ex_change_count != ex->ex_change_count) { + *src_dev = dev; + ex->ex_change_count = ex_change_count; + } else { + struct domain_device *ch; + + list_for_each_entry(ch, &ex->children, siblings) { + if (ch->dev_type == EDGE_DEV || + ch->dev_type == FANOUT_DEV) { + res = sas_find_bcast_dev(ch, src_dev); + if (src_dev) + return res; + } + } + } +out: + return res; +} + +static void sas_unregister_ex_tree(struct domain_device *dev) +{ + struct expander_device *ex = &dev->ex_dev; + struct domain_device *child, *n; + + list_for_each_entry_safe(child, n, &ex->children, siblings) { + if (child->dev_type == EDGE_DEV || + child->dev_type == FANOUT_DEV) + sas_unregister_ex_tree(child); + else + sas_unregister_dev(child); + } + sas_unregister_dev(dev); +} + +static void sas_unregister_devs_sas_addr(struct domain_device *parent, + int phy_id) +{ + struct expander_device *ex_dev = &parent->ex_dev; + struct ex_phy *phy = &ex_dev->ex_phy[phy_id]; + struct domain_device *child, *n; + + list_for_each_entry_safe(child, n, &ex_dev->children, siblings) { + if (SAS_ADDR(child->sas_addr) == + SAS_ADDR(phy->attached_sas_addr)) { + if (child->dev_type == EDGE_DEV || + child->dev_type == FANOUT_DEV) + sas_unregister_ex_tree(child); + else + sas_unregister_dev(child); + break; + } + } + sas_disable_routing(parent, phy->attached_sas_addr); + memset(phy->attached_sas_addr, 0, SAS_ADDR_SIZE); + sas_port_delete_phy(phy->port, phy->phy); + if (phy->port->num_phys == 0) + sas_port_delete(phy->port); + phy->port = NULL; +} + +static int sas_discover_bfs_by_root_level(struct domain_device *root, + const int level) +{ + struct expander_device *ex_root = &root->ex_dev; + struct domain_device *child; + int res = 0; + + list_for_each_entry(child, &ex_root->children, siblings) { + if (child->dev_type == EDGE_DEV || + child->dev_type == FANOUT_DEV) { + struct sas_expander_device *ex = + rphy_to_expander_device(child->rphy); + + if (level > ex->level) + res = sas_discover_bfs_by_root_level(child, + level); + else if (level == ex->level) + res = sas_ex_discover_devices(child, -1); + } + } + return res; +} + +static int sas_discover_bfs_by_root(struct domain_device *dev) +{ + int res; + struct sas_expander_device *ex = rphy_to_expander_device(dev->rphy); + int level = ex->level+1; + + res = sas_ex_discover_devices(dev, -1); + if (res) + goto out; + do { + res = sas_discover_bfs_by_root_level(dev, level); + mb(); + level += 1; + } while (level <= dev->port->disc.max_level); +out: + return res; +} + +static int sas_discover_new(struct domain_device *dev, int phy_id) +{ + struct ex_phy *ex_phy = &dev->ex_dev.ex_phy[phy_id]; + struct domain_device *child; + int res; + + SAS_DPRINTK("ex %016llx phy%d new device attached\n", + SAS_ADDR(dev->sas_addr), phy_id); + res = sas_ex_phy_discover(dev, phy_id); + if (res) + goto out; + res = sas_ex_discover_devices(dev, phy_id); + if (res) + goto out; + list_for_each_entry(child, &dev->ex_dev.children, siblings) { + if (SAS_ADDR(child->sas_addr) == + SAS_ADDR(ex_phy->attached_sas_addr)) { + if (child->dev_type == EDGE_DEV || + child->dev_type == FANOUT_DEV) + res = sas_discover_bfs_by_root(child); + break; + } + } +out: + return res; +} + +static int sas_rediscover_dev(struct domain_device *dev, int phy_id) +{ + struct expander_device *ex = &dev->ex_dev; + struct ex_phy *phy = &ex->ex_phy[phy_id]; + u8 attached_sas_addr[8]; + int res; + + res = sas_get_phy_attached_sas_addr(dev, phy_id, attached_sas_addr); + switch (res) { + case SMP_RESP_NO_PHY: + phy->phy_state = PHY_NOT_PRESENT; + sas_unregister_devs_sas_addr(dev, phy_id); + goto out; break; + case SMP_RESP_PHY_VACANT: + phy->phy_state = PHY_VACANT; + sas_unregister_devs_sas_addr(dev, phy_id); + goto out; break; + case SMP_RESP_FUNC_ACC: + break; + } + + if (SAS_ADDR(attached_sas_addr) == 0) { + phy->phy_state = PHY_EMPTY; + sas_unregister_devs_sas_addr(dev, phy_id); + } else if (SAS_ADDR(attached_sas_addr) == + SAS_ADDR(phy->attached_sas_addr)) { + SAS_DPRINTK("ex %016llx phy 0x%x broadcast flutter\n", + SAS_ADDR(dev->sas_addr), phy_id); + } else + res = sas_discover_new(dev, phy_id); +out: + return res; +} + +static int sas_rediscover(struct domain_device *dev, const int phy_id) +{ + struct expander_device *ex = &dev->ex_dev; + struct ex_phy *changed_phy = &ex->ex_phy[phy_id]; + int res = 0; + int i; + + SAS_DPRINTK("ex %016llx phy%d originated BROADCAST(CHANGE)\n", + SAS_ADDR(dev->sas_addr), phy_id); + + if (SAS_ADDR(changed_phy->attached_sas_addr) != 0) { + for (i = 0; i < ex->num_phys; i++) { + struct ex_phy *phy = &ex->ex_phy[i]; + + if (i == phy_id) + continue; + if (SAS_ADDR(phy->attached_sas_addr) == + SAS_ADDR(changed_phy->attached_sas_addr)) { + SAS_DPRINTK("phy%d part of wide port with " + "phy%d\n", phy_id, i); + goto out; + } + } + res = sas_rediscover_dev(dev, phy_id); + } else + res = sas_discover_new(dev, phy_id); +out: + return res; +} + +/** + * sas_revalidate_domain -- revalidate the domain + * @port: port to the domain of interest + * + * NOTE: this process _must_ quit (return) as soon as any connection + * errors are encountered. Connection recovery is done elsewhere. + * Discover process only interrogates devices in order to discover the + * domain. + */ +int sas_ex_revalidate_domain(struct domain_device *port_dev) +{ + int res; + struct domain_device *dev = NULL; + + res = sas_find_bcast_dev(port_dev, &dev); + if (res) + goto out; + if (dev) { + struct expander_device *ex = &dev->ex_dev; + int i = 0, phy_id; + + do { + phy_id = -1; + res = sas_find_bcast_phy(dev, &phy_id, i); + if (phy_id == -1) + break; + res = sas_rediscover(dev, phy_id); + i = phy_id + 1; + } while (i < ex->num_phys); + } +out: + return res; +} + +#if 0 +/* ---------- SMP portal ---------- */ + +static ssize_t smp_portal_write(struct kobject *kobj, char *buf, loff_t offs, + size_t size) +{ + struct domain_device *dev = to_dom_device(kobj); + struct expander_device *ex = &dev->ex_dev; + + if (offs != 0) + return -EFBIG; + else if (size == 0) + return 0; + + down_interruptible(&ex->smp_sema); + if (ex->smp_req) + kfree(ex->smp_req); + ex->smp_req = kzalloc(size, GFP_USER); + if (!ex->smp_req) { + up(&ex->smp_sema); + return -ENOMEM; + } + memcpy(ex->smp_req, buf, size); + ex->smp_req_size = size; + ex->smp_portal_pid = current->pid; + up(&ex->smp_sema); + + return size; +} + +static ssize_t smp_portal_read(struct kobject *kobj, char *buf, loff_t offs, + size_t size) +{ + struct domain_device *dev = to_dom_device(kobj); + struct expander_device *ex = &dev->ex_dev; + u8 *smp_resp; + int res = -EINVAL; + + /* XXX: sysfs gives us an offset of 0x10 or 0x8 while in fact + * it should be 0. + */ + + down_interruptible(&ex->smp_sema); + if (!ex->smp_req || ex->smp_portal_pid != current->pid) + goto out; + + res = 0; + if (size == 0) + goto out; + + res = -ENOMEM; + smp_resp = alloc_smp_resp(size); + if (!smp_resp) + goto out; + res = smp_execute_task(dev, ex->smp_req, ex->smp_req_size, + smp_resp, size); + if (!res) { + memcpy(buf, smp_resp, size); + res = size; + } + + kfree(smp_resp); +out: + kfree(ex->smp_req); + ex->smp_req = NULL; + ex->smp_req_size = 0; + ex->smp_portal_pid = -1; + up(&ex->smp_sema); + return res; +} +#endif diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_init.c linux-2.6.18.ovz/drivers/scsi/libsas/sas_init.c --- linux-2.6.18/drivers/scsi/libsas/sas_init.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_init.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,227 @@ +/* + * Serial Attached SCSI (SAS) Transport Layer initialization + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sas_internal.h" + +#include "../scsi_sas_internal.h" + +kmem_cache_t *sas_task_cache; + +/*------------ SAS addr hash -----------*/ +void sas_hash_addr(u8 *hashed, const u8 *sas_addr) +{ + const u32 poly = 0x00DB2777; + u32 r = 0; + int i; + + for (i = 0; i < 8; i++) { + int b; + for (b = 7; b >= 0; b--) { + r <<= 1; + if ((1 << b) & sas_addr[i]) { + if (!(r & 0x01000000)) + r ^= poly; + } else if (r & 0x01000000) + r ^= poly; + } + } + + hashed[0] = (r >> 16) & 0xFF; + hashed[1] = (r >> 8) & 0xFF ; + hashed[2] = r & 0xFF; +} + + +/* ---------- HA events ---------- */ + +void sas_hae_reset(void *data) +{ + struct sas_ha_struct *ha = data; + + sas_begin_event(HAE_RESET, &ha->event_lock, + &ha->pending); +} + +int sas_register_ha(struct sas_ha_struct *sas_ha) +{ + int error = 0; + + spin_lock_init(&sas_ha->phy_port_lock); + sas_hash_addr(sas_ha->hashed_sas_addr, sas_ha->sas_addr); + + if (sas_ha->lldd_queue_size == 0) + sas_ha->lldd_queue_size = 1; + else if (sas_ha->lldd_queue_size == -1) + sas_ha->lldd_queue_size = 128; /* Sanity */ + + error = sas_register_phys(sas_ha); + if (error) { + printk(KERN_NOTICE "couldn't register sas phys:%d\n", error); + return error; + } + + error = sas_register_ports(sas_ha); + if (error) { + printk(KERN_NOTICE "couldn't register sas ports:%d\n", error); + goto Undo_phys; + } + + error = sas_init_events(sas_ha); + if (error) { + printk(KERN_NOTICE "couldn't start event thread:%d\n", error); + goto Undo_ports; + } + + if (sas_ha->lldd_max_execute_num > 1) { + error = sas_init_queue(sas_ha); + if (error) { + printk(KERN_NOTICE "couldn't start queue thread:%d, " + "running in direct mode\n", error); + sas_ha->lldd_max_execute_num = 1; + } + } + + return 0; + +Undo_ports: + sas_unregister_ports(sas_ha); +Undo_phys: + + return error; +} + +int sas_unregister_ha(struct sas_ha_struct *sas_ha) +{ + if (sas_ha->lldd_max_execute_num > 1) { + sas_shutdown_queue(sas_ha); + } + + sas_unregister_ports(sas_ha); + + return 0; +} + +static int sas_get_linkerrors(struct sas_phy *phy) +{ + if (scsi_is_sas_phy_local(phy)) + /* FIXME: we have no local phy stats + * gathering at this time */ + return -EINVAL; + + return sas_smp_get_phy_events(phy); +} + +static int sas_phy_reset(struct sas_phy *phy, int hard_reset) +{ + int ret; + enum phy_func reset_type; + + if (hard_reset) + reset_type = PHY_FUNC_HARD_RESET; + else + reset_type = PHY_FUNC_LINK_RESET; + + if (scsi_is_sas_phy_local(phy)) { + struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); + struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost); + struct asd_sas_phy *asd_phy = sas_ha->sas_phy[phy->number]; + struct sas_internal *i = + to_sas_internal(sas_ha->core.shost->transportt); + + ret = i->dft->lldd_control_phy(asd_phy, reset_type); + } else { + struct sas_rphy *rphy = dev_to_rphy(phy->dev.parent); + struct domain_device *ddev = sas_find_dev_by_rphy(rphy); + ret = sas_smp_phy_control(ddev, phy->number, reset_type); + } + return ret; +} + +static struct sas_function_template sft = { + .phy_reset = sas_phy_reset, + .get_linkerrors = sas_get_linkerrors, +}; + +struct scsi_transport_template * +sas_domain_attach_transport(struct sas_domain_function_template *dft) +{ + struct scsi_transport_template *stt = sas_attach_transport(&sft); + struct sas_internal *i; + + if (!stt) + return stt; + + i = to_sas_internal(stt); + i->dft = dft; + stt->create_work_queue = 1; + stt->eh_timed_out = sas_scsi_timed_out; + stt->eh_strategy_handler = sas_scsi_recover_host; + + return stt; +} +EXPORT_SYMBOL_GPL(sas_domain_attach_transport); + + +void sas_domain_release_transport(struct scsi_transport_template *stt) +{ + sas_release_transport(stt); +} +EXPORT_SYMBOL_GPL(sas_domain_release_transport); + +/* ---------- SAS Class register/unregister ---------- */ + +static int __init sas_class_init(void) +{ + sas_task_cache = kmem_cache_create("sas_task", sizeof(struct sas_task), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!sas_task_cache) + return -ENOMEM; + + return 0; +} + +static void __exit sas_class_exit(void) +{ + kmem_cache_destroy(sas_task_cache); +} + +MODULE_AUTHOR("Luben Tuikov "); +MODULE_DESCRIPTION("SAS Transport Layer"); +MODULE_LICENSE("GPL v2"); + +module_init(sas_class_init); +module_exit(sas_class_exit); + +EXPORT_SYMBOL_GPL(sas_register_ha); +EXPORT_SYMBOL_GPL(sas_unregister_ha); diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_internal.h linux-2.6.18.ovz/drivers/scsi/libsas/sas_internal.h --- linux-2.6.18/drivers/scsi/libsas/sas_internal.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_internal.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,146 @@ +/* + * Serial Attached SCSI (SAS) class internal header file + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#ifndef _SAS_INTERNAL_H_ +#define _SAS_INTERNAL_H_ + +#include +#include +#include +#include + +#define sas_printk(fmt, ...) printk(KERN_NOTICE "sas: " fmt, ## __VA_ARGS__) + +#ifdef SAS_DEBUG +#define SAS_DPRINTK(fmt, ...) printk(KERN_NOTICE "sas: " fmt, ## __VA_ARGS__) +#else +#define SAS_DPRINTK(fmt, ...) +#endif + +void sas_scsi_recover_host(struct Scsi_Host *shost); + +int sas_show_class(enum sas_class class, char *buf); +int sas_show_proto(enum sas_proto proto, char *buf); +int sas_show_linkrate(enum sas_phy_linkrate linkrate, char *buf); +int sas_show_oob_mode(enum sas_oob_mode oob_mode, char *buf); + +int sas_register_phys(struct sas_ha_struct *sas_ha); +void sas_unregister_phys(struct sas_ha_struct *sas_ha); + +int sas_register_ports(struct sas_ha_struct *sas_ha); +void sas_unregister_ports(struct sas_ha_struct *sas_ha); + +enum scsi_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *); + +int sas_init_queue(struct sas_ha_struct *sas_ha); +int sas_init_events(struct sas_ha_struct *sas_ha); +void sas_shutdown_queue(struct sas_ha_struct *sas_ha); + +void sas_deform_port(struct asd_sas_phy *phy); + +void sas_porte_bytes_dmaed(void *); +void sas_porte_broadcast_rcvd(void *); +void sas_porte_link_reset_err(void *); +void sas_porte_timer_event(void *); +void sas_porte_hard_reset(void *); + +int sas_notify_lldd_dev_found(struct domain_device *); +void sas_notify_lldd_dev_gone(struct domain_device *); + +int sas_smp_phy_control(struct domain_device *dev, int phy_id, + enum phy_func phy_func); +int sas_smp_get_phy_events(struct sas_phy *phy); + +struct domain_device *sas_find_dev_by_rphy(struct sas_rphy *rphy); + +void sas_hae_reset(void *); + +static inline void sas_queue_event(int event, spinlock_t *lock, + unsigned long *pending, + struct work_struct *work, + struct Scsi_Host *shost) +{ + unsigned long flags; + + spin_lock_irqsave(lock, flags); + if (test_bit(event, pending)) { + spin_unlock_irqrestore(lock, flags); + return; + } + __set_bit(event, pending); + spin_unlock_irqrestore(lock, flags); + scsi_queue_work(shost, work); +} + +static inline void sas_begin_event(int event, spinlock_t *lock, + unsigned long *pending) +{ + unsigned long flags; + + spin_lock_irqsave(lock, flags); + __clear_bit(event, pending); + spin_unlock_irqrestore(lock, flags); +} + +static inline void sas_fill_in_rphy(struct domain_device *dev, + struct sas_rphy *rphy) +{ + rphy->identify.sas_address = SAS_ADDR(dev->sas_addr); + rphy->identify.initiator_port_protocols = dev->iproto; + rphy->identify.target_port_protocols = dev->tproto; + switch (dev->dev_type) { + case SATA_DEV: + /* FIXME: need sata device type */ + case SAS_END_DEV: + rphy->identify.device_type = SAS_END_DEVICE; + break; + case EDGE_DEV: + rphy->identify.device_type = SAS_EDGE_EXPANDER_DEVICE; + break; + case FANOUT_DEV: + rphy->identify.device_type = SAS_FANOUT_EXPANDER_DEVICE; + break; + default: + rphy->identify.device_type = SAS_PHY_UNUSED; + break; + } +} + +static inline void sas_add_parent_port(struct domain_device *dev, int phy_id) +{ + struct expander_device *ex = &dev->ex_dev; + struct ex_phy *ex_phy = &ex->ex_phy[phy_id]; + + if (!ex->parent_port) { + ex->parent_port = sas_port_alloc(&dev->rphy->dev, phy_id); + /* FIXME: error handling */ + BUG_ON(!ex->parent_port); + BUG_ON(sas_port_add(ex->parent_port)); + sas_port_mark_backlink(ex->parent_port); + } + sas_port_add_phy(ex->parent_port, ex_phy->phy); +} + +#endif /* _SAS_INTERNAL_H_ */ diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_phy.c linux-2.6.18.ovz/drivers/scsi/libsas/sas_phy.c --- linux-2.6.18/drivers/scsi/libsas/sas_phy.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_phy.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,157 @@ +/* + * Serial Attached SCSI (SAS) Phy class + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "sas_internal.h" +#include +#include +#include +#include "../scsi_sas_internal.h" + +/* ---------- Phy events ---------- */ + +static void sas_phye_loss_of_signal(void *data) +{ + struct asd_sas_phy *phy = data; + + sas_begin_event(PHYE_LOSS_OF_SIGNAL, &phy->ha->event_lock, + &phy->phy_events_pending); + phy->error = 0; + sas_deform_port(phy); +} + +static void sas_phye_oob_done(void *data) +{ + struct asd_sas_phy *phy = data; + + sas_begin_event(PHYE_OOB_DONE, &phy->ha->event_lock, + &phy->phy_events_pending); + phy->error = 0; +} + +static void sas_phye_oob_error(void *data) +{ + struct asd_sas_phy *phy = data; + struct sas_ha_struct *sas_ha = phy->ha; + struct asd_sas_port *port = phy->port; + struct sas_internal *i = + to_sas_internal(sas_ha->core.shost->transportt); + + sas_begin_event(PHYE_OOB_ERROR, &phy->ha->event_lock, + &phy->phy_events_pending); + + sas_deform_port(phy); + + if (!port && phy->enabled && i->dft->lldd_control_phy) { + phy->error++; + switch (phy->error) { + case 1: + case 2: + i->dft->lldd_control_phy(phy, PHY_FUNC_HARD_RESET); + break; + case 3: + default: + phy->error = 0; + phy->enabled = 0; + i->dft->lldd_control_phy(phy, PHY_FUNC_DISABLE); + break; + } + } +} + +static void sas_phye_spinup_hold(void *data) +{ + struct asd_sas_phy *phy = data; + struct sas_ha_struct *sas_ha = phy->ha; + struct sas_internal *i = + to_sas_internal(sas_ha->core.shost->transportt); + + sas_begin_event(PHYE_SPINUP_HOLD, &phy->ha->event_lock, + &phy->phy_events_pending); + + phy->error = 0; + i->dft->lldd_control_phy(phy, PHY_FUNC_RELEASE_SPINUP_HOLD); +} + +/* ---------- Phy class registration ---------- */ + +int sas_register_phys(struct sas_ha_struct *sas_ha) +{ + int i; + + static void (*sas_phy_event_fns[PHY_NUM_EVENTS])(void *) = { + [PHYE_LOSS_OF_SIGNAL] = sas_phye_loss_of_signal, + [PHYE_OOB_DONE] = sas_phye_oob_done, + [PHYE_OOB_ERROR] = sas_phye_oob_error, + [PHYE_SPINUP_HOLD] = sas_phye_spinup_hold, + }; + + static void (*sas_port_event_fns[PORT_NUM_EVENTS])(void *) = { + [PORTE_BYTES_DMAED] = sas_porte_bytes_dmaed, + [PORTE_BROADCAST_RCVD] = sas_porte_broadcast_rcvd, + [PORTE_LINK_RESET_ERR] = sas_porte_link_reset_err, + [PORTE_TIMER_EVENT] = sas_porte_timer_event, + [PORTE_HARD_RESET] = sas_porte_hard_reset, + }; + + /* Now register the phys. */ + for (i = 0; i < sas_ha->num_phys; i++) { + int k; + struct asd_sas_phy *phy = sas_ha->sas_phy[i]; + + phy->error = 0; + INIT_LIST_HEAD(&phy->port_phy_el); + for (k = 0; k < PORT_NUM_EVENTS; k++) + INIT_WORK(&phy->port_events[k], sas_port_event_fns[k], + phy); + + for (k = 0; k < PHY_NUM_EVENTS; k++) + INIT_WORK(&phy->phy_events[k], sas_phy_event_fns[k], + phy); + phy->port = NULL; + phy->ha = sas_ha; + spin_lock_init(&phy->frame_rcvd_lock); + spin_lock_init(&phy->sas_prim_lock); + phy->frame_rcvd_size = 0; + + phy->phy = sas_phy_alloc(&sas_ha->core.shost->shost_gendev, + i); + if (!phy->phy) + return -ENOMEM; + + phy->phy->identify.initiator_port_protocols = + phy->iproto; + phy->phy->identify.target_port_protocols = phy->tproto; + phy->phy->identify.sas_address = SAS_ADDR(sas_ha->sas_addr); + phy->phy->identify.phy_identifier = i; + phy->phy->minimum_linkrate_hw = SAS_LINK_RATE_1_5_GBPS; + phy->phy->maximum_linkrate_hw = SAS_LINK_RATE_3_0_GBPS; + phy->phy->minimum_linkrate = SAS_LINK_RATE_1_5_GBPS; + phy->phy->maximum_linkrate = SAS_LINK_RATE_3_0_GBPS; + phy->phy->negotiated_linkrate = SAS_LINK_RATE_UNKNOWN; + + sas_phy_add(phy->phy); + } + + return 0; +} diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_port.c linux-2.6.18.ovz/drivers/scsi/libsas/sas_port.c --- linux-2.6.18/drivers/scsi/libsas/sas_port.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_port.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,279 @@ +/* + * Serial Attached SCSI (SAS) Port class + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "sas_internal.h" + +#include +#include +#include "../scsi_sas_internal.h" + +/** + * sas_form_port -- add this phy to a port + * @phy: the phy of interest + * + * This function adds this phy to an existing port, thus creating a wide + * port, or it creates a port and adds the phy to the port. + */ +static void sas_form_port(struct asd_sas_phy *phy) +{ + int i; + struct sas_ha_struct *sas_ha = phy->ha; + struct asd_sas_port *port = phy->port; + struct sas_internal *si = + to_sas_internal(sas_ha->core.shost->transportt); + + if (port) { + if (memcmp(port->attached_sas_addr, phy->attached_sas_addr, + SAS_ADDR_SIZE) == 0) + sas_deform_port(phy); + else { + SAS_DPRINTK("%s: phy%d belongs to port%d already(%d)!\n", + __FUNCTION__, phy->id, phy->port->id, + phy->port->num_phys); + return; + } + } + + /* find a port */ + spin_lock(&sas_ha->phy_port_lock); + for (i = 0; i < sas_ha->num_phys; i++) { + port = sas_ha->sas_port[i]; + spin_lock(&port->phy_list_lock); + if (*(u64 *) port->sas_addr && + memcmp(port->attached_sas_addr, + phy->attached_sas_addr, SAS_ADDR_SIZE) == 0 && + port->num_phys > 0) { + /* wide port */ + SAS_DPRINTK("phy%d matched wide port%d\n", phy->id, + port->id); + break; + } else if (*(u64 *) port->sas_addr == 0 && port->num_phys==0) { + memcpy(port->sas_addr, phy->sas_addr, SAS_ADDR_SIZE); + break; + } + spin_unlock(&port->phy_list_lock); + } + + if (i >= sas_ha->num_phys) { + printk(KERN_NOTICE "%s: couldn't find a free port, bug?\n", + __FUNCTION__); + spin_unlock(&sas_ha->phy_port_lock); + return; + } + + /* add the phy to the port */ + list_add_tail(&phy->port_phy_el, &port->phy_list); + phy->port = port; + port->num_phys++; + port->phy_mask |= (1U << phy->id); + + if (!port->phy) + port->phy = phy->phy; + + SAS_DPRINTK("phy%d added to port%d, phy_mask:0x%x\n", phy->id, + port->id, port->phy_mask); + + if (*(u64 *)port->attached_sas_addr == 0) { + port->class = phy->class; + memcpy(port->attached_sas_addr, phy->attached_sas_addr, + SAS_ADDR_SIZE); + port->iproto = phy->iproto; + port->tproto = phy->tproto; + port->oob_mode = phy->oob_mode; + port->linkrate = phy->linkrate; + } else + port->linkrate = max(port->linkrate, phy->linkrate); + spin_unlock(&port->phy_list_lock); + spin_unlock(&sas_ha->phy_port_lock); + + if (!port->port) { + port->port = sas_port_alloc(phy->phy->dev.parent, port->id); + BUG_ON(!port->port); + sas_port_add(port->port); + } + sas_port_add_phy(port->port, phy->phy); + + if (port->port_dev) + port->port_dev->pathways = port->num_phys; + + /* Tell the LLDD about this port formation. */ + if (si->dft->lldd_port_formed) + si->dft->lldd_port_formed(phy); + + sas_discover_event(phy->port, DISCE_DISCOVER_DOMAIN); +} + +/** + * sas_deform_port -- remove this phy from the port it belongs to + * @phy: the phy of interest + * + * This is called when the physical link to the other phy has been + * lost (on this phy), in Event thread context. We cannot delay here. + */ +void sas_deform_port(struct asd_sas_phy *phy) +{ + struct sas_ha_struct *sas_ha = phy->ha; + struct asd_sas_port *port = phy->port; + struct sas_internal *si = + to_sas_internal(sas_ha->core.shost->transportt); + + if (!port) + return; /* done by a phy event */ + + if (port->port_dev) + port->port_dev->pathways--; + + if (port->num_phys == 1) { + sas_unregister_domain_devices(port); + sas_port_delete(port->port); + port->port = NULL; + } else + sas_port_delete_phy(port->port, phy->phy); + + + if (si->dft->lldd_port_deformed) + si->dft->lldd_port_deformed(phy); + + spin_lock(&sas_ha->phy_port_lock); + spin_lock(&port->phy_list_lock); + + list_del_init(&phy->port_phy_el); + phy->port = NULL; + port->num_phys--; + port->phy_mask &= ~(1U << phy->id); + + if (port->num_phys == 0) { + INIT_LIST_HEAD(&port->phy_list); + memset(port->sas_addr, 0, SAS_ADDR_SIZE); + memset(port->attached_sas_addr, 0, SAS_ADDR_SIZE); + port->class = 0; + port->iproto = 0; + port->tproto = 0; + port->oob_mode = 0; + port->phy_mask = 0; + } + spin_unlock(&port->phy_list_lock); + spin_unlock(&sas_ha->phy_port_lock); + + return; +} + +/* ---------- SAS port events ---------- */ + +void sas_porte_bytes_dmaed(void *data) +{ + struct asd_sas_phy *phy = data; + + sas_begin_event(PORTE_BYTES_DMAED, &phy->ha->event_lock, + &phy->port_events_pending); + + sas_form_port(phy); +} + +void sas_porte_broadcast_rcvd(void *data) +{ + unsigned long flags; + u32 prim; + struct asd_sas_phy *phy = data; + + sas_begin_event(PORTE_BROADCAST_RCVD, &phy->ha->event_lock, + &phy->port_events_pending); + + spin_lock_irqsave(&phy->sas_prim_lock, flags); + prim = phy->sas_prim; + spin_unlock_irqrestore(&phy->sas_prim_lock, flags); + + SAS_DPRINTK("broadcast received: %d\n", prim); + sas_discover_event(phy->port, DISCE_REVALIDATE_DOMAIN); +} + +void sas_porte_link_reset_err(void *data) +{ + struct asd_sas_phy *phy = data; + + sas_begin_event(PORTE_LINK_RESET_ERR, &phy->ha->event_lock, + &phy->port_events_pending); + + sas_deform_port(phy); +} + +void sas_porte_timer_event(void *data) +{ + struct asd_sas_phy *phy = data; + + sas_begin_event(PORTE_TIMER_EVENT, &phy->ha->event_lock, + &phy->port_events_pending); + + sas_deform_port(phy); +} + +void sas_porte_hard_reset(void *data) +{ + struct asd_sas_phy *phy = data; + + sas_begin_event(PORTE_HARD_RESET, &phy->ha->event_lock, + &phy->port_events_pending); + + sas_deform_port(phy); +} + +/* ---------- SAS port registration ---------- */ + +static void sas_init_port(struct asd_sas_port *port, + struct sas_ha_struct *sas_ha, int i) +{ + port->id = i; + INIT_LIST_HEAD(&port->dev_list); + spin_lock_init(&port->phy_list_lock); + INIT_LIST_HEAD(&port->phy_list); + port->num_phys = 0; + port->phy_mask = 0; + port->ha = sas_ha; + + spin_lock_init(&port->dev_list_lock); +} + +int sas_register_ports(struct sas_ha_struct *sas_ha) +{ + int i; + + /* initialize the ports and discovery */ + for (i = 0; i < sas_ha->num_phys; i++) { + struct asd_sas_port *port = sas_ha->sas_port[i]; + + sas_init_port(port, sas_ha, i); + sas_init_disc(&port->disc, port); + } + return 0; +} + +void sas_unregister_ports(struct sas_ha_struct *sas_ha) +{ + int i; + + for (i = 0; i < sas_ha->num_phys; i++) + if (sas_ha->sas_phy[i]->port) + sas_deform_port(sas_ha->sas_phy[i]); + +} diff -uprN linux-2.6.18/drivers/scsi/libsas/sas_scsi_host.c linux-2.6.18.ovz/drivers/scsi/libsas/sas_scsi_host.c --- linux-2.6.18/drivers/scsi/libsas/sas_scsi_host.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/drivers/scsi/libsas/sas_scsi_host.c 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,786 @@ +/* + * Serial Attached SCSI (SAS) class SCSI Host glue. + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include "sas_internal.h" + +#include +#include +#include +#include +#include +#include +#include "../scsi_sas_internal.h" + +#include +#include +#include + +/* ---------- SCSI Host glue ---------- */ + +#define TO_SAS_TASK(_scsi_cmd) ((void *)(_scsi_cmd)->host_scribble) +#define ASSIGN_SAS_TASK(_sc, _t) do { (_sc)->host_scribble = (void *) _t; } while (0) + +static void sas_scsi_task_done(struct sas_task *task) +{ + struct task_status_struct *ts = &task->task_status; + struct scsi_cmnd *sc = task->uldd_task; + unsigned ts_flags = task->task_state_flags; + int hs = 0, stat = 0; + + if (unlikely(!sc)) { + SAS_DPRINTK("task_done called with non existing SCSI cmnd!\n"); + list_del_init(&task->list); + sas_free_task(task); + return; + } + + if (ts->resp == SAS_TASK_UNDELIVERED) { + /* transport error */ + hs = DID_NO_CONNECT; + } else { /* ts->resp == SAS_TASK_COMPLETE */ + /* task delivered, what happened afterwards? */ + switch (ts->stat) { + case SAS_DEV_NO_RESPONSE: + case SAS_INTERRUPTED: + case SAS_PHY_DOWN: + case SAS_NAK_R_ERR: + case SAS_OPEN_TO: + hs = DID_NO_CONNECT; + break; + case SAS_DATA_UNDERRUN: + sc->resid = ts->residual; + if (sc->request_bufflen - sc->resid < sc->underflow) + hs = DID_ERROR; + break; + case SAS_DATA_OVERRUN: + hs = DID_ERROR; + break; + case SAS_QUEUE_FULL: + hs = DID_SOFT_ERROR; /* retry */ + break; + case SAS_DEVICE_UNKNOWN: + hs = DID_BAD_TARGET; + break; + case SAS_SG_ERR: + hs = DID_PARITY; + break; + case SAS_OPEN_REJECT: + if (ts->open_rej_reason == SAS_OREJ_RSVD_RETRY) + hs = DID_SOFT_ERROR; /* retry */ + else + hs = DID_ERROR; + break; + case SAS_PROTO_RESPONSE: + SAS_DPRINTK("LLDD:%s sent SAS_PROTO_RESP for an SSP " + "task; please report this\n", + task->dev->port->ha->sas_ha_name); + break; + case SAS_ABORTED_TASK: + hs = DID_ABORT; + break; + case SAM_CHECK_COND: + memcpy(sc->sense_buffer, ts->buf, + max(SCSI_SENSE_BUFFERSIZE, ts->buf_valid_size)); + stat = SAM_CHECK_COND; + break; + default: + stat = ts->stat; + break; + } + } + ASSIGN_SAS_TASK(sc, NULL); + sc->result = (hs << 16) | stat; + list_del_init(&task->list); + sas_free_task(task); + /* This is very ugly but this is how SCSI Core works. */ + if (ts_flags & SAS_TASK_STATE_ABORTED) + scsi_finish_command(sc); + else + sc->scsi_done(sc); +} + +static enum task_attribute sas_scsi_get_task_attr(struct scsi_cmnd *cmd) +{ + enum task_attribute ta = TASK_ATTR_SIMPLE; + if (cmd->request && blk_rq_tagged(cmd->request)) { + if (cmd->device->ordered_tags && + (cmd->request->flags & REQ_HARDBARRIER)) + ta = TASK_ATTR_HOQ; + } + return ta; +} + +static struct sas_task *sas_create_task(struct scsi_cmnd *cmd, + struct domain_device *dev, + unsigned long gfp_flags) +{ + struct sas_task *task = sas_alloc_task(gfp_flags); + struct scsi_lun lun; + + if (!task) + return NULL; + + *(u32 *)cmd->sense_buffer = 0; + task->uldd_task = cmd; + ASSIGN_SAS_TASK(cmd, task); + + task->dev = dev; + task->task_proto = task->dev->tproto; /* BUG_ON(!SSP) */ + + task->ssp_task.retry_count = 1; + int_to_scsilun(cmd->device->lun, &lun); + memcpy(task->ssp_task.LUN, &lun.scsi_lun, 8); + task->ssp_task.task_attr = sas_scsi_get_task_attr(cmd); + memcpy(task->ssp_task.cdb, cmd->cmnd, 16); + + task->scatter = cmd->request_buffer; + task->num_scatter = cmd->use_sg; + task->total_xfer_len = cmd->request_bufflen; + task->data_dir = cmd->sc_data_direction; + + task->task_done = sas_scsi_task_done; + + return task; +} + +static int sas_queue_up(struct sas_task *task) +{ + struct sas_ha_struct *sas_ha = task->dev->port->ha; + struct scsi_core *core = &sas_ha->core; + unsigned long flags; + LIST_HEAD(list); + + spin_lock_irqsave(&core->task_queue_lock, flags); + if (sas_ha->lldd_queue_size < core->task_queue_size + 1) { + spin_unlock_irqrestore(&core->task_queue_lock, flags); + return -SAS_QUEUE_FULL; + } + list_add_tail(&task->list, &core->task_queue); + core->task_queue_size += 1; + spin_unlock_irqrestore(&core->task_queue_lock, flags); + up(&core->queue_thread_sema); + + return 0; +} + +/** + * sas_queuecommand -- Enqueue a command for processing + * @parameters: See SCSI Core documentation + * + * Note: XXX: Remove the host unlock/lock pair when SCSI Core can + * call us without holding an IRQ spinlock... + */ +int sas_queuecommand(struct scsi_cmnd *cmd, + void (*scsi_done)(struct scsi_cmnd *)) +{ + int res = 0; + struct domain_device *dev = cmd_to_domain_dev(cmd); + struct Scsi_Host *host = cmd->device->host; + struct sas_internal *i = to_sas_internal(host->transportt); + + spin_unlock_irq(host->host_lock); + + { + struct sas_ha_struct *sas_ha = dev->port->ha; + struct sas_task *task; + + res = -ENOMEM; + task = sas_create_task(cmd, dev, GFP_ATOMIC); + if (!task) + goto out; + + cmd->scsi_done = scsi_done; + /* Queue up, Direct Mode or Task Collector Mode. */ + if (sas_ha->lldd_max_execute_num < 2) + res = i->dft->lldd_execute_task(task, 1, GFP_ATOMIC); + else + res = sas_queue_up(task); + + /* Examine */ + if (res) { + SAS_DPRINTK("lldd_execute_task returned: %d\n", res); + ASSIGN_SAS_TASK(cmd, NULL); + sas_free_task(task); + if (res == -SAS_QUEUE_FULL) { + cmd->result = DID_SOFT_ERROR << 16; /* retry */ + res = 0; + scsi_done(cmd); + } + goto out; + } + } +out: + spin_lock_irq(host->host_lock); + return res; +} + +static void sas_scsi_clear_queue_lu(struct list_head *error_q, struct scsi_cmnd *my_cmd) +{ + struct scsi_cmnd *cmd, *n; + + list_for_each_entry_safe(cmd, n, error_q, eh_entry) { + if (cmd == my_cmd) + list_del_init(&cmd->eh_entry); + } +} + +static void sas_scsi_clear_queue_I_T(struct list_head *error_q, + struct domain_device *dev) +{ + struct scsi_cmnd *cmd, *n; + + list_for_each_entry_safe(cmd, n, error_q, eh_entry) { + struct domain_device *x = cmd_to_domain_dev(cmd); + + if (x == dev) + list_del_init(&cmd->eh_entry); + } +} + +static void sas_scsi_clear_queue_port(struct list_head *error_q, + struct asd_sas_port *port) +{ + struct scsi_cmnd *cmd, *n; + + list_for_each_entry_safe(cmd, n, error_q, eh_entry) { + struct domain_device *dev = cmd_to_domain_dev(cmd); + struct asd_sas_port *x = dev->port; + + if (x == port) + list_del_init(&cmd->eh_entry); + } +} + +enum task_disposition { + TASK_IS_DONE, + TASK_IS_ABORTED, + TASK_IS_AT_LU, + TASK_IS_NOT_AT_LU, +}; + +static enum task_disposition sas_scsi_find_task(struct sas_task *task) +{ + struct sas_ha_struct *ha = task->dev->port->ha; + unsigned long flags; + int i, res; + struct sas_internal *si = + to_sas_internal(task->dev->port->ha->core.shost->transportt); + + if (ha->lldd_max_execute_num > 1) { + struct scsi_core *core = &ha->core; + struct sas_task *t, *n; + + spin_lock_irqsave(&core->task_queue_lock, flags); + list_for_each_entry_safe(t, n, &core->task_queue, list) { + if (task == t) { + list_del_init(&t->list); + spin_unlock_irqrestore(&core->task_queue_lock, + flags); + SAS_DPRINTK("%s: task 0x%p aborted from " + "task_queue\n", + __FUNCTION__, task); + return TASK_IS_ABORTED; + } + } + spin_unlock_irqrestore(&core->task_queue_lock, flags); + } + + for (i = 0; i < 5; i++) { + SAS_DPRINTK("%s: aborting task 0x%p\n", __FUNCTION__, task); + res = si->dft->lldd_abort_task(task); + + spin_lock_irqsave(&task->task_state_lock, flags); + if (task->task_state_flags & SAS_TASK_STATE_DONE) { + spin_unlock_irqrestore(&task->task_state_lock, flags); + SAS_DPRINTK("%s: task 0x%p is done\n", __FUNCTION__, + task); + return TASK_IS_DONE; + } + spin_unlock_irqrestore(&task->task_state_lock, flags); + + if (res == TMF_RESP_FUNC_COMPLETE) { + SAS_DPRINTK("%s: task 0x%p is aborted\n", + __FUNCTION__, task); + return TASK_IS_ABORTED; + } else if (si->dft->lldd_query_task) { + SAS_DPRINTK("%s: querying task 0x%p\n", + __FUNCTION__, task); + res = si->dft->lldd_query_task(task); + if (res == TMF_RESP_FUNC_SUCC) { + SAS_DPRINTK("%s: task 0x%p at LU\n", + __FUNCTION__, task); + return TASK_IS_AT_LU; + } else if (res == TMF_RESP_FUNC_COMPLETE) { + SAS_DPRINTK("%s: task 0x%p not at LU\n", + __FUNCTION__, task); + return TASK_IS_NOT_AT_LU; + } + } + } + return res; +} + +static int sas_recover_lu(struct domain_device *dev, struct scsi_cmnd *cmd) +{ + int res = TMF_RESP_FUNC_FAILED; + struct scsi_lun lun; + struct sas_internal *i = + to_sas_internal(dev->port->ha->core.shost->transportt); + + int_to_scsilun(cmd->device->lun, &lun); + + SAS_DPRINTK("eh: device %llx LUN %x has the task\n", + SAS_ADDR(dev->sas_addr), + cmd->device->lun); + + if (i->dft->lldd_abort_task_set) + res = i->dft->lldd_abort_task_set(dev, lun.scsi_lun); + + if (res == TMF_RESP_FUNC_FAILED) { + if (i->dft->lldd_clear_task_set) + res = i->dft->lldd_clear_task_set(dev, lun.scsi_lun); + } + + if (res == TMF_RESP_FUNC_FAILED) { + if (i->dft->lldd_lu_reset) + res = i->dft->lldd_lu_reset(dev, lun.scsi_lun); + } + + return res; +} + +static int sas_recover_I_T(struct domain_device *dev) +{ + int res = TMF_RESP_FUNC_FAILED; + struct sas_internal *i = + to_sas_internal(dev->port->ha->core.shost->transportt); + + SAS_DPRINTK("I_T nexus reset for dev %016llx\n", + SAS_ADDR(dev->sas_addr)); + + if (i->dft->lldd_I_T_nexus_reset) + res = i->dft->lldd_I_T_nexus_reset(dev); + + return res; +} + +void sas_scsi_recover_host(struct Scsi_Host *shost) +{ + struct sas_ha_struct *ha = SHOST_TO_SAS_HA(shost); + unsigned long flags; + LIST_HEAD(error_q); + struct scsi_cmnd *cmd, *n; + enum task_disposition res = TASK_IS_DONE; + int tmf_resp; + struct sas_internal *i = to_sas_internal(shost->transportt); + + spin_lock_irqsave(shost->host_lock, flags); + list_splice_init(&shost->eh_cmd_q, &error_q); + spin_unlock_irqrestore(shost->host_lock, flags); + + SAS_DPRINTK("Enter %s\n", __FUNCTION__); + + /* All tasks on this list were marked SAS_TASK_STATE_ABORTED + * by sas_scsi_timed_out() callback. + */ +Again: + SAS_DPRINTK("going over list...\n"); + list_for_each_entry_safe(cmd, n, &error_q, eh_entry) { + struct sas_task *task = TO_SAS_TASK(cmd); + + SAS_DPRINTK("trying to find task 0x%p\n", task); + list_del_init(&cmd->eh_entry); + res = sas_scsi_find_task(task); + + cmd->eh_eflags = 0; + shost->host_failed--; + + switch (res) { + case TASK_IS_DONE: + SAS_DPRINTK("%s: task 0x%p is done\n", __FUNCTION__, + task); + task->task_done(task); + continue; + case TASK_IS_ABORTED: + SAS_DPRINTK("%s: task 0x%p is aborted\n", + __FUNCTION__, task); + task->task_done(task); + continue; + case TASK_IS_AT_LU: + SAS_DPRINTK("task 0x%p is at LU: lu recover\n", task); + tmf_resp = sas_recover_lu(task->dev, cmd); + if (tmf_resp == TMF_RESP_FUNC_COMPLETE) { + SAS_DPRINTK("dev %016llx LU %x is " + "recovered\n", + SAS_ADDR(task->dev), + cmd->device->lun); + task->task_done(task); + sas_scsi_clear_queue_lu(&error_q, cmd); + goto Again; + } + /* fallthrough */ + case TASK_IS_NOT_AT_LU: + SAS_DPRINTK("task 0x%p is not at LU: I_T recover\n", + task); + tmf_resp = sas_recover_I_T(task->dev); + if (tmf_resp == TMF_RESP_FUNC_COMPLETE) { + SAS_DPRINTK("I_T %016llx recovered\n", + SAS_ADDR(task->dev->sas_addr)); + task->task_done(task); + sas_scsi_clear_queue_I_T(&error_q, task->dev); + goto Again; + } + /* Hammer time :-) */ + if (i->dft->lldd_clear_nexus_port) { + struct asd_sas_port *port = task->dev->port; + SAS_DPRINTK("clearing nexus for port:%d\n", + port->id); + res = i->dft->lldd_clear_nexus_port(port); + if (res == TMF_RESP_FUNC_COMPLETE) { + SAS_DPRINTK("clear nexus port:%d " + "succeeded\n", port->id); + task->task_done(task); + sas_scsi_clear_queue_port(&error_q, + port); + goto Again; + } + } + if (i->dft->lldd_clear_nexus_ha) { + SAS_DPRINTK("clear nexus ha\n"); + res = i->dft->lldd_clear_nexus_ha(ha); + if (res == TMF_RESP_FUNC_COMPLETE) { + SAS_DPRINTK("clear nexus ha " + "succeeded\n"); + task->task_done(task); + goto out; + } + } + /* If we are here -- this means that no amount + * of effort could recover from errors. Quite + * possibly the HA just disappeared. + */ + SAS_DPRINTK("error from device %llx, LUN %x " + "couldn't be recovered in any way\n", + SAS_ADDR(task->dev->sas_addr), + cmd->device->lun); + + task->task_done(task); + goto clear_q; + } + } +out: + SAS_DPRINTK("--- Exit %s\n", __FUNCTION__); + return; +clear_q: + SAS_DPRINTK("--- Exit %s -- clear_q\n", __FUNCTION__); + list_for_each_entry_safe(cmd, n, &error_q, eh_entry) { + struct sas_task *task = TO_SAS_TASK(cmd); + list_del_init(&cmd->eh_entry); + task->task_done(task); + } +} + +enum scsi_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *cmd) +{ + struct sas_task *task = TO_SAS_TASK(cmd); + unsigned long flags; + + if (!task) { + SAS_DPRINTK("command 0x%p, task 0x%p, timed out: EH_HANDLED\n", + cmd, task); + return EH_HANDLED; + } + + spin_lock_irqsave(&task->task_state_lock, flags); + if (task->task_state_flags & SAS_TASK_STATE_DONE) { + spin_unlock_irqrestore(&task->task_state_lock, flags); + SAS_DPRINTK("command 0x%p, task 0x%p, timed out: EH_HANDLED\n", + cmd, task); + return EH_HANDLED; + } + task->task_state_flags |= SAS_TASK_STATE_ABORTED; + spin_unlock_irqrestore(&task->task_state_lock, flags); + + SAS_DPRINTK("command 0x%p, task 0x%p, timed out: EH_NOT_HANDLED\n", + cmd, task); + + return EH_NOT_HANDLED; +} + +struct domain_device *sas_find_dev_by_rphy(struct sas_rphy *rphy) +{ + struct Scsi_Host *shost = dev_to_shost(rphy->dev.parent); + struct sas_ha_struct *ha = SHOST_TO_SAS_HA(shost); + struct domain_device *found_dev = NULL; + int i; + + spin_lock(&ha->phy_port_lock); + for (i = 0; i < ha->num_phys; i++) { + struct asd_sas_port *port = ha->sas_port[i]; + struct domain_device *dev; + + spin_lock(&port->dev_list_lock); + list_for_each_entry(dev, &port->dev_list, dev_list_node) { + if (rphy == dev->rphy) { + found_dev = dev; + spin_unlock(&port->dev_list_lock); + goto found; + } + } + spin_unlock(&port->dev_list_lock); + } + found: + spin_unlock(&ha->phy_port_lock); + + return found_dev; +} + +static inline struct domain_device *sas_find_target(struct scsi_target *starget) +{ + struct sas_rphy *rphy = dev_to_rphy(starget->dev.parent); + + return sas_find_dev_by_rphy(rphy); +} + +int sas_target_alloc(struct scsi_target *starget) +{ + struct domain_device *found_dev = sas_find_target(starget); + + if (!found_dev) + return -ENODEV; + + starget->hostdata = found_dev; + return 0; +} + +#define SAS_DEF_QD 32 +#define SAS_MAX_QD 64 + +int sas_slave_configure(struct scsi_device *scsi_dev) +{ + struct domain_device *dev = sdev_to_domain_dev(scsi_dev); + struct sas_ha_struct *sas_ha; + + BUG_ON(dev->rphy->identify.device_type != SAS_END_DEVICE); + + sas_ha = dev->port->ha; + + sas_read_port_mode_page(scsi_dev); + + if (scsi_dev->tagged_supported) { + scsi_set_tag_type(scsi_dev, MSG_SIMPLE_TAG); + scsi_activate_tcq(scsi_dev, SAS_DEF_QD); + } else { + SAS_DPRINTK("device %llx, LUN %x doesn't support " + "TCQ\n", SAS_ADDR(dev->sas_addr), + scsi_dev->lun); + scsi_dev->tagged_supported = 0; + scsi_set_tag_type(scsi_dev, 0); + scsi_deactivate_tcq(scsi_dev, 1); + } + + return 0; +} + +void sas_slave_destroy(struct scsi_device *scsi_dev) +{ +} + +int sas_change_queue_depth(struct scsi_device *scsi_dev, int new_depth) +{ + int res = min(new_depth, SAS_MAX_QD); + + if (scsi_dev->tagged_supported) + scsi_adjust_queue_depth(scsi_dev, scsi_get_tag_type(scsi_dev), + res); + else { + struct domain_device *dev = sdev_to_domain_dev(scsi_dev); + sas_printk("device %llx LUN %x queue depth changed to 1\n", + SAS_ADDR(dev->sas_addr), + scsi_dev->lun); + scsi_adjust_queue_depth(scsi_dev, 0, 1); + res = 1; + } + + return res; +} + +int sas_change_queue_type(struct scsi_device *scsi_dev, int qt) +{ + if (!scsi_dev->tagged_supported) + return 0; + + scsi_deactivate_tcq(scsi_dev, 1); + + scsi_set_tag_type(scsi_dev, qt); + scsi_activate_tcq(scsi_dev, scsi_dev->queue_depth); + + return qt; +} + +int sas_bios_param(struct scsi_device *scsi_dev, + struct block_device *bdev, + sector_t capacity, int *hsc) +{ + hsc[0] = 255; + hsc[1] = 63; + sector_div(capacity, 255*63); + hsc[2] = capacity; + + return 0; +} + +/* ---------- Task Collector Thread implementation ---------- */ + +static void sas_queue(struct sas_ha_struct *sas_ha) +{ + struct scsi_core *core = &sas_ha->core; + unsigned long flags; + LIST_HEAD(q); + int can_queue; + int res; + struct sas_internal *i = to_sas_internal(core->shost->transportt); + + spin_lock_irqsave(&core->task_queue_lock, flags); + while (!core->queue_thread_kill && + !list_empty(&core->task_queue)) { + + can_queue = sas_ha->lldd_queue_size - core->task_queue_size; + if (can_queue >= 0) { + can_queue = core->task_queue_size; + list_splice_init(&core->task_queue, &q); + } else { + struct list_head *a, *n; + + can_queue = sas_ha->lldd_queue_size; + list_for_each_safe(a, n, &core->task_queue) { + list_move_tail(a, &q); + if (--can_queue == 0) + break; + } + can_queue = sas_ha->lldd_queue_size; + } + core->task_queue_size -= can_queue; + spin_unlock_irqrestore(&core->task_queue_lock, flags); + { + struct sas_task *task = list_entry(q.next, + struct sas_task, + list); + list_del_init(&q); + res = i->dft->lldd_execute_task(task, can_queue, + GFP_KERNEL); + if (unlikely(res)) + __list_add(&q, task->list.prev, &task->list); + } + spin_lock_irqsave(&core->task_queue_lock, flags); + if (res) { + list_splice_init(&q, &core->task_queue); /*at head*/ + core->task_queue_size += can_queue; + } + } + spin_unlock_irqrestore(&core->task_queue_lock, flags); +} + +static DECLARE_COMPLETION(queue_th_comp); + +/** + * sas_queue_thread -- The Task Collector thread + * @_sas_ha: pointer to struct sas_ha + */ +static int sas_queue_thread(void *_sas_ha) +{ + struct sas_ha_struct *sas_ha = _sas_ha; + struct scsi_core *core = &sas_ha->core; + + daemonize("sas_queue_%d", core->shost->host_no); + current->flags |= PF_NOFREEZE; + + complete(&queue_th_comp); + + while (1) { + down_interruptible(&core->queue_thread_sema); + sas_queue(sas_ha); + if (core->queue_thread_kill) + break; + } + + complete(&queue_th_comp); + + return 0; +} + +int sas_init_queue(struct sas_ha_struct *sas_ha) +{ + int res; + struct scsi_core *core = &sas_ha->core; + + spin_lock_init(&core->task_queue_lock); + core->task_queue_size = 0; + INIT_LIST_HEAD(&core->task_queue); + init_MUTEX_LOCKED(&core->queue_thread_sema); + + res = kernel_thread(sas_queue_thread, sas_ha, 0); + if (res >= 0) + wait_for_completion(&queue_th_comp); + + return res < 0 ? res : 0; +} + +void sas_shutdown_queue(struct sas_ha_struct *sas_ha) +{ + unsigned long flags; + struct scsi_core *core = &sas_ha->core; + struct sas_task *task, *n; + + init_completion(&queue_th_comp); + core->queue_thread_kill = 1; + up(&core->queue_thread_sema); + wait_for_completion(&queue_th_comp); + + if (!list_empty(&core->task_queue)) + SAS_DPRINTK("HA: %llx: scsi core task queue is NOT empty!?\n", + SAS_ADDR(sas_ha->sas_addr)); + + spin_lock_irqsave(&core->task_queue_lock, flags); + list_for_each_entry_safe(task, n, &core->task_queue, list) { + struct scsi_cmnd *cmd = task->uldd_task; + + list_del_init(&task->list); + + ASSIGN_SAS_TASK(cmd, NULL); + sas_free_task(task); + cmd->result = DID_ABORT << 16; + cmd->scsi_done(cmd); + } + spin_unlock_irqrestore(&core->task_queue_lock, flags); +} + +EXPORT_SYMBOL_GPL(sas_queuecommand); +EXPORT_SYMBOL_GPL(sas_target_alloc); +EXPORT_SYMBOL_GPL(sas_slave_configure); +EXPORT_SYMBOL_GPL(sas_slave_destroy); +EXPORT_SYMBOL_GPL(sas_change_queue_depth); +EXPORT_SYMBOL_GPL(sas_change_queue_type); +EXPORT_SYMBOL_GPL(sas_bios_param); diff -uprN linux-2.6.18/drivers/scsi/lpfc/lpfc_ct.c linux-2.6.18.ovz/drivers/scsi/lpfc/lpfc_ct.c --- linux-2.6.18/drivers/scsi/lpfc/lpfc_ct.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/scsi/lpfc/lpfc_ct.c 2007-06-13 06:55:07.000000000 -0400 @@ -958,8 +958,8 @@ lpfc_fdmi_cmd(struct lpfc_hba * phba, st ae = (ATTRIBUTE_ENTRY *) ((uint8_t *) rh + size); ae->ad.bits.AttrType = be16_to_cpu(OS_NAME_VERSION); sprintf(ae->un.OsNameVersion, "%s %s %s", - system_utsname.sysname, system_utsname.release, - system_utsname.version); + init_utsname()->sysname, init_utsname()->release, + init_utsname()->version); len = strlen(ae->un.OsNameVersion); len += (len & 3) ? (4 - (len & 3)) : 4; ae->ad.bits.AttrLen = be16_to_cpu(FOURBYTES + len); @@ -1077,7 +1077,7 @@ lpfc_fdmi_cmd(struct lpfc_hba * phba, st size); ae->ad.bits.AttrType = be16_to_cpu(HOST_NAME); sprintf(ae->un.HostName, "%s", - system_utsname.nodename); + init_utsname()->nodename); len = strlen(ae->un.HostName); len += (len & 3) ? (4 - (len & 3)) : 4; ae->ad.bits.AttrLen = @@ -1165,7 +1165,7 @@ lpfc_fdmi_tmo_handler(struct lpfc_hba *p ndlp = lpfc_findnode_did(phba, NLP_SEARCH_ALL, FDMI_DID); if (ndlp) { - if (system_utsname.nodename[0] != '\0') { + if (init_utsname()->nodename[0] != '\0') { lpfc_fdmi_cmd(phba, ndlp, SLI_MGMT_DHBA); } else { mod_timer(&phba->fc_fdmitmo, jiffies + HZ * 60); diff -uprN linux-2.6.18/drivers/scsi/sata_mv.c linux-2.6.18.ovz/drivers/scsi/sata_mv.c --- linux-2.6.18/drivers/scsi/sata_mv.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/scsi/sata_mv.c 2007-06-13 06:55:07.000000000 -0400 @@ -463,6 +463,7 @@ static const struct ata_port_operations .qc_prep = mv_qc_prep_iie, .qc_issue = mv_qc_issue, + .data_xfer = ata_mmio_data_xfer, .eng_timeout = mv_eng_timeout, diff -uprN linux-2.6.18/drivers/scsi/scsi_lib.c linux-2.6.18.ovz/drivers/scsi/scsi_lib.c --- linux-2.6.18/drivers/scsi/scsi_lib.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/scsi/scsi_lib.c 2007-06-13 06:55:07.000000000 -0400 @@ -191,6 +191,7 @@ int scsi_execute(struct scsi_device *sde goto out; req->cmd_len = COMMAND_SIZE(cmd[0]); + memset(req->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */ memcpy(req->cmd, cmd, req->cmd_len); req->sense = sense; req->sense_len = 0; @@ -408,6 +409,7 @@ int scsi_execute_async(struct scsi_devic goto free_req; req->cmd_len = cmd_len; + memset(req->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */ memcpy(req->cmd, cmd, req->cmd_len); req->sense = sioc->sense; req->sense_len = 0; diff -uprN linux-2.6.18/drivers/scsi/scsi_transport_sas.c linux-2.6.18.ovz/drivers/scsi/scsi_transport_sas.c --- linux-2.6.18/drivers/scsi/scsi_transport_sas.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/scsi/scsi_transport_sas.c 2007-06-13 06:55:08.000000000 -0400 @@ -266,9 +266,6 @@ show_sas_phy_##field(struct class_device struct sas_internal *i = to_sas_internal(shost->transportt); \ int error; \ \ - if (!phy->local_attached) \ - return -EINVAL; \ - \ error = i->f->get_linkerrors ? i->f->get_linkerrors(phy) : 0; \ if (error) \ return error; \ @@ -299,9 +296,6 @@ static ssize_t do_sas_phy_reset(struct c struct sas_internal *i = to_sas_internal(shost->transportt); int error; - if (!phy->local_attached) - return -EINVAL; - error = i->f->phy_reset(phy, hard_reset); if (error) return error; @@ -849,7 +843,7 @@ show_sas_rphy_enclosure_identifier(struc * Only devices behind an expander are supported, because the * enclosure identifier is a SMP feature. */ - if (phy->local_attached) + if (scsi_is_sas_phy_local(phy)) return -EINVAL; error = i->f->get_enclosure_identifier(rphy, &identifier); @@ -870,7 +864,7 @@ show_sas_rphy_bay_identifier(struct clas struct sas_internal *i = to_sas_internal(shost->transportt); int val; - if (phy->local_attached) + if (scsi_is_sas_phy_local(phy)) return -EINVAL; val = i->f->get_bay_identifier(rphy); diff -uprN linux-2.6.18/drivers/serial/serial_core.c linux-2.6.18.ovz/drivers/serial/serial_core.c --- linux-2.6.18/drivers/serial/serial_core.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/serial/serial_core.c 2007-06-13 06:55:07.000000000 -0400 @@ -1932,6 +1932,9 @@ int uart_suspend_port(struct uart_driver if (state->info && state->info->flags & UIF_INITIALIZED) { const struct uart_ops *ops = port->ops; + state->info->flags = (state->info->flags & ~UIF_INITIALIZED) + | UIF_SUSPENDED; + spin_lock_irq(&port->lock); ops->stop_tx(port); ops->set_mctrl(port, 0); @@ -1991,7 +1994,7 @@ int uart_resume_port(struct uart_driver console_start(port->cons); } - if (state->info && state->info->flags & UIF_INITIALIZED) { + if (state->info && state->info->flags & UIF_SUSPENDED) { const struct uart_ops *ops = port->ops; int ret; @@ -2003,15 +2006,17 @@ int uart_resume_port(struct uart_driver ops->set_mctrl(port, port->mctrl); ops->start_tx(port); spin_unlock_irq(&port->lock); + state->info->flags |= UIF_INITIALIZED; } else { /* * Failed to resume - maybe hardware went away? * Clear the "initialized" flag so we won't try * to call the low level drivers shutdown method. */ - state->info->flags &= ~UIF_INITIALIZED; uart_shutdown(state); } + + state->info->flags &= ~UIF_SUSPENDED; } mutex_unlock(&state->mutex); diff -uprN linux-2.6.18/drivers/serial/serial_cs.c linux-2.6.18.ovz/drivers/serial/serial_cs.c --- linux-2.6.18/drivers/serial/serial_cs.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/serial/serial_cs.c 2007-06-13 06:55:07.000000000 -0400 @@ -185,14 +185,12 @@ static int serial_suspend(struct pcmcia_ static int serial_resume(struct pcmcia_device *link) { - if (pcmcia_dev_present(link)) { - struct serial_info *info = link->priv; - int i; + struct serial_info *info = link->priv; + int i; - for (i = 0; i < info->ndev; i++) - serial8250_resume_port(info->line[i]); - wakeup_card(info); - } + for (i = 0; i < info->ndev; i++) + serial8250_resume_port(info->line[i]); + wakeup_card(info); return 0; } diff -uprN linux-2.6.18/drivers/usb/class/usblp.c linux-2.6.18.ovz/drivers/usb/class/usblp.c --- linux-2.6.18/drivers/usb/class/usblp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/class/usblp.c 2007-06-13 06:55:07.000000000 -0400 @@ -701,6 +701,7 @@ static ssize_t usblp_write(struct file * usblp->wcomplete = 0; err = usb_submit_urb(usblp->writeurb, GFP_KERNEL); if (err) { + usblp->wcomplete = 1; if (err != -ENOMEM) count = -EIO; else diff -uprN linux-2.6.18/drivers/usb/core/devio.c linux-2.6.18.ovz/drivers/usb/core/devio.c --- linux-2.6.18/drivers/usb/core/devio.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/core/devio.c 2007-06-13 06:55:07.000000000 -0400 @@ -59,6 +59,9 @@ #define USB_DEVICE_MAX USB_MAXBUS * 128 static struct class *usb_device_class; +/* Mutual exclusion for removal, open, and release */ +DEFINE_MUTEX(usbfs_mutex); + struct async { struct list_head asynclist; struct dev_state *ps; @@ -541,15 +544,13 @@ static int usbdev_open(struct inode *ino struct dev_state *ps; int ret; - /* - * no locking necessary here, as chrdev_open has the kernel lock - * (still acquire the kernel lock for safety) - */ + /* Protect against simultaneous removal or release */ + mutex_lock(&usbfs_mutex); + ret = -ENOMEM; if (!(ps = kmalloc(sizeof(struct dev_state), GFP_KERNEL))) - goto out_nolock; + goto out; - lock_kernel(); ret = -ENOENT; /* check if we are called from a real node or usbfs */ if (imajor(inode) == USB_DEVICE_MAJOR) @@ -579,9 +580,8 @@ static int usbdev_open(struct inode *ino list_add_tail(&ps->list, &dev->filelist); file->private_data = ps; out: - unlock_kernel(); - out_nolock: - return ret; + mutex_unlock(&usbfs_mutex); + return ret; } static int usbdev_release(struct inode *inode, struct file *file) @@ -591,7 +591,12 @@ static int usbdev_release(struct inode * unsigned int ifnum; usb_lock_device(dev); + + /* Protect against simultaneous open */ + mutex_lock(&usbfs_mutex); list_del_init(&ps->list); + mutex_unlock(&usbfs_mutex); + for (ifnum = 0; ps->ifclaimed && ifnum < 8*sizeof(ps->ifclaimed); ifnum++) { if (test_bit(ifnum, &ps->ifclaimed)) @@ -600,9 +605,8 @@ static int usbdev_release(struct inode * destroy_all_async(ps); usb_unlock_device(dev); usb_put_dev(dev); - ps->dev = NULL; kfree(ps); - return 0; + return 0; } static int proc_control(struct dev_state *ps, void __user *arg) diff -uprN linux-2.6.18/drivers/usb/core/hcd.c linux-2.6.18.ovz/drivers/usb/core/hcd.c --- linux-2.6.18/drivers/usb/core/hcd.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/core/hcd.c 2007-06-13 06:55:07.000000000 -0400 @@ -317,8 +317,8 @@ static int rh_string ( // id 3 == vendor description } else if (id == 3) { - snprintf (buf, sizeof buf, "%s %s %s", system_utsname.sysname, - system_utsname.release, hcd->driver->description); + snprintf (buf, sizeof buf, "%s %s %s", init_utsname()->sysname, + init_utsname()->release, hcd->driver->description); // unsupported IDs --> "protocol stall" } else diff -uprN linux-2.6.18/drivers/usb/core/notify.c linux-2.6.18.ovz/drivers/usb/core/notify.c --- linux-2.6.18/drivers/usb/core/notify.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/core/notify.c 2007-06-13 06:55:07.000000000 -0400 @@ -50,8 +50,11 @@ void usb_notify_add_device(struct usb_de void usb_notify_remove_device(struct usb_device *udev) { + /* Protect against simultaneous usbfs open */ + mutex_lock(&usbfs_mutex); blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_REMOVE, udev); + mutex_unlock(&usbfs_mutex); } void usb_notify_add_bus(struct usb_bus *ubus) diff -uprN linux-2.6.18/drivers/usb/core/usb.h linux-2.6.18.ovz/drivers/usb/core/usb.h --- linux-2.6.18/drivers/usb/core/usb.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/core/usb.h 2007-06-13 06:55:07.000000000 -0400 @@ -59,6 +59,7 @@ static inline int is_active(struct usb_i extern const char *usbcore_name; /* usbfs stuff */ +extern struct mutex usbfs_mutex; extern struct usb_driver usbfs_driver; extern struct file_operations usbfs_devices_fops; extern struct file_operations usbfs_device_file_operations; diff -uprN linux-2.6.18/drivers/usb/gadget/ether.c linux-2.6.18.ovz/drivers/usb/gadget/ether.c --- linux-2.6.18/drivers/usb/gadget/ether.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/gadget/ether.c 2007-06-13 06:55:07.000000000 -0400 @@ -262,7 +262,7 @@ MODULE_PARM_DESC(host_addr, "Host Ethern #define DEV_CONFIG_CDC #endif -#ifdef CONFIG_USB_GADGET_MUSBHDRC +#ifdef CONFIG_USB_GADGET_MUSB_HDRC #define DEV_CONFIG_CDC #endif @@ -2257,7 +2257,7 @@ eth_bind (struct usb_gadget *gadget) return -ENODEV; } snprintf (manufacturer, sizeof manufacturer, "%s %s/%s", - system_utsname.sysname, system_utsname.release, + init_utsname()->sysname, init_utsname()->release, gadget->name); /* If there's an RNDIS configuration, that's what Windows wants to @@ -2564,7 +2564,7 @@ static struct usb_gadget_driver eth_driv .function = (char *) driver_desc, .bind = eth_bind, - .unbind = __exit_p(eth_unbind), + .unbind = eth_unbind, .setup = eth_setup, .disconnect = eth_disconnect, diff -uprN linux-2.6.18/drivers/usb/gadget/file_storage.c linux-2.6.18.ovz/drivers/usb/gadget/file_storage.c --- linux-2.6.18/drivers/usb/gadget/file_storage.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/gadget/file_storage.c 2007-06-13 06:55:07.000000000 -0400 @@ -3982,7 +3982,7 @@ static int __init fsg_bind(struct usb_ga usb_gadget_set_selfpowered(gadget); snprintf(manufacturer, sizeof manufacturer, "%s %s with %s", - system_utsname.sysname, system_utsname.release, + init_utsname()->sysname, init_utsname()->release, gadget->name); /* On a real device, serial[] would be loaded from permanent diff -uprN linux-2.6.18/drivers/usb/gadget/serial.c linux-2.6.18.ovz/drivers/usb/gadget/serial.c --- linux-2.6.18/drivers/usb/gadget/serial.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/gadget/serial.c 2007-06-13 06:55:07.000000000 -0400 @@ -1431,7 +1431,7 @@ static int __init gs_bind(struct usb_gad return -ENOMEM; snprintf(manufacturer, sizeof(manufacturer), "%s %s with %s", - system_utsname.sysname, system_utsname.release, + init_utsname()->sysname, init_utsname()->release, gadget->name); memset(dev, 0, sizeof(struct gs_dev)); diff -uprN linux-2.6.18/drivers/usb/gadget/zero.c linux-2.6.18.ovz/drivers/usb/gadget/zero.c --- linux-2.6.18/drivers/usb/gadget/zero.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/gadget/zero.c 2007-06-13 06:55:07.000000000 -0400 @@ -1242,7 +1242,7 @@ autoconf_fail: EP_OUT_NAME, EP_IN_NAME); snprintf (manufacturer, sizeof manufacturer, "%s %s with %s", - system_utsname.sysname, system_utsname.release, + init_utsname()->sysname, init_utsname()->release, gadget->name); return 0; diff -uprN linux-2.6.18/drivers/usb/input/hid-core.c linux-2.6.18.ovz/drivers/usb/input/hid-core.c --- linux-2.6.18/drivers/usb/input/hid-core.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/input/hid-core.c 2007-06-13 06:55:07.000000000 -0400 @@ -1734,10 +1734,10 @@ static const struct hid_blacklist { { USB_VENDOR_ID_APPLE, 0x020E, HID_QUIRK_POWERBOOK_HAS_FN }, { USB_VENDOR_ID_APPLE, 0x020F, HID_QUIRK_POWERBOOK_HAS_FN }, { USB_VENDOR_ID_APPLE, 0x0214, HID_QUIRK_POWERBOOK_HAS_FN }, - { USB_VENDOR_ID_APPLE, 0x0215, HID_QUIRK_POWERBOOK_HAS_FN }, + { USB_VENDOR_ID_APPLE, 0x0215, HID_QUIRK_POWERBOOK_HAS_FN | HID_QUIRK_POWERBOOK_ISO_KEYBOARD}, { USB_VENDOR_ID_APPLE, 0x0216, HID_QUIRK_POWERBOOK_HAS_FN }, { USB_VENDOR_ID_APPLE, 0x0217, HID_QUIRK_POWERBOOK_HAS_FN }, - { USB_VENDOR_ID_APPLE, 0x0218, HID_QUIRK_POWERBOOK_HAS_FN }, + { USB_VENDOR_ID_APPLE, 0x0218, HID_QUIRK_POWERBOOK_HAS_FN | HID_QUIRK_POWERBOOK_ISO_KEYBOARD}, { USB_VENDOR_ID_APPLE, 0x0219, HID_QUIRK_POWERBOOK_HAS_FN }, { USB_VENDOR_ID_APPLE, 0x030A, HID_QUIRK_POWERBOOK_HAS_FN }, { USB_VENDOR_ID_APPLE, 0x030B, HID_QUIRK_POWERBOOK_HAS_FN }, diff -uprN linux-2.6.18/drivers/usb/input/hid-input.c linux-2.6.18.ovz/drivers/usb/input/hid-input.c --- linux-2.6.18/drivers/usb/input/hid-input.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/input/hid-input.c 2007-06-13 06:55:07.000000000 -0400 @@ -123,6 +123,12 @@ static struct hidinput_key_translation p { } }; +static struct hidinput_key_translation powerbook_iso_keyboard[] = { + { KEY_GRAVE, KEY_102ND }, + { KEY_102ND, KEY_GRAVE }, + { } +}; + static int usbhid_pb_fnmode = 1; module_param_named(pb_fnmode, usbhid_pb_fnmode, int, 0644); MODULE_PARM_DESC(pb_fnmode, @@ -197,6 +203,14 @@ static int hidinput_pb_event(struct hid_ } } + if (hid->quirks & HID_QUIRK_POWERBOOK_ISO_KEYBOARD) { + trans = find_translation(powerbook_iso_keyboard, usage->code); + if (trans) { + input_event(input, usage->type, trans->to, value); + return 1; + } + } + return 0; } @@ -212,6 +226,9 @@ static void hidinput_pb_setup(struct inp for (trans = powerbook_numlock_keys; trans->from; trans++) set_bit(trans->to, input->keybit); + + for (trans = powerbook_iso_keyboard; trans->from; trans++) + set_bit(trans->to, input->keybit); } #else static inline int hidinput_pb_event(struct hid_device *hid, struct input_dev *input, diff -uprN linux-2.6.18/drivers/usb/input/hid.h linux-2.6.18.ovz/drivers/usb/input/hid.h --- linux-2.6.18/drivers/usb/input/hid.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/input/hid.h 2007-06-13 06:55:07.000000000 -0400 @@ -260,6 +260,7 @@ struct hid_item { #define HID_QUIRK_POWERBOOK_HAS_FN 0x00001000 #define HID_QUIRK_POWERBOOK_FN_ON 0x00002000 #define HID_QUIRK_INVERT_HWHEEL 0x00004000 +#define HID_QUIRK_POWERBOOK_ISO_KEYBOARD 0x00010000 /* * This is the global environment of the parser. This information is diff -uprN linux-2.6.18/drivers/usb/input/usbtouchscreen.c linux-2.6.18.ovz/drivers/usb/input/usbtouchscreen.c --- linux-2.6.18/drivers/usb/input/usbtouchscreen.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/usb/input/usbtouchscreen.c 2007-06-13 06:55:07.000000000 -0400 @@ -522,7 +522,7 @@ static int usbtouch_probe(struct usb_int type->max_press, 0, 0); usb_fill_int_urb(usbtouch->irq, usbtouch->udev, - usb_rcvintpipe(usbtouch->udev, 0x81), + usb_rcvintpipe(usbtouch->udev, endpoint->bEndpointAddress), usbtouch->data, type->rept_size, usbtouch_irq, usbtouch, endpoint->bInterval); diff -uprN linux-2.6.18/drivers/video/fbmem.c linux-2.6.18.ovz/drivers/video/fbmem.c --- linux-2.6.18/drivers/video/fbmem.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/video/fbmem.c 2007-06-13 06:55:07.000000000 -0400 @@ -554,7 +554,8 @@ static int fbmem_read_proc(char *buf, ch int clen; clen = 0; - for (fi = registered_fb; fi < ®istered_fb[FB_MAX] && len < 4000; fi++) + for (fi = registered_fb; fi < ®istered_fb[FB_MAX] && clen < 4000; + fi++) if (*fi) clen += sprintf(buf + clen, "%d %s\n", (*fi)->node, diff -uprN linux-2.6.18/drivers/video/fbsysfs.c linux-2.6.18.ovz/drivers/video/fbsysfs.c --- linux-2.6.18/drivers/video/fbsysfs.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/video/fbsysfs.c 2007-06-13 06:55:07.000000000 -0400 @@ -397,6 +397,12 @@ static ssize_t store_bl_curve(struct cla u8 tmp_curve[FB_BACKLIGHT_LEVELS]; unsigned int i; + /* Some drivers don't use framebuffer_alloc(), but those also + * don't have backlights. + */ + if (!fb_info || !fb_info->bl_dev) + return -ENODEV; + if (count != (FB_BACKLIGHT_LEVELS / 8 * 24)) return -EINVAL; @@ -430,6 +436,12 @@ static ssize_t show_bl_curve(struct clas ssize_t len = 0; unsigned int i; + /* Some drivers don't use framebuffer_alloc(), but those also + * don't have backlights. + */ + if (!fb_info || !fb_info->bl_dev) + return -ENODEV; + mutex_lock(&fb_info->bl_mutex); for (i = 0; i < FB_BACKLIGHT_LEVELS; i += 8) len += snprintf(&buf[len], PAGE_SIZE, diff -uprN linux-2.6.18/drivers/video/nvidia/nv_hw.c linux-2.6.18.ovz/drivers/video/nvidia/nv_hw.c --- linux-2.6.18/drivers/video/nvidia/nv_hw.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/video/nvidia/nv_hw.c 2007-06-13 06:55:07.000000000 -0400 @@ -145,12 +145,18 @@ static void nvGetClocks(struct nvidia_pa if (par->Architecture >= NV_ARCH_40) { pll = NV_RD32(par->PMC, 0x4020); - P = (pll >> 16) & 0x03; + P = (pll >> 16) & 0x07; pll = NV_RD32(par->PMC, 0x4024); M = pll & 0xFF; N = (pll >> 8) & 0xFF; - MB = (pll >> 16) & 0xFF; - NB = (pll >> 24) & 0xFF; + if (((par->Chipset & 0xfff0) == 0x0290) || + ((par->Chipset & 0xfff0) == 0x0390)) { + MB = 1; + NB = 1; + } else { + MB = (pll >> 16) & 0xFF; + NB = (pll >> 24) & 0xFF; + } *MClk = ((N * NB * par->CrystalFreqKHz) / (M * MB)) >> P; pll = NV_RD32(par->PMC, 0x4000); diff -uprN linux-2.6.18/drivers/video/nvidia/nv_setup.c linux-2.6.18.ovz/drivers/video/nvidia/nv_setup.c --- linux-2.6.18/drivers/video/nvidia/nv_setup.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/video/nvidia/nv_setup.c 2007-06-13 06:55:07.000000000 -0400 @@ -359,6 +359,7 @@ int NVCommonSetup(struct fb_info *info) case 0x0186: case 0x0187: case 0x018D: + case 0x0228: case 0x0286: case 0x028C: case 0x0316: @@ -382,6 +383,10 @@ int NVCommonSetup(struct fb_info *info) case 0x034C: case 0x0160: case 0x0166: + case 0x0169: + case 0x016B: + case 0x016C: + case 0x016D: case 0x00C8: case 0x00CC: case 0x0144: @@ -639,12 +644,23 @@ int NVCommonSetup(struct fb_info *info) par->fpHeight = NV_RD32(par->PRAMDAC, 0x0800) + 1; par->fpSyncs = NV_RD32(par->PRAMDAC, 0x0848) & 0x30000033; - printk("Panel size is %i x %i\n", par->fpWidth, par->fpHeight); + printk("nvidiafb: Panel size is %i x %i\n", par->fpWidth, par->fpHeight); } if (monA) info->monspecs = *monA; + if (!par->FlatPanel || !par->twoHeads) + par->FPDither = 0; + + par->LVDS = 0; + if (par->FlatPanel && par->twoHeads) { + NV_WR32(par->PRAMDAC0, 0x08B0, 0x00010004); + if (par->PRAMDAC0[0x08b4] & 1) + par->LVDS = 1; + printk("nvidiafb: Panel is %s\n", par->LVDS ? "LVDS" : "TMDS"); + } + kfree(edidA); kfree(edidB); done: diff -uprN linux-2.6.18/drivers/video/nvidia/nv_type.h linux-2.6.18.ovz/drivers/video/nvidia/nv_type.h --- linux-2.6.18/drivers/video/nvidia/nv_type.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/video/nvidia/nv_type.h 2007-06-13 06:55:07.000000000 -0400 @@ -129,6 +129,7 @@ struct nvidia_par { int fpHeight; int PanelTweak; int paneltweak; + int LVDS; int pm_state; u32 crtcSync_read; u32 fpSyncs; diff -uprN linux-2.6.18/drivers/video/nvidia/nvidia.c linux-2.6.18.ovz/drivers/video/nvidia/nvidia.c --- linux-2.6.18/drivers/video/nvidia/nvidia.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/drivers/video/nvidia/nvidia.c 2007-06-13 06:55:07.000000000 -0400 @@ -1145,20 +1145,20 @@ static u32 __devinit nvidia_get_arch(str case 0x0340: /* GeForceFX 5700 */ arch = NV_ARCH_30; break; - case 0x0040: - case 0x00C0: - case 0x0120: + case 0x0040: /* GeForce 6800 */ + case 0x00C0: /* GeForce 6800 */ + case 0x0120: /* GeForce 6800 */ case 0x0130: - case 0x0140: - case 0x0160: - case 0x01D0: - case 0x0090: - case 0x0210: - case 0x0220: + case 0x0140: /* GeForce 6600 */ + case 0x0160: /* GeForce 6200 */ + case 0x01D0: /* GeForce 7200, 7300, 7400 */ + case 0x0090: /* GeForce 7800 */ + case 0x0210: /* GeForce 6800 */ + case 0x0220: /* GeForce 6200 */ case 0x0230: - case 0x0240: - case 0x0290: - case 0x0390: + case 0x0240: /* GeForce 6100 */ + case 0x0290: /* GeForce 7900 */ + case 0x0390: /* GeForce 7600 */ arch = NV_ARCH_40; break; case 0x0020: /* TNT, TNT2 */ diff -uprN linux-2.6.18/fs/Kconfig linux-2.6.18.ovz/fs/Kconfig --- linux-2.6.18/fs/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/Kconfig 2007-06-13 06:55:07.000000000 -0400 @@ -440,6 +440,15 @@ config QUOTA with the quota tools. Probably the quota support is only useful for multi user systems. If unsure, say N. +config QUOTA_COMPAT + bool "Compatibility with older quotactl interface" + depends on QUOTA + help + This option enables compatibility layer for older version + of quotactl interface with byte granularity (QUOTAON at 0x0100, + GETQUOTA at 0x0D00). Interface versions older than that one and + with block granularity are still not supported. + config QFMT_V1 tristate "Old quota format support" depends on QUOTA @@ -455,6 +464,39 @@ config QFMT_V2 This quota format allows using quotas with 32-bit UIDs/GIDs. If you need this functionality say Y here. +config SIM_FS + tristate "VPS filesystem" + depends on VZ_QUOTA + default m + help + This file system is a part of Virtuozzo. It intoduces a fake + superblock and blockdev to VE to hide real device and show + statfs results taken from quota. + +config VZ_QUOTA + tristate "Virtuozzo Disk Quota support" + depends on QUOTA + select VZ_DEV + default m + help + Virtuozzo Disk Quota imposes disk quota on directories with their + files and subdirectories in total. Such disk quota is used to + account and limit disk usage by Virtuozzo VPS, but also may be used + separately. + +config VZ_QUOTA_UNLOAD + bool "Unloadable Virtuozzo Disk Quota module" + depends on VZ_QUOTA=m + default n + help + Make Virtuozzo Disk Quota module unloadable. + Doesn't work reliably now. + +config VZ_QUOTA_UGID + bool "Per-user and per-group quota in Virtuozzo quota partitions" + depends on VZ_QUOTA!=n + default y + config QUOTACTL bool depends on XFS_QUOTA || QUOTA diff -uprN linux-2.6.18/fs/Makefile linux-2.6.18.ovz/fs/Makefile --- linux-2.6.18/fs/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/Makefile 2007-06-13 06:55:07.000000000 -0400 @@ -40,9 +40,15 @@ obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_VZ_QUOTA) += vzdquota.o +vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o +vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o +vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o obj-$(CONFIG_DNOTIFY) += dnotify.o +obj-$(CONFIG_SIM_FS) += simfs.o + obj-$(CONFIG_PROC_FS) += proc/ obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ diff -uprN linux-2.6.18/fs/aio.c linux-2.6.18.ovz/fs/aio.c --- linux-2.6.18/fs/aio.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/aio.c 2007-06-13 06:55:07.000000000 -0400 @@ -41,13 +41,16 @@ #endif /*------ sysctl variables----*/ -static DEFINE_SPINLOCK(aio_nr_lock); +DEFINE_SPINLOCK(aio_nr_lock); unsigned long aio_nr; /* current system wide number of aio requests */ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +EXPORT_SYMBOL_GPL(aio_nr_lock); +EXPORT_SYMBOL_GPL(aio_nr); /*----end sysctl variables---*/ static kmem_cache_t *kiocb_cachep; -static kmem_cache_t *kioctx_cachep; +kmem_cache_t *kioctx_cachep; +EXPORT_SYMBOL_GPL(kioctx_cachep); static struct workqueue_struct *aio_wq; @@ -58,7 +61,7 @@ static DECLARE_WORK(fput_work, aio_fput_ static DEFINE_SPINLOCK(fput_lock); static LIST_HEAD(fput_head); -static void aio_kick_handler(void *); +void aio_kick_handler(void *); static void aio_queue_work(struct kioctx *); /* aio_setup @@ -292,7 +295,7 @@ static void aio_cancel_all(struct kioctx spin_unlock_irq(&ctx->ctx_lock); } -static void wait_for_all_aios(struct kioctx *ctx) +void wait_for_all_aios(struct kioctx *ctx) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -309,6 +312,7 @@ static void wait_for_all_aios(struct kio __set_task_state(tsk, TASK_RUNNING); remove_wait_queue(&ctx->wait, &wait); } +EXPORT_SYMBOL_GPL(wait_for_all_aios); /* wait_on_sync_kiocb: * Waits on the given sync kiocb to complete. @@ -855,7 +859,7 @@ static inline void aio_run_all_iocbs(str * space. * Run on aiod's context. */ -static void aio_kick_handler(void *data) +void aio_kick_handler(void *data) { struct kioctx *ctx = data; mm_segment_t oldfs = get_fs(); @@ -874,6 +878,7 @@ static void aio_kick_handler(void *data) if (requeue) queue_work(aio_wq, &ctx->wq); } +EXPORT_SYMBOL_GPL(aio_kick_handler); /* diff -uprN linux-2.6.18/fs/autofs/autofs_i.h linux-2.6.18.ovz/fs/autofs/autofs_i.h --- linux-2.6.18/fs/autofs/autofs_i.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/autofs/autofs_i.h 2007-06-13 06:55:07.000000000 -0400 @@ -124,7 +124,7 @@ static inline struct autofs_sb_info *aut filesystem without "magic".) */ static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || process_group(current) == sbi->oz_pgrp; + return sbi->catatonic || virt_pgid(current) == sbi->oz_pgrp; } /* Hash operations */ diff -uprN linux-2.6.18/fs/autofs/init.c linux-2.6.18.ovz/fs/autofs/init.c --- linux-2.6.18/fs/autofs/init.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/autofs/init.c 2007-06-13 06:55:07.000000000 -0400 @@ -25,6 +25,7 @@ static struct file_system_type autofs_fs .name = "autofs", .get_sb = autofs_get_sb, .kill_sb = kill_anon_super, + .fs_flags = FS_VIRTUALIZED, }; static int __init init_autofs_fs(void) diff -uprN linux-2.6.18/fs/autofs/inode.c linux-2.6.18.ovz/fs/autofs/inode.c --- linux-2.6.18/fs/autofs/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/autofs/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -66,7 +66,7 @@ static int parse_options(char *options, *uid = current->uid; *gid = current->gid; - *pgrp = process_group(current); + *pgrp = virt_pgid(current); *minproto = *maxproto = AUTOFS_PROTO_VERSION; @@ -138,7 +138,7 @@ int autofs_fill_super(struct super_block sbi->magic = AUTOFS_SBI_MAGIC; sbi->catatonic = 0; sbi->exp_timeout = 0; - sbi->oz_pgrp = process_group(current); + sbi->oz_pgrp = virt_pgid(current); autofs_initialize_hash(&sbi->dirhash); sbi->queues = NULL; memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN); diff -uprN linux-2.6.18/fs/autofs/root.c linux-2.6.18.ovz/fs/autofs/root.c --- linux-2.6.18/fs/autofs/root.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/autofs/root.c 2007-06-13 06:55:07.000000000 -0400 @@ -354,7 +354,7 @@ static int autofs_root_unlink(struct ino /* This allows root to remove symlinks */ lock_kernel(); - if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) { + if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) { unlock_kernel(); return -EACCES; } @@ -541,7 +541,7 @@ static int autofs_root_ioctl(struct inod _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) return -ENOTTY; - if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) + if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) return -EPERM; switch(cmd) { diff -uprN linux-2.6.18/fs/autofs4/autofs_i.h linux-2.6.18.ovz/fs/autofs4/autofs_i.h --- linux-2.6.18/fs/autofs4/autofs_i.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/autofs4/autofs_i.h 2007-06-13 06:55:07.000000000 -0400 @@ -130,7 +130,7 @@ static inline struct autofs_info *autofs filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || process_group(current) == sbi->oz_pgrp; + return sbi->catatonic || virt_pgid(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ diff -uprN linux-2.6.18/fs/autofs4/init.c linux-2.6.18.ovz/fs/autofs4/init.c --- linux-2.6.18/fs/autofs4/init.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/autofs4/init.c 2007-06-13 06:55:07.000000000 -0400 @@ -25,6 +25,7 @@ static struct file_system_type autofs_fs .name = "autofs", .get_sb = autofs_get_sb, .kill_sb = kill_anon_super, + .fs_flags = FS_VIRTUALIZED, }; static int __init init_autofs4_fs(void) diff -uprN linux-2.6.18/fs/autofs4/inode.c linux-2.6.18.ovz/fs/autofs4/inode.c --- linux-2.6.18/fs/autofs4/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/autofs4/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -219,7 +219,7 @@ static int parse_options(char *options, *uid = current->uid; *gid = current->gid; - *pgrp = process_group(current); + *pgrp = virt_pgid(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -318,7 +318,7 @@ int autofs4_fill_super(struct super_bloc sbi->pipefd = -1; sbi->catatonic = 0; sbi->exp_timeout = 0; - sbi->oz_pgrp = process_group(current); + sbi->oz_pgrp = virt_pgid(current); sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; diff -uprN linux-2.6.18/fs/autofs4/root.c linux-2.6.18.ovz/fs/autofs4/root.c --- linux-2.6.18/fs/autofs4/root.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/autofs4/root.c 2007-06-13 06:55:07.000000000 -0400 @@ -605,7 +605,7 @@ static int autofs4_dir_unlink(struct ino struct autofs_info *p_ino; /* This allows root to remove symlinks */ - if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) + if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) return -EACCES; if (atomic_dec_and_test(&ino->count)) { @@ -812,7 +812,7 @@ static int autofs4_root_ioctl(struct ino _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) return -ENOTTY; - if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) + if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) return -EPERM; switch(cmd) { diff -uprN linux-2.6.18/fs/bad_inode.c linux-2.6.18.ovz/fs/bad_inode.c --- linux-2.6.18/fs/bad_inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/bad_inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,61 +14,319 @@ #include #include #include +#include -static int return_EIO(void) +static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin) { return -EIO; } -#define EIO_ERROR ((void *) (return_EIO)) +static ssize_t bad_file_read(struct file *filp, char __user *buf, + size_t size, loff_t *ppos) +{ + return -EIO; +} + +static ssize_t bad_file_aio_read(struct kiocb *iocb, char __user *buf, + size_t size, loff_t pos) +{ + return -EIO; +} + +static ssize_t bad_file_write(struct file *filp, const char __user *buf, + size_t size, loff_t *ppos) +{ + return -EIO; +} + +static ssize_t bad_file_aio_write(struct kiocb *iocb, const char __user *buf, + size_t size, loff_t pos) +{ + return -EIO; +} + +static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + return -EIO; +} + +static unsigned int bad_file_poll(struct file *filp, poll_table *wait) +{ + return POLLERR; +} + +static int bad_file_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + return -EIO; +} + +static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + return -EIO; +} + +static long bad_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return -EIO; +} + +static int bad_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + return -EIO; +} + +static int bad_file_open(struct inode *inode, struct file *filp) +{ + return -EIO; +} + +static int bad_file_flush(struct file *file, fl_owner_t id) +{ + return -EIO; +} + +static int bad_file_release(struct inode *inode, struct file *filp) +{ + return -EIO; +} + +static int bad_file_fsync(struct file *file, struct dentry *dentry, + int datasync) +{ + return -EIO; +} + +static int bad_file_aio_fsync(struct kiocb *iocb, int datasync) +{ + return -EIO; +} + +static int bad_file_fasync(int fd, struct file *filp, int on) +{ + return -EIO; +} + +static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl) +{ + return -EIO; +} + +static ssize_t bad_file_readv(struct file *filp, const struct iovec *iov, + unsigned long vlen, loff_t *ppos) +{ + return -EIO; +} + +static ssize_t bad_file_writev(struct file *filp, const struct iovec *iov, + unsigned long vlen, loff_t *ppos) +{ + return -EIO; +} + +static ssize_t bad_file_sendfile(struct file *in_file, loff_t *ppos, + size_t count, read_actor_t actor, void *target) +{ + return -EIO; +} + +static ssize_t bad_file_sendpage(struct file *file, struct page *page, + int off, size_t len, loff_t *pos, int more) +{ + return -EIO; +} + +static unsigned long bad_file_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return -EIO; +} + +static int bad_file_check_flags(int flags) +{ + return -EIO; +} + +static int bad_file_dir_notify(struct file *file, unsigned long arg) +{ + return -EIO; +} + +static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) +{ + return -EIO; +} + +static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags) +{ + return -EIO; +} + +static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + return -EIO; +} static const struct file_operations bad_file_ops = { - .llseek = EIO_ERROR, - .aio_read = EIO_ERROR, - .read = EIO_ERROR, - .write = EIO_ERROR, - .aio_write = EIO_ERROR, - .readdir = EIO_ERROR, - .poll = EIO_ERROR, - .ioctl = EIO_ERROR, - .mmap = EIO_ERROR, - .open = EIO_ERROR, - .flush = EIO_ERROR, - .release = EIO_ERROR, - .fsync = EIO_ERROR, - .aio_fsync = EIO_ERROR, - .fasync = EIO_ERROR, - .lock = EIO_ERROR, - .readv = EIO_ERROR, - .writev = EIO_ERROR, - .sendfile = EIO_ERROR, - .sendpage = EIO_ERROR, - .get_unmapped_area = EIO_ERROR, + .llseek = bad_file_llseek, + .read = bad_file_read, + .aio_read = bad_file_aio_read, + .write = bad_file_write, + .aio_write = bad_file_aio_write, + .readdir = bad_file_readdir, + .poll = bad_file_poll, + .ioctl = bad_file_ioctl, + .unlocked_ioctl = bad_file_unlocked_ioctl, + .compat_ioctl = bad_file_compat_ioctl, + .mmap = bad_file_mmap, + .open = bad_file_open, + .flush = bad_file_flush, + .release = bad_file_release, + .fsync = bad_file_fsync, + .aio_fsync = bad_file_aio_fsync, + .fasync = bad_file_fasync, + .lock = bad_file_lock, + .readv = bad_file_readv, + .writev = bad_file_writev, + .sendfile = bad_file_sendfile, + .sendpage = bad_file_sendpage, + .get_unmapped_area = bad_file_get_unmapped_area, + .check_flags = bad_file_check_flags, + .dir_notify = bad_file_dir_notify, + .flock = bad_file_flock, + .splice_write = bad_file_splice_write, + .splice_read = bad_file_splice_read, }; +static int bad_inode_create (struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + return -EIO; +} + +static struct dentry *bad_inode_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + return ERR_PTR(-EIO); +} + +static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, + const char *symname) +{ + return -EIO; +} + +static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, + int mode) +{ + return -EIO; +} + +static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) +{ + return -EIO; +} + +static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + return -EIO; +} + +static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + return -EIO; +} + +static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + return -EIO; +} + +static int bad_inode_permission(struct inode *inode, int mask, + struct nameidata *nd) +{ + return -EIO; +} + +static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) +{ + return -EIO; +} + +static int bad_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + return -EIO; +} + +static int bad_inode_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EIO; +} + +static ssize_t bad_inode_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + return -EIO; +} + +static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, + size_t buffer_size) +{ + return -EIO; +} + +static int bad_inode_removexattr(struct dentry *dentry, const char *name) +{ + return -EIO; +} + static struct inode_operations bad_inode_ops = { - .create = EIO_ERROR, - .lookup = EIO_ERROR, - .link = EIO_ERROR, - .unlink = EIO_ERROR, - .symlink = EIO_ERROR, - .mkdir = EIO_ERROR, - .rmdir = EIO_ERROR, - .mknod = EIO_ERROR, - .rename = EIO_ERROR, - .readlink = EIO_ERROR, + .create = bad_inode_create, + .lookup = bad_inode_lookup, + .link = bad_inode_link, + .unlink = bad_inode_unlink, + .symlink = bad_inode_symlink, + .mkdir = bad_inode_mkdir, + .rmdir = bad_inode_rmdir, + .mknod = bad_inode_mknod, + .rename = bad_inode_rename, + .readlink = bad_inode_readlink, /* follow_link must be no-op, otherwise unmounting this inode won't work */ - .truncate = EIO_ERROR, - .permission = EIO_ERROR, - .getattr = EIO_ERROR, - .setattr = EIO_ERROR, - .setxattr = EIO_ERROR, - .getxattr = EIO_ERROR, - .listxattr = EIO_ERROR, - .removexattr = EIO_ERROR, + /* put_link returns void */ + /* truncate returns void */ + .permission = bad_inode_permission, + .getattr = bad_inode_getattr, + .setattr = bad_inode_setattr, + .setxattr = bad_inode_setxattr, + .getxattr = bad_inode_getxattr, + .listxattr = bad_inode_listxattr, + .removexattr = bad_inode_removexattr, }; diff -uprN linux-2.6.18/fs/binfmt_aout.c linux-2.6.18.ovz/fs/binfmt_aout.c --- linux-2.6.18/fs/binfmt_aout.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/binfmt_aout.c 2007-06-13 06:55:07.000000000 -0400 @@ -446,9 +446,11 @@ beyond_if: #endif start_thread(regs, ex.a_entry, current->mm->start_stack); if (unlikely(current->ptrace & PT_PTRACED)) { - if (current->ptrace & PT_TRACE_EXEC) + if (current->ptrace & PT_TRACE_EXEC) { + set_pn_state(current, PN_STOP_EXEC); ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); - else + clear_pn_state(current); + } else send_sig(SIGTRAP, current, 0); } return 0; diff -uprN linux-2.6.18/fs/binfmt_elf.c linux-2.6.18.ovz/fs/binfmt_elf.c --- linux-2.6.18/fs/binfmt_elf.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/binfmt_elf.c 2007-06-13 06:55:07.000000000 -0400 @@ -361,7 +361,7 @@ static unsigned long load_elf_interp(str eppnt = elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { - int elf_type = MAP_PRIVATE | MAP_DENYWRITE; + int elf_type = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECPRIO; int elf_prot = 0; unsigned long vaddr = 0; unsigned long k, map_addr; @@ -683,6 +683,15 @@ static int load_elf_binary(struct linux_ retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) goto out_free_interp; + + /* + * If the binary is not readable than enforce + * mm->dumpable = 0 regardless of the interpreter's + * permissions. + */ + if (file_permission(interpreter, MAY_READ) < 0) + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE); if (retval != BINPRM_BUF_SIZE) { @@ -846,7 +855,8 @@ static int load_elf_binary(struct linux_ if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; - elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + elf_flags = MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_EXECPRIO; vaddr = elf_ppnt->p_vaddr; if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { @@ -863,6 +873,8 @@ static int load_elf_binary(struct linux_ elf_prot, elf_flags); if (BAD_ADDR(error)) { send_sig(SIGKILL, current, 0); + retval = IS_ERR((void *)error) ? + PTR_ERR((void*)error) : -EINVAL; goto out_free_dentry; } @@ -892,6 +904,7 @@ static int load_elf_binary(struct linux_ TASK_SIZE - elf_ppnt->p_memsz < k) { /* set_brk can never work. Avoid overflows. */ send_sig(SIGKILL, current, 0); + retval = -EINVAL; goto out_free_dentry; } @@ -1016,9 +1029,11 @@ static int load_elf_binary(struct linux_ start_thread(regs, elf_entry, bprm->p); if (unlikely(current->ptrace & PT_PTRACED)) { - if (current->ptrace & PT_TRACE_EXEC) + if (current->ptrace & PT_TRACE_EXEC) { + set_pn_state(current, PN_STOP_EXEC); ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); - else + clear_pn_state(current); + } else send_sig(SIGTRAP, current, 0); } retval = 0; @@ -1037,10 +1052,8 @@ out_free_interp: out_free_file: sys_close(elf_exec_fileno); out_free_fh: - if (files) { - put_files_struct(current->files); - current->files = files; - } + if (files) + reset_files_struct(current, files); out_free_ph: kfree(elf_phdata); goto out; @@ -1295,10 +1308,10 @@ static void fill_prstatus(struct elf_prs prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; prstatus->pr_sigpend = p->pending.signal.sig[0]; prstatus->pr_sighold = p->blocked.sig[0]; - prstatus->pr_pid = p->pid; - prstatus->pr_ppid = p->parent->pid; - prstatus->pr_pgrp = process_group(p); - prstatus->pr_sid = p->signal->session; + prstatus->pr_pid = virt_pid(p); + prstatus->pr_ppid = virt_pid(p->parent); + prstatus->pr_pgrp = virt_pgid(p); + prstatus->pr_sid = virt_sid(p); if (thread_group_leader(p)) { /* * This is the record for the group leader. Add in the @@ -1341,10 +1354,10 @@ static int fill_psinfo(struct elf_prpsin psinfo->pr_psargs[i] = ' '; psinfo->pr_psargs[len] = 0; - psinfo->pr_pid = p->pid; - psinfo->pr_ppid = p->parent->pid; - psinfo->pr_pgrp = process_group(p); - psinfo->pr_sid = p->signal->session; + psinfo->pr_pid = virt_pid(p); + psinfo->pr_ppid = virt_pid(p->parent); + psinfo->pr_pgrp = virt_pgid(p); + psinfo->pr_sid = virt_sid(p); i = p->state ? ffz(~p->state) + 1 : 0; psinfo->pr_state = i; @@ -1481,7 +1494,7 @@ static int elf_core_dump(long signr, str if (signr) { struct elf_thread_status *tmp; read_lock(&tasklist_lock); - do_each_thread(g,p) + do_each_thread_ve(g,p) if (current->mm == p->mm && current != p) { tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); if (!tmp) { @@ -1492,7 +1505,7 @@ static int elf_core_dump(long signr, str tmp->thread = p; list_add(&tmp->list, &thread_list); } - while_each_thread(g,p); + while_each_thread_ve(g,p); read_unlock(&tasklist_lock); list_for_each(t, &thread_list) { struct elf_thread_status *tmp; diff -uprN linux-2.6.18/fs/binfmt_elf_fdpic.c linux-2.6.18.ovz/fs/binfmt_elf_fdpic.c --- linux-2.6.18/fs/binfmt_elf_fdpic.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/binfmt_elf_fdpic.c 2007-06-13 06:55:07.000000000 -0400 @@ -237,6 +237,14 @@ static int load_elf_fdpic_binary(struct goto error; } + /* + * If the binary is not readable than enforce + * mm->dumpable = 0 regardless of the interpreter's + * permissions. + */ + if (file_permission(interpreter, MAY_READ) < 0) + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE); if (retval < 0) diff -uprN linux-2.6.18/fs/binfmt_misc.c linux-2.6.18.ovz/fs/binfmt_misc.c --- linux-2.6.18/fs/binfmt_misc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/binfmt_misc.c 2007-06-13 06:55:07.000000000 -0400 @@ -215,10 +215,8 @@ _error: bprm->interp_flags = 0; bprm->interp_data = 0; _unshare: - if (files) { - put_files_struct(current->files); - current->files = files; - } + if (files) + reset_files_struct(current, files); goto _ret; } diff -uprN linux-2.6.18/fs/block_dev.c linux-2.6.18.ovz/fs/block_dev.c --- linux-2.6.18/fs/block_dev.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/block_dev.c 2007-06-13 06:55:07.000000000 -0400 @@ -21,6 +21,7 @@ #include #include #include +#include #include struct bdev_inode { @@ -947,9 +948,15 @@ do_open(struct block_device *bdev, struc { struct module *owner = NULL; struct gendisk *disk; - int ret = -ENXIO; + int ret; int part; + ret = get_device_perms_ve(S_IFBLK, bdev->bd_dev, + file->f_mode & (FMODE_READ | FMODE_WRITE)); + if (ret) + return ret; + + ret = -ENXIO; file->f_mapping = bdev->bd_inode->i_mapping; lock_kernel(); disk = get_gendisk(bdev->bd_dev, &part); @@ -1215,7 +1222,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ -struct block_device *lookup_bdev(const char *path) +struct block_device *lookup_bdev(const char *path, int mode) { struct block_device *bdev; struct inode *inode; @@ -1233,6 +1240,11 @@ struct block_device *lookup_bdev(const c error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto fail; + + error = get_device_perms_ve(S_IFBLK, inode->i_rdev, mode); + if (error) + goto fail; + error = -EACCES; if (nd.mnt->mnt_flags & MNT_NODEV) goto fail; @@ -1264,12 +1276,13 @@ struct block_device *open_bdev_excl(cons mode_t mode = FMODE_READ; int error = 0; - bdev = lookup_bdev(path); + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + bdev = lookup_bdev(path, mode); if (IS_ERR(bdev)) return bdev; - if (!(flags & MS_RDONLY)) - mode |= FMODE_WRITE; error = blkdev_get(bdev, mode, 0); if (error) return ERR_PTR(error); diff -uprN linux-2.6.18/fs/buffer.c linux-2.6.18.ovz/fs/buffer.c --- linux-2.6.18/fs/buffer.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/buffer.c 2007-06-13 06:55:07.000000000 -0400 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,8 @@ #include #include +#include + static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void invalidate_bh_lrus(void); @@ -77,6 +80,7 @@ EXPORT_SYMBOL(__lock_buffer); void fastcall unlock_buffer(struct buffer_head *bh) { + smp_mb__before_clear_bit(); clear_buffer_locked(bh); smp_mb__after_clear_bit(); wake_up_bit(&bh->b_state, BH_Lock); @@ -280,7 +284,14 @@ static void do_sync(unsigned long wait) asmlinkage long sys_sync(void) { + struct user_beancounter *ub; + + ub = get_exec_ub(); + ub_percpu_inc(ub, sync); + do_sync(1); + + ub_percpu_inc(ub, sync_done); return 0; } @@ -323,6 +334,7 @@ long do_fsync(struct file *file, int dat int ret; int err; struct address_space *mapping = file->f_mapping; + struct user_beancounter *ub; if (!file->f_op || !file->f_op->fsync) { /* Why? We can still call filemap_fdatawrite */ @@ -330,6 +342,12 @@ long do_fsync(struct file *file, int dat goto out; } + ub = get_exec_ub(); + if (datasync) + ub_percpu_inc(ub, fdsync); + else + ub_percpu_inc(ub, fsync); + ret = filemap_fdatawrite(mapping); /* @@ -344,6 +362,11 @@ long do_fsync(struct file *file, int dat err = filemap_fdatawait(mapping); if (!ret) ret = err; + + if (datasync) + ub_percpu_inc(ub, fdsync_done); + else + ub_percpu_inc(ub, fsync_done); out: return ret; } @@ -838,7 +861,11 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); */ int __set_page_dirty_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; + int acct; + struct address_space * const mapping = page_mapping(page); + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); spin_lock(&mapping->private_lock); if (page_has_buffers(page)) { @@ -854,15 +881,20 @@ int __set_page_dirty_buffers(struct page if (!TestSetPageDirty(page)) { write_lock_irq(&mapping->tree_lock); + acct = 0; if (page->mapping) { /* Race with truncate? */ - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + acct = 1; + } radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } write_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + if (acct) + task_io_account_write(page, PAGE_CACHE_SIZE, 0); return 1; } return 0; @@ -1176,8 +1208,21 @@ grow_buffers(struct block_device *bdev, } while ((size << sizebits) < PAGE_SIZE); index = block >> sizebits; - block = index << sizebits; + /* + * Check for a block which wants to lie outside our maximum possible + * pagecache index. (this comparison is done using sector_t types). + */ + if (unlikely(index != block >> sizebits)) { + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR "%s: requested out-of-range block %llu for " + "device %s\n", + __FUNCTION__, (unsigned long long)block, + bdevname(bdev, b)); + return -EIO; + } + block = index << sizebits; /* Create a page with the proper size buffers.. */ page = grow_dev_page(bdev, block, index, size); if (!page) @@ -1204,12 +1249,16 @@ __getblk_slow(struct block_device *bdev, for (;;) { struct buffer_head * bh; + int ret; bh = __find_get_block(bdev, block, size); if (bh) return bh; - if (!grow_buffers(bdev, block, size)) + ret = grow_buffers(bdev, block, size); + if (ret < 0) + return NULL; + if (ret == 0) free_more_memory(); } } @@ -2860,6 +2909,9 @@ void ll_rw_block(int rw, int nr, struct { int i; + if (likely(nr) && !(rw & WRITE)) + task_io_account_read(nr * bhs[0]->b_size); + for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i]; @@ -2995,8 +3047,13 @@ int try_to_free_buffers(struct page *pag * could encounter a non-uptodate page, which is unresolvable. * This only applies in the rare case where try_to_free_buffers * succeeds but the page is not freed. + * + * Also, during truncate, discard_buffer will have marked all + * the page's buffers clean. We discover that here and clean + * the page also. */ - clear_page_dirty(page); + if (test_clear_page_dirty(page)) + task_io_account_cancelled_write(PAGE_CACHE_SIZE); } spin_unlock(&mapping->private_lock); out: diff -uprN linux-2.6.18/fs/char_dev.c linux-2.6.18.ovz/fs/char_dev.c --- linux-2.6.18/fs/char_dev.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/char_dev.c 2007-06-13 06:55:07.000000000 -0400 @@ -20,6 +20,8 @@ #include #include +#include + #ifdef CONFIG_KMOD #include #endif @@ -296,6 +298,11 @@ int chrdev_open(struct inode * inode, st struct cdev *new = NULL; int ret = 0; + ret = get_device_perms_ve(S_IFCHR, inode->i_rdev, + filp->f_mode & (FMODE_READ | FMODE_WRITE)); + if (ret) + return ret; + spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { diff -uprN linux-2.6.18/fs/cifs/CHANGES linux-2.6.18.ovz/fs/cifs/CHANGES --- linux-2.6.18/fs/cifs/CHANGES 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/cifs/CHANGES 2007-06-13 06:55:07.000000000 -0400 @@ -6,7 +6,11 @@ on requests on other threads. Improve P (lock cancel now works, and unlock of merged range works even to Windows servers now). Fix oops on mount to lanman servers (win9x, os/2 etc.) when null password. Do not send listxattr -(SMB to query all EAs) if nouser_xattr specified. +(SMB to query all EAs) if nouser_xattr specified. Return error +in rename 2nd attempt retry (ie report if rename by handle also +fails, after rename by path fails, we were not reporting whether +the retry worked or not). + Version 1.44 ------------ diff -uprN linux-2.6.18/fs/cifs/connect.c linux-2.6.18.ovz/fs/cifs/connect.c --- linux-2.6.18/fs/cifs/connect.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/cifs/connect.c 2007-06-13 06:55:07.000000000 -0400 @@ -771,12 +771,12 @@ cifs_parse_mount_options(char *options, separator[1] = 0; memset(vol->source_rfc1001_name,0x20,15); - for(i=0;i < strnlen(system_utsname.nodename,15);i++) { + for(i = 0; i < strnlen(utsname()->nodename, 15); i++) { /* does not have to be a perfect mapping since the field is informational, only used for servers that do not support port 445 and it can be overridden at mount time */ vol->source_rfc1001_name[i] = - toupper(system_utsname.nodename[i]); + toupper(utsname()->nodename[i]); } vol->source_rfc1001_name[15] = 0; /* null target name indicates to use *SMBSERVR default called name @@ -2111,7 +2111,7 @@ CIFSSessSetup(unsigned int xid, struct c 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, + cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bcc_ptr += 2; @@ -2138,8 +2138,8 @@ CIFSSessSetup(unsigned int xid, struct c } strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, utsname()->release); + bcc_ptr += strlen(utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; } @@ -2403,7 +2403,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32, + cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bcc_ptr += 2; /* null terminate Linux version */ @@ -2420,8 +2420,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i } else { /* ASCII */ strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, utsname()->release); + bcc_ptr += strlen(utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; bcc_ptr++; /* empty domain field */ @@ -2794,7 +2794,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32, + cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bcc_ptr += 2; /* null term version string */ @@ -2846,8 +2846,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, utsname()->release); + bcc_ptr += strlen(utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; bcc_ptr++; /* null domain */ diff -uprN linux-2.6.18/fs/cifs/file.c linux-2.6.18.ovz/fs/cifs/file.c --- linux-2.6.18/fs/cifs/file.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/cifs/file.c 2007-06-13 06:55:07.000000000 -0400 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include "cifsfs.h" @@ -752,6 +753,7 @@ int cifs_lock(struct file *file, int cmd int stored_rc = 0; struct cifsLockInfo *li, *tmp; + rc = 0; down(&fid->lock_sem); list_for_each_entry_safe(li, tmp, &fid->llist, llist) { if (pfLock->fl_start <= li->offset && @@ -766,7 +768,7 @@ int cifs_lock(struct file *file, int cmd kfree(li); } } - up(&fid->lock_sem); + up(&fid->lock_sem); } } @@ -1815,6 +1817,7 @@ static int cifs_readpages(struct file *f } break; } else if (bytes_read > 0) { + task_io_account_read(bytes_read); pSMBr = (struct smb_com_read_rsp *)smb_read_data; cifs_copy_cache_pages(mapping, page_list, bytes_read, smb_read_data + 4 /* RFC1001 hdr */ + diff -uprN linux-2.6.18/fs/cifs/inode.c linux-2.6.18.ovz/fs/cifs/inode.c --- linux-2.6.18/fs/cifs/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/cifs/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -880,10 +880,14 @@ int cifs_rename(struct inode *source_ino kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); if (info_buf_source != NULL) { info_buf_target = info_buf_source + 1; - rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, - info_buf_source, cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->capabilities & CAP_UNIX) + rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, + info_buf_source, + cifs_sb_source->local_nls, + cifs_sb_source->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + /* else rc is still EEXIST so will fall through to + unlink the target and retry rename */ if (rc == 0) { rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName, info_buf_target, @@ -932,7 +936,7 @@ int cifs_rename(struct inode *source_ino cifs_sb_source->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc==0) { - CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName, + rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName, cifs_sb_source->local_nls, cifs_sb_source->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); diff -uprN linux-2.6.18/fs/cifs/sess.c linux-2.6.18.ovz/fs/cifs/sess.c --- linux-2.6.18/fs/cifs/sess.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/cifs/sess.c 2007-06-13 06:55:07.000000000 -0400 @@ -111,7 +111,7 @@ static void unicode_ssetup_strings(char bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32, nls_cp); bcc_ptr += 2 * bytes_ret; - bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release, 32, nls_cp); bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* trailing null */ @@ -158,8 +158,8 @@ static void ascii_ssetup_strings(char ** strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, init_utsname()->release); + bcc_ptr += strlen(init_utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; diff -uprN linux-2.6.18/fs/compat.c linux-2.6.18.ovz/fs/compat.c --- linux-2.6.18/fs/compat.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/compat.c 2007-06-13 06:55:07.000000000 -0400 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include /* for SIOCDEVPRIVATE */ @@ -46,6 +47,7 @@ #include #include #include +#include #include /* siocdevprivate_ioctl */ @@ -69,6 +71,18 @@ int compat_printk(const char *fmt, ...) return ret; } +int ve_compat_printk(int dst, const char *fmt, ...) +{ + va_list ap; + int ret; + if (!compat_log) + return 0; + va_start(ap, fmt); + ret = ve_vprintk(dst, fmt, ap); + va_end(ap); + return ret; +} + /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -84,7 +98,7 @@ asmlinkage long compat_sys_utime(char __ tv[0].tv_usec = 0; tv[1].tv_usec = 0; } - return do_utimes(AT_FDCWD, filename, t ? tv : NULL); + return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t) @@ -98,7 +112,7 @@ asmlinkage long compat_sys_futimesat(uns get_user(tv[1].tv_usec, &t[1].tv_usec)) return -EFAULT; } - return do_utimes(dfd, filename, t ? tv : NULL); + return do_utimes(dfd, filename, t ? tv : NULL, 0); } asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t) @@ -213,6 +227,8 @@ asmlinkage long compat_sys_statfs(const struct kstatfs tmp; error = vfs_statfs(nd.dentry, &tmp); if (!error) + error = faudit_statfs(nd.mnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs(buf, &tmp); path_release(&nd); } @@ -231,6 +247,8 @@ asmlinkage long compat_sys_fstatfs(unsig goto out; error = vfs_statfs(file->f_dentry, &tmp); if (!error) + error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs(buf, &tmp); fput(file); out: @@ -281,6 +299,8 @@ asmlinkage long compat_sys_statfs64(cons struct kstatfs tmp; error = vfs_statfs(nd.dentry, &tmp); if (!error) + error = faudit_statfs(nd.mnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs64(buf, &tmp); path_release(&nd); } @@ -302,6 +322,8 @@ asmlinkage long compat_sys_fstatfs64(uns goto out; error = vfs_statfs(file->f_dentry, &tmp); if (!error) + error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs64(buf, &tmp); fput(file); out: @@ -873,7 +895,7 @@ asmlinkage long compat_sys_mount(char __ retval = -EINVAL; - if (type_page) { + if (type_page && data_page) { if (!strcmp((char *)type_page, SMBFS_NAME)) { do_smb_super_data_conv((void *)data_page); } else if (!strcmp((char *)type_page, NCPFS_NAME)) { @@ -1480,6 +1502,61 @@ out: return ret; } +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) +{ + long ret; + unsigned int cmds; + mm_segment_t old_fs; + struct if_dqblk dqblk; + struct if32_dqblk { + __u32 dqb_bhardlimit[2]; + __u32 dqb_bsoftlimit[2]; + __u32 dqb_curspace[2]; + __u32 dqb_ihardlimit[2]; + __u32 dqb_isoftlimit[2]; + __u32 dqb_curinodes[2]; + __u32 dqb_btime[2]; + __u32 dqb_itime[2]; + __u32 dqb_valid; + } dqblk32; + + cmds = cmd >> SUBCMDSHIFT; + + switch (cmds) { + case Q_GETQUOTA: + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_quotactl(cmd, special, id, &dqblk); + set_fs(old_fs); + if (ret < 0) + break; + + memcpy(&dqblk32, &dqblk, sizeof(dqblk32)); + dqblk32.dqb_valid = dqblk.dqb_valid; + if (copy_to_user(addr, &dqblk32, sizeof(dqblk32))) + ret = -EFAULT; + + break; + case Q_SETQUOTA: + ret = -EFAULT; + if (copy_from_user(&dqblk32, addr, sizeof(dqblk32))) + break; + memcpy(&dqblk, &dqblk32, sizeof(dqblk32)); + dqblk.dqb_valid = dqblk32.dqb_valid; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_quotactl(cmd, special, id, &dqblk); + set_fs(old_fs); + break; + default: + ret = sys_quotactl(cmd, special, id, addr); + break; + } + return ret; +} + #ifdef CONFIG_MMU #define free_arg_pages(bprm) do { } while (0) @@ -1513,6 +1590,10 @@ int compat_do_execve(char * filename, int retval; int i; + retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); + if (retval) + return retval; + retval = -ENOMEM; bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) diff -uprN linux-2.6.18/fs/dcache.c linux-2.6.18.ovz/fs/dcache.c --- linux-2.6.18/fs/dcache.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/dcache.c 2007-06-13 06:55:07.000000000 -0400 @@ -27,12 +27,17 @@ #include #include #include +#include #include #include #include #include #include +#include +#include +#include +#include int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); @@ -42,7 +47,7 @@ static __cacheline_aligned_in_smp DEFINE EXPORT_SYMBOL(dcache_lock); -static kmem_cache_t *dentry_cache __read_mostly; +kmem_cache_t *dentry_cache __read_mostly; #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) @@ -112,6 +117,29 @@ static void dentry_iput(struct dentry * } } +/** + * d_kill - kill dentry and return parent + * @dentry: dentry to kill + * + * Called with dcache_lock and d_lock, releases both. The dentry must + * already be unhashed and removed from the LRU. + * + * If this is the root of the dentry tree, return NULL. + */ +static struct dentry *d_kill(struct dentry *dentry) +{ + struct dentry *parent; + + list_del(&dentry->d_u.d_child); + dentry_stat.nr_dentry--; /* For d_free, below */ + preempt_enable_no_resched(); + /*drops the locks, at that point nobody can reach this dentry */ + dentry_iput(dentry); + parent = dentry->d_parent; + d_free(dentry); + return dentry == parent ? NULL : parent; +} + /* * This is dput * @@ -139,25 +167,18 @@ static void dentry_iput(struct dentry * * they too may now get deleted. * * no dcache lock, please. + * preemption is disabled by the caller. */ -void dput(struct dentry *dentry) +static void dput_recursive(struct dentry *dentry) { - if (!dentry) - return; - repeat: - if (atomic_read(&dentry->d_count) == 1) - might_sleep(); if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) - return; + goto out_preempt; spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - return; - } + if (atomic_read(&dentry->d_count)) + goto out_unlock; /* * AV: ->d_delete() is _NOT_ allowed to block now. @@ -174,36 +195,43 @@ repeat: list_add(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; } +out_unlock: spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); +out_preempt: + preempt_enable(); return; unhash_it: __d_drop(dentry); - -kill_it: { - struct dentry *parent; - - /* If dentry was on d_lru list - * delete it from there - */ - if (!list_empty(&dentry->d_lru)) { - list_del(&dentry->d_lru); - dentry_stat.nr_unused--; - } - list_del(&dentry->d_u.d_child); - dentry_stat.nr_dentry--; /* For d_free, below */ - /*drops the locks, at that point nobody can reach this dentry */ - dentry_iput(dentry); - parent = dentry->d_parent; - d_free(dentry); - if (dentry == parent) - return; - dentry = parent; +kill_it: + /* If dentry was on d_lru list + * delete it from there + */ + if (!list_empty(&dentry->d_lru)) { + list_del(&dentry->d_lru); + dentry_stat.nr_unused--; + } + dentry = d_kill(dentry); + if (dentry) { + preempt_disable(); goto repeat; } } +void dput(struct dentry *dentry) +{ + if (!dentry) + return; + + if (atomic_read(&dentry->d_count) == 1) + might_sleep(); + + preempt_disable(); + ub_dentry_uncharge(dentry); + dput_recursive(dentry); +} + /** * d_invalidate - invalidate a dentry * @dentry: dentry to invalidate @@ -270,6 +298,8 @@ static inline struct dentry * __dget_loc dentry_stat.nr_unused--; list_del_init(&dentry->d_lru); } + + ub_dentry_charge_nofail(dentry); return dentry; } @@ -361,22 +391,49 @@ restart: * Throw away a dentry - free the inode, dput the parent. This requires that * the LRU list has already been removed. * + * If prune_parents is true, try to prune ancestors as well. + * * Called with dcache_lock, drops it and then regains. * Called with dentry->d_lock held, drops it. */ -static void prune_one_dentry(struct dentry * dentry) +static void prune_one_dentry(struct dentry * dentry, int prune_parents) { - struct dentry * parent; - __d_drop(dentry); - list_del(&dentry->d_u.d_child); - dentry_stat.nr_dentry--; /* For d_free, below */ - dentry_iput(dentry); - parent = dentry->d_parent; - d_free(dentry); - if (parent != dentry) - dput(parent); + preempt_disable(); + dentry = d_kill(dentry); + if (!prune_parents) { + /* + * dentry is not in use, only child (not outside) + * references change, so parent->d_inuse does not change + */ + if (dentry) { + preempt_disable(); + dput_recursive(dentry); + } + spin_lock(&dcache_lock); + return; + } + + /* + * Prune ancestors. Locking is simpler than in dput(), + * because dcache_lock needs to be taken anyway. + */ spin_lock(&dcache_lock); + while (dentry) { + if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock)) + return; + + if (dentry->d_op && dentry->d_op->d_delete) + dentry->d_op->d_delete(dentry); + if (!list_empty(&dentry->d_lru)) { + list_del(&dentry->d_lru); + dentry_stat.nr_unused--; + } + __d_drop(dentry); + preempt_disable(); + dentry = d_kill(dentry); + spin_lock(&dcache_lock); + } } /** @@ -384,6 +441,7 @@ static void prune_one_dentry(struct dent * @count: number of entries to try and free * @sb: if given, ignore dentries for other superblocks * which are being unmounted. + * @prune_parents: if true, try to prune ancestors as well in one go * * Shrink the dcache. This is done when we need * more memory, or simply when we need to unmount @@ -394,7 +452,7 @@ static void prune_one_dentry(struct dent * all the dentries are in use. */ -static void prune_dcache(int count, struct super_block *sb) +static void prune_dcache(int count, struct super_block *sb, int prune_parents) { spin_lock(&dcache_lock); for (; count ; count--) { @@ -454,7 +512,7 @@ static void prune_dcache(int count, stru * without taking the s_umount lock (I already hold it). */ if (sb && dentry->d_sb == sb) { - prune_one_dentry(dentry); + prune_one_dentry(dentry, prune_parents); continue; } /* @@ -469,7 +527,7 @@ static void prune_dcache(int count, stru s_umount = &dentry->d_sb->s_umount; if (down_read_trylock(s_umount)) { if (dentry->d_sb->s_root != NULL) { - prune_one_dentry(dentry); + prune_one_dentry(dentry, prune_parents); up_read(s_umount); continue; } @@ -539,7 +597,7 @@ repeat: spin_unlock(&dentry->d_lock); continue; } - prune_one_dentry(dentry); + prune_one_dentry(dentry, 1); cond_resched_lock(&dcache_lock); goto repeat; } @@ -682,7 +740,7 @@ void shrink_dcache_parent(struct dentry int found; while ((found = select_parent(parent)) != 0) - prune_dcache(found, parent->d_sb); + prune_dcache(found, parent->d_sb, 1); } /* @@ -699,12 +757,18 @@ void shrink_dcache_parent(struct dentry */ static int shrink_dcache_memory(int nr, gfp_t gfp_mask) { + int res = -1; + + KSTAT_PERF_ENTER(shrink_dcache) if (nr) { if (!(gfp_mask & __GFP_FS)) - return -1; - prune_dcache(nr, NULL); + goto out; + prune_dcache(nr, NULL, 1); } - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; +out: + KSTAT_PERF_LEAVE(shrink_dcache) + return res; } /** @@ -722,21 +786,26 @@ struct dentry *d_alloc(struct dentry * p struct dentry *dentry; char *dname; + dname = NULL; + if (name->len > DNAME_INLINE_LEN-1) { + dname = kmalloc(name->len + 1, GFP_KERNEL); + if (!dname) + goto err_name; + } + + ub_dentry_alloc_start(); dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) - return NULL; + goto err_alloc; - if (name->len > DNAME_INLINE_LEN-1) { - dname = kmalloc(name->len + 1, GFP_KERNEL); - if (!dname) { - kmem_cache_free(dentry_cache, dentry); - return NULL; - } - } else { + preempt_disable(); + if (dname == NULL) dname = dentry->d_iname; - } dentry->d_name.name = dname; + if (ub_dentry_alloc(dentry)) + goto err_charge; + dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); @@ -767,12 +836,27 @@ struct dentry *d_alloc(struct dentry * p } spin_lock(&dcache_lock); - if (parent) + if (parent) { list_add(&dentry->d_u.d_child, &parent->d_subdirs); + if (parent->d_flags & DCACHE_VIRTUAL) + dentry->d_flags |= DCACHE_VIRTUAL; + } dentry_stat.nr_dentry++; spin_unlock(&dcache_lock); + preempt_enable(); + ub_dentry_alloc_end(); return dentry; + +err_charge: + preempt_enable(); + kmem_cache_free(dentry_cache, dentry); +err_alloc: + if (name->len > DNAME_INLINE_LEN - 1) + kfree(dname); + ub_dentry_alloc_end(); +err_name: + return NULL; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) @@ -1060,12 +1144,12 @@ struct dentry * __d_lookup(struct dentry unsigned int hash = name->hash; const unsigned char *str = name->name; struct hlist_head *head = d_hash(parent,hash); - struct dentry *found = NULL; struct hlist_node *node; - struct dentry *dentry; + struct dentry *dentry, *found; rcu_read_lock(); + found = NULL; hlist_for_each_entry_rcu(dentry, node, head, d_hash) { struct qstr *qstr; @@ -1102,6 +1186,8 @@ struct dentry * __d_lookup(struct dentry if (!d_unhashed(dentry)) { atomic_inc(&dentry->d_count); found = dentry; + if (ub_dentry_charge(found)) + goto charge_failure; } spin_unlock(&dentry->d_lock); break; @@ -1111,6 +1197,14 @@ next: rcu_read_unlock(); return found; + +charge_failure: + spin_unlock(&found->d_lock); + rcu_read_unlock(); + /* dentry is now unhashed, just kill it */ + dput(found); + /* ... and fail lookup */ + return NULL; } /** @@ -1387,6 +1481,32 @@ already_unhashed: } /** + * __d_path_add_deleted - prepend "(deleted) " text + * @end: a pointer to the character after free space at the beginning of the + * buffer + * @buflen: remaining free space + */ +static inline char * __d_path_add_deleted(char * end, int buflen) +{ + buflen -= 10; + if (buflen < 0) + return ERR_PTR(-ENAMETOOLONG); + end -= 10; + memcpy(end, "(deleted) ", 10); + return end; +} + +/** + * d_root_check - checks if dentry is accessible from current's fs root + * @dentry: dentry to be verified + * @vfsmnt: vfsmnt to which the dentry belongs + */ +int d_root_check(struct dentry *dentry, struct vfsmount *vfsmnt) +{ + return PTR_ERR(d_path(dentry, vfsmnt, NULL, 0)); +} + +/** * d_path - return the path of a dentry * @dentry: dentry to report * @vfsmnt: vfsmnt to which the dentry belongs @@ -1402,41 +1522,40 @@ already_unhashed: * * "buflen" should be positive. Caller holds the dcache_lock. */ -static char * __d_path( struct dentry *dentry, struct vfsmount *vfsmnt, +char * __d_path( struct dentry *dentry, struct vfsmount *vfsmnt, struct dentry *root, struct vfsmount *rootmnt, char *buffer, int buflen) { char * end = buffer+buflen; - char * retval; + char * retval = NULL; int namelen; + int deleted; + struct vfsmount *oldvfsmnt; - *--end = '\0'; - buflen--; - if (!IS_ROOT(dentry) && d_unhashed(dentry)) { - buflen -= 10; - end -= 10; - if (buflen < 0) + oldvfsmnt = vfsmnt; + deleted = (!IS_ROOT(dentry) && d_unhashed(dentry)); + if (buffer != NULL) { + *--end = '\0'; + buflen--; + + if (buflen < 1) goto Elong; - memcpy(end, " (deleted)", 10); + /* Get '/' right */ + retval = end-1; + *retval = '/'; } - if (buflen < 1) - goto Elong; - /* Get '/' right */ - retval = end-1; - *retval = '/'; - for (;;) { struct dentry * parent; if (dentry == root && vfsmnt == rootmnt) break; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { - /* Global root? */ + /* root of a tree? */ spin_lock(&vfsmount_lock); if (vfsmnt->mnt_parent == vfsmnt) { spin_unlock(&vfsmount_lock); - goto global_root; + goto other_root; } dentry = vfsmnt->mnt_mountpoint; vfsmnt = vfsmnt->mnt_parent; @@ -1445,30 +1564,55 @@ static char * __d_path( struct dentry *d } parent = dentry->d_parent; prefetch(parent); + if (buffer != NULL) { + namelen = dentry->d_name.len; + buflen -= namelen + 1; + if (buflen < 0) + goto Elong; + end -= namelen; + memcpy(end, dentry->d_name.name, namelen); + *--end = '/'; + retval = end; + } + dentry = parent; + } + /* the given root point is reached */ +finish: + if (buffer != NULL && deleted) + retval = __d_path_add_deleted(end, buflen); + return retval; + +other_root: + /* + * We traversed the tree upward and reached a root, but the given + * lookup terminal point wasn't encountered. It means either that the + * dentry is out of our scope or belongs to an abstract space like + * sock_mnt or pipe_mnt. Check for it. + * + * There are different options to check it. + * We may assume that any dentry tree is unreachable unless it's + * connected to `root' (defined as fs root of init aka child reaper) + * and expose all paths that are not connected to it. + * The other option is to allow exposing of known abstract spaces + * explicitly and hide the path information for other cases. + * This approach is more safe, let's take it. 2001/04/22 SAW + */ + if (!(oldvfsmnt->mnt_sb->s_flags & MS_NOUSER)) + return ERR_PTR(-EINVAL); + if (buffer != NULL) { namelen = dentry->d_name.len; - buflen -= namelen + 1; + buflen -= namelen; if (buflen < 0) goto Elong; - end -= namelen; - memcpy(end, dentry->d_name.name, namelen); - *--end = '/'; - retval = end; - dentry = parent; + retval -= namelen-1; /* hit the slash */ + memcpy(retval, dentry->d_name.name, namelen); } + goto finish; - return retval; - -global_root: - namelen = dentry->d_name.len; - buflen -= namelen; - if (buflen < 0) - goto Elong; - retval -= namelen-1; /* hit the slash */ - memcpy(retval, dentry->d_name.name, namelen); - return retval; Elong: return ERR_PTR(-ENAMETOOLONG); } +EXPORT_SYMBOL(__d_path); /* write full pathname into buffer and return start of pathname */ char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, @@ -1490,6 +1634,229 @@ char * d_path(struct dentry *dentry, str return res; } +#ifdef CONFIG_VE +#include +#include +#include +#include +#include + +static void mark_sub_tree_virtual(struct dentry *d) +{ + struct dentry *orig_root; + + orig_root = d; + while (1) { + spin_lock(&d->d_lock); + d->d_flags |= DCACHE_VIRTUAL; + spin_unlock(&d->d_lock); + + if (!list_empty(&d->d_subdirs)) { + d = list_entry(d->d_subdirs.next, + struct dentry, d_u.d_child); + continue; + } + if (d == orig_root) + break; + while (d == list_entry(d->d_parent->d_subdirs.prev, + struct dentry, d_u.d_child)) { + d = d->d_parent; + if (d == orig_root) + goto out; + } + d = list_entry(d->d_u.d_child.next, + struct dentry, d_u.d_child); + } +out: + return; +} + +void mark_tree_virtual(struct vfsmount *m, struct dentry *d) +{ + struct vfsmount *orig_rootmnt; + + spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); + orig_rootmnt = m; + while (1) { + mark_sub_tree_virtual(d); + if (!list_empty(&m->mnt_mounts)) { + m = list_entry(m->mnt_mounts.next, + struct vfsmount, mnt_child); + d = m->mnt_root; + continue; + } + if (m == orig_rootmnt) + break; + while (m == list_entry(m->mnt_parent->mnt_mounts.prev, + struct vfsmount, mnt_child)) { + m = m->mnt_parent; + if (m == orig_rootmnt) + goto out; + } + m = list_entry(m->mnt_child.next, + struct vfsmount, mnt_child); + d = m->mnt_root; + } +out: + spin_unlock(&vfsmount_lock); + spin_unlock(&dcache_lock); +} +EXPORT_SYMBOL(mark_tree_virtual); + +static struct vz_rate_info area_ri = { 20, 10*HZ }; +#define VE_AREA_ACC_CHECK 0x0001 +#define VE_AREA_ACC_DENY 0x0002 +#define VE_AREA_EXEC_CHECK 0x0010 +#define VE_AREA_EXEC_DENY 0x0020 +#define VE0_AREA_ACC_CHECK 0x0100 +#define VE0_AREA_ACC_DENY 0x0200 +#define VE0_AREA_EXEC_CHECK 0x1000 +#define VE0_AREA_EXEC_DENY 0x2000 +int ve_area_access_check = 0; + +static void print_connection_info(struct task_struct *tsk) +{ + struct files_struct *files; + struct fdtable *fdt; + int fd; + + files = get_files_struct(tsk); + if (!files) + return; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + for (fd = 0; fd < fdt->max_fds; fd++) { + struct file *file; + struct inode *inode; + struct socket *socket; + struct sock *sk; + struct inet_sock *inet; + + file = fdt->fd[fd]; + if (file == NULL) + continue; + + inode = file->f_dentry->d_inode; + if (!S_ISSOCK(inode->i_mode)) + continue; + + socket = SOCKET_I(inode); + if (socket == NULL) + continue; + + sk = socket->sk; + if ((sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + || sk->sk_type != SOCK_STREAM) + continue; + + inet = inet_sk(sk); + printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n", + NIPQUAD(inet->daddr), ntohs(inet->dport), + inet->num); + } + spin_unlock(&files->file_lock); + put_files_struct(files); +} + +static void check_alert(struct vfsmount *vfsmnt, struct dentry *dentry, + char *str) +{ + struct task_struct *tsk; + unsigned long page; + struct super_block *sb; + char *p; + + if (!vz_ratelimit(&area_ri)) + return; + + tsk = current; + p = ERR_PTR(-ENOMEM); + page = __get_free_page(GFP_KERNEL); + if (page) { + spin_lock(&dcache_lock); + p = __d_path(dentry, vfsmnt, tsk->fs->root, tsk->fs->rootmnt, + (char *)page, PAGE_SIZE); + spin_unlock(&dcache_lock); + } + if (IS_ERR(p)) + p = "(undefined)"; + + sb = dentry->d_sb; + printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n" + "Task %d/%d[%s] from VE%d, execenv %d\n", + str, p, sb->s_type->owner_env->veid, + sb->s_type->name, sb->s_dev, + tsk->pid, virt_pid(tsk), tsk->comm, + VE_TASK_INFO(tsk)->owner_env->veid, + get_exec_env()->veid); + + free_page(page); + + print_connection_info(tsk); + + read_lock(&tasklist_lock); + tsk = tsk->real_parent; + get_task_struct(tsk); + read_unlock(&tasklist_lock); + + printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n", + tsk->pid, virt_pid(tsk), tsk->comm, + VE_TASK_INFO(tsk)->owner_env->veid); + + print_connection_info(tsk); + put_task_struct(tsk); + dump_stack(); +} +#endif + +int check_area_access_ve(struct dentry *dentry, struct vfsmount *mnt) +{ +#ifdef CONFIG_VE + int check, alert, deny; + + if (ve_is_super(get_exec_env())) { + check = ve_area_access_check & VE0_AREA_ACC_CHECK; + alert = dentry->d_flags & DCACHE_VIRTUAL; + deny = ve_area_access_check & VE0_AREA_ACC_DENY; + } else { + check = ve_area_access_check & VE_AREA_ACC_CHECK; + alert = !(dentry->d_flags & DCACHE_VIRTUAL); + deny = ve_area_access_check & VE_AREA_ACC_DENY; + } + + if (check && alert) + check_alert(mnt, dentry, "Access"); + if (deny && alert) + return -EACCES; +#endif + return 0; +} + +int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt) +{ +#ifdef CONFIG_VE + int check, alert, deny; + + if (ve_is_super(get_exec_env())) { + check = ve_area_access_check & VE0_AREA_EXEC_CHECK; + alert = dentry->d_flags & DCACHE_VIRTUAL; + deny = ve_area_access_check & VE0_AREA_EXEC_DENY; + } else { + check = ve_area_access_check & VE_AREA_EXEC_CHECK; + alert = !(dentry->d_flags & DCACHE_VIRTUAL); + deny = ve_area_access_check & VE_AREA_EXEC_DENY; + } + + if (check && alert) + check_alert(mnt, dentry, "Exec"); + if (deny && alert) + return -EACCES; +#endif + return 0; +} + /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just @@ -1626,10 +1993,12 @@ resume: goto repeat; } atomic_dec(&dentry->d_count); + ub_dentry_uncharge_locked(dentry); } if (this_parent != root) { next = this_parent->d_u.d_child.next; atomic_dec(&this_parent->d_count); + ub_dentry_uncharge_locked(this_parent); this_parent = this_parent->d_parent; goto resume; } @@ -1765,7 +2134,7 @@ void __init vfs_caches_init(unsigned lon SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); diff -uprN linux-2.6.18/fs/devpts/inode.c linux-2.6.18.ovz/fs/devpts/inode.c --- linux-2.6.18/fs/devpts/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/devpts/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -22,16 +23,17 @@ #define DEVPTS_SUPER_MAGIC 0x1cd1 +struct devpts_config devpts_config = {.mode = 0600}; + +#ifndef CONFIG_VE static struct vfsmount *devpts_mnt; static struct dentry *devpts_root; - -static struct { - int setuid; - int setgid; - uid_t uid; - gid_t gid; - umode_t mode; -} config = {.mode = 0600}; +#define config devpts_config +#else +#define devpts_mnt (get_exec_env()->devpts_mnt) +#define devpts_root (get_exec_env()->devpts_root) +#define config (*(get_exec_env()->devpts_config)) +#endif enum { Opt_uid, Opt_gid, Opt_mode, @@ -83,7 +85,8 @@ static int devpts_remount(struct super_b config.mode = option & ~S_IFMT; break; default: - printk(KERN_ERR "devpts: called with bogus options\n"); + ve_printk(VE_LOG, KERN_ERR + "devpts: called with bogus options\n"); return -EINVAL; } } @@ -136,13 +139,15 @@ static int devpts_get_sb(struct file_sys return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); } -static struct file_system_type devpts_fs_type = { +struct file_system_type devpts_fs_type = { .owner = THIS_MODULE, .name = "devpts", .get_sb = devpts_get_sb, .kill_sb = kill_anon_super, }; +EXPORT_SYMBOL(devpts_fs_type); + /* * The normal naming convention is simply /dev/pts/; this conforms * to the System V naming convention @@ -234,6 +239,7 @@ static int __init init_devpts_fs(void) static void __exit exit_devpts_fs(void) { + /* the code is never called, the argument is irrelevant */ unregister_filesystem(&devpts_fs_type); mntput(devpts_mnt); } diff -uprN linux-2.6.18/fs/direct-io.c linux-2.6.18.ovz/fs/direct-io.c --- linux-2.6.18/fs/direct-io.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/direct-io.c 2007-06-13 06:55:07.000000000 -0400 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -675,6 +676,13 @@ submit_page_section(struct dio *dio, str { int ret = 0; + if (dio->rw & WRITE) { + /* + * Read accounting is performed in submit_bio() + */ + task_io_account_write(page, len, 1); + } + /* * Can we just grow the current page's presence in the dio? */ diff -uprN linux-2.6.18/fs/dquot.c linux-2.6.18.ovz/fs/dquot.c --- linux-2.6.18/fs/dquot.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/dquot.c 2007-06-13 06:55:07.000000000 -0400 @@ -158,7 +158,9 @@ static struct quota_format_type *find_qu struct quota_format_type *actqf; spin_lock(&dq_list_lock); - for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next); + for (actqf = quota_formats; + actqf && (actqf->qf_fmt_id != id || actqf->qf_ops == NULL); + actqf = actqf->qf_next); if (!actqf || !try_module_get(actqf->qf_owner)) { int qm; diff -uprN linux-2.6.18/fs/eventpoll.c linux-2.6.18.ovz/fs/eventpoll.c --- linux-2.6.18/fs/eventpoll.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/eventpoll.c 2007-06-13 06:55:07.000000000 -0400 @@ -106,11 +106,6 @@ #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) -struct epoll_filefd { - struct file *file; - int fd; -}; - /* * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". * It is used to keep track on all tasks that are currently inside the wake_up() code @@ -133,36 +128,6 @@ struct poll_safewake { spinlock_t lock; }; -/* - * This structure is stored inside the "private_data" member of the file - * structure and rapresent the main data sructure for the eventpoll - * interface. - */ -struct eventpoll { - /* Protect the this structure access */ - rwlock_t lock; - - /* - * This semaphore is used to ensure that files are not removed - * while epoll is using them. This is read-held during the event - * collection loop and it is write-held during the file cleanup - * path, the epoll file exit code and the ctl operations. - */ - struct rw_semaphore sem; - - /* Wait queue used by sys_epoll_wait() */ - wait_queue_head_t wq; - - /* Wait queue used by file->poll() */ - wait_queue_head_t poll_wait; - - /* List of ready file descriptors */ - struct list_head rdllist; - - /* RB-Tree root used to store monitored fd structs */ - struct rb_root rbr; -}; - /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ @@ -181,51 +146,6 @@ struct eppoll_entry { wait_queue_head_t *whead; }; -/* - * Each file descriptor added to the eventpoll interface will - * have an entry of this type linked to the hash. - */ -struct epitem { - /* RB-Tree node used to link this structure to the eventpoll rb-tree */ - struct rb_node rbn; - - /* List header used to link this structure to the eventpoll ready list */ - struct list_head rdllink; - - /* The file descriptor information this item refers to */ - struct epoll_filefd ffd; - - /* Number of active wait queue attached to poll operations */ - int nwait; - - /* List containing poll wait queues */ - struct list_head pwqlist; - - /* The "container" of this item */ - struct eventpoll *ep; - - /* The structure that describe the interested events and the source fd */ - struct epoll_event event; - - /* - * Used to keep track of the usage count of the structure. This avoids - * that the structure will desappear from underneath our processing. - */ - atomic_t usecnt; - - /* List header used to link this item to the "struct file" items list */ - struct list_head fllink; - - /* List header used to link the item to the transfer list */ - struct list_head txlink; - - /* - * This is used during the collection/transfer of events to userspace - * to pin items empty events set. - */ - unsigned int revents; -}; - /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; @@ -240,14 +160,10 @@ static int ep_getfd(int *efd, struct ino struct eventpoll *ep); static int ep_alloc(struct eventpoll **pep); static void ep_free(struct eventpoll *ep); -static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); static void ep_use_epitem(struct epitem *epi); -static void ep_release_epitem(struct epitem *epi); static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt); static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi); -static int ep_insert(struct eventpoll *ep, struct epoll_event *event, - struct file *tfile, int fd); static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event); static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi); @@ -275,7 +191,8 @@ static int eventpollfs_get_sb(struct fil /* * This semaphore is used to serialize ep_free() and eventpoll_release_file(). */ -static struct mutex epmutex; +struct mutex epmutex; +EXPORT_SYMBOL_GPL(epmutex); /* Safe wake up implementation */ static struct poll_safewake psw; @@ -290,10 +207,11 @@ static kmem_cache_t *pwq_cache __read_mo static struct vfsmount *eventpoll_mnt __read_mostly; /* File callbacks that implement the eventpoll file behaviour */ -static const struct file_operations eventpoll_fops = { +const struct file_operations eventpoll_fops = { .release = ep_eventpoll_close, .poll = ep_eventpoll_poll }; +EXPORT_SYMBOL_GPL(eventpoll_fops); /* * This is used to register the virtual file system from where @@ -534,7 +452,7 @@ eexit_1: current, size, error)); return error; } - +EXPORT_SYMBOL_GPL(sys_epoll_create); /* * The following function implements the controller interface for @@ -844,7 +762,7 @@ static void ep_free(struct eventpoll *ep * the returned item, so the caller must call ep_release_epitem() * after finished using the "struct epitem". */ -static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) +struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; unsigned long flags; @@ -874,6 +792,7 @@ static struct epitem *ep_find(struct eve return epir; } +EXPORT_SYMBOL_GPL(ep_find); /* @@ -892,13 +811,13 @@ static void ep_use_epitem(struct epitem * has finished using the structure. It might lead to freeing the * structure itself if the count goes to zero. */ -static void ep_release_epitem(struct epitem *epi) +void ep_release_epitem(struct epitem *epi) { if (atomic_dec_and_test(&epi->usecnt)) kmem_cache_free(epi_cache, epi); } - +EXPORT_SYMBOL_GPL(ep_release_epitem); /* * This is the callback that is used to add our wait queue to the @@ -944,7 +863,7 @@ static void ep_rbtree_insert(struct even } -static int ep_insert(struct eventpoll *ep, struct epoll_event *event, +int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; @@ -1036,6 +955,7 @@ eexit_2: eexit_1: return error; } +EXPORT_SYMBOL_GPL(ep_insert); /* diff -uprN linux-2.6.18/fs/exec.c linux-2.6.18.ovz/fs/exec.c --- linux-2.6.18/fs/exec.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/exec.c 2007-06-13 06:55:07.000000000 -0400 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +54,8 @@ #include #include +#include + #ifdef CONFIG_KMOD #include #endif @@ -64,6 +67,8 @@ int suid_dumpable = 0; EXPORT_SYMBOL(suid_dumpable); /* The maximal length of core_pattern is also specified in sysctl.c */ +int sysctl_at_vsyscall; + static struct linux_binfmt *formats; static DEFINE_RWLOCK(binfmt_lock); @@ -308,6 +313,10 @@ void install_arg_page(struct vm_area_str struct mm_struct *mm = vma->vm_mm; pte_t * pte; spinlock_t *ptl; + struct page_beancounter *pb; + + if (unlikely(pb_alloc(&pb))) + goto out_nopb; if (unlikely(anon_vma_prepare(vma))) goto out; @@ -324,12 +333,17 @@ void install_arg_page(struct vm_area_str lru_cache_add_active(page); set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( page, vma->vm_page_prot)))); + pb_add_ref(page, mm, &pb); + ub_unused_privvm_dec(mm, vma); + pb_free(&pb); page_add_new_anon_rmap(page, vma, address); pte_unmap_unlock(pte, ptl); /* no need for flush_tlb */ return; out: + pb_free(&pb); +out_nopb: __free_page(page); force_sig(SIGKILL, current); } @@ -404,9 +418,14 @@ int setup_arg_pages(struct linux_binprm bprm->loader += stack_base; bprm->exec += stack_base; - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + ret = -ENOMEM; + if (ub_memory_charge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags, + NULL, UB_SOFT)) + goto fail_charge; + + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | __GFP_SOFT_UBC); if (!mpnt) - return -ENOMEM; + goto fail_alloc; memset(mpnt, 0, sizeof(*mpnt)); @@ -431,11 +450,8 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_flags |= mm->def_flags; mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7]; - if ((ret = insert_vm_struct(mm, mpnt))) { - up_write(&mm->mmap_sem); - kmem_cache_free(vm_area_cachep, mpnt); - return ret; - } + if ((ret = insert_vm_struct(mm, mpnt))) + goto fail_insert; mm->stack_vm = mm->total_vm = vma_pages(mpnt); } @@ -450,6 +466,14 @@ int setup_arg_pages(struct linux_binprm up_write(&mm->mmap_sem); return 0; + +fail_insert: + up_write(&mm->mmap_sem); + kmem_cache_free(vm_area_cachep, mpnt); +fail_alloc: + ub_memory_uncharge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags, NULL); +fail_charge: + return ret; } EXPORT_SYMBOL(setup_arg_pages); @@ -525,10 +549,11 @@ int kernel_read(struct file *file, unsig EXPORT_SYMBOL(kernel_read); -static int exec_mmap(struct mm_struct *mm) +static int exec_mmap(struct linux_binprm *bprm) { struct task_struct *tsk; - struct mm_struct * old_mm, *active_mm; + struct mm_struct *old_mm, *active_mm, *mm; + int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -550,6 +575,10 @@ static int exec_mmap(struct mm_struct *m return -EINTR; } } + + ret = 0; + mm = bprm->mm; + mm->vps_dumpable = 1; task_lock(tsk); active_mm = tsk->active_mm; tsk->mm = mm; @@ -557,14 +586,24 @@ static int exec_mmap(struct mm_struct *m activate_mm(active_mm, mm); task_unlock(tsk); arch_pick_mmap_layout(mm); + bprm->mm = NULL; /* We're using it now */ + +#ifdef CONFIG_VZ_GENCALLS + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXECMMAP, + bprm) & NOTIFY_FAIL) { + /* similar to binfmt_elf */ + send_sig(SIGKILL, current, 0); + ret = -ENOMEM; + } +#endif if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); mmput(old_mm); - return 0; + return ret; } mmdrop(active_mm); - return 0; + return ret; } /* @@ -704,7 +743,14 @@ static int de_thread(struct task_struct attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); + set_virt_tgid(leader, virt_pid(current)); + set_virt_pid(leader, virt_pid(current)); + set_virt_pid(current, virt_tgid(current)); list_replace_rcu(&leader->tasks, ¤t->tasks); +#ifdef CONFIG_VE + list_replace_rcu(&leader->ve_task_info.vetask_list, + ¤t->ve_task_info.vetask_list); +#endif current->group_leader = current; leader->group_leader = current; @@ -845,12 +891,10 @@ int flush_old_exec(struct linux_binprm * /* * Release all of the old mmap stuff */ - retval = exec_mmap(bprm->mm); + retval = exec_mmap(bprm); if (retval) goto mmap_failed; - bprm->mm = NULL; /* We're using it now */ - /* This is the point of no return */ put_files_struct(files); @@ -901,8 +945,7 @@ int flush_old_exec(struct linux_binprm * return 0; mmap_failed: - put_files_struct(current->files); - current->files = files; + reset_files_struct(current, files); out: return retval; } @@ -1133,6 +1176,10 @@ int do_execve(char * filename, int retval; int i; + retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); + if (retval) + return retval; + retval = -ENOMEM; bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) @@ -1278,7 +1325,7 @@ static void format_corename(char *corena case 'p': pid_in_pattern = 1; rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current->tgid); + "%d", virt_tgid(current)); if (rc > out_end - out_ptr) goto out; out_ptr += rc; @@ -1322,7 +1369,7 @@ static void format_corename(char *corena case 'h': down_read(&uts_sem); rc = snprintf(out_ptr, out_end - out_ptr, - "%s", system_utsname.nodename); + "%s", utsname()->nodename); up_read(&uts_sem); if (rc > out_end - out_ptr) goto out; @@ -1350,7 +1397,7 @@ static void format_corename(char *corena if (!pid_in_pattern && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { rc = snprintf(out_ptr, out_end - out_ptr, - ".%d", current->tgid); + ".%d", virt_tgid(current)); if (rc > out_end - out_ptr) goto out; out_ptr += rc; @@ -1397,7 +1444,7 @@ static inline int zap_threads(struct tas goto done; rcu_read_lock(); - for_each_process(g) { + for_each_process_ve(g) { if (g == tsk->group_leader) continue; @@ -1472,7 +1519,7 @@ int do_coredump(long signr, int exit_cod if (!binfmt || !binfmt->core_dump) goto fail; down_write(&mm->mmap_sem); - if (!mm->dumpable) { + if (!mm->dumpable || mm->vps_dumpable != 1) { up_write(&mm->mmap_sem); goto fail; } diff -uprN linux-2.6.18/fs/ext2/namei.c linux-2.6.18.ovz/fs/ext2/namei.c --- linux-2.6.18/fs/ext2/namei.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/ext2/namei.c 2007-06-13 06:55:07.000000000 -0400 @@ -31,6 +31,7 @@ */ #include +#include #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -257,6 +258,8 @@ static int ext2_unlink(struct inode * di struct page * page; int err = -ENOENT; + DQUOT_INIT(inode); + de = ext2_find_entry (dir, dentry, &page); if (!de) goto out; @@ -299,6 +302,9 @@ static int ext2_rename (struct inode * o struct ext2_dir_entry_2 * old_de; int err = -ENOENT; + if (new_inode) + DQUOT_INIT(new_inode); + old_de = ext2_find_entry (old_dir, old_dentry, &old_page); if (!old_de) goto out; diff -uprN linux-2.6.18/fs/ext2/super.c linux-2.6.18.ovz/fs/ext2/super.c --- linux-2.6.18/fs/ext2/super.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/ext2/super.c 2007-06-13 06:55:07.000000000 -0400 @@ -365,7 +365,6 @@ static int parse_options (char * options { char * p; substring_t args[MAX_OPT_ARGS]; - unsigned long kind = EXT2_MOUNT_ERRORS_CONT; int option; if (!options) @@ -405,13 +404,19 @@ static int parse_options (char * options /* *sb_block = match_int(&args[0]); */ break; case Opt_err_panic: - kind = EXT2_MOUNT_ERRORS_PANIC; + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_RO); + set_opt (sbi->s_mount_opt, ERRORS_PANIC); break; case Opt_err_ro: - kind = EXT2_MOUNT_ERRORS_RO; + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_RO); break; case Opt_err_cont: - kind = EXT2_MOUNT_ERRORS_CONT; + clear_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_CONT); break; case Opt_nouid32: set_opt (sbi->s_mount_opt, NO_UID32); @@ -490,7 +495,6 @@ static int parse_options (char * options return 0; } } - sbi->s_mount_opt |= kind; return 1; } @@ -701,15 +705,21 @@ static int ext2_fill_super(struct super_ set_opt(sbi->s_mount_opt, GRPID); if (def_mount_opts & EXT2_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT2_FS_XATTR if (def_mount_opts & EXT2_DEFM_XATTR_USER) set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT2_FS_POSIX_ACL if (def_mount_opts & EXT2_DEFM_ACL) set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_RO) set_opt(sbi->s_mount_opt, ERRORS_RO); + else + set_opt(sbi->s_mount_opt, ERRORS_CONT); sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); @@ -1032,7 +1042,7 @@ static int ext2_remount (struct super_bl es = sbi->s_es; if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != (old_mount_opt & EXT2_MOUNT_XIP)) && - invalidate_inodes(sb)) + invalidate_inodes(sb, 0)) ext2_warning(sb, __FUNCTION__, "busy inodes while remounting "\ "xip remain in cache (no functional problem)"); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) @@ -1242,7 +1252,7 @@ static struct file_system_type ext2_fs_t .name = "ext2", .get_sb = ext2_get_sb, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, }; static int __init init_ext2_fs(void) diff -uprN linux-2.6.18/fs/ext3/inode.c linux-2.6.18.ovz/fs/ext3/inode.c --- linux-2.6.18/fs/ext3/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/ext3/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -2669,8 +2669,10 @@ void ext3_read_inode(struct inode * inod */ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT3_INODE_SIZE(inode->i_sb)) + EXT3_INODE_SIZE(inode->i_sb)) { + brelse(bh); goto bad_inode; + } if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ ei->i_extra_isize = sizeof(struct ext3_inode) - diff -uprN linux-2.6.18/fs/ext3/ioctl.c linux-2.6.18.ovz/fs/ext3/ioctl.c --- linux-2.6.18/fs/ext3/ioctl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/ext3/ioctl.c 2007-06-13 06:55:07.000000000 -0400 @@ -72,7 +72,7 @@ int ext3_ioctl (struct inode * inode, st * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) { + if (!capable(CAP_SYS_ADMIN)) { mutex_unlock(&inode->i_mutex); return -EPERM; } diff -uprN linux-2.6.18/fs/ext3/namei.c linux-2.6.18.ovz/fs/ext3/namei.c --- linux-2.6.18/fs/ext3/namei.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/ext3/namei.c 2007-06-13 06:55:07.000000000 -0400 @@ -959,6 +959,7 @@ static struct buffer_head * ext3_dx_find (block<b_data))) { brelse (bh); + *err = ERR_BAD_DX_DIR; goto errout; } *res_dir = de; @@ -1009,6 +1010,11 @@ static struct dentry *ext3_lookup(struct if (!inode) return ERR_PTR(-EACCES); + + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } } return d_splice_alias(inode, dentry); } @@ -1044,6 +1050,11 @@ struct dentry *ext3_get_parent(struct de if (!inode) return ERR_PTR(-EACCES); + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } + parent = d_alloc_anon(inode); if (!parent) { iput(inode); @@ -1124,9 +1135,9 @@ static struct ext3_dir_entry_2 *do_split char *data1 = (*bh)->b_data, *data2; unsigned split; struct ext3_dir_entry_2 *de = NULL, *de2; - int err; + int err = 0; - bh2 = ext3_append (handle, dir, &newblock, error); + bh2 = ext3_append (handle, dir, &newblock, &err); if (!(bh2)) { brelse(*bh); *bh = NULL; @@ -1135,14 +1146,9 @@ static struct ext3_dir_entry_2 *do_split BUFFER_TRACE(*bh, "get_write_access"); err = ext3_journal_get_write_access(handle, *bh); - if (err) { - journal_error: - brelse(*bh); - brelse(bh2); - *bh = NULL; - ext3_std_error(dir->i_sb, err); - goto errout; - } + if (err) + goto journal_error; + BUFFER_TRACE(frame->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, frame->bh); if (err) @@ -1185,8 +1191,16 @@ static struct ext3_dir_entry_2 *do_split goto journal_error; brelse (bh2); dxtrace(dx_show_index ("frame", frame->entries)); -errout: return de; + +journal_error: + brelse(*bh); + brelse(bh2); + *bh = NULL; + ext3_std_error(dir->i_sb, err); +errout: + *error = err; + return NULL; } #endif @@ -1283,7 +1297,7 @@ static int add_dirent_to_buf(handle_t *h if (err) ext3_std_error(dir->i_sb, err); brelse(bh); - return 0; + return err; } #ifdef CONFIG_EXT3_INDEX @@ -2181,6 +2195,12 @@ static int ext3_link (struct dentry * ol if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + /* + * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing + * otherwise has the potential to corrupt the orphan inode list. + */ + if (inode->i_nlink == 0) + return -ENOENT; retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + diff -uprN linux-2.6.18/fs/ext3/super.c linux-2.6.18.ovz/fs/ext3/super.c --- linux-2.6.18/fs/ext3/super.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/ext3/super.c 2007-06-13 06:55:07.000000000 -0400 @@ -159,20 +159,21 @@ static void ext3_handle_error(struct sup if (sb->s_flags & MS_RDONLY) return; - if (test_opt (sb, ERRORS_RO)) { - printk (KERN_CRIT "Remounting filesystem read-only\n"); - sb->s_flags |= MS_RDONLY; - } else { + if (!test_opt (sb, ERRORS_CONT)) { journal_t *journal = EXT3_SB(sb)->s_journal; EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; if (journal) journal_abort(journal, -EIO); } - if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs (device %s): panic forced after error\n", - sb->s_id); + if (test_opt (sb, ERRORS_RO)) { + printk (KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } ext3_commit_super(sb, es, 1); + if (test_opt (sb, ERRORS_PANIC)) + panic ("EXT3-fs (device %s): panic forced after error\n", + sb->s_id); } void ext3_error (struct super_block * sb, const char * function, @@ -1451,10 +1452,14 @@ static int ext3_fill_super (struct super set_opt(sbi->s_mount_opt, GRPID); if (def_mount_opts & EXT3_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT3_FS_XATTR if (def_mount_opts & EXT3_DEFM_XATTR_USER) set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT3_FS_POSIX_ACL if (def_mount_opts & EXT3_DEFM_ACL) set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) @@ -1466,6 +1471,8 @@ static int ext3_fill_super (struct super set_opt(sbi->s_mount_opt, ERRORS_PANIC); else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO) set_opt(sbi->s_mount_opt, ERRORS_RO); + else + set_opt(sbi->s_mount_opt, ERRORS_CONT); sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); @@ -2716,7 +2723,7 @@ static struct file_system_type ext3_fs_t .name = "ext3", .get_sb = ext3_get_sb, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, }; static int __init init_ext3_fs(void) diff -uprN linux-2.6.18/fs/fcntl.c linux-2.6.18.ovz/fs/fcntl.c --- linux-2.6.18/fs/fcntl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/fcntl.c 2007-06-13 06:55:07.000000000 -0400 @@ -189,6 +189,7 @@ out_fput: fput(file); goto out; } +EXPORT_SYMBOL_GPL(sys_dup2); asmlinkage long sys_dup(unsigned int fildes) { @@ -253,6 +254,7 @@ static int setfl(int fd, struct file * f static void f_modown(struct file *filp, unsigned long pid, uid_t uid, uid_t euid, int force) { + pid = comb_vpid_to_pid(pid); write_lock_irq(&filp->f_owner.lock); if (force || !filp->f_owner.pid) { filp->f_owner.pid = pid; @@ -319,7 +321,7 @@ static long do_fcntl(int fd, unsigned in * current syscall conventions, the only way * to fix this will be in libc. */ - err = filp->f_owner.pid; + err = comb_pid_to_vpid(filp->f_owner.pid); force_successful_syscall_return(); break; case F_SETOWN: @@ -470,23 +472,29 @@ static void send_sigio_to_task(struct ta void send_sigio(struct fown_struct *fown, int fd, int band) { struct task_struct *p; + struct file *f; + struct ve_struct *ve; int pid; read_lock(&fown->lock); pid = fown->pid; if (!pid) goto out_unlock_fown; + + /* hack: fown's are always embedded in struct file */ + f = container_of(fown, struct file, f_owner); + ve = f->owner_env; read_lock(&tasklist_lock); if (pid > 0) { - p = find_task_by_pid(pid); - if (p) { + p = find_task_by_pid_all(pid); + if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) { send_sigio_to_task(p, fown, fd, band); } } else { - do_each_task_pid(-pid, PIDTYPE_PGID, p) { + __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) { send_sigio_to_task(p, fown, fd, band); - } while_each_task_pid(-pid, PIDTYPE_PGID, p); + } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve); } read_unlock(&tasklist_lock); out_unlock_fown: @@ -503,6 +511,8 @@ static void send_sigurg_to_task(struct t int send_sigurg(struct fown_struct *fown) { struct task_struct *p; + struct file *f; + struct ve_struct *ve; int pid, ret = 0; read_lock(&fown->lock); @@ -511,17 +521,19 @@ int send_sigurg(struct fown_struct *fown goto out_unlock_fown; ret = 1; + f = container_of(fown, struct file, f_owner); + ve = f->owner_env; read_lock(&tasklist_lock); if (pid > 0) { - p = find_task_by_pid(pid); - if (p) { + p = find_task_by_pid_all(pid); + if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) { send_sigurg_to_task(p, fown); } } else { - do_each_task_pid(-pid, PIDTYPE_PGID, p) { + __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) { send_sigurg_to_task(p, fown); - } while_each_task_pid(-pid, PIDTYPE_PGID, p); + } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve); } read_unlock(&tasklist_lock); out_unlock_fown: diff -uprN linux-2.6.18/fs/file.c linux-2.6.18.ovz/fs/file.c --- linux-2.6.18/fs/file.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/file.c 2007-06-13 06:55:07.000000000 -0400 @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,8 @@ #include #include +#include + struct fdtable_defer { spinlock_t lock; struct work_struct wq; @@ -44,9 +47,9 @@ struct file ** alloc_fd_array(int num) int size = num * sizeof(struct file *); if (size <= PAGE_SIZE) - new_fds = (struct file **) kmalloc(size, GFP_KERNEL); + new_fds = (struct file **) ub_kmalloc(size, GFP_KERNEL); else - new_fds = (struct file **) vmalloc(size); + new_fds = (struct file **) ub_vmalloc(size); return new_fds; } @@ -213,9 +216,9 @@ fd_set * alloc_fdset(int num) int size = num / 8; if (size <= PAGE_SIZE) - new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL); + new_fdset = (fd_set *) ub_kmalloc(size, GFP_KERNEL); else - new_fdset = (fd_set *) vmalloc(size); + new_fdset = (fd_set *) ub_vmalloc(size); return new_fdset; } @@ -236,7 +239,7 @@ static struct fdtable *alloc_fdtable(int fd_set *new_openset = NULL, *new_execset = NULL; struct file **new_fds; - fdt = kzalloc(sizeof(*fdt), GFP_KERNEL); + fdt = kzalloc(sizeof(*fdt), GFP_KERNEL_UBC); if (!fdt) goto out; @@ -294,7 +297,7 @@ out: * both fd array and fdset. It is expected to be called with the * files_lock held. */ -static int expand_fdtable(struct files_struct *files, int nr) +int expand_fdtable(struct files_struct *files, int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -330,6 +333,7 @@ static int expand_fdtable(struct files_s out: return error; } +EXPORT_SYMBOL_GPL(expand_fdtable); /* * Expand files. diff -uprN linux-2.6.18/fs/file_table.c linux-2.6.18.ovz/fs/file_table.c --- linux-2.6.18/fs/file_table.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/file_table.c 2007-06-13 06:55:07.000000000 -0400 @@ -24,6 +24,10 @@ #include +#include +#include +#include + /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE @@ -37,12 +41,15 @@ static struct percpu_counter nr_files __ static inline void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + put_ve(f->owner_env); kmem_cache_free(filp_cachep, f); } static inline void file_free(struct file *f) { - percpu_counter_dec(&nr_files); + if (f->f_ub == get_ub0()) + percpu_counter_dec(&nr_files); + ub_file_uncharge(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -90,11 +97,14 @@ struct file *get_empty_filp(void) struct task_struct *tsk; static int old_max; struct file * f; + int acct; + acct = (get_exec_ub() == get_ub0()); /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (acct && get_nr_files() >= files_stat.max_files && + !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. @@ -106,9 +116,16 @@ struct file *get_empty_filp(void) f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; - - percpu_counter_inc(&nr_files); memset(f, 0, sizeof(*f)); + + if (ub_file_charge(f)) + goto fail_ch; + + if (acct) + percpu_counter_inc(&nr_files); + + f->owner_env = get_ve(get_exec_env()); + if (security_file_alloc(f)) goto fail_sec; @@ -135,6 +152,10 @@ fail_sec: file_free(f); fail: return NULL; + +fail_ch: + kmem_cache_free(filp_cachep, f); + return NULL; } EXPORT_SYMBOL(get_empty_filp); diff -uprN linux-2.6.18/fs/filesystems.c linux-2.6.18.ovz/fs/filesystems.c --- linux-2.6.18/fs/filesystems.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/filesystems.c 2007-06-13 06:55:07.000000000 -0400 @@ -13,6 +13,7 @@ #include #include #include /* for 'current' */ +#include #include /* @@ -22,8 +23,8 @@ * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or - * 2) we hold the reference to the module. - * The latter can be guaranteed by call of try_module_get(); if it + * 2) we hold the reference to the element. + * The latter can be guaranteed by call of try_filesystem(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ @@ -31,23 +32,45 @@ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); +int try_get_filesystem(struct file_system_type *fs) +{ + if (try_module_get(fs->owner)) { + get_ve(fs->owner_env); + return 1; + } + return 0; +} + /* WARNING: This can be used only if we _already_ own a reference */ void get_filesystem(struct file_system_type *fs) { + get_ve(fs->owner_env); __module_get(fs->owner); } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); + put_ve(fs->owner_env); +} + +static inline int check_ve_fstype(struct file_system_type *p, + struct ve_struct *env) +{ + return ((p->fs_flags & FS_VIRTUALIZED) || + ve_accessible_strict(p->owner_env, env)); } -static struct file_system_type **find_filesystem(const char *name) +static struct file_system_type **find_filesystem(const char *name, + struct ve_struct *env) { struct file_system_type **p; - for (p=&file_systems; *p; p=&(*p)->next) + for (p=&file_systems; *p; p=&(*p)->next) { + if (!check_ve_fstype(*p, env)) + continue; if (strcmp((*p)->name,name) == 0) break; + } return p; } @@ -74,8 +97,12 @@ int register_filesystem(struct file_syst if (fs->next) return -EBUSY; INIT_LIST_HEAD(&fs->fs_supers); + if (fs->owner_env == NULL) + fs->owner_env = get_ve0(); + if (fs->proto == NULL) + fs->proto = fs; write_lock(&file_systems_lock); - p = find_filesystem(fs->name); + p = find_filesystem(fs->name, fs->owner_env); if (*p) res = -EBUSY; else @@ -119,6 +146,75 @@ int unregister_filesystem(struct file_sy EXPORT_SYMBOL(unregister_filesystem); +#ifdef CONFIG_VE +int register_ve_fs_type(struct ve_struct *ve, struct file_system_type *template, + struct file_system_type **p_fs_type, struct vfsmount **p_mnt) +{ + struct vfsmount *mnt; + struct file_system_type *local_fs_type; + int ret; + + local_fs_type = kzalloc(sizeof(*local_fs_type) + sizeof(void *), + GFP_KERNEL); + if (local_fs_type == NULL) + return -ENOMEM; + + local_fs_type->name = template->name; + local_fs_type->fs_flags = template->fs_flags; + local_fs_type->get_sb = template->get_sb; + local_fs_type->kill_sb = template->kill_sb; + local_fs_type->owner = template->owner; + local_fs_type->owner_env = ve; + local_fs_type->proto = template; + + get_filesystem(local_fs_type); /* get_ve() inside */ + + ret = register_filesystem(local_fs_type); + if (ret) + goto reg_err; + + if (p_mnt == NULL) + goto done; + + mnt = kern_mount(local_fs_type); + if (IS_ERR(mnt)) + goto mnt_err; + + *p_mnt = mnt; +done: + *p_fs_type = local_fs_type; + return 0; + +mnt_err: + ret = PTR_ERR(mnt); + unregister_filesystem(local_fs_type); /* does not put */ + +reg_err: + put_filesystem(local_fs_type); + kfree(local_fs_type); + printk(KERN_DEBUG + "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret); + return ret; +} + +EXPORT_SYMBOL(register_ve_fs_type); + +void unregister_ve_fs_type(struct file_system_type *local_fs_type, + struct vfsmount *local_fs_mount) +{ + if (local_fs_mount == NULL && local_fs_type == NULL) + return; + + unregister_filesystem(local_fs_type); + umount_ve_fs_type(local_fs_type); + if (local_fs_mount) + kern_umount(local_fs_mount); /* alias to mntput, drop our ref */ + put_filesystem(local_fs_type); +} + +EXPORT_SYMBOL(unregister_ve_fs_type); +#endif + static int fs_index(const char __user * __name) { struct file_system_type * tmp; @@ -132,11 +228,14 @@ static int fs_index(const char __user * err = -EINVAL; read_lock(&file_systems_lock); - for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { + for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) { + if (!check_ve_fstype(tmp, get_exec_env())) + continue; if (strcmp(tmp->name,name) == 0) { err = index; break; } + index++; } read_unlock(&file_systems_lock); putname(name); @@ -149,9 +248,15 @@ static int fs_name(unsigned int index, c int len, res; read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) - if (index <= 0 && try_module_get(tmp->owner)) - break; + for (tmp = file_systems; tmp; tmp = tmp->next) { + if (!check_ve_fstype(tmp, get_exec_env())) + continue; + if (!index) { + if (try_get_filesystem(tmp)) + break; + } else + index--; + } read_unlock(&file_systems_lock); if (!tmp) return -EINVAL; @@ -169,8 +274,9 @@ static int fs_maxindex(void) int index; read_lock(&file_systems_lock); - for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) - ; + for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next) + if (check_ve_fstype(tmp, get_exec_env())) + index++; read_unlock(&file_systems_lock); return index; } @@ -206,9 +312,10 @@ int get_filesystem_list(char * buf) read_lock(&file_systems_lock); tmp = file_systems; while (tmp && len < PAGE_SIZE - 80) { - len += sprintf(buf+len, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); + if (check_ve_fstype(tmp, get_exec_env())) + len += sprintf(buf+len, "%s\t%s\n", + (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); @@ -220,14 +327,14 @@ struct file_system_type *get_fs_type(con struct file_system_type *fs; read_lock(&file_systems_lock); - fs = *(find_filesystem(name)); - if (fs && !try_module_get(fs->owner)) + fs = *(find_filesystem(name, get_exec_env())); + if (fs && !try_get_filesystem(fs)) fs = NULL; read_unlock(&file_systems_lock); if (!fs && (request_module("%s", name) == 0)) { read_lock(&file_systems_lock); - fs = *(find_filesystem(name)); - if (fs && !try_module_get(fs->owner)) + fs = *(find_filesystem(name, get_exec_env())); + if (fs && !try_get_filesystem(fs)) fs = NULL; read_unlock(&file_systems_lock); } @@ -235,3 +342,5 @@ struct file_system_type *get_fs_type(con } EXPORT_SYMBOL(get_fs_type); +EXPORT_SYMBOL(get_filesystem); +EXPORT_SYMBOL(put_filesystem); diff -uprN linux-2.6.18/fs/fuse/control.c linux-2.6.18.ovz/fs/fuse/control.c --- linux-2.6.18/fs/fuse/control.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/fuse/control.c 2007-06-13 06:55:07.000000000 -0400 @@ -10,6 +10,7 @@ #include #include +#include #define FUSE_CTL_SUPER_MAGIC 0x65735543 @@ -17,7 +18,11 @@ * This is non-NULL when the single instance of the control filesystem * exists. Protected by fuse_mutex */ +#ifdef CONFIG_VE +#define fuse_control_sb (get_exec_env()->_fuse_control_sb) +#else static struct super_block *fuse_control_sb; +#endif static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) { @@ -207,12 +212,51 @@ static struct file_system_type fuse_ctl_ .kill_sb = fuse_ctl_kill_sb, }; +#ifdef CONFIG_VE +static int fuse_ctl_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_ctl_fs_type != NULL) + return -EBUSY; + + return register_ve_fs_type(ve, &fuse_ctl_fs_type, + &ve->fuse_ctl_fs_type, NULL); +} + +static void fuse_ctl_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_ctl_fs_type == NULL) + return; + + unregister_ve_fs_type(ve->fuse_ctl_fs_type, NULL); + ve->fuse_ctl_fs_type = NULL; +} + +static struct ve_hook fuse_ctl_ve_hook = { + .init = fuse_ctl_start, + .fini = fuse_ctl_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif + int __init fuse_ctl_init(void) { - return register_filesystem(&fuse_ctl_fs_type); + int err; + + err = register_filesystem(&fuse_ctl_fs_type); + if (err == 0) + ve_hook_register(VE_SS_CHAIN, &fuse_ctl_ve_hook); + return err; } void fuse_ctl_cleanup(void) { + ve_hook_unregister(&fuse_ctl_ve_hook); unregister_filesystem(&fuse_ctl_fs_type); } diff -uprN linux-2.6.18/fs/fuse/dir.c linux-2.6.18.ovz/fs/fuse/dir.c --- linux-2.6.18/fs/fuse/dir.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/fuse/dir.c 2007-06-13 06:55:07.000000000 -0400 @@ -138,6 +138,7 @@ static int fuse_dentry_revalidate(struct struct fuse_entry_out outarg; struct fuse_conn *fc; struct fuse_req *req; + struct fuse_req *forget_req; /* Doesn't hurt to "reset" the validity timeout */ fuse_invalidate_entry_cache(entry); @@ -151,21 +152,29 @@ static int fuse_dentry_revalidate(struct if (IS_ERR(req)) return 0; + forget_req = fuse_get_req(fc); + if (IS_ERR(forget_req)) { + fuse_put_request(fc, req); + return 0; + } + fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); request_send(fc, req); err = req->out.h.error; + fuse_put_request(fc, req); /* Zero nodeid is same as -ENOENT */ if (!err && !outarg.nodeid) err = -ENOENT; if (!err) { struct fuse_inode *fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { - fuse_send_forget(fc, req, outarg.nodeid, 1); + fuse_send_forget(fc, forget_req, + outarg.nodeid, 1); return 0; } fi->nlookup ++; } - fuse_put_request(fc, req); + fuse_put_request(fc, forget_req); if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) return 0; @@ -214,6 +223,7 @@ static struct dentry *fuse_lookup(struct struct inode *inode = NULL; struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_req *req; + struct fuse_req *forget_req; if (entry->d_name.len > FUSE_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -222,9 +232,16 @@ static struct dentry *fuse_lookup(struct if (IS_ERR(req)) return ERR_PTR(PTR_ERR(req)); + forget_req = fuse_get_req(fc); + if (IS_ERR(forget_req)) { + fuse_put_request(fc, req); + return ERR_PTR(PTR_ERR(forget_req)); + } + fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); err = req->out.h.error; + fuse_put_request(fc, req); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (!err && outarg.nodeid && (invalid_nodeid(outarg.nodeid) || !valid_mode(outarg.attr.mode))) @@ -233,11 +250,11 @@ static struct dentry *fuse_lookup(struct inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr); if (!inode) { - fuse_send_forget(fc, req, outarg.nodeid, 1); + fuse_send_forget(fc, forget_req, outarg.nodeid, 1); return ERR_PTR(-ENOMEM); } } - fuse_put_request(fc, req); + fuse_put_request(fc, forget_req); if (err && err != -ENOENT) return ERR_PTR(err); @@ -375,6 +392,13 @@ static int create_new_entry(struct fuse_ struct fuse_entry_out outarg; struct inode *inode; int err; + struct fuse_req *forget_req; + + forget_req = fuse_get_req(fc); + if (IS_ERR(forget_req)) { + fuse_put_request(fc, req); + return PTR_ERR(forget_req); + } req->in.h.nodeid = get_node_id(dir); req->out.numargs = 1; @@ -382,24 +406,24 @@ static int create_new_entry(struct fuse_ req->out.args[0].value = &outarg; request_send(fc, req); err = req->out.h.error; - if (err) { - fuse_put_request(fc, req); - return err; - } + fuse_put_request(fc, req); + if (err) + goto out_put_forget_req; + err = -EIO; if (invalid_nodeid(outarg.nodeid)) - goto out_put_request; + goto out_put_forget_req; if ((outarg.attr.mode ^ mode) & S_IFMT) - goto out_put_request; + goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr); if (!inode) { - fuse_send_forget(fc, req, outarg.nodeid, 1); + fuse_send_forget(fc, forget_req, outarg.nodeid, 1); return -ENOMEM; } - fuse_put_request(fc, req); + fuse_put_request(fc, forget_req); if (dir_alias(inode)) { iput(inode); @@ -411,8 +435,8 @@ static int create_new_entry(struct fuse_ fuse_invalidate_attr(dir); return 0; - out_put_request: - fuse_put_request(fc, req); + out_put_forget_req: + fuse_put_request(fc, forget_req); return err; } @@ -935,14 +959,30 @@ static void iattr_to_fattr(struct iattr } } +static void fuse_vmtruncate(struct inode *inode, loff_t offset) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + int need_trunc; + + spin_lock(&fc->lock); + need_trunc = inode->i_size > offset; + i_size_write(inode, offset); + spin_unlock(&fc->lock); + + if (need_trunc) { + struct address_space *mapping = inode->i_mapping; + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + } +} + /* * Set attributes, and at the same time refresh them. * * Truncation is slightly complicated, because the 'truncate' request * may fail, in which case we don't want to touch the mapping. - * vmtruncate() doesn't allow for this case. So do the rlimit - * checking by hand and call vmtruncate() only after the file has - * actually been truncated. + * vmtruncate() doesn't allow for this case, so do the rlimit checking + * and the actual truncation by hand. */ static int fuse_setattr(struct dentry *entry, struct iattr *attr) { @@ -993,12 +1033,8 @@ static int fuse_setattr(struct dentry *e make_bad_inode(inode); err = -EIO; } else { - if (is_truncate) { - loff_t origsize = i_size_read(inode); - i_size_write(inode, outarg.attr.size); - if (origsize > outarg.attr.size) - vmtruncate(inode, outarg.attr.size); - } + if (is_truncate) + fuse_vmtruncate(inode, outarg.attr.size); fuse_change_attributes(inode, &outarg.attr); fi->i_time = time_to_jiffies(outarg.attr_valid, outarg.attr_valid_nsec); diff -uprN linux-2.6.18/fs/fuse/file.c linux-2.6.18.ovz/fs/fuse/file.c --- linux-2.6.18/fs/fuse/file.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/fuse/file.c 2007-06-13 06:55:07.000000000 -0400 @@ -481,8 +481,10 @@ static int fuse_commit_write(struct file err = -EIO; if (!err) { pos += count; - if (pos > i_size_read(inode)) + spin_lock(&fc->lock); + if (pos > inode->i_size) i_size_write(inode, pos); + spin_unlock(&fc->lock); if (offset == 0 && to == PAGE_CACHE_SIZE) { clear_page_dirty(page); @@ -586,8 +588,12 @@ static ssize_t fuse_direct_io(struct fil } fuse_put_request(fc, req); if (res > 0) { - if (write && pos > i_size_read(inode)) - i_size_write(inode, pos); + if (write) { + spin_lock(&fc->lock); + if (pos > inode->i_size) + i_size_write(inode, pos); + spin_unlock(&fc->lock); + } *ppos = pos; } fuse_invalidate_attr(inode); diff -uprN linux-2.6.18/fs/fuse/fuse_i.h linux-2.6.18.ovz/fs/fuse/fuse_i.h --- linux-2.6.18/fs/fuse/fuse_i.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/fuse/fuse_i.h 2007-06-13 06:55:07.000000000 -0400 @@ -38,7 +38,11 @@ #define FUSE_ALLOW_OTHER (1 << 1) /** List of active connections */ +#ifdef CONFIG_VE +#define fuse_conn_list (get_exec_env()->_fuse_conn_list) +#else extern struct list_head fuse_conn_list; +#endif /** Global mutex protecting fuse_conn_list and the control filesystem */ extern struct mutex fuse_mutex; diff -uprN linux-2.6.18/fs/fuse/inode.c linux-2.6.18.ovz/fs/fuse/inode.c --- linux-2.6.18/fs/fuse/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/fuse/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -17,13 +17,16 @@ #include #include #include +#include MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); static kmem_cache_t *fuse_inode_cachep; +#ifndef CONFIG_VE struct list_head fuse_conn_list; +#endif DEFINE_MUTEX(fuse_mutex); #define FUSE_SUPER_MAGIC 0x65735546 @@ -109,6 +112,7 @@ static int fuse_remount_fs(struct super_ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr) { + struct fuse_conn *fc = get_fuse_conn(inode); if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size) invalidate_inode_pages(inode->i_mapping); @@ -117,7 +121,9 @@ void fuse_change_attributes(struct inode inode->i_nlink = attr->nlink; inode->i_uid = attr->uid; inode->i_gid = attr->gid; + spin_lock(&fc->lock); i_size_write(inode, attr->size); + spin_unlock(&fc->lock); inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; @@ -131,7 +137,7 @@ void fuse_change_attributes(struct inode static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) { inode->i_mode = attr->mode & S_IFMT; - i_size_write(inode, attr->size); + inode->i_size = attr->size; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); @@ -661,6 +667,41 @@ static void fuse_sysfs_cleanup(void) subsystem_unregister(&fuse_subsys); } +#ifdef CONFIG_VE +static int fuse_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_fs_type != NULL) + return -EBUSY; + + INIT_LIST_HEAD(&ve->_fuse_conn_list); + return register_ve_fs_type(ve, &fuse_fs_type, &ve->fuse_fs_type, NULL); +} + +static void fuse_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_fs_type == NULL) + return; + + unregister_ve_fs_type(ve->fuse_fs_type, NULL); + kfree(ve->fuse_fs_type); + ve->fuse_fs_type = NULL; + BUG_ON(!list_empty(&ve->_fuse_conn_list)); +} + +static struct ve_hook fuse_ve_hook = { + .init = fuse_start, + .fini = fuse_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif + static int __init fuse_init(void) { int res; @@ -685,6 +726,7 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; + ve_hook_register(VE_SS_CHAIN, &fuse_ve_hook); return 0; err_sysfs_cleanup: @@ -701,6 +743,7 @@ static void __exit fuse_exit(void) { printk(KERN_DEBUG "fuse exit\n"); + ve_hook_unregister(&fuse_ve_hook); fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); diff -uprN linux-2.6.18/fs/hfs/super.c linux-2.6.18.ovz/fs/hfs/super.c --- linux-2.6.18/fs/hfs/super.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/hfs/super.c 2007-06-13 06:55:07.000000000 -0400 @@ -391,11 +391,13 @@ static int hfs_fill_super(struct super_b hfs_find_exit(&fd); goto bail_no_root; } + res = -EINVAL; root_inode = hfs_iget(sb, &fd.search_key->cat, &rec); hfs_find_exit(&fd); if (!root_inode) goto bail_no_root; + res = -ENOMEM; sb->s_root = d_alloc_root(root_inode); if (!sb->s_root) goto bail_iput; diff -uprN linux-2.6.18/fs/hugetlbfs/inode.c linux-2.6.18.ovz/fs/hugetlbfs/inode.c --- linux-2.6.18/fs/hugetlbfs/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/hugetlbfs/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -747,7 +747,7 @@ struct file *hugetlb_zero_setup(size_t s struct inode *inode; struct dentry *dentry, *root; struct qstr quick_string; - char buf[16]; + char buf[64]; static atomic_t counter; if (!can_do_hugetlb_shm()) @@ -757,7 +757,8 @@ struct file *hugetlb_zero_setup(size_t s return ERR_PTR(-ENOMEM); root = hugetlbfs_vfsmount->mnt_root; - snprintf(buf, 16, "%u", atomic_inc_return(&counter)); + snprintf(buf, 16, "VE%d-%u", VEID(get_exec_env()), + atomic_inc_return(&counter)); quick_string.name = buf; quick_string.len = strlen(quick_string.name); quick_string.hash = 0; diff -uprN linux-2.6.18/fs/inode.c linux-2.6.18.ovz/fs/inode.c --- linux-2.6.18/fs/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -97,13 +98,15 @@ static DEFINE_MUTEX(iprune_mutex); */ struct inodes_stat_t inodes_stat; -static kmem_cache_t * inode_cachep __read_mostly; +kmem_cache_t * inode_cachep __read_mostly; + +static struct address_space_operations vfs_empty_aops; +struct inode_operations vfs_empty_iops; +static struct file_operations vfs_empty_fops; +EXPORT_SYMBOL(vfs_empty_iops); static struct inode *alloc_inode(struct super_block *sb) { - static const struct address_space_operations empty_aops; - static struct inode_operations empty_iops; - static const struct file_operations empty_fops; struct inode *inode; if (sb->s_op->alloc_inode) @@ -118,8 +121,8 @@ static struct inode *alloc_inode(struct inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; atomic_set(&inode->i_count, 1); - inode->i_op = &empty_iops; - inode->i_fop = &empty_fops; + inode->i_op = &vfs_empty_iops; + inode->i_fop = &vfs_empty_fops; inode->i_nlink = 1; atomic_set(&inode->i_writecount, 0); inode->i_size = 0; @@ -143,7 +146,7 @@ static struct inode *alloc_inode(struct return NULL; } - mapping->a_ops = &empty_aops; + mapping->a_ops = &vfs_empty_aops; mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_HIGHUSER); @@ -298,13 +301,57 @@ static void dispose_list(struct list_hea spin_unlock(&inode_lock); } +static void show_header(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + printk("VFS: Busy inodes after unmount. " + "sb = %p, fs type = %s, sb count = %d, " + "sb->s_root = %s\n", sb, + (sb->s_type != NULL) ? sb->s_type->name : "", + sb->s_count, + (sb->s_root != NULL) ? + (char *)sb->s_root->d_name.name : ""); +} + +static void show_inode(struct inode *inode) +{ + struct dentry *d; + int i; + + printk("inode = %p, inode->i_count = %d, " + "inode->i_nlink = %d, " + "inode->i_mode = %d, " + "inode->i_state = %ld, " + "inode->i_flags = %d, " + "inode->i_devices.next = %p, " + "inode->i_devices.prev = %p, " + "inode->i_ino = %ld\n", + inode, + atomic_read(&inode->i_count), + inode->i_nlink, + inode->i_mode, + inode->i_state, + inode->i_flags, + inode->i_devices.next, + inode->i_devices.prev, + inode->i_ino); + printk("inode dump: "); + for (i = 0; i < sizeof(*inode); i++) + printk("%2.2x ", *((u_char *)inode + i)); + printk("\n"); + list_for_each_entry(d, &inode->i_dentry, d_alias) + printk(" d_alias %s\n", + d->d_name.name); +} + /* * Invalidate all inodes for a device. */ -static int invalidate_list(struct list_head *head, struct list_head *dispose) +static int invalidate_list(struct list_head *head, struct list_head *dispose, int check) { struct list_head *next; - int busy = 0, count = 0; + int busy = 0, count = 0, once = 1; next = head->next; for (;;) { @@ -331,6 +378,14 @@ static int invalidate_list(struct list_h continue; } busy = 1; + + if (check) { + if (once) { + once = 0; + show_header(inode); + } + show_inode(inode); + } } /* only unused inodes may be cached with i_count zero */ inodes_stat.nr_unused -= count; @@ -345,7 +400,7 @@ static int invalidate_list(struct list_h * fails because there are busy inodes then a non zero value is returned. * If the discard is successful all the inodes have been discarded. */ -int invalidate_inodes(struct super_block * sb) +int invalidate_inodes(struct super_block * sb, int check) { int busy; LIST_HEAD(throw_away); @@ -353,7 +408,7 @@ int invalidate_inodes(struct super_block mutex_lock(&iprune_mutex); spin_lock(&inode_lock); inotify_unmount_inodes(&sb->s_inodes); - busy = invalidate_list(&sb->s_inodes, &throw_away); + busy = invalidate_list(&sb->s_inodes, &throw_away, check); spin_unlock(&inode_lock); dispose_list(&throw_away); @@ -377,7 +432,7 @@ int __invalidate_device(struct block_dev * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb); + res = invalidate_inodes(sb, 0); drop_super(sb); } invalidate_bdev(bdev, 0); @@ -472,6 +527,7 @@ static void prune_icache(int nr_to_scan) */ static int shrink_icache_memory(int nr, gfp_t gfp_mask) { + KSTAT_PERF_ENTER(shrink_icache) if (nr) { /* * Nasty deadlock avoidance. We may hold various FS locks, @@ -482,6 +538,7 @@ static int shrink_icache_memory(int nr, return -1; prune_icache(nr); } + KSTAT_PERF_LEAVE(shrink_icache) return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } @@ -546,7 +603,8 @@ repeat: */ struct inode *new_inode(struct super_block *sb) { - static unsigned long last_ino; + /* 32 bits for compatability mode stat calls */ + static unsigned int last_ino; struct inode * inode; spin_lock_prefetch(&inode_lock); @@ -705,7 +763,8 @@ static inline unsigned long hash(struct */ ino_t iunique(struct super_block *sb, ino_t max_reserved) { - static ino_t counter; + /* 32 bits for compatability mode stat calls */ + static unsigned int counter; struct inode *inode; struct hlist_head * head; ino_t res; diff -uprN linux-2.6.18/fs/ioprio.c linux-2.6.18.ovz/fs/ioprio.c --- linux-2.6.18/fs/ioprio.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/ioprio.c 2007-06-13 06:55:07.000000000 -0400 @@ -25,6 +25,7 @@ #include #include #include +#include static int set_task_ioprio(struct task_struct *task, int ioprio) { @@ -62,6 +63,9 @@ asmlinkage long sys_ioprio_set(int which struct user_struct *user; int ret; + if (!ve_is_super(get_exec_env())) + return -EPERM; + switch (class) { case IOPRIO_CLASS_RT: if (!capable(CAP_SYS_ADMIN)) @@ -87,18 +91,18 @@ asmlinkage long sys_ioprio_set(int which if (!who) p = current; else - p = find_task_by_pid(who); + p = find_task_by_pid_all(who); if (p) ret = set_task_ioprio(p, ioprio); break; case IOPRIO_WHO_PGRP: if (!who) who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + do_each_task_pid_all(who, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); if (ret) break; - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_task_pid_all(who, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) @@ -109,17 +113,23 @@ asmlinkage long sys_ioprio_set(int which if (!user) break; - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (p->uid != who) continue; ret = set_task_ioprio(p, ioprio); if (ret) goto free_uid; - } while_each_thread(g, p); + } while_each_thread_all(g, p); free_uid: if (who) free_uid(user); break; + case IOPRIO_WHO_UBC: + if (class != IOPRIO_CLASS_BE) + return -ERANGE; + + ret = bc_set_ioprio(who, data); + break; default: ret = -EINVAL; } @@ -176,14 +186,14 @@ asmlinkage long sys_ioprio_get(int which if (!who) p = current; else - p = find_task_by_pid(who); + p = find_task_by_pid_ve(who); if (p) ret = get_task_ioprio(p); break; case IOPRIO_WHO_PGRP: if (!who) who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + do_each_task_pid_ve(who, PIDTYPE_PGID, p) { tmpio = get_task_ioprio(p); if (tmpio < 0) continue; @@ -191,7 +201,7 @@ asmlinkage long sys_ioprio_get(int which ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_task_pid_ve(who, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) @@ -202,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which if (!user) break; - do_each_thread(g, p) { + do_each_thread_ve(g, p) { if (p->uid != user->uid) continue; tmpio = get_task_ioprio(p); @@ -212,7 +222,7 @@ asmlinkage long sys_ioprio_get(int which ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_thread(g, p); + } while_each_thread_ve(g, p); if (who) free_uid(user); diff -uprN linux-2.6.18/fs/jbd/commit.c linux-2.6.18.ovz/fs/jbd/commit.c --- linux-2.6.18/fs/jbd/commit.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/jbd/commit.c 2007-06-13 06:55:07.000000000 -0400 @@ -160,6 +160,117 @@ static int journal_write_commit_record(j return (ret == -EIO); } +void journal_do_submit_data(struct buffer_head **wbuf, int bufs) +{ + int i; + + for (i = 0; i < bufs; i++) { + wbuf[i]->b_end_io = end_buffer_write_sync; + /* We use-up our safety reference in submit_bh() */ + submit_bh(WRITE, wbuf[i]); + } +} + +/* + * Submit all the data buffers to disk + */ +static void journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct journal_head *jh; + struct buffer_head *bh; + int locked; + int bufs = 0; + struct buffer_head **wbuf = journal->j_wbuf; + + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_sync_datalist, so we have to keep looping back to + * write_out_data until we *know* that the list is empty. + * + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + */ +write_out_data: + cond_resched(); + spin_lock(&journal->j_list_lock); + + while (commit_transaction->t_sync_datalist) { + jh = commit_transaction->t_sync_datalist; + bh = jh2bh(jh); + locked = 0; + + /* Get reference just to make sure buffer does not disappear + * when we are forced to drop various locks */ + get_bh(bh); + /* If the buffer is dirty, we need to submit IO and hence + * we need the buffer lock. We try to lock the buffer without + * blocking. If we fail, we need to drop j_list_lock and do + * blocking lock_buffer(). + */ + if (buffer_dirty(bh)) { + if (test_set_buffer_locked(bh)) { + BUFFER_TRACE(bh, "needs blocking lock"); + spin_unlock(&journal->j_list_lock); + /* Write out all data to prevent deadlocks */ + journal_do_submit_data(wbuf, bufs); + bufs = 0; + lock_buffer(bh); + spin_lock(&journal->j_list_lock); + } + locked = 1; + } + /* We have to get bh_state lock. Again out of order, sigh. */ + if (!inverted_lock(journal, bh)) { + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + } + /* Someone already cleaned up the buffer? */ + if (!buffer_jbd(bh) + || jh->b_transaction != commit_transaction + || jh->b_jlist != BJ_SyncData) { + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + BUFFER_TRACE(bh, "already cleaned up"); + put_bh(bh); + continue; + } + if (locked && test_clear_buffer_dirty(bh)) { + BUFFER_TRACE(bh, "needs writeout, adding to array"); + wbuf[bufs++] = bh; + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + if (bufs == journal->j_wbufsize) { + spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs); + bufs = 0; + goto write_out_data; + } + } + else { + BUFFER_TRACE(bh, "writeout complete: unfile"); + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + journal_remove_journal_head(bh); + /* Once for our safety reference, once for + * journal_remove_journal_head() */ + put_bh(bh); + put_bh(bh); + } + + if (lock_need_resched(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto write_out_data; + } + } + spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs); +} + /* * journal_commit_transaction * @@ -313,80 +424,13 @@ void journal_commit_transaction(journal_ * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - */ - bufs = 0; - /* - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); - spin_lock(&journal->j_list_lock); - - while (commit_transaction->t_sync_datalist) { - struct buffer_head *bh; - - jh = commit_transaction->t_sync_datalist; - commit_transaction->t_sync_datalist = jh->b_tnext; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - BUFFER_TRACE(bh, "locked"); - if (!inverted_lock(journal, bh)) - goto write_out_data; - __journal_temp_unlink_buffer(jh); - __journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } else { - if (buffer_dirty(bh)) { - BUFFER_TRACE(bh, "start journal writeout"); - get_bh(bh); - wbuf[bufs++] = bh; - if (bufs == journal->j_wbufsize) { - jbd_debug(2, "submit %d writes\n", - bufs); - spin_unlock(&journal->j_list_lock); - ll_rw_block(SWRITE, bufs, wbuf); - journal_brelse_array(wbuf, bufs); - bufs = 0; - goto write_out_data; - } - } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - if (!inverted_lock(journal, bh)) - goto write_out_data; - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - if (lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } - } - } - - if (bufs) { - spin_unlock(&journal->j_list_lock); - ll_rw_block(SWRITE, bufs, wbuf); - journal_brelse_array(wbuf, bufs); - spin_lock(&journal->j_list_lock); - } + journal_submit_data_buffers(journal, commit_transaction); /* * Wait for all previously submitted IO to complete. */ + spin_lock(&journal->j_list_lock); while (commit_transaction->t_locked_list) { struct buffer_head *bh; diff -uprN linux-2.6.18/fs/jbd/journal.c linux-2.6.18.ovz/fs/jbd/journal.c --- linux-2.6.18/fs/jbd/journal.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/jbd/journal.c 2007-06-13 06:55:07.000000000 -0400 @@ -211,10 +211,16 @@ end_loop: return 0; } -static void journal_start_thread(journal_t *journal) +static int journal_start_thread(journal_t *journal) { - kthread_run(kjournald, journal, "kjournald"); + struct task_struct *t; + + t = kthread_run(kjournald, journal, "kjournald"); + if (IS_ERR(t)) + return PTR_ERR(t); + wait_event(journal->j_wait_done_commit, journal->j_task != 0); + return 0; } static void journal_kill_thread(journal_t *journal) @@ -840,8 +846,7 @@ static int journal_reset(journal_t *jour /* Add the dynamic fields and write it to disk. */ journal_update_superblock(journal, 1); - journal_start_thread(journal); - return 0; + return journal_start_thread(journal); } /** diff -uprN linux-2.6.18/fs/jfs/jfs_imap.c linux-2.6.18.ovz/fs/jfs/jfs_imap.c --- linux-2.6.18/fs/jfs/jfs_imap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/jfs/jfs_imap.c 2007-06-13 06:55:07.000000000 -0400 @@ -318,7 +318,7 @@ int diRead(struct inode *ip) struct inomap *imap; int block_offset; int inodes_left; - uint pageno; + unsigned long pageno; int rel_inode; jfs_info("diRead: ino = %ld", ip->i_ino); @@ -606,7 +606,7 @@ int diWrite(tid_t tid, struct inode *ip) int block_offset; int inodes_left; struct metapage *mp; - uint pageno; + unsigned long pageno; int rel_inode; int dioffset; struct inode *ipimap; diff -uprN linux-2.6.18/fs/lockd/clntproc.c linux-2.6.18.ovz/fs/lockd/clntproc.c --- linux-2.6.18/fs/lockd/clntproc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/lockd/clntproc.c 2007-06-13 06:55:07.000000000 -0400 @@ -129,11 +129,11 @@ static void nlmclnt_setlockargs(struct n nlmclnt_next_cookie(&argp->cookie); argp->state = nsm_local_state; memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh)); - lock->caller = system_utsname.nodename; + lock->caller = utsname()->nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", (unsigned int)fl->fl_u.nfs_fl.owner->pid, - system_utsname.nodename); + utsname()->nodename); lock->svid = fl->fl_u.nfs_fl.owner->pid; lock->fl.fl_start = fl->fl_start; lock->fl.fl_end = fl->fl_end; @@ -156,6 +156,7 @@ nlmclnt_proc(struct inode *inode, int cm sigset_t oldset; unsigned long flags; int status, proto, vers; + struct ve_struct *ve; vers = (NFS_PROTO(inode)->version == 3) ? 4 : 1; if (NFS_PROTO(inode)->version > 3) { @@ -165,14 +166,17 @@ nlmclnt_proc(struct inode *inode, int cm /* Retrieve transport protocol from NFS client */ proto = NFS_CLIENT(inode)->cl_xprt->prot; + ve = set_exec_env(NFS_CLIENT(inode)->cl_xprt->owner_env); host = nlmclnt_lookup_host(NFS_ADDR(inode), proto, vers); + status = -ENOLCK; if (host == NULL) - return -ENOLCK; + goto fail; call = nlm_alloc_call(host); + status = -ENOMEM; if (call == NULL) - return -ENOMEM; + goto fail; nlmclnt_locks_init_private(fl, host); /* Set up the argument struct */ @@ -214,6 +218,8 @@ nlmclnt_proc(struct inode *inode, int cm spin_unlock_irqrestore(¤t->sighand->siglock, flags); dprintk("lockd: clnt proc returns %d\n", status); +fail: + (void)set_exec_env(ve); return status; } EXPORT_SYMBOL(nlmclnt_proc); diff -uprN linux-2.6.18/fs/lockd/host.c linux-2.6.18.ovz/fs/lockd/host.c --- linux-2.6.18/fs/lockd/host.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/lockd/host.c 2007-06-13 06:55:07.000000000 -0400 @@ -65,6 +65,7 @@ nlm_lookup_host(int server, struct socka struct nlm_host *host, **hp; u32 addr; int hash; + struct ve_struct *ve; dprintk("lockd: nlm_lookup_host(%08x, p=%d, v=%d)\n", (unsigned)(sin? ntohl(sin->sin_addr.s_addr) : 0), proto, version); @@ -77,6 +78,7 @@ nlm_lookup_host(int server, struct socka if (time_after_eq(jiffies, next_gc)) nlm_gc_hosts(); + ve = get_exec_env(); for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) { if (host->h_proto != proto) continue; @@ -84,6 +86,8 @@ nlm_lookup_host(int server, struct socka continue; if (host->h_server != server) continue; + if (!ve_accessible_strict(host->owner_env, ve)) + continue; if (nlm_cmp_addr(&host->h_addr, sin)) { if (hp != nlm_hosts + hash) { @@ -127,6 +131,7 @@ nlm_lookup_host(int server, struct socka spin_lock_init(&host->h_lock); INIT_LIST_HEAD(&host->h_granted); INIT_LIST_HEAD(&host->h_reclaim); + host->owner_env = ve; if (++nrhosts > NLM_HOST_MAX) next_gc = 0; @@ -143,10 +148,15 @@ nlm_find_client(void) * and return it */ int hash; + struct ve_struct *ve; + + ve = get_exec_env(); mutex_lock(&nlm_host_mutex); for (hash = 0 ; hash < NLM_HOST_NRHASH; hash++) { struct nlm_host *host, **hp; for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) { + if (!ve_accessible_strict(host->owner_env, ve)) + continue; if (host->h_server && host->h_killed == 0) { nlm_get_host(host); diff -uprN linux-2.6.18/fs/lockd/mon.c linux-2.6.18.ovz/fs/lockd/mon.c --- linux-2.6.18/fs/lockd/mon.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/lockd/mon.c 2007-06-13 06:55:07.000000000 -0400 @@ -152,7 +152,7 @@ xdr_encode_common(struct rpc_rqst *rqstp */ sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); if (!(p = xdr_encode_string(p, buffer)) - || !(p = xdr_encode_string(p, system_utsname.nodename))) + || !(p = xdr_encode_string(p, utsname()->nodename))) return ERR_PTR(-EIO); *p++ = htonl(argp->prog); *p++ = htonl(argp->vers); diff -uprN linux-2.6.18/fs/lockd/svc.c linux-2.6.18.ovz/fs/lockd/svc.c --- linux-2.6.18/fs/lockd/svc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/lockd/svc.c 2007-06-13 06:55:07.000000000 -0400 @@ -44,10 +44,11 @@ struct nlmsvc_binding * nlmsvc_ops; EXPORT_SYMBOL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); -static unsigned int nlmsvc_users; -static pid_t nlmsvc_pid; -int nlmsvc_grace_period; -unsigned long nlmsvc_timeout; +static unsigned int _nlmsvc_users; +static pid_t _nlmsvc_pid; +int _nlmsvc_grace_period; +unsigned long _nlmsvc_timeout; + static DECLARE_COMPLETION(lockd_start_done); static DECLARE_WAIT_QUEUE_HEAD(lockd_exit); @@ -162,8 +163,13 @@ lockd(struct svc_rqst *rqstp) * recvfrom routine. */ err = svc_recv(serv, rqstp, timeout); - if (err == -EAGAIN || err == -EINTR) + if (err == -EAGAIN || err == -EINTR) { +#ifdef CONFIG_VE + if (!get_exec_env()->is_running) + break; +#endif continue; + } if (err < 0) { printk(KERN_WARNING "lockd: terminating on error %d\n", diff -uprN linux-2.6.18/fs/lockd/svclock.c linux-2.6.18.ovz/fs/lockd/svclock.c --- linux-2.6.18/fs/lockd/svclock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/lockd/svclock.c 2007-06-13 06:55:07.000000000 -0400 @@ -325,7 +325,7 @@ static int nlmsvc_setgrantargs(struct nl { locks_copy_lock(&call->a_args.lock.fl, &lock->fl); memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); - call->a_args.lock.caller = system_utsname.nodename; + call->a_args.lock.caller = utsname()->nodename; call->a_args.lock.oh.len = lock->oh.len; /* set default data area */ diff -uprN linux-2.6.18/fs/lockd/xdr.c linux-2.6.18.ovz/fs/lockd/xdr.c --- linux-2.6.18/fs/lockd/xdr.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/lockd/xdr.c 2007-06-13 06:55:07.000000000 -0400 @@ -515,7 +515,7 @@ nlmclt_decode_res(struct rpc_rqst *req, */ #define NLM_void_sz 0 #define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN) -#define NLM_caller_sz 1+XDR_QUADLEN(sizeof(system_utsname.nodename)) +#define NLM_caller_sz 1+XDR_QUADLEN(sizeof(utsname()->nodename)) #define NLM_netobj_sz 1+XDR_QUADLEN(XDR_MAX_NETOBJ) /* #define NLM_owner_sz 1+XDR_QUADLEN(NLM_MAXOWNER) */ #define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE) diff -uprN linux-2.6.18/fs/locks.c linux-2.6.18.ovz/fs/locks.c --- linux-2.6.18/fs/locks.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/locks.c 2007-06-13 06:55:07.000000000 -0400 @@ -129,6 +129,8 @@ #include #include +#include + #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) @@ -145,9 +147,25 @@ static LIST_HEAD(blocked_list); static kmem_cache_t *filelock_cache __read_mostly; /* Allocate an empty lock structure. */ -static struct file_lock *locks_alloc_lock(void) +static struct file_lock *locks_alloc_lock(int charge) { - return kmem_cache_alloc(filelock_cache, SLAB_KERNEL); + struct file_lock *fl; + + fl = kmem_cache_alloc(filelock_cache, SLAB_KERNEL); +#ifdef CONFIG_USER_RESOURCE + if (fl == NULL) + goto out; + fl->fl_charged = 0; + if (!charge) + goto out; + if (!ub_flock_charge(fl, 1)) + goto out; + + kmem_cache_free(filelock_cache, fl); + fl = NULL; +out: +#endif + return fl; } static void locks_release_private(struct file_lock *fl) @@ -172,6 +190,7 @@ static void locks_free_lock(struct file_ BUG_ON(!list_empty(&fl->fl_block)); BUG_ON(!list_empty(&fl->fl_link)); + ub_flock_uncharge(fl); locks_release_private(fl); kmem_cache_free(filelock_cache, fl); } @@ -277,7 +296,7 @@ static int flock_make_lock(struct file * if (type < 0) return type; - fl = locks_alloc_lock(); + fl = locks_alloc_lock(type != F_UNLCK); if (fl == NULL) return -ENOMEM; @@ -464,7 +483,7 @@ static int lease_init(struct file *filp, /* Allocate a file_lock initialised to this type of lease */ static int lease_alloc(struct file *filp, int type, struct file_lock **flp) { - struct file_lock *fl = locks_alloc_lock(); + struct file_lock *fl = locks_alloc_lock(1); int error = -ENOMEM; if (fl == NULL) @@ -762,8 +781,15 @@ static int flock_lock_file(struct file * goto out; } + /* + * Nont F_UNLCK request must be already charged in + * flock_make_lock(). + * + * actually new_fl must be charged not the request, + * but we try to fail earlier + */ error = -ENOMEM; - new_fl = locks_alloc_lock(); + new_fl = locks_alloc_lock(0); if (new_fl == NULL) goto out; /* @@ -789,6 +815,10 @@ find_conflict: } if (request->fl_flags & FL_ACCESS) goto out; + + set_flock_charged(new_fl); + unset_flock_charged(request); + locks_copy_lock(new_fl, request); locks_insert_lock(&inode->i_flock, new_fl); new_fl = NULL; @@ -820,8 +850,11 @@ static int __posix_lock_file_conf(struct if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK || request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { - new_fl = locks_alloc_lock(); - new_fl2 = locks_alloc_lock(); + if (request->fl_type != F_UNLCK) + new_fl = locks_alloc_lock(1); + else + new_fl = NULL; + new_fl2 = locks_alloc_lock(0); } lock_kernel(); @@ -955,7 +988,7 @@ static int __posix_lock_file_conf(struct * bail out. */ error = -ENOLCK; /* "no luck" */ - if (right && left == right && !new_fl2) + if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2)) goto out; error = 0; @@ -966,23 +999,32 @@ static int __posix_lock_file_conf(struct goto out; } - if (!new_fl) { - error = -ENOLCK; + error = -ENOLCK; + if (!new_fl) + goto out; + if (right && (left == right) && ub_flock_charge(new_fl, 1)) goto out; - } locks_copy_lock(new_fl, request); locks_insert_lock(before, new_fl); new_fl = NULL; + error = 0; } if (right) { if (left == right) { /* The new lock breaks the old one in two pieces, * so we have to use the second new lock. */ + error = -ENOLCK; + if (added && ub_flock_charge(new_fl2, + request->fl_type != F_UNLCK)) + goto out; + /* FIXME move all fl_charged manipulations in ub code */ + set_flock_charged(new_fl2); left = new_fl2; new_fl2 = NULL; locks_copy_lock(left, right); locks_insert_lock(before, left); + error = 0; } right->fl_start = request->fl_end + 1; locks_wake_up_blocks(right); @@ -1422,7 +1464,7 @@ static int __setlease(struct file *filp, goto out; error = -ENOMEM; - fl = locks_alloc_lock(); + fl = locks_alloc_lock(1); if (fl == NULL) goto out; @@ -1610,6 +1652,7 @@ asmlinkage long sys_flock(unsigned int f out: return error; } +EXPORT_SYMBOL_GPL(sys_flock); /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). @@ -1645,7 +1688,7 @@ int fcntl_getlk(struct file *filp, struc flock.l_type = F_UNLCK; if (fl != NULL) { - flock.l_pid = fl->fl_pid; + flock.l_pid = pid_to_vpid(fl->fl_pid); #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -1677,7 +1720,7 @@ out: int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct flock __user *l) { - struct file_lock *file_lock = locks_alloc_lock(); + struct file_lock *file_lock = locks_alloc_lock(0); struct flock flock; struct inode *inode; int error; @@ -1799,7 +1842,7 @@ int fcntl_getlk64(struct file *filp, str flock.l_type = F_UNLCK; if (fl != NULL) { - flock.l_pid = fl->fl_pid; + flock.l_pid = pid_to_vpid(fl->fl_pid); flock.l_start = fl->fl_start; flock.l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -1820,7 +1863,7 @@ out: int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, struct flock64 __user *l) { - struct file_lock *file_lock = locks_alloc_lock(); + struct file_lock *file_lock = locks_alloc_lock(0); struct flock64 flock; struct inode *inode; int error; @@ -2018,7 +2061,9 @@ EXPORT_SYMBOL(posix_unblock_lock); static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx) { struct inode *inode = NULL; + unsigned int fl_pid; + fl_pid = pid_to_vpid(fl->fl_pid); if (fl->fl_file != NULL) inode = fl->fl_file->f_dentry->d_inode; @@ -2060,16 +2105,16 @@ static void lock_get_status(char* out, s } if (inode) { #ifdef WE_CAN_BREAK_LSLK_NOW - out += sprintf(out, "%d %s:%ld ", fl->fl_pid, + out += sprintf(out, "%d %s:%ld ", fl_pid, inode->i_sb->s_id, inode->i_ino); #else /* userspace relies on this representation of dev_t ;-( */ - out += sprintf(out, "%d %02x:%02x:%ld ", fl->fl_pid, + out += sprintf(out, "%d %02x:%02x:%ld ", fl_pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); #endif } else { - out += sprintf(out, "%d :0 ", fl->fl_pid); + out += sprintf(out, "%d :0 ", fl_pid); } if (IS_POSIX(fl)) { if (fl->fl_end == OFFSET_MAX) @@ -2118,11 +2163,18 @@ int get_locks_status(char *buffer, char char *q = buffer; off_t pos = 0; int i = 0; + struct ve_struct *env; lock_kernel(); + env = get_exec_env(); list_for_each(tmp, &file_lock_list) { struct list_head *btmp; - struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link); + struct file_lock *fl; + + fl = list_entry(tmp, struct file_lock, fl_link); + if (!ve_accessible(fl->fl_file->owner_env, env)) + continue; + lock_get_status(q, fl, ++i, ""); move_lock_status(&q, &pos, offset); @@ -2228,7 +2280,7 @@ EXPORT_SYMBOL(lock_may_write); static int __init filelock_init(void) { filelock_cache = kmem_cache_create("file_lock_cache", - sizeof(struct file_lock), 0, SLAB_PANIC, + sizeof(struct file_lock), 0, SLAB_PANIC | SLAB_UBC, init_once, NULL); return 0; } diff -uprN linux-2.6.18/fs/mpage.c linux-2.6.18.ovz/fs/mpage.c --- linux-2.6.18/fs/mpage.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/mpage.c 2007-06-13 06:55:07.000000000 -0400 @@ -26,6 +26,7 @@ #include #include #include +#include /* * I/O completion handler for multipage BIOs. @@ -699,6 +700,7 @@ mpage_writepages(struct address_space *m struct writeback_control *wbc, get_block_t get_block) { struct backing_dev_info *bdi = mapping->backing_dev_info; + struct user_beancounter *old_ub; struct bio *bio = NULL; sector_t last_block_in_bio = 0; int ret = 0; @@ -772,6 +774,8 @@ retry: continue; } + old_ub = bc_io_switch_context(page); + if (writepage) { ret = (*writepage)(page, wbc); if (ret) { @@ -787,6 +791,9 @@ retry: &last_block_in_bio, &ret, wbc, page->mapping->a_ops->writepage); } + + bc_io_restore_context(old_ub); + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) unlock_page(page); if (ret || (--(wbc->nr_to_write) <= 0)) diff -uprN linux-2.6.18/fs/namei.c linux-2.6.18.ovz/fs/namei.c --- linux-2.6.18/fs/namei.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/namei.c 2007-06-13 06:55:07.000000000 -0400 @@ -141,6 +141,7 @@ char * getname(const char __user * filen { char *tmp, *result; + ub_dentry_checkup(); result = ERR_PTR(-ENOMEM); tmp = __getname(); if (tmp) { @@ -386,6 +387,21 @@ static struct dentry * cached_lookup(str if (!dentry) dentry = d_lookup(parent, name); + /* + * The revalidation rules are simple: + * d_revalidate operation is called when we're about to use a cached + * dentry rather than call d_lookup. + * d_revalidate method may unhash the dentry itself or return FALSE, in + * which case if the dentry can be released d_lookup will be called. + * + * Additionally, by request of NFS people + * (http://linux.bkbits.net:8080/linux-2.4/cset@1.181?nav=index.html|src/|src/fs|related/fs/namei.c) + * d_revalidate is called when `/', `.' or `..' are looked up. + * Since re-lookup is impossible on them, we introduce a hack and + * return an error in this case. + * + * 2003/02/19 SAW + */ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { if (!dentry->d_op->d_revalidate(dentry, nd) && !d_invalidate(dentry)) { dput(dentry); @@ -448,6 +464,7 @@ static struct dentry * real_lookup(struc struct dentry * result; struct inode *dir = parent->d_inode; +repeat: mutex_lock(&dir->i_mutex); /* * First re-do the cached lookup just in case it was created @@ -486,7 +503,7 @@ static struct dentry * real_lookup(struc if (result->d_op && result->d_op->d_revalidate) { if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { dput(result); - result = ERR_PTR(-ENOENT); + goto repeat; } } return result; @@ -714,7 +731,14 @@ static __always_inline void follow_dotdo read_unlock(¤t->fs->lock); break; } - read_unlock(¤t->fs->lock); +#ifdef CONFIG_VE + if (nd->dentry == get_exec_env()->fs_root && + nd->mnt == get_exec_env()->fs_rootmnt) { + read_unlock(¤t->fs->lock); + break; + } +#endif + read_unlock(¤t->fs->lock); spin_lock(&dcache_lock); if (nd->dentry != nd->mnt->mnt_root) { nd->dentry = dget(nd->dentry->d_parent); @@ -755,6 +779,10 @@ static int do_lookup(struct nameidata *n if (dentry->d_op && dentry->d_op->d_revalidate) goto need_revalidate; done: + if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) { + dput(dentry); + return -ENOENT; + } path->mnt = mnt; path->dentry = dentry; __follow_mount(path); @@ -790,6 +818,7 @@ static fastcall int __link_path_walk(con { struct path next; struct inode *inode; + int real_components = 0; int err; unsigned int lookup_flags = nd->flags; @@ -861,6 +890,7 @@ static fastcall int __link_path_walk(con break; } /* This does the actual lookups.. */ + real_components++; err = do_lookup(nd, &this, &next); if (err) break; @@ -874,6 +904,9 @@ static fastcall int __link_path_walk(con goto out_dput; if (inode->i_op->follow_link) { + err = -ENOENT; + if (lookup_flags & LOOKUP_STRICT) + goto out_dput; err = do_follow_link(&next, nd); if (err) goto return_err; @@ -921,6 +954,7 @@ last_component: break; inode = next.dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) + && !(lookup_flags & LOOKUP_STRICT) && inode && inode->i_op && inode->i_op->follow_link) { err = do_follow_link(&next, nd); if (err) @@ -942,26 +976,40 @@ lookup_parent: nd->last_type = LAST_NORM; if (this.name[0] != '.') goto return_base; - if (this.len == 1) + if (this.len == 1) { nd->last_type = LAST_DOT; - else if (this.len == 2 && this.name[1] == '.') + goto return_reval; + } else if (this.len == 2 && this.name[1] == '.') { nd->last_type = LAST_DOTDOT; - else - goto return_base; + goto return_reval; + } +return_base: + if (!(nd->flags & LOOKUP_NOAREACHECK)) { + err = check_area_access_ve(nd->dentry, nd->mnt); + if (err) + break; + } + return 0; return_reval: /* * We bypassed the ordinary revalidation routines. * We may need to check the cached dentry for staleness. */ - if (nd->dentry && nd->dentry->d_sb && + if (!real_components && nd->dentry && nd->dentry->d_sb && (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { err = -ESTALE; /* Note: we do not d_invalidate() */ if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd)) + /* + * This lookup is for `/' or `.' or `..'. + * The filesystem unhashed the dentry itself + * inside d_revalidate (otherwise, d_invalidate + * wouldn't succeed). As a special courtesy to + * NFS we return an error. 2003/02/19 SAW + */ break; } -return_base: - return 0; + goto return_base; out_dput: dput_path(&next, nd); break; @@ -1879,6 +1927,7 @@ asmlinkage long sys_mknod(const char __u { return sys_mknodat(AT_FDCWD, filename, mode, dev); } +EXPORT_SYMBOL_GPL(sys_mknod); int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { @@ -1937,6 +1986,7 @@ asmlinkage long sys_mkdir(const char __u { return sys_mkdirat(AT_FDCWD, pathname, mode); } +EXPORT_SYMBOL_GPL(sys_mkdir); /* * We try to drop the dentry early: we should have @@ -1965,6 +2015,7 @@ void dentry_unhash(struct dentry *dentry spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); } +EXPORT_SYMBOL(sys_symlink); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -2044,6 +2095,7 @@ asmlinkage long sys_rmdir(const char __u { return do_rmdir(AT_FDCWD, pathname); } +EXPORT_SYMBOL_GPL(sys_rmdir); int vfs_unlink(struct inode *dir, struct dentry *dentry) { @@ -2143,6 +2195,7 @@ asmlinkage long sys_unlink(const char __ { return do_unlinkat(AT_FDCWD, pathname); } +EXPORT_SYMBOL_GPL(sys_unlink); int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode) { @@ -2299,6 +2352,7 @@ asmlinkage long sys_link(const char __us { return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +EXPORT_SYMBOL(sys_rename); /* * The worst of all namespace operations - renaming directory. "Perverted" @@ -2410,6 +2464,9 @@ int vfs_rename(struct inode *old_dir, st int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); const char *old_name; + if (DQUOT_RENAME(old_dentry->d_inode, old_dir, new_dir)) + return -EXDEV; + if (old_dentry->d_inode == new_dentry->d_inode) return 0; diff -uprN linux-2.6.18/fs/namespace.c linux-2.6.18.ovz/fs/namespace.c --- linux-2.6.18/fs/namespace.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/namespace.c 2007-06-13 06:55:07.000000000 -0400 @@ -39,13 +39,15 @@ static inline int sysfs_init(void) /* spinlock for vfsmount related operations, inplace of dcache_lock */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); +EXPORT_SYMBOL(vfsmount_lock); static int event; static struct list_head *mount_hashtable __read_mostly; static int hash_mask __read_mostly, hash_bits __read_mostly; static kmem_cache_t *mnt_cache __read_mostly; -static struct rw_semaphore namespace_sem; +struct rw_semaphore namespace_sem; +EXPORT_SYMBOL(namespace_sem); /* /sys/fs */ decl_subsys(fs, NULL, NULL); @@ -64,6 +66,7 @@ struct vfsmount *alloc_vfsmnt(const char struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); if (mnt) { memset(mnt, 0, sizeof(struct vfsmount)); + mnt->owner = VEID(get_exec_env()); atomic_set(&mnt->mnt_count, 1); INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); @@ -75,7 +78,7 @@ struct vfsmount *alloc_vfsmnt(const char INIT_LIST_HEAD(&mnt->mnt_slave); if (name) { int size = strlen(name) + 1; - char *newname = kmalloc(size, GFP_KERNEL); + char *newname = kmalloc(size, GFP_KERNEL_UBC); if (newname) { memcpy(newname, name, size); mnt->mnt_devname = newname; @@ -141,7 +144,7 @@ struct vfsmount *lookup_mnt(struct vfsmo static inline int check_mnt(struct vfsmount *mnt) { - return mnt->mnt_namespace == current->namespace; + return mnt->mnt_namespace == current->nsproxy->namespace; } static void touch_namespace(struct namespace *ns) @@ -357,10 +360,33 @@ static inline void mangle(struct seq_fil seq_escape(m, s, " \t\n\\"); } +static int prepare_mnt_root_mangle(struct vfsmount *mnt, + char **path_buf, char **path) +{ + /* skip FS_NOMOUNT mounts (rootfs) */ + if (mnt->mnt_sb->s_flags & MS_NOUSER) + return -EACCES; + + *path_buf = (char *)__get_free_page(GFP_KERNEL); + if (!*path_buf) + return -ENOMEM; + + *path = d_path(mnt->mnt_root, mnt, *path_buf, PAGE_SIZE); + if (IS_ERR(*path)) { + free_page((unsigned long)*path_buf); + /* + * This means that the file position will be incremented, i.e. + * the total number of "invisible" vfsmnt will leak. + */ + return -EACCES; + } + return 0; +} + static int show_vfsmnt(struct seq_file *m, void *v) { struct vfsmount *mnt = v; - int err = 0; + int err; static struct proc_fs_info { int flag; char *str; @@ -379,10 +405,19 @@ static int show_vfsmnt(struct seq_file * { 0, NULL } }; struct proc_fs_info *fs_infop; + char *path_buf, *path; + + err = prepare_mnt_root_mangle(mnt, &path_buf, &path); + if (err < 0) + return (err == -EACCES ? 0 : err); - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + if (ve_is_super(get_exec_env())) + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + else + mangle(m, mnt->mnt_sb->s_type->name); seq_putc(m, ' '); - seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); + mangle(m, path); + free_page((unsigned long) path_buf); seq_putc(m, ' '); mangle(m, mnt->mnt_sb->s_type->name); seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); @@ -410,18 +445,27 @@ struct seq_operations mounts_op = { static int show_vfsstat(struct seq_file *m, void *v) { struct vfsmount *mnt = v; - int err = 0; + char *path_buf, *path; + int err; + + err = prepare_mnt_root_mangle(mnt, &path_buf, &path); + if (err < 0) + return (err == -EACCES ? 0 : err); /* device */ if (mnt->mnt_devname) { seq_puts(m, "device "); - mangle(m, mnt->mnt_devname); + if (ve_is_super(get_exec_env())) + mangle(m, mnt->mnt_devname); + else + mangle(m, mnt->mnt_sb->s_type->name); } else seq_puts(m, "no device"); /* mount point */ seq_puts(m, " mounted on "); - seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); + mangle(m, path); + free_page((unsigned long)path_buf); seq_putc(m, ' '); /* file system type */ @@ -520,6 +564,7 @@ void release_mounts(struct list_head *he mntput(mnt); } } +EXPORT_SYMBOL(release_mounts); void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) { @@ -542,6 +587,7 @@ void umount_tree(struct vfsmount *mnt, i change_mnt_propagation(p, MS_PRIVATE); } } +EXPORT_SYMBOL(umount_tree); static int do_umount(struct vfsmount *mnt, int flags) { @@ -629,6 +675,34 @@ static int do_umount(struct vfsmount *mn return retval; } +#ifdef CONFIG_VE +void umount_ve_fs_type(struct file_system_type *local_fs_type) +{ + struct vfsmount *mnt; + struct list_head *p, *q; + LIST_HEAD(kill); + LIST_HEAD(umount_list); + + down_write(&namespace_sem); + spin_lock(&vfsmount_lock); + list_for_each_safe(p, q, ¤t->nsproxy->namespace->list) { + mnt = list_entry(p, struct vfsmount, mnt_list); + if (mnt->mnt_sb->s_type != local_fs_type) + continue; + list_del(p); + list_add(p, &kill); + } + + while (!list_empty(&kill)) { + mnt = list_entry(kill.next, struct vfsmount, mnt_list); + umount_tree(mnt, 1, &umount_list); + } + spin_unlock(&vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); +} +#endif + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -652,7 +726,7 @@ asmlinkage long sys_umount(char __user * goto dput_and_out; retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) goto dput_and_out; retval = do_umount(nd.mnt, flags); @@ -676,7 +750,7 @@ asmlinkage long sys_oldumount(char __use static int mount_is_safe(struct nameidata *nd) { - if (capable(CAP_SYS_ADMIN)) + if (capable(CAP_VE_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet @@ -838,7 +912,7 @@ static int attach_recursive_mnt(struct v if (parent_nd) { detach_mnt(source_mnt, parent_nd); attach_mnt(source_mnt, nd); - touch_namespace(current->namespace); + touch_namespace(current->nsproxy->namespace); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); @@ -892,6 +966,8 @@ static int do_change_type(struct nameida if (nd->dentry != nd->mnt->mnt_root) return -EINVAL; + if (!ve_accessible_veid(nd->mnt->owner, get_exec_env()->veid)) + return -EPERM; down_write(&namespace_sem); spin_lock(&vfsmount_lock); @@ -905,7 +981,8 @@ static int do_change_type(struct nameida /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, int recurse) +static int do_loopback(struct nameidata *nd, char *old_name, int recurse, + int mnt_flags) { struct nameidata old_nd; struct vfsmount *mnt = NULL; @@ -935,6 +1012,7 @@ static int do_loopback(struct nameidata if (!mnt) goto out; + mnt->mnt_flags |= mnt_flags; err = graft_tree(mnt, nd); if (err) { LIST_HEAD(umount_list); @@ -960,8 +1038,9 @@ static int do_remount(struct nameidata * { int err; struct super_block *sb = nd->mnt->mnt_sb; + int bind; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (!check_mnt(nd->mnt)) @@ -970,12 +1049,23 @@ static int do_remount(struct nameidata * if (nd->dentry != nd->mnt->mnt_root) return -EINVAL; + if (!ve_accessible_veid(nd->mnt->owner, get_exec_env()->veid)) + return -EPERM; + + /* do not allow to remount bind-mounts with another mountpoint flags */ + bind = 0; + if (nd->dentry != sb->s_root) { + if ((flags & ~(MS_BIND|MS_POSIXACL|MS_NOUSER)) != 0) + return -EINVAL; + bind = 1; + } + down_write(&sb->s_umount); - err = do_remount_sb(sb, flags, data, 0); + err = bind ? 0 : do_remount_sb(sb, flags, data, 0); if (!err) nd->mnt->mnt_flags = mnt_flags; up_write(&sb->s_umount); - if (!err) + if (!err && !bind) security_sb_post_remount(nd->mnt, flags, data); return err; } @@ -995,7 +1085,7 @@ static int do_move_mount(struct nameidat struct nameidata old_nd, parent_nd; struct vfsmount *p; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1003,6 +1093,10 @@ static int do_move_mount(struct nameidat if (err) return err; + err = -EPERM; + if (!ve_accessible_veid(old_nd.mnt->owner, get_exec_env()->veid)) + goto out_nosem; + down_write(&namespace_sem); while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry)) ; @@ -1058,6 +1152,7 @@ out: up_write(&namespace_sem); if (!err) path_release(&parent_nd); +out_nosem: path_release(&old_nd); return err; } @@ -1075,7 +1170,7 @@ static int do_new_mount(struct nameidata return -EINVAL; /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; mnt = do_kern_mount(type, flags, name, data); @@ -1113,6 +1208,11 @@ int do_add_mount(struct vfsmount *newmnt goto unlock; newmnt->mnt_flags = mnt_flags; + + /* make this before graft_tree reveals mnt_root to the world... */ + if (nd->dentry->d_flags & DCACHE_VIRTUAL) + newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL; + if ((err = graft_tree(newmnt, nd))) goto unlock; @@ -1430,7 +1530,7 @@ long do_mount(char *dev_name, char *dir_ retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&nd, dev_name, flags & MS_REC, mnt_flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); else if (flags & MS_MOVE) @@ -1449,7 +1549,7 @@ dput_out: */ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) { - struct namespace *namespace = tsk->namespace; + struct namespace *namespace = tsk->nsproxy->namespace; struct namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; struct vfsmount *p, *q; @@ -1516,7 +1616,7 @@ struct namespace *dup_namespace(struct t int copy_namespace(int flags, struct task_struct *tsk) { - struct namespace *namespace = tsk->namespace; + struct namespace *namespace = tsk->nsproxy->namespace; struct namespace *new_ns; int err = 0; @@ -1539,7 +1639,7 @@ int copy_namespace(int flags, struct tas goto out; } - tsk->namespace = new_ns; + tsk->nsproxy->namespace = new_ns; out: put_namespace(namespace); @@ -1587,6 +1687,7 @@ out1: free_page(type_page); return retval; } +EXPORT_SYMBOL_GPL(sys_mount); /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. @@ -1638,7 +1739,7 @@ static void chroot_fs_refs(struct nameid struct fs_struct *fs; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_ve(g, p) { task_lock(p); fs = p->fs; if (fs) { @@ -1653,7 +1754,7 @@ static void chroot_fs_refs(struct nameid put_fs_struct(fs); } else task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_ve(g, p); read_unlock(&tasklist_lock); } @@ -1762,7 +1863,7 @@ asmlinkage long sys_pivot_root(const cha detach_mnt(user_nd.mnt, &root_parent); attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */ attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */ - touch_namespace(current->namespace); + touch_namespace(current->nsproxy->namespace); spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd); @@ -1788,7 +1889,6 @@ static void __init init_mount_tree(void) { struct vfsmount *mnt; struct namespace *namespace; - struct task_struct *g, *p; mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); if (IS_ERR(mnt)) @@ -1804,13 +1904,8 @@ static void __init init_mount_tree(void) namespace->root = mnt; mnt->mnt_namespace = namespace; - init_task.namespace = namespace; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - get_namespace(namespace); - p->namespace = namespace; - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + init_task.nsproxy->namespace = namespace; + get_namespace(namespace); set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); set_fs_root(current->fs, namespace->root, namespace->root->mnt_root); @@ -1825,7 +1920,8 @@ void __init mnt_init(unsigned long mempa init_rwsem(&namespace_sem); mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, + NULL, NULL); mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); @@ -1881,3 +1977,4 @@ void __put_namespace(struct namespace *n release_mounts(&umount_list); kfree(namespace); } +EXPORT_SYMBOL_GPL(__put_namespace); diff -uprN linux-2.6.18/fs/nfs/dir.c linux-2.6.18.ovz/fs/nfs/dir.c --- linux-2.6.18/fs/nfs/dir.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/nfs/dir.c 2007-06-13 06:55:07.000000000 -0400 @@ -902,9 +902,15 @@ static struct dentry *nfs_lookup(struct lock_kernel(); - /* If we're doing an exclusive create, optimize away the lookup */ - if (nfs_is_exclusive_create(dir, nd)) - goto no_entry; + /* + * If we're doing an exclusive create, optimize away the lookup + * but don't hash the dentry. + */ + if (nfs_is_exclusive_create(dir, nd)) { + d_instantiate(dentry, NULL); + res = NULL; + goto out_unlock; + } error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); if (error == -ENOENT) @@ -1156,6 +1162,8 @@ int nfs_instantiate(struct dentry *dentr if (IS_ERR(inode)) goto out_err; d_instantiate(dentry, inode); + if (d_unhashed(dentry)) + d_rehash(dentry); return 0; out_err: d_drop(dentry); diff -uprN linux-2.6.18/fs/nfs/direct.c linux-2.6.18.ovz/fs/nfs/direct.c --- linux-2.6.18/fs/nfs/direct.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/nfs/direct.c 2007-06-13 06:55:07.000000000 -0400 @@ -122,9 +122,9 @@ ssize_t nfs_direct_IO(int rw, struct kio return -EINVAL; } -static void nfs_direct_dirty_pages(struct page **pages, int npages) +static void nfs_direct_dirty_pages(struct page **pages, unsigned int npages) { - int i; + unsigned int i; for (i = 0; i < npages; i++) { struct page *page = pages[i]; if (!PageCompound(page)) @@ -132,9 +132,9 @@ static void nfs_direct_dirty_pages(struc } } -static void nfs_direct_release_pages(struct page **pages, int npages) +static void nfs_direct_release_pages(struct page **pages, unsigned int npages) { - int i; + unsigned int i; for (i = 0; i < npages; i++) page_cache_release(pages[i]); } @@ -162,7 +162,7 @@ static inline struct nfs_direct_req *nfs return dreq; } -static void nfs_direct_req_release(struct kref *kref) +static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); @@ -171,6 +171,11 @@ static void nfs_direct_req_release(struc kmem_cache_free(nfs_direct_cachep, dreq); } +static void nfs_direct_req_release(struct nfs_direct_req *dreq) +{ + kref_put(&dreq->kref, nfs_direct_req_free); +} + /* * Collects and returns the final error value/byte-count. */ @@ -190,7 +195,6 @@ static ssize_t nfs_direct_wait(struct nf result = dreq->count; out: - kref_put(&dreq->kref, nfs_direct_req_release); return (ssize_t) result; } @@ -208,7 +212,7 @@ static void nfs_direct_complete(struct n } complete_all(&dreq->completion); - kref_put(&dreq->kref, nfs_direct_req_release); + nfs_direct_req_release(dreq); } /* @@ -279,7 +283,7 @@ static ssize_t nfs_direct_read_schedule( result = get_user_pages(current, current->mm, user_addr, data->npages, 1, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); - if (unlikely(result < data->npages)) { + if (result < 0 || result < data->npages) { if (result > 0) nfs_direct_release_pages(data->pagevec, result); nfs_readdata_release(data); @@ -360,6 +364,7 @@ static ssize_t nfs_direct_read(struct ki if (!result) result = nfs_direct_wait(dreq); rpc_clnt_sigunmask(clnt, &oldset); + nfs_direct_req_release(dreq); return result; } @@ -606,7 +611,7 @@ static ssize_t nfs_direct_write_schedule result = get_user_pages(current, current->mm, user_addr, data->npages, 0, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); - if (unlikely(result < data->npages)) { + if (result < 0 || result < data->npages) { if (result > 0) nfs_direct_release_pages(data->pagevec, result); nfs_writedata_release(data); @@ -700,6 +705,7 @@ static ssize_t nfs_direct_write(struct k if (!result) result = nfs_direct_wait(dreq); rpc_clnt_sigunmask(clnt, &oldset); + nfs_direct_req_release(dreq); return result; } diff -uprN linux-2.6.18/fs/nfs/file.c linux-2.6.18.ovz/fs/nfs/file.c --- linux-2.6.18/fs/nfs/file.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/nfs/file.c 2007-06-13 06:55:07.000000000 -0400 @@ -312,13 +312,13 @@ static void nfs_invalidate_page(struct p static int nfs_release_page(struct page *page, gfp_t gfp) { - if (gfp & __GFP_FS) - return !nfs_wb_page(page->mapping->host, page); - else - /* - * Avoid deadlock on nfs_wait_on_request(). - */ - return 0; + /* If PagePrivate() is set, then the page is not freeable */ + return 0; +} + +static int nfs_launder_page(struct page *page) +{ + return nfs_wb_page(page->mapping->host, page); } const struct address_space_operations nfs_file_aops = { @@ -334,6 +334,7 @@ const struct address_space_operations nf #ifdef CONFIG_NFS_DIRECTIO .direct_IO = nfs_direct_IO, #endif + .launder_page = nfs_launder_page, }; /* @@ -426,8 +427,9 @@ static int do_vfs_lock(struct file *file BUG(); } if (res < 0) - printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", - __FUNCTION__); + dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager" + " - error %d!\n", + __FUNCTION__, res); return res; } diff -uprN linux-2.6.18/fs/nfs/inode.c linux-2.6.18.ovz/fs/nfs/inode.c --- linux-2.6.18/fs/nfs/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/nfs/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -1144,6 +1144,10 @@ static int __init init_nfs_fs(void) { int err; + err = rpciod_up(); + if (err) + goto out5; + err = nfs_init_nfspagecache(); if (err) goto out4; @@ -1184,11 +1188,15 @@ out2: out3: nfs_destroy_nfspagecache(); out4: + rpciod_down(); +out5: return err; } static void __exit exit_nfs_fs(void) { + rpciod_down(); + nfs_destroy_directcache(); nfs_destroy_writepagecache(); nfs_destroy_readpagecache(); diff -uprN linux-2.6.18/fs/nfs/nfsroot.c linux-2.6.18.ovz/fs/nfs/nfsroot.c --- linux-2.6.18/fs/nfs/nfsroot.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/nfs/nfsroot.c 2007-06-13 06:55:07.000000000 -0400 @@ -312,7 +312,7 @@ static int __init root_nfs_name(char *na /* Override them by options set on kernel command-line */ root_nfs_parse(name, buf); - cp = system_utsname.nodename; + cp = utsname()->nodename; if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); return -1; diff -uprN linux-2.6.18/fs/nfs/super.c linux-2.6.18.ovz/fs/nfs/super.c --- linux-2.6.18/fs/nfs/super.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/nfs/super.c 2007-06-13 06:55:07.000000000 -0400 @@ -40,6 +40,9 @@ #include #include #include +#include +#include +#include #include #include @@ -120,7 +123,7 @@ static struct file_system_type nfs_fs_ty .name = "nfs", .get_sb = nfs_get_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; struct file_system_type clone_nfs_fs_type = { @@ -128,7 +131,8 @@ struct file_system_type clone_nfs_fs_typ .name = "nfs", .get_sb = nfs_clone_nfs_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT| + FS_BINARY_MOUNTDATA|FS_VIRTUALIZED, }; static struct super_operations nfs_sops = { @@ -221,6 +225,74 @@ module_param_call(idmap_cache_timeout, p &nfs_idmap_cache_timeout, 0644); #endif +#ifdef CONFIG_VE +static int ve_nfs_start(void *data) +{ + int err; + struct ve_struct *ve; + struct ve_nfs_context *ctx; + + ve = (struct ve_struct *)data; + if (!(ve->features & VE_FEATURE_NFS)) + return 0; + ctx = kzalloc(sizeof(struct ve_nfs_context), GFP_KERNEL); + if (ctx == NULL) + return -ENOMEM; + + err = register_ve_fs_type(ve, &nfs_fs_type, &ctx->fstype, NULL); + if (err < 0) + goto fail; + ve->nfs_context = ctx; + return 0; + +fail: + kfree(ctx); + return err; +} + +static void ve_nfs_stop(void *data) +{ + struct ve_struct *ve; + struct super_block *sb; + + ve = (struct ve_struct *)data; + if (ve->nfs_context == NULL) + return; + + /* Basically, on a valid stop we can be here iff NFS was mounted + read-only. In such a case client force-stop is not a problem. + If we are here and NFS is read-write, we are in a FORCE stop, so + force the client to stop. + Lock daemon is already dead. + Only superblock client remains. Den */ + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + struct rpc_clnt *clnt; + if (sb->s_type != ve->nfs_context->fstype) + continue; + clnt = NFS_SB(sb)->client; + clnt->cl_dead = 1; + clnt->cl_xprt->ops->close(clnt->cl_xprt); + rpc_killall_tasks(clnt); + } + spin_unlock(&sb_lock); + + unregister_ve_fs_type(ve->nfs_context->fstype, NULL); + kfree(ve->nfs_context->fstype); + kfree(ve->nfs_context); + + ve->nfs_context = NULL; +} + +static struct ve_hook nfs_hook = { + .init = ve_nfs_start, + .fini = ve_nfs_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_POST, +}; +#endif + /* * Register the NFS filesystems */ @@ -240,6 +312,7 @@ int __init register_nfs_fs(void) if (ret < 0) goto error_2; #endif + ve_hook_register(VE_SS_CHAIN, &nfs_hook); return 0; #ifdef CONFIG_NFS_V4 @@ -257,6 +330,7 @@ error_0: */ void __exit unregister_nfs_fs(void) { + ve_hook_unregister(&nfs_hook); #ifdef CONFIG_NFS_V4 unregister_filesystem(&nfs4_fs_type); nfs_unregister_sysctl(); @@ -514,11 +588,15 @@ static void nfs_umount_begin(struct vfsm /* -EIO all pending I/O */ server = NFS_SB(vfsmnt->mnt_sb); rpc = server->client; - if (!IS_ERR(rpc)) + if (!IS_ERR(rpc)) { + rpc->cl_dead = 1; rpc_killall_tasks(rpc); + } rpc = server->client_acl; - if (!IS_ERR(rpc)) + if (!IS_ERR(rpc)) { + rpc->cl_dead = 1; rpc_killall_tasks(rpc); + } } /* diff -uprN linux-2.6.18/fs/nfsd/nfs2acl.c linux-2.6.18.ovz/fs/nfsd/nfs2acl.c --- linux-2.6.18/fs/nfsd/nfs2acl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/nfsd/nfs2acl.c 2007-06-13 06:55:07.000000000 -0400 @@ -287,13 +287,20 @@ static int nfsaclsvc_release_getacl(stru return 1; } -static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, u32 *p, - struct nfsd_fhandle *resp) +static int nfsaclsvc_release_attrstat(struct svc_rqst *rqstp, u32 *p, + struct nfsd_attrstat *resp) { fh_put(&resp->fh); return 1; } +static int nfsaclsvc_release_access(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_accessres *resp) +{ + fh_put(&resp->fh); + return 1; +} + #define nfsaclsvc_decode_voidargs NULL #define nfsaclsvc_encode_voidres NULL #define nfsaclsvc_release_void NULL @@ -322,9 +329,9 @@ struct nfsd3_voidargs { int dummy; }; static struct svc_procedure nfsd_acl_procedures2[] = { PROC(null, void, void, void, RC_NOCACHE, ST), PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, attrstat, fhandle, RC_NOCACHE, ST+AT), - PROC(getattr, fhandle, attrstat, fhandle, RC_NOCACHE, ST+AT), - PROC(access, access, access, fhandle, RC_NOCACHE, ST+AT+1), + PROC(setacl, setacl, attrstat, attrstat, RC_NOCACHE, ST+AT), + PROC(getattr, fhandle, attrstat, attrstat, RC_NOCACHE, ST+AT), + PROC(access, access, access, access, RC_NOCACHE, ST+AT+1), }; struct svc_version nfsd_acl_version2 = { diff -uprN linux-2.6.18/fs/ntfs/super.c linux-2.6.18.ovz/fs/ntfs/super.c --- linux-2.6.18/fs/ntfs/super.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/ntfs/super.c 2007-06-13 06:55:07.000000000 -0400 @@ -3076,7 +3076,7 @@ iput_tmp_ino_err_out_now: * method again... FIXME: Do we need to do this twice now because of * attribute inodes? I think not, so leave as is for now... (AIA) */ - if (invalidate_inodes(sb)) { + if (invalidate_inodes(sb, 0)) { ntfs_error(sb, "Busy inodes left. This is most likely a NTFS " "driver bug."); /* Copied from fs/super.c. I just love this message. (-; */ diff -uprN linux-2.6.18/fs/open.c linux-2.6.18.ovz/fs/open.c --- linux-2.6.18/fs/open.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/open.c 2007-06-13 06:55:07.000000000 -0400 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -52,7 +53,21 @@ int vfs_statfs(struct dentry *dentry, st EXPORT_SYMBOL(vfs_statfs); -static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) +int faudit_statfs(struct super_block *sb, struct kstatfs *buf) +{ + struct faudit_statfs_arg arg; + + arg.sb = sb; + arg.stat = buf; + + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg) + != NOTIFY_DONE) + return arg.err; + return 0; +} + +static int vfs_statfs_native(struct dentry *dentry, struct vfsmount *mnt, + struct statfs *buf) { struct kstatfs st; int retval; @@ -61,6 +76,10 @@ static int vfs_statfs_native(struct dent if (retval) return retval; + retval = faudit_statfs(mnt->mnt_sb, &st); + if (retval) + return retval; + if (sizeof(*buf) == sizeof(st)) memcpy(buf, &st, sizeof(st)); else { @@ -95,7 +114,8 @@ static int vfs_statfs_native(struct dent return 0; } -static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) +static int vfs_statfs64(struct dentry *dentry, struct vfsmount *mnt, + struct statfs64 *buf) { struct kstatfs st; int retval; @@ -104,6 +124,10 @@ static int vfs_statfs64(struct dentry *d if (retval) return retval; + retval = faudit_statfs(mnt->mnt_sb, &st); + if (retval) + return retval; + if (sizeof(*buf) == sizeof(st)) memcpy(buf, &st, sizeof(st)); else { @@ -130,7 +154,7 @@ asmlinkage long sys_statfs(const char __ error = user_path_walk(path, &nd); if (!error) { struct statfs tmp; - error = vfs_statfs_native(nd.dentry, &tmp); + error = vfs_statfs_native(nd.dentry, nd.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_release(&nd); @@ -149,7 +173,7 @@ asmlinkage long sys_statfs64(const char error = user_path_walk(path, &nd); if (!error) { struct statfs64 tmp; - error = vfs_statfs64(nd.dentry, &tmp); + error = vfs_statfs64(nd.dentry, nd.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_release(&nd); @@ -168,7 +192,7 @@ asmlinkage long sys_fstatfs(unsigned int file = fget(fd); if (!file) goto out; - error = vfs_statfs_native(file->f_dentry, &tmp); + error = vfs_statfs_native(file->f_dentry, file->f_vfsmnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -189,7 +213,7 @@ asmlinkage long sys_fstatfs64(unsigned i file = fget(fd); if (!file) goto out; - error = vfs_statfs64(file->f_dentry, &tmp); + error = vfs_statfs64(file->f_dentry, file->f_vfsmnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -368,52 +392,16 @@ asmlinkage long sys_ftruncate64(unsigned */ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times) { - int error; - struct nameidata nd; - struct inode * inode; - struct iattr newattrs; - - error = user_path_walk(filename, &nd); - if (error) - goto out; - inode = nd.dentry->d_inode; - - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; + struct timeval tv[2]; - /* Don't worry, the checks are done in inode_change_ok() */ - newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; if (times) { - error = -EPERM; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - goto dput_and_out; - - error = get_user(newattrs.ia_atime.tv_sec, ×->actime); - newattrs.ia_atime.tv_nsec = 0; - if (!error) - error = get_user(newattrs.ia_mtime.tv_sec, ×->modtime); - newattrs.ia_mtime.tv_nsec = 0; - if (error) - goto dput_and_out; - - newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; - } else { - error = -EACCES; - if (IS_IMMUTABLE(inode)) - goto dput_and_out; - - if (current->fsuid != inode->i_uid && - (error = vfs_permission(&nd, MAY_WRITE)) != 0) - goto dput_and_out; + if (get_user(tv[0].tv_sec, ×->actime) || + get_user(tv[1].tv_sec, ×->modtime)) + return -EFAULT; + tv[0].tv_usec = 0; + tv[1].tv_usec = 0; } - mutex_lock(&inode->i_mutex); - error = notify_change(nd.dentry, &newattrs); - mutex_unlock(&inode->i_mutex); -dput_and_out: - path_release(&nd); -out: - return error; + return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0); } #endif @@ -422,14 +410,19 @@ out: * must be owner or have write permission. * Else, update from *times, must be owner or super user. */ -long do_utimes(int dfd, char __user *filename, struct timeval *times) +long do_utimes(int dfd, char __user *filename, struct timeval *times, int flags) { - int error; + int error = -EINVAL; struct nameidata nd; struct inode * inode; struct iattr newattrs; + int follow; - error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd); + if ((flags & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; + + follow = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + error = __user_walk_fd(dfd, filename, follow, &nd); if (error) goto out; @@ -475,7 +468,7 @@ asmlinkage long sys_futimesat(int dfd, c if (utimes && copy_from_user(×, utimes, sizeof(times))) return -EFAULT; - return do_utimes(dfd, filename, utimes ? times : NULL); + return do_utimes(dfd, filename, utimes ? times : NULL, 0); } asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes) @@ -655,15 +648,20 @@ out: return err; } -asmlinkage long sys_fchmodat(int dfd, const char __user *filename, - mode_t mode) +static long do_fchmodat(int dfd, const char __user *filename, mode_t mode, + int flags) { struct nameidata nd; struct inode * inode; - int error; + int error = -EINVAL; struct iattr newattrs; + int follow; - error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd); + if ((flags & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; + + follow = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + error = __user_walk_fd(dfd, filename, follow, &nd); if (error) goto out; inode = nd.dentry->d_inode; @@ -690,6 +688,12 @@ out: return error; } +asmlinkage long sys_fchmodat(int dfd, const char __user *filename, + mode_t mode) +{ + return do_fchmodat(dfd, filename, mode, 0); +} + asmlinkage long sys_chmod(const char __user *filename, mode_t mode) { return sys_fchmodat(AT_FDCWD, filename, mode); @@ -742,6 +746,7 @@ asmlinkage long sys_chown(const char __u } return error; } +EXPORT_SYMBOL_GPL(sys_chown); asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) @@ -1232,3 +1237,23 @@ int nonseekable_open(struct inode *inode } EXPORT_SYMBOL(nonseekable_open); + +long sys_lchmod(char __user * filename, mode_t mode) +{ + return do_fchmodat(AT_FDCWD, filename, mode, AT_SYMLINK_NOFOLLOW); +} + +long sys_lutime(char __user * filename, + struct utimbuf __user * times) +{ + struct timeval tv[2]; + + if (times) { + if (get_user(tv[0].tv_sec, ×->actime) || + get_user(tv[1].tv_sec, ×->modtime)) + return -EFAULT; + tv[0].tv_usec = 0; + tv[1].tv_usec = 0; + } + return do_utimes(AT_FDCWD, filename, times ? tv : NULL, AT_SYMLINK_NOFOLLOW); +} diff -uprN linux-2.6.18/fs/partitions/check.c linux-2.6.18.ovz/fs/partitions/check.c --- linux-2.6.18/fs/partitions/check.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/partitions/check.c 2007-06-13 06:55:07.000000000 -0400 @@ -126,6 +126,7 @@ char *disk_name(struct gendisk *hd, int return buf; } +EXPORT_SYMBOL(disk_name); const char *bdevname(struct block_device *bdev, char *buf) { diff -uprN linux-2.6.18/fs/pipe.c linux-2.6.18.ovz/fs/pipe.c --- linux-2.6.18/fs/pipe.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/pipe.c 2007-06-13 06:55:07.000000000 -0400 @@ -20,6 +20,8 @@ #include #include +#include + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -423,7 +425,7 @@ redo1: int error, atomic = 1; if (!page) { - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_HIGHUSER | __GFP_UBC); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; @@ -812,7 +814,7 @@ struct pipe_inode_info * alloc_pipe_info { struct pipe_inode_info *pipe; - pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_UBC); if (pipe) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; @@ -972,6 +974,7 @@ close_f1: no_files: return error; } +EXPORT_SYMBOL_GPL(do_pipe); /* * pipefs should _never_ be mounted by userland - too much of security hassle, diff -uprN linux-2.6.18/fs/proc/array.c linux-2.6.18.ovz/fs/proc/array.c --- linux-2.6.18/fs/proc/array.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/array.c 2007-06-13 06:55:07.000000000 -0400 @@ -75,6 +75,9 @@ #include #include #include +#include + +#include #include #include @@ -161,8 +164,14 @@ static inline char * task_state(struct t struct group_info *group_info; int g; struct fdtable *fdt = NULL; + pid_t pid, ppid, tgid, vpid; + + pid = get_task_pid(p); + tgid = get_task_tgid(p); read_lock(&tasklist_lock); + ppid = get_task_ppid(p); + vpid = (pid_alive(p) ? virt_pid(p) : 0); buffer += sprintf(buffer, "State:\t%s\n" "SleepAVG:\t%lu%%\n" @@ -170,13 +179,19 @@ static inline char * task_state(struct t "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" +#ifdef CONFIG_FAIRSCHED + "FNid:\t%d\n" +#endif "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), (p->sleep_avg/1024)*100/(1020000000/1024), - p->tgid, - p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, - pid_alive(p) && p->ptrace ? p->parent->pid : 0, + tgid, + pid, ppid, + pid_alive(p) && p->ptrace ? get_task_pid(p->parent) : 0, +#ifdef CONFIG_FAIRSCHED + task_fairsched_node_id(p), +#endif p->uid, p->euid, p->suid, p->fsuid, p->gid, p->egid, p->sgid, p->fsgid); read_unlock(&tasklist_lock); @@ -199,6 +214,16 @@ static inline char * task_state(struct t put_group_info(group_info); buffer += sprintf(buffer, "\n"); + +#ifdef CONFIG_VE + buffer += sprintf(buffer, + "envID:\t%d\n" + "VPid:\t%d\n" + "PNState:\t%u\n" + "StopState:\t%u\n", + VE_TASK_INFO(p)->owner_env->veid, + vpid, p->pn_state, p->stopped_state); +#endif return buffer; } @@ -244,7 +269,7 @@ static void collect_sigign_sigcatch(stru static inline char * task_sig(struct task_struct *p, char *buffer) { - sigset_t pending, shpending, blocked, ignored, caught; + sigset_t pending, shpending, blocked, ignored, caught, saved; int num_threads = 0; unsigned long qsize = 0; unsigned long qlim = 0; @@ -254,6 +279,7 @@ static inline char * task_sig(struct tas sigemptyset(&blocked); sigemptyset(&ignored); sigemptyset(&caught); + sigemptyset(&saved); /* Gather all the data with the appropriate locks held */ read_lock(&tasklist_lock); @@ -262,6 +288,7 @@ static inline char * task_sig(struct tas pending = p->pending.signal; shpending = p->signal->shared_pending.signal; blocked = p->blocked; + saved = p->saved_sigmask; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = atomic_read(&p->signal->count); qsize = atomic_read(&p->user->sigpending); @@ -279,6 +306,7 @@ static inline char * task_sig(struct tas buffer = render_sigset_t("SigBlk:\t", &blocked, buffer); buffer = render_sigset_t("SigIgn:\t", &ignored, buffer); buffer = render_sigset_t("SigCgt:\t", &caught, buffer); + buffer = render_sigset_t("SigSvd:\t", &saved, buffer); return buffer; } @@ -293,10 +321,27 @@ static inline char *task_cap(struct task cap_t(p->cap_effective)); } +#ifdef CONFIG_USER_RESOURCE +static inline void ub_dump_task_info(struct task_struct *tsk, + char *stsk, int ltsk, char *smm, int lmm) +{ + print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk); + task_lock(tsk); + if (tsk->mm) + print_ub_uid(tsk->mm->mm_ub, smm, lmm); + else + strncpy(smm, "N/A", lmm); + task_unlock(tsk); +} +#endif + int proc_pid_status(struct task_struct *task, char * buffer) { char * orig = buffer; struct mm_struct *mm = get_task_mm(task); +#ifdef CONFIG_USER_RESOURCE + char tsk_ub_info[64], mm_ub_info[64]; +#endif buffer = task_name(task, buffer); buffer = task_state(task, buffer); @@ -311,6 +356,14 @@ int proc_pid_status(struct task_struct * #if defined(CONFIG_S390) buffer = task_show_regs(task, buffer); #endif +#ifdef CONFIG_USER_RESOURCE + ub_dump_task_info(task, + tsk_ub_info, sizeof(tsk_ub_info), + mm_ub_info, sizeof(mm_ub_info)); + + buffer += sprintf(buffer, "TaskUB:\t%s\n", tsk_ub_info); + buffer += sprintf(buffer, "MMUB:\t%s\n", mm_ub_info); +#endif return buffer - orig; } @@ -332,6 +385,10 @@ static int do_task_stat(struct task_stru unsigned long rsslim = 0; struct task_struct *t; char tcomm[sizeof(task->comm)]; +#ifdef CONFIG_USER_RESOURCE + char ub_task_info[64]; + char ub_mm_info[64]; +#endif state = *get_task_state(task); vsize = eip = esp = 0; @@ -369,11 +426,11 @@ static int do_task_stat(struct task_stru } if (task->signal) { if (task->signal->tty) { - tty_pgrp = task->signal->tty->pgrp; + tty_pgrp = pid_to_vpid(task->signal->tty->pgrp); tty_nr = new_encode_dev(tty_devnum(task->signal->tty)); } - pgid = process_group(task); - sid = task->signal->session; + pgid = get_task_pgid(task); + sid = get_task_sid(task); cmin_flt = task->signal->cmin_flt; cmaj_flt = task->signal->cmaj_flt; cutime = task->signal->cutime; @@ -386,7 +443,7 @@ static int do_task_stat(struct task_stru stime = cputime_add(stime, task->signal->stime); } } - ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; + ppid = get_task_ppid(task); read_unlock(&tasklist_lock); if (!whole || num_threads<2) @@ -403,17 +460,34 @@ static int do_task_stat(struct task_stru priority = task_prio(task); nice = task_nice(task); +#ifndef CONFIG_VE /* Temporary variable needed for gcc-2.96 */ /* convert timespec -> nsec*/ start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC + task->start_time.tv_nsec; /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); +#else + start_time = ve_relative_clock(&task->start_time); +#endif + +#ifdef CONFIG_USER_RESOURCE + ub_dump_task_info(task, + ub_task_info, sizeof(ub_task_info), + ub_mm_info, sizeof(ub_mm_info)); +#endif res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %llu\n", - task->pid, +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %llu" +#ifdef CONFIG_VE + " 0 0 0 0 0 0 0 %d %u" +#endif +#ifdef CONFIG_USER_RESOURCE + " %s %s" +#endif + "\n", + get_task_pid(task), tcomm, state, ppid, @@ -457,7 +531,16 @@ static int do_task_stat(struct task_stru task_cpu(task), task->rt_priority, task->policy, - (unsigned long long)delayacct_blkio_ticks(task)); + (unsigned long long)delayacct_blkio_ticks(task) +#ifdef CONFIG_VE + , virt_pid(task), + VEID(VE_TASK_INFO(task)->owner_env) +#endif +#ifdef CONFIG_USER_RESOURCE + , ub_task_info, + ub_mm_info +#endif + ); if(mm) mmput(mm); return res; diff -uprN linux-2.6.18/fs/proc/base.c linux-2.6.18.ovz/fs/proc/base.c --- linux-2.6.18/fs/proc/base.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/base.c 2007-06-13 06:55:07.000000000 -0400 @@ -71,6 +71,7 @@ #include #include #include +#include #include "internal.h" /* NOTE: @@ -136,6 +137,9 @@ enum pid_directory_inos { #ifdef CONFIG_AUDITSYSCALL PROC_TGID_LOGINUID, #endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + PROC_TGID_DISK_IO, +#endif PROC_TGID_OOM_SCORE, PROC_TGID_OOM_ADJUST, PROC_TID_INO, @@ -240,6 +244,9 @@ static struct pid_entry tgid_base_stuff[ #ifdef CONFIG_AUDITSYSCALL E(PROC_TGID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), #endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + E(PROC_TGID_DISK_IO, "io", S_IRUGO), +#endif {0,0,NULL,0} }; static struct pid_entry tid_base_stuff[] = { @@ -314,6 +321,9 @@ static int proc_fd_link(struct inode *in struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); + int err; + + err = -ENOENT; if (task) { files = get_files_struct(task); @@ -327,16 +337,18 @@ static int proc_fd_link(struct inode *in spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { - *mnt = mntget(file->f_vfsmnt); - *dentry = dget(file->f_dentry); - spin_unlock(&files->file_lock); - put_files_struct(files); - return 0; + if (d_root_check(file->f_dentry, file->f_vfsmnt)) { + err = -EACCES; + } else { + *mnt = mntget(file->f_vfsmnt); + *dentry = dget(file->f_dentry); + err = 0; + } } spin_unlock(&files->file_lock); put_files_struct(files); } - return -ENOENT; + return err; } static struct fs_struct *get_fs_struct(struct task_struct *task) @@ -375,10 +387,12 @@ static int proc_cwd_link(struct inode *i } if (fs) { read_lock(&fs->lock); - *mnt = mntget(fs->pwdmnt); - *dentry = dget(fs->pwd); + result = d_root_check(fs->pwd, fs->pwdmnt); + if (!result) { + *mnt = mntget(fs->pwdmnt); + *dentry = dget(fs->pwd); + } read_unlock(&fs->lock); - result = 0; put_fs_struct(fs); } return result; @@ -526,10 +540,25 @@ static int proc_oom_score(struct task_st struct timespec uptime; do_posix_clock_monotonic_gettime(&uptime); + read_lock(&tasklist_lock); points = badness(task, uptime.tv_sec); + read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } +#ifdef CONFIG_TASK_IO_ACCOUNTING +static int proc_pid_io_accounting(struct task_struct *task, char *buffer) +{ + return sprintf(buffer, + "read_bytes: %llu\n" + "write_bytes: %llu\n" + "cancelled_write_bytes: %llu\n", + (unsigned long long)task->ioac.read_bytes, + (unsigned long long)task->ioac.write_bytes, + (unsigned long long)task->ioac.cancelled_write_bytes); +} +#endif + /************************************************************************/ /* Here the fs part begins */ /************************************************************************/ @@ -586,11 +615,7 @@ static int mounts_open(struct inode *ino int ret = -EINVAL; if (task) { - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + namespace = get_task_mnt_ns(task); put_task_struct(task); } @@ -657,11 +682,7 @@ static int mountstats_open(struct inode struct task_struct *task = get_proc_task(inode); if (task) { - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + namespace = get_task_mnt_ns(task); put_task_struct(task); } @@ -907,6 +928,8 @@ static ssize_t oom_adjust_write(struct f oom_adjust = simple_strtol(buffer, &end, 0); if ((oom_adjust < -16 || oom_adjust > 15) && oom_adjust != OOM_DISABLE) return -EINVAL; + if (oom_adjust == OOM_DISABLE && !ve_is_super(get_exec_env())) + return -EPERM; if (*end == '\n') end++; task = get_proc_task(file->f_dentry->d_inode); @@ -1297,6 +1320,10 @@ static struct inode *proc_pid_make_inode struct inode * inode; struct proc_inode *ei; + if (!ve_accessible(VE_TASK_INFO(task)->owner_env, + sb->s_type->owner_env)) + return NULL; + /* We need a new inode */ inode = new_inode(sb); @@ -1429,6 +1456,14 @@ static int tid_fd_revalidate(struct dent static int pid_delete_dentry(struct dentry * dentry) { +#ifdef CONFIG_VE + /* + * Don't hash dentries from VE0 that may hold VE's pids + */ + if (ve_is_super(dentry->d_sb->s_type->owner_env) && + proc_pid(dentry->d_inode)->veid != 0) + return 1; +#endif /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. @@ -1548,10 +1583,33 @@ static struct file_operations proc_task_ }; /* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +static int proc_fd_permission(struct inode *inode, int mask, + struct nameidata *nd) +{ + struct task_struct *tsk; + int rv; + + rv = generic_permission(inode, mask, NULL); + if (rv == 0) + return 0; + tsk = get_proc_task(inode); + if (tsk) { + if (tsk == current) + rv = 0; + put_task_struct(tsk); + } + return rv; +} + +/* * proc directories can do almost nothing.. */ static struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, + .permission = proc_fd_permission, .setattr = proc_setattr, }; @@ -1834,6 +1892,12 @@ static struct dentry *proc_pident_lookup inode->i_fop = &proc_loginuid_operations; break; #endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + case PROC_TGID_DISK_IO: + inode->i_fop = &proc_info_file_operations; + ei->op.proc_read = proc_pid_io_accounting; + break; +#endif default: printk("procfs: impossible type (%d)",p->type); iput(inode); @@ -1938,14 +2002,14 @@ static int proc_self_readlink(struct den int buflen) { char tmp[PROC_NUMBUF]; - sprintf(tmp, "%d", current->tgid); + sprintf(tmp, "%d", get_task_tgid(current)); return vfs_readlink(dentry,buffer,buflen,tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { char tmp[PROC_NUMBUF]; - sprintf(tmp, "%d", current->tgid); + sprintf(tmp, "%d", get_task_tgid(current)); return ERR_PTR(vfs_follow_link(nd,tmp)); } @@ -1975,15 +2039,16 @@ static struct inode_operations proc_self * that no dcache entries will exist at process exit time it * just makes it very unlikely that any will persist. */ -void proc_flush_task(struct task_struct *task) +static void __proc_flush_task(struct task_struct *task, + int pid, int tgid, struct dentry *root) { struct dentry *dentry, *leader, *dir; char buf[PROC_NUMBUF]; struct qstr name; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", task->pid); - dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name); + name.len = snprintf(buf, sizeof(buf), "%d", pid); + dentry = d_hash_and_lookup(root, &name); if (dentry) { shrink_dcache_parent(dentry); d_drop(dentry); @@ -1994,8 +2059,8 @@ void proc_flush_task(struct task_struct goto out; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", task->tgid); - leader = d_hash_and_lookup(proc_mnt->mnt_root, &name); + name.len = snprintf(buf, sizeof(buf), "%d", tgid); + leader = d_hash_and_lookup(root, &name); if (!leader) goto out; @@ -2006,7 +2071,7 @@ void proc_flush_task(struct task_struct goto out_put_leader; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { shrink_dcache_parent(dentry); @@ -2021,6 +2086,19 @@ out: return; } +void proc_flush_task(struct task_struct *task) +{ + __proc_flush_task(task, task->pid, task->tgid, + proc_mnt->mnt_root); +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + return; + + __proc_flush_task(task, virt_pid(task), virt_tgid(task), + task->ve_task_info.owner_env->proc_mnt->mnt_root); +#endif +} + /* SMP-safe */ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { @@ -2050,7 +2128,19 @@ struct dentry *proc_pid_lookup(struct in goto out; rcu_read_lock(); - task = find_task_by_pid(tgid); + task = find_task_by_pid_ve(tgid); + /* In theory we are allowed to lookup both /proc/VIRT_PID and + * /proc/GLOBAL_PID inside VE. However, current /proc implementation + * cannot maintain two references to one task, so that we have + * to prohibit /proc/GLOBAL_PID. + */ + if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tgid)) { + /* However, VE_ENTERed tasks are exception, they use global + * pids. + */ + if (virt_pid(task) != tgid) + task = NULL; + } if (task) get_task_struct(task); rcu_read_unlock(); @@ -2101,7 +2191,12 @@ static struct dentry *proc_task_lookup(s goto out; rcu_read_lock(); - task = find_task_by_pid(tid); + task = find_task_by_pid_ve(tid); + /* See comment above in similar place. */ + if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tid)) { + if (virt_pid(task) != tid) + task = NULL; + } if (task) get_task_struct(task); rcu_read_unlock(); @@ -2152,12 +2247,17 @@ out_no_task: * In the case of a seek we start with &init_task and walk nr * threads past it. */ -static struct task_struct *first_tgid(int tgid, unsigned int nr) +static struct task_struct *first_tgid(int tgid, unsigned int nr, + struct ve_struct *ve) { struct task_struct *pos; rcu_read_lock(); if (tgid && nr) { - pos = find_task_by_pid(tgid); + struct ve_struct *oldve; + + oldve = set_exec_env(ve); + pos = find_task_by_pid_ve(tgid); + (void)set_exec_env(oldve); if (pos && thread_group_leader(pos)) goto found; } @@ -2169,12 +2269,14 @@ static struct task_struct *first_tgid(in /* If we haven't found our starting place yet start with * the init_task and walk nr tasks forward. */ - for (pos = next_task(&init_task); nr > 0; --nr) { - pos = next_task(pos); - if (pos == &init_task) { - pos = NULL; + pos = __first_task_ve(ve); + if (pos == NULL) + goto done; + + for ( ; nr > 0; --nr) { + pos = __next_task_ve(ve, pos); + if (pos == NULL) goto done; - } } found: get_task_struct(pos); @@ -2189,14 +2291,15 @@ done: * * The reference to the input task_struct is released. */ -static struct task_struct *next_tgid(struct task_struct *start) +static struct task_struct *next_tgid(struct task_struct *start, + struct ve_struct *ve) { struct task_struct *pos; rcu_read_lock(); pos = start; if (pid_alive(start)) - pos = next_task(start); - if (pid_alive(pos) && (pos != &init_task)) { + pos = __next_task_ve(ve, start); + if (pos != NULL && pid_alive(pos)) { get_task_struct(pos); goto done; } @@ -2214,6 +2317,7 @@ int proc_pid_readdir(struct file * filp, unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; struct task_struct *task; int tgid; + struct ve_struct *ve; if (!nr) { ino_t ino = fake_ino(0,PROC_TGID_INO); @@ -2229,12 +2333,13 @@ int proc_pid_readdir(struct file * filp, */ tgid = filp->f_version; filp->f_version = 0; - for (task = first_tgid(tgid, nr); + ve = filp->f_dentry->d_sb->s_type->owner_env; + for (task = first_tgid(tgid, nr, ve); task; - task = next_tgid(task), filp->f_pos++) { + task = next_tgid(task, ve), filp->f_pos++) { int len; ino_t ino; - tgid = task->pid; + tgid = get_task_pid_ve(task, ve); len = snprintf(buf, sizeof(buf), "%d", tgid); ino = fake_ino(tgid, PROC_TGID_INO); if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) { @@ -2261,14 +2366,18 @@ int proc_pid_readdir(struct file * filp, * threads past it. */ static struct task_struct *first_tid(struct task_struct *leader, - int tid, int nr) + int tid, int nr, struct ve_struct *ve) { struct task_struct *pos; rcu_read_lock(); /* Attempt to start with the pid of a thread */ if (tid && (nr > 0)) { - pos = find_task_by_pid(tid); + struct ve_struct *old_ve; + + old_ve = set_exec_env(ve); + pos = find_task_by_pid_ve(tid); + (void) set_exec_env(old_ve); if (pos && (pos->group_leader == leader)) goto found; } @@ -2354,11 +2463,12 @@ static int proc_task_readdir(struct file */ tid = filp->f_version; filp->f_version = 0; - for (task = first_tid(leader, tid, pos - 2); + for (task = first_tid(leader, tid, pos - 2, + filp->f_dentry->d_sb->s_type->owner_env); task; task = next_tid(task), pos++) { int len; - tid = task->pid; + tid = get_task_pid(task); len = snprintf(buf, sizeof(buf), "%d", tid); ino = fake_ino(tid, PROC_TID_INO); if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) { diff -uprN linux-2.6.18/fs/proc/generic.c linux-2.6.18.ovz/fs/proc/generic.c --- linux-2.6.18/fs/proc/generic.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/generic.c 2007-06-13 06:55:07.000000000 -0400 @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -238,6 +239,10 @@ static int proc_notify_change(struct den struct proc_dir_entry *de = PDE(inode); int error; + if ((iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) && + LPDE(inode) == GPDE(inode)) + return -EPERM; + error = inode_change_ok(inode, iattr); if (error) goto out; @@ -274,7 +279,7 @@ static struct inode_operations proc_file * returns the struct proc_dir_entry for "/proc/tty/driver", and * returns "serial" in residual. */ -static int xlate_proc_name(const char *name, +static int __xlate_proc_name(struct proc_dir_entry *root, const char *name, struct proc_dir_entry **ret, const char **residual) { const char *cp = name, *next; @@ -282,8 +287,13 @@ static int xlate_proc_name(const char *n int len; int rtn = 0; + if (*ret) { + de_get(*ret); + return 0; + } + spin_lock(&proc_subdir_lock); - de = &proc_root; + de = root; while (1) { next = strchr(cp, '/'); if (!next) @@ -301,12 +311,29 @@ static int xlate_proc_name(const char *n cp += len + 1; } *residual = cp; - *ret = de; + *ret = de_get(de); out: spin_unlock(&proc_subdir_lock); return rtn; } +#ifndef CONFIG_VE +#define xlate_proc_loc_name xlate_proc_name +#else +static int xlate_proc_loc_name(const char *name, + struct proc_dir_entry **ret, const char **residual) +{ + return __xlate_proc_name(get_exec_env()->proc_root, + name, ret, residual); +} +#endif + +static int xlate_proc_name(const char *name, + struct proc_dir_entry **ret, const char **residual) +{ + return __xlate_proc_name(&proc_root, name, ret, residual); +} + static DEFINE_IDR(proc_inum_idr); static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ @@ -378,6 +405,20 @@ static struct dentry_operations proc_den .d_delete = proc_delete_dentry, }; +static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir, + struct dentry *d) +{ + struct proc_dir_entry *de; + + for (de = dir->subdir; de; de = de->next) { + if (de->namelen != d->d_name.len) + continue; + if (!memcmp(d->d_name.name, de->name, de->namelen)) + break; + } + return de_get(de); +} + /* * Don't create negative dentries here, return -ENOENT by hand * instead. @@ -385,36 +426,118 @@ static struct dentry_operations proc_den struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) { struct inode *inode = NULL; - struct proc_dir_entry * de; + struct proc_dir_entry *lde, *gde; int error = -ENOENT; lock_kernel(); spin_lock(&proc_subdir_lock); - de = PDE(dir); - if (de) { - for (de = de->subdir; de ; de = de->next) { - if (de->namelen != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { - unsigned int ino = de->low_ino; - - spin_unlock(&proc_subdir_lock); - error = -EINVAL; - inode = proc_get_inode(dir->i_sb, ino, de); - spin_lock(&proc_subdir_lock); - break; - } - } - } + lde = LPDE(dir); + if (lde) + lde = __proc_lookup(lde, dentry); + if (lde && !try_module_get(lde->owner)) { + de_put(lde); + lde = NULL; + } +#ifdef CONFIG_VE + gde = GPDE(dir); + if (gde) + gde = __proc_lookup(gde, dentry); + if (!lde && gde && !try_module_get(gde->owner)) { + de_put(gde); + gde = NULL; + } +#else + gde = NULL; +#endif spin_unlock(&proc_subdir_lock); + + /* + * There are following possible cases after lookup: + * + * lde gde + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * NULL NULL ENOENT + * loc NULL found in local tree + * loc glob found in both trees + * NULL glob found in global tree + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * We initialized inode as follows after lookup: + * + * inode->lde inode->gde + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * loc NULL in local tree + * loc glob both trees + * glob glob global tree + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * i.e. inode->lde is always initialized + */ + + if (lde == NULL && gde == NULL) + goto out; + + if (lde != NULL) + inode = proc_get_inode(dir->i_sb, lde->low_ino, lde); + else + inode = proc_get_inode(dir->i_sb, gde->low_ino, gde); + + /* + * We can sleep in proc_get_inode(), but since we have i_sem + * being taken, no one can setup GPDE/LPDE on this inode. + */ + if (!inode) + goto out_put; + +#ifdef CONFIG_VE + GPDE(inode) = de_get(gde); + if (gde) + __module_get(gde->owner); + + /* if dentry is found in both trees and it is a directory + * then inode's nlink count must be altered, because local + * and global subtrees may differ. + * on the other hand, they may intersect, so actual nlink + * value is difficult to calculate - upper estimate is used + * instead of it. + * dentry found in global tree only must not be writable + * in non-super ve. + */ + if (lde && gde && lde != gde && gde->nlink > 1) + inode->i_nlink += gde->nlink - 2; + if (lde == NULL && !ve_is_super(dir->i_sb->s_type->owner_env)) + inode->i_mode &= ~S_IWUGO; +#endif + unlock_kernel(); + dentry->d_op = &proc_dentry_operations; + d_add(dentry, inode); + de_put(lde); + de_put(gde); + return NULL; + +out_put: + if (lde) + module_put(lde->owner); + else + module_put(gde->owner); + de_put(lde); + de_put(gde); +out: unlock_kernel(); + return ERR_PTR(error); +} - if (inode) { - dentry->d_op = &proc_dentry_operations; - d_add(dentry, inode); - return NULL; +static inline int in_tree(struct proc_dir_entry *de, struct proc_dir_entry *dir) +{ + struct proc_dir_entry *gde; + + for (gde = dir->subdir; gde; gde = gde->next) { + if (de->namelen != gde->namelen) + continue; + if (memcmp(de->name, gde->name, gde->namelen)) + continue; + return 1; } - return ERR_PTR(error); + return 0; } /* @@ -429,7 +552,7 @@ struct dentry *proc_lookup(struct inode int proc_readdir(struct file * filp, void * dirent, filldir_t filldir) { - struct proc_dir_entry * de; + struct proc_dir_entry *de, *tmp; unsigned int ino; int i; struct inode *inode = filp->f_dentry->d_inode; @@ -464,11 +587,8 @@ int proc_readdir(struct file * filp, de = de->subdir; i -= 2; for (;;) { - if (!de) { - ret = 1; - spin_unlock(&proc_subdir_lock); - goto out; - } + if (!de) + goto chk_global; if (!i) break; de = de->next; @@ -477,14 +597,56 @@ int proc_readdir(struct file * filp, do { /* filldir passes info to user space */ + de_get(de); spin_unlock(&proc_subdir_lock); - if (filldir(dirent, de->name, de->namelen, filp->f_pos, - de->low_ino, de->mode >> 12) < 0) + if (filldir(dirent, de->name, de->namelen, + filp->f_pos, de->low_ino, + de->mode >> 12) < 0) { + de_put(de); goto out; + } spin_lock(&proc_subdir_lock); + tmp = de->next; + de_put(de); filp->f_pos++; - de = de->next; + de = tmp; } while (de); +chk_global: +#ifdef CONFIG_VE + de = GPDE(inode); + if (de == NULL) + goto done; + + de = de->subdir; + while (de) { + /* skip local names */ + if (in_tree(de, LPDE(inode))) { + de = de->next; + continue; + } + + if (i > 0) { + i--; + de = de->next; + continue; + } + + de_get(de); + spin_unlock(&proc_subdir_lock); + if (filldir(dirent, de->name, de->namelen, + filp->f_pos, de->low_ino, + de->mode >> 12) < 0) { + de_put(de); + goto out; + } + spin_lock(&proc_subdir_lock); + tmp = de->next; + de_put(de); + filp->f_pos++; + de = tmp; + } +done: +#endif spin_unlock(&proc_subdir_lock); } ret = 1; @@ -521,8 +683,13 @@ static int proc_register(struct proc_dir dp->low_ino = i; spin_lock(&proc_subdir_lock); + if (dir->deleted) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + dp->next = dir->subdir; - dp->parent = dir; + dp->parent = de_get(dir); dir->subdir = dp; spin_unlock(&proc_subdir_lock); @@ -586,17 +753,18 @@ static struct proc_dir_entry *proc_creat /* make sure name is valid */ if (!name || !strlen(name)) goto out; - if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0) + if (xlate_proc_loc_name(name, parent, &fn) != 0) goto out; /* At this point there must not be any '/' characters beyond *fn */ if (strchr(fn, '/')) - goto out; + goto out_put; len = strlen(fn); ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); - if (!ent) goto out; + if (!ent) + goto out_put; memset(ent, 0, sizeof(struct proc_dir_entry)); memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); @@ -604,8 +772,13 @@ static struct proc_dir_entry *proc_creat ent->namelen = len; ent->mode = mode; ent->nlink = nlink; - out: + atomic_set(&ent->count, 1); return ent; + +out_put: + de_put(*parent); +out: + return NULL; } struct proc_dir_entry *proc_symlink(const char *name, @@ -629,6 +802,7 @@ struct proc_dir_entry *proc_symlink(cons kfree(ent); ent = NULL; } + de_put(parent); } return ent; } @@ -647,6 +821,7 @@ struct proc_dir_entry *proc_mkdir_mode(c kfree(ent); ent = NULL; } + de_put(parent); } return ent; } @@ -685,9 +860,28 @@ struct proc_dir_entry *create_proc_entry kfree(ent); ent = NULL; } + de_put(parent); } return ent; } +EXPORT_SYMBOL(remove_proc_glob_entry); + +struct proc_dir_entry *create_proc_glob_entry(const char *name, mode_t mode, + struct proc_dir_entry *parent) +{ + const char *path; + struct proc_dir_entry *ent; + + path = name; + if (xlate_proc_name(path, &parent, &name) != 0) + return NULL; + + ent = create_proc_entry(name, mode, parent); + de_put(parent); + return ent; +} + +EXPORT_SYMBOL(create_proc_glob_entry); void free_proc_entry(struct proc_dir_entry *de) { @@ -707,15 +901,13 @@ void free_proc_entry(struct proc_dir_ent * Remove a /proc entry and free it if it's not currently in use. * If it is in use, we set the 'deleted' flag. */ -void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +static void __remove_proc_entry(const char *name, struct proc_dir_entry *parent) { struct proc_dir_entry **p; struct proc_dir_entry *de; const char *fn = name; int len; - if (!parent && xlate_proc_name(name, &parent, &fn) != 0) - goto out; len = strlen(fn); spin_lock(&proc_subdir_lock); @@ -730,16 +922,43 @@ void remove_proc_entry(const char *name, proc_kill_inodes(de); de->nlink = 0; WARN_ON(de->subdir); - if (!atomic_read(&de->count)) - free_proc_entry(de); - else { - de->deleted = 1; - printk("remove_proc_entry: %s/%s busy, count=%d\n", - parent->name, de->name, atomic_read(&de->count)); - } + de->deleted = 1; + de_put(parent); + de_put(de); break; } spin_unlock(&proc_subdir_lock); -out: - return; +} + +void remove_proc_loc_entry(const char *name, struct proc_dir_entry *parent) +{ + const char *path; + + path = name; + if (xlate_proc_loc_name(path, &parent, &name) != 0) + return; + + __remove_proc_entry(name, parent); + de_put(parent); +} + +void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent) +{ + const char *path; + + path = name; + if (xlate_proc_name(path, &parent, &name) != 0) + return; + + __remove_proc_entry(name, parent); + de_put(parent); +} + +void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + remove_proc_loc_entry(name, parent); +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + remove_proc_glob_entry(name, parent); +#endif } diff -uprN linux-2.6.18/fs/proc/inode.c linux-2.6.18.ovz/fs/proc/inode.c --- linux-2.6.18/fs/proc/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -21,35 +21,27 @@ #include "internal.h" -static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) -{ - if (de) - atomic_inc(&de->count); - return de; -} - /* * Decrements the use count and checks for deferred deletion. */ -static void de_put(struct proc_dir_entry *de) +void de_put(struct proc_dir_entry *de) { - if (de) { - lock_kernel(); - if (!atomic_read(&de->count)) { - printk("de_put: entry %s already free!\n", de->name); - unlock_kernel(); - return; - } + if (de) { + if (unlikely(!atomic_read(&de->count))) + goto out_bad; if (atomic_dec_and_test(&de->count)) { - if (de->deleted) { - printk("de_put: deferred delete of %s\n", - de->name); - free_proc_entry(de); - } - } - unlock_kernel(); + if (unlikely(!de->deleted)) + goto out_bad; + + free_proc_entry(de); + } } + return; + +out_bad: + printk("de_put: bad dentry %s count:%d deleted:%d\n", + de->name, atomic_read(&de->count), de->deleted); } /* @@ -65,12 +57,19 @@ static void proc_delete_inode(struct ino put_pid(PROC_I(inode)->pid); /* Let go of any associated proc directory entry */ - de = PROC_I(inode)->pde; + de = LPDE(inode); if (de) { if (de->owner) module_put(de->owner); de_put(de); } +#ifdef CONFIG_VE + de = GPDE(inode); + if (de) { + module_put(de->owner); + de_put(de); + } +#endif clear_inode(inode); } @@ -97,6 +96,9 @@ static struct inode *proc_alloc_inode(st ei->pde = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; +#ifdef CONFIG_VE + GPDE(inode) = NULL; +#endif return inode; } @@ -154,12 +156,9 @@ struct inode *proc_get_inode(struct supe WARN_ON(de && de->deleted); - if (de != NULL && !try_module_get(de->owner)) - goto out_mod; - inode = iget(sb, ino); if (!inode) - goto out_ino; + goto out_mod; PROC_I(inode)->pde = de; if (de) { @@ -180,9 +179,6 @@ struct inode *proc_get_inode(struct supe return inode; -out_ino: - if (de != NULL) - module_put(de->owner); out_mod: de_put(de); return NULL; @@ -198,7 +194,9 @@ int proc_fill_super(struct super_block * s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; - + + /* proc_root.owner == NULL, just a formal call */ + __module_get(proc_root.owner); root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); if (!root_inode) goto out_no_root; @@ -207,6 +205,12 @@ int proc_fill_super(struct super_block * s->s_root = d_alloc_root(root_inode); if (!s->s_root) goto out_no_root; +#ifdef CONFIG_VE + LPDE(root_inode) = de_get(get_exec_env()->proc_root); + GPDE(root_inode) = &proc_root; +#else + LPDE(root_inode) = &proc_root; +#endif return 0; out_no_root: diff -uprN linux-2.6.18/fs/proc/kmsg.c linux-2.6.18.ovz/fs/proc/kmsg.c --- linux-2.6.18/fs/proc/kmsg.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/kmsg.c 2007-06-13 06:55:07.000000000 -0400 @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include @@ -40,7 +42,7 @@ static ssize_t kmsg_read(struct file *fi static unsigned int kmsg_poll(struct file *file, poll_table *wait) { - poll_wait(file, &log_wait, wait); + poll_wait(file, &ve_log_wait, wait); if (do_syslog(9, NULL, 0)) return POLLIN | POLLRDNORM; return 0; @@ -53,3 +55,4 @@ const struct file_operations proc_kmsg_o .open = kmsg_open, .release = kmsg_release, }; +EXPORT_SYMBOL(proc_kmsg_operations); diff -uprN linux-2.6.18/fs/proc/proc_misc.c linux-2.6.18.ovz/fs/proc/proc_misc.c --- linux-2.6.18/fs/proc/proc_misc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/proc_misc.c 2007-06-13 06:55:07.000000000 -0400 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -44,7 +45,10 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -52,8 +56,10 @@ #include #include "internal.h" -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) +#ifdef CONFIG_FAIRSCHED +#include +#endif + /* * Warning: stuff below (imported functions) assumes that its output will fit * into one page. For some of those functions it may be wrong. Moreover, we @@ -83,15 +89,33 @@ static int loadavg_read_proc(char *page, { int a, b, c; int len; - - a = avenrun[0] + (FIXED_1/200); - b = avenrun[1] + (FIXED_1/200); - c = avenrun[2] + (FIXED_1/200); + unsigned long __nr_running; + int __nr_threads; + unsigned long *__avenrun; + struct ve_struct *ve; + + ve = get_exec_env(); + + if (ve_is_super(ve)) { + __avenrun = &avenrun[0]; + __nr_running = nr_running(); + __nr_threads = nr_threads; + } +#ifdef CONFIG_VE + else { + __avenrun = &ve->avenrun[0]; + __nr_running = nr_running_ve(ve); + __nr_threads = atomic_read(&ve->pcounter); + } +#endif + a = __avenrun[0] + (FIXED_1/200); + b = __avenrun[1] + (FIXED_1/200); + c = __avenrun[2] + (FIXED_1/200); len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running(), nr_threads, last_pid); + __nr_running, __nr_threads, last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -104,6 +128,13 @@ static int uptime_read_proc(char *page, cputime_t idletime = cputime_add(init_task.utime, init_task.stime); do_posix_clock_monotonic_gettime(&uptime); +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + set_normalized_timespec(&uptime, + uptime.tv_sec - get_exec_env()->start_timespec.tv_sec, + uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec); + } +#endif cputime_to_timespec(idletime, &idle); len = sprintf(page,"%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, @@ -117,34 +148,48 @@ static int uptime_read_proc(char *page, static int meminfo_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct sysinfo i; + struct meminfo mi; int len; - unsigned long inactive; - unsigned long active; - unsigned long free; - unsigned long committed; - unsigned long allowed; + unsigned long dummy; struct vmalloc_info vmi; - long cached; - get_zone_counts(&active, &inactive, &free); + get_zone_counts(&mi.active, &mi.inactive, &dummy); /* * display in kilobytes. */ #define K(x) ((x) << (PAGE_SHIFT - 10)) - si_meminfo(&i); - si_swapinfo(&i); - committed = atomic_read(&vm_committed_space); - allowed = ((totalram_pages - hugetlb_total_pages()) + si_meminfo(&mi.si); + si_swapinfo(&mi.si); + mi.committed_space = atomic_read(&vm_committed_space); + mi.swapcache = total_swapcache_pages; + mi.allowed = ((totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100) + total_swap_pages; - cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; - if (cached < 0) - cached = 0; + mi.cache = global_page_state(NR_FILE_PAGES) - + total_swapcache_pages - mi.si.bufferram; + if (mi.cache < 0) + mi.cache = 0; get_vmalloc_info(&vmi); + mi.vmalloc_used = vmi.used >> PAGE_SHIFT; + mi.vmalloc_largest = vmi.largest_chunk >> PAGE_SHIFT; + mi.vmalloc_total = VMALLOC_TOTAL >> PAGE_SHIFT; + + mi.pi.nr_file_dirty = global_page_state(NR_FILE_DIRTY); + mi.pi.nr_writeback = global_page_state(NR_WRITEBACK); + mi.pi.nr_anon_pages = global_page_state(NR_ANON_PAGES); + mi.pi.nr_file_mapped = global_page_state(NR_FILE_MAPPED); + mi.pi.nr_slab = global_page_state(NR_SLAB); + mi.pi.nr_pagetable = global_page_state(NR_PAGETABLE); + mi.pi.nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); + mi.pi.nr_bounce = global_page_state(NR_BOUNCE); + +#ifdef CONFIG_USER_RESOURCE + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) + & NOTIFY_FAIL) + return -ENOMSG; +#endif /* * Tagged format, for easy grepping and expansion. @@ -176,32 +221,32 @@ static int meminfo_read_proc(char *page, "VmallocTotal: %8lu kB\n" "VmallocUsed: %8lu kB\n" "VmallocChunk: %8lu kB\n", - K(i.totalram), - K(i.freeram), - K(i.bufferram), - K(cached), - K(total_swapcache_pages), - K(active), - K(inactive), - K(i.totalhigh), - K(i.freehigh), - K(i.totalram-i.totalhigh), - K(i.freeram-i.freehigh), - K(i.totalswap), - K(i.freeswap), - K(global_page_state(NR_FILE_DIRTY)), - K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), - K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SLAB)), - K(global_page_state(NR_PAGETABLE)), - K(global_page_state(NR_UNSTABLE_NFS)), - K(global_page_state(NR_BOUNCE)), - K(allowed), - K(committed), - (unsigned long)VMALLOC_TOTAL >> 10, - vmi.used >> 10, - vmi.largest_chunk >> 10 + K(mi.si.totalram), + K(mi.si.freeram), + K(mi.si.bufferram), + K(mi.cache), + K(mi.swapcache), + K(mi.active), + K(mi.inactive), + K(mi.si.totalhigh), + K(mi.si.freehigh), + K(mi.si.totalram - mi.si.totalhigh), + K(mi.si.freeram - mi.si.freehigh), + K(mi.si.totalswap), + K(mi.si.freeswap), + K(mi.pi.nr_file_dirty), + K(mi.pi.nr_writeback), + K(mi.pi.nr_anon_pages), + K(mi.pi.nr_file_mapped), + K(mi.pi.nr_slab), + K(mi.pi.nr_pagetable), + K(mi.pi.nr_unstable_nfs), + K(mi.pi.nr_bounce), + K(mi.allowed), + K(mi.committed_space), + K(mi.vmalloc_total), + K(mi.vmalloc_used), + K(mi.vmalloc_largest) ); len += hugetlb_report_meminfo(page + len); @@ -241,8 +286,17 @@ static int version_read_proc(char *page, int count, int *eof, void *data) { int len; + struct new_utsname *utsname; - strcpy(page, linux_banner); + if (ve_is_super(get_exec_env())) + strcpy(page, linux_banner); + else { + utsname = ¤t->nsproxy->uts_ns->name; + sprintf(page, "Linux version %s (" + LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" + LINUX_COMPILER ") %s\n", + utsname->release, utsname->version); + } len = strlen(page); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -434,18 +488,14 @@ static struct file_operations proc_slabs #endif #endif -static int show_stat(struct seq_file *p, void *v) +static void show_stat_ve0(struct seq_file *p) { int i; - unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; u64 sum = 0; user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; - jif = - wall_to_monotonic.tv_sec; - if (wall_to_monotonic.tv_nsec) - --jif; for_each_possible_cpu(i) { int j; @@ -499,9 +549,90 @@ static int show_stat(struct seq_file *p, for (i = 0; i < NR_IRQS; i++) seq_printf(p, " %u", kstat_irqs(i)); #endif +#ifdef CONFIG_VM_EVENT_COUNTERS + seq_printf(p, "\nswap %lu %lu\n", + vm_events(PSWPIN), vm_events(PSWPOUT)); +#else + seq_printf(p, "\nswap 0 0\n"); +#endif +} + +#ifdef CONFIG_VE +static void show_stat_ve(struct seq_file *p, struct ve_struct *env) +{ + int i; + u64 user, nice, system; + cycles_t idle, iowait; + cpumask_t ve_cpus; + + ve_cpu_online_map(env, &ve_cpus); + + user = nice = system = idle = iowait = 0; + for_each_cpu_mask(i, ve_cpus) { + user += VE_CPU_STATS(env, i)->user; + nice += VE_CPU_STATS(env, i)->nice; + system += VE_CPU_STATS(env, i)->system; + + idle += ve_sched_get_idle_time(i); + iowait += ve_sched_get_iowait_time(i); + } + + seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n", + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cycles_to_clocks(idle), + (unsigned long long)cycles_to_clocks(iowait)); + + for_each_cpu_mask(i, ve_cpus) { + user = VE_CPU_STATS(env, i)->user; + nice = VE_CPU_STATS(env, i)->nice; + system = VE_CPU_STATS(env, i)->system; + + idle = ve_sched_get_idle_time(i); + iowait = ve_sched_get_iowait_time(i); + seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n", + i, + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cycles_to_clocks(idle), + (unsigned long long)cycles_to_clocks(iowait)); + } + seq_printf(p, "intr 0\nswap 0 0\n"); +} +#endif + +int show_stat(struct seq_file *p, void *v) +{ + extern unsigned long total_forks; + unsigned long seq, jif; + struct ve_struct *env; + unsigned long __nr_running, __nr_iowait; + + do { + seq = read_seqbegin(&xtime_lock); + jif = - wall_to_monotonic.tv_sec; + if (wall_to_monotonic.tv_nsec) + --jif; + } while (read_seqretry(&xtime_lock, seq)); + + env = get_exec_env(); + if (ve_is_super(env)) { + show_stat_ve0(p); + __nr_running = nr_running(); + __nr_iowait = nr_iowait(); + } +#ifdef CONFIG_VE + else { + show_stat_ve(p, env); + __nr_running = nr_running_ve(env); + __nr_iowait = nr_iowait_ve(); + } +#endif seq_printf(p, - "\nctxt %llu\n" + "ctxt %llu\n" "btime %lu\n" "processes %lu\n" "procs_running %lu\n" @@ -509,8 +640,8 @@ static int show_stat(struct seq_file *p, nr_context_switches(), (unsigned long)jif, total_forks, - nr_running(), - nr_iowait()); + __nr_running, + __nr_iowait); return 0; } @@ -599,7 +730,8 @@ static int cmdline_read_proc(char *page, { int len; - len = sprintf(page, "%s\n", saved_command_line); + len = sprintf(page, "%s\n", + ve_is_super(get_exec_env()) ? saved_command_line : "quiet"); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -628,11 +760,15 @@ static ssize_t write_sysrq_trigger(struc size_t count, loff_t *ppos) { if (count) { - char c; + int i, cnt; + char c[32]; - if (get_user(c, buf)) + cnt = min(count, sizeof(c)); + if (copy_from_user(c, buf, cnt)) return -EFAULT; - __handle_sysrq(c, NULL, NULL, 0); + + for (i = 0; i < cnt; i++) + __handle_sysrq(c[i], NULL, NULL, 0); } return count; } diff -uprN linux-2.6.18/fs/proc/proc_tty.c linux-2.6.18.ovz/fs/proc/proc_tty.c --- linux-2.6.18/fs/proc/proc_tty.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/proc_tty.c 2007-06-13 06:55:07.000000000 -0400 @@ -106,24 +106,35 @@ static int show_tty_driver(struct seq_fi /* iterator */ static void *t_start(struct seq_file *m, loff_t *pos) { - struct list_head *p; + struct tty_driver *drv; + loff_t l = *pos; - list_for_each(p, &tty_drivers) + read_lock(&tty_driver_guard); + list_for_each_entry(drv, &tty_drivers, tty_drivers) { + if (!ve_accessible_strict(drv->owner_env, get_exec_env())) + continue; if (!l--) - return list_entry(p, struct tty_driver, tty_drivers); + return drv; + } return NULL; } static void *t_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *p = ((struct tty_driver *)v)->tty_drivers.next; + struct tty_driver *drv; + (*pos)++; - return p==&tty_drivers ? NULL : - list_entry(p, struct tty_driver, tty_drivers); + drv = (struct tty_driver *)v; + list_for_each_entry_continue(drv, &tty_drivers, tty_drivers) { + if (ve_accessible_strict(drv->owner_env, get_exec_env())) + return drv; + } + return NULL; } static void t_stop(struct seq_file *m, void *v) { + read_unlock(&tty_driver_guard); } static struct seq_operations tty_drivers_op = { diff -uprN linux-2.6.18/fs/proc/root.c linux-2.6.18.ovz/fs/proc/root.c --- linux-2.6.18/fs/proc/root.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/root.c 2007-06-13 06:55:07.000000000 -0400 @@ -19,7 +19,10 @@ #include "internal.h" -struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver; +#ifndef CONFIG_VE +struct proc_dir_entry *proc_net, *proc_net_stat; +#endif +struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver; #ifdef CONFIG_SYSCTL struct proc_dir_entry *proc_sys_root; @@ -31,12 +34,14 @@ static int proc_get_sb(struct file_syste return get_sb_single(fs_type, flags, data, proc_fill_super, mnt); } -static struct file_system_type proc_fs_type = { +struct file_system_type proc_fs_type = { .name = "proc", .get_sb = proc_get_sb, .kill_sb = kill_anon_super, }; +EXPORT_SYMBOL(proc_fs_type); + void __init proc_root_init(void) { int err = proc_init_inodecache(); @@ -156,7 +161,9 @@ EXPORT_SYMBOL(create_proc_entry); EXPORT_SYMBOL(remove_proc_entry); EXPORT_SYMBOL(proc_root); EXPORT_SYMBOL(proc_root_fs); +#ifndef CONFIG_VE EXPORT_SYMBOL(proc_net); EXPORT_SYMBOL(proc_net_stat); +#endif EXPORT_SYMBOL(proc_bus); EXPORT_SYMBOL(proc_root_driver); diff -uprN linux-2.6.18/fs/proc/task_mmu.c linux-2.6.18.ovz/fs/proc/task_mmu.c --- linux-2.6.18/fs/proc/task_mmu.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/task_mmu.c 2007-06-13 06:55:07.000000000 -0400 @@ -94,9 +94,12 @@ int proc_exe_link(struct inode *inode, s } if (vma) { - *mnt = mntget(vma->vm_file->f_vfsmnt); - *dentry = dget(vma->vm_file->f_dentry); - result = 0; + result = d_root_check(vma->vm_file->f_dentry, + vma->vm_file->f_vfsmnt); + if (!result) { + *mnt = mntget(vma->vm_file->f_vfsmnt); + *dentry = dget(vma->vm_file->f_dentry); + } } up_read(&mm->mmap_sem); diff -uprN linux-2.6.18/fs/proc/task_nommu.c linux-2.6.18.ovz/fs/proc/task_nommu.c --- linux-2.6.18/fs/proc/task_nommu.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/proc/task_nommu.c 2007-06-13 06:55:07.000000000 -0400 @@ -126,9 +126,12 @@ int proc_exe_link(struct inode *inode, s } if (vma) { - *mnt = mntget(vma->vm_file->f_vfsmnt); - *dentry = dget(vma->vm_file->f_dentry); - result = 0; + result = d_root_check(vma->vm_file->f_dentry, + vma->vm_file->f_vfsmnt); + if (!result) { + *mnt = mntget(vma->vm_file->f_vfsmnt); + *dentry = dget(vma->vm_file->f_dentry); + } } up_read(&mm->mmap_sem); diff -uprN linux-2.6.18/fs/quota.c linux-2.6.18.ovz/fs/quota.c --- linux-2.6.18/fs/quota.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/quota.c 2007-06-13 06:55:07.000000000 -0400 @@ -81,11 +81,11 @@ static int generic_quotactl_valid(struct if (cmd == Q_GETQUOTA) { if (((type == USRQUOTA && current->euid != id) || (type == GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_VE_SYS_ADMIN)) return -EPERM; } else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; return 0; @@ -132,10 +132,10 @@ static int xqm_quotactl_valid(struct sup if (cmd == Q_XGETQUOTA) { if (((type == XQM_USRQUOTA && current->euid != id) || (type == XQM_GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_VE_SYS_ADMIN)) return -EPERM; } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; } @@ -180,7 +180,8 @@ static void quota_sync_sb(struct super_b continue; if (!sb_has_quota_enabled(sb, cnt)) continue; - discard[cnt] = igrab(sb_dqopt(sb)->files[cnt]); + if (sb_dqopt(sb)->files[cnt]) + discard[cnt] = igrab(sb_dqopt(sb)->files[cnt]); } mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -216,7 +217,7 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root && sb->s_qcop->quota_sync) + if (sb->s_root && sb->s_qcop && sb->s_qcop->quota_sync) quota_sync_sb(sb, type); up_read(&sb->s_umount); spin_lock(&sb_lock); @@ -337,6 +338,235 @@ static int do_quotactl(struct super_bloc return 0; } +static struct super_block *quota_get_sb(const char __user *special) +{ + struct super_block *sb; + struct block_device *bdev; + char *tmp; + + tmp = getname(special); + if (IS_ERR(tmp)) + return (struct super_block *)tmp; + bdev = lookup_bdev(tmp, FMODE_QUOTACTL); + putname(tmp); + if (IS_ERR(bdev)) + return (struct super_block *)bdev; + sb = get_super(bdev); + bdput(bdev); + if (!sb) + return ERR_PTR(-ENODEV); + return sb; +} + +#ifdef CONFIG_QUOTA_COMPAT + +#define QC_QUOTAON 0x0100 /* enable quotas */ +#define QC_QUOTAOFF 0x0200 /* disable quotas */ +/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */ +#define QC_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ +#define QC_SETQLIM 0x0700 /* set limits */ +/* GETSTATS at 0x0800 is now longer... */ +#define QC_GETINFO 0x0900 /* get info about quotas - graces, flags... */ +#define QC_SETINFO 0x0A00 /* set info about quotas */ +#define QC_SETGRACE 0x0B00 /* set inode and block grace */ +#define QC_SETFLAGS 0x0C00 /* set flags for quota */ +#define QC_GETQUOTA 0x0D00 /* get limits and usage */ +#define QC_SETQUOTA 0x0E00 /* set limits and usage */ +#define QC_SETUSE 0x0F00 /* set usage */ +/* 0x1000 used by old RSQUASH */ +#define QC_GETSTATS 0x1100 /* get collected stats */ + +struct compat_dqblk { + unsigned int dqb_ihardlimit; + unsigned int dqb_isoftlimit; + unsigned int dqb_curinodes; + unsigned int dqb_bhardlimit; + unsigned int dqb_bsoftlimit; + qsize_t dqb_curspace; + __kernel_time_t dqb_btime; + __kernel_time_t dqb_itime; +}; + +struct compat_dqinfo { + unsigned int dqi_bgrace; + unsigned int dqi_igrace; + unsigned int dqi_flags; + unsigned int dqi_blocks; + unsigned int dqi_free_blk; + unsigned int dqi_free_entry; +}; + +struct compat_dqstats { + __u32 lookups; + __u32 drops; + __u32 reads; + __u32 writes; + __u32 cache_hits; + __u32 allocated_dquots; + __u32 free_dquots; + __u32 syncs; + __u32 version; +}; + +asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); +static long compat_quotactl(unsigned int cmds, unsigned int type, + const char __user *special, qid_t id, + void __user *addr) +{ + struct super_block *sb; + long ret; + + sb = NULL; + switch (cmds) { + case QC_QUOTAON: + return sys_quotactl(QCMD(Q_QUOTAON, type), + special, id, addr); + + case QC_QUOTAOFF: + return sys_quotactl(QCMD(Q_QUOTAOFF, type), + special, id, addr); + + case QC_SYNC: + return sys_quotactl(QCMD(Q_SYNC, type), + special, id, addr); + + case QC_GETQUOTA: { + struct if_dqblk idq; + struct compat_dqblk cdq; + + sb = quota_get_sb(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); + if (ret) + break; + ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); + if (ret) + break; + cdq.dqb_ihardlimit = idq.dqb_ihardlimit; + cdq.dqb_isoftlimit = idq.dqb_isoftlimit; + cdq.dqb_curinodes = idq.dqb_curinodes; + cdq.dqb_bhardlimit = idq.dqb_bhardlimit; + cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit; + cdq.dqb_curspace = idq.dqb_curspace; + cdq.dqb_btime = idq.dqb_btime; + cdq.dqb_itime = idq.dqb_itime; + ret = 0; + if (copy_to_user(addr, &cdq, sizeof(cdq))) + ret = -EFAULT; + break; + } + + case QC_SETQUOTA: + case QC_SETUSE: + case QC_SETQLIM: { + struct if_dqblk idq; + struct compat_dqblk cdq; + + sb = quota_get_sb(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id); + if (ret) + break; + ret = -EFAULT; + if (copy_from_user(&cdq, addr, sizeof(cdq))) + break; + idq.dqb_ihardlimit = cdq.dqb_ihardlimit; + idq.dqb_isoftlimit = cdq.dqb_isoftlimit; + idq.dqb_curinodes = cdq.dqb_curinodes; + idq.dqb_bhardlimit = cdq.dqb_bhardlimit; + idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit; + idq.dqb_curspace = cdq.dqb_curspace; + idq.dqb_valid = 0; + if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM) + idq.dqb_valid |= QIF_LIMITS; + if (cmds == QC_SETQUOTA || cmds == QC_SETUSE) + idq.dqb_valid |= QIF_USAGE; + ret = sb->s_qcop->set_dqblk(sb, type, id, &idq); + break; + } + + case QC_GETINFO: { + struct if_dqinfo iinf; + struct compat_dqinfo cinf; + + sb = quota_get_sb(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); + if (ret) + break; + ret = sb->s_qcop->get_info(sb, type, &iinf); + if (ret) + break; + cinf.dqi_bgrace = iinf.dqi_bgrace; + cinf.dqi_igrace = iinf.dqi_igrace; + cinf.dqi_flags = 0; + if (iinf.dqi_flags & DQF_INFO_DIRTY) + cinf.dqi_flags |= 0x0010; + cinf.dqi_blocks = 0; + cinf.dqi_free_blk = 0; + cinf.dqi_free_entry = 0; + ret = 0; + if (copy_to_user(addr, &cinf, sizeof(cinf))) + ret = -EFAULT; + break; + } + + case QC_SETINFO: + case QC_SETGRACE: + case QC_SETFLAGS: { + struct if_dqinfo iinf; + struct compat_dqinfo cinf; + + sb = quota_get_sb(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_SETINFO, id); + if (ret) + break; + ret = -EFAULT; + if (copy_from_user(&cinf, addr, sizeof(cinf))) + break; + iinf.dqi_bgrace = cinf.dqi_bgrace; + iinf.dqi_igrace = cinf.dqi_igrace; + iinf.dqi_flags = cinf.dqi_flags; + iinf.dqi_valid = 0; + if (cmds == QC_SETINFO || cmds == QC_SETGRACE) + iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE; + if (cmds == QC_SETINFO || cmds == QC_SETFLAGS) + iinf.dqi_valid |= IIF_FLAGS; + ret = sb->s_qcop->set_info(sb, type, &iinf); + break; + } + + case QC_GETSTATS: { + struct compat_dqstats stat; + + memset(&stat, 0, sizeof(stat)); + stat.version = 6*10000+5*100+0; + ret = 0; + if (copy_to_user(addr, &stat, sizeof(stat))) + ret = -EFAULT; + break; + } + + default: + ret = -ENOSYS; + break; + } + if (sb && !IS_ERR(sb)) + drop_super(sb); + return ret; +} + +#endif + /* * This is the system call interface. This communicates with * the user-level programs. Currently this only supports diskquota @@ -347,25 +577,20 @@ asmlinkage long sys_quotactl(unsigned in { uint cmds, type; struct super_block *sb = NULL; - struct block_device *bdev; - char *tmp; int ret; cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; +#ifdef CONFIG_QUOTA_COMPAT + if (cmds >= 0x0100 && cmds < 0x3000) + return compat_quotactl(cmds, type, special, id, addr); +#endif + if (cmds != Q_SYNC || special) { - tmp = getname(special); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); - bdev = lookup_bdev(tmp); - putname(tmp); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - sb = get_super(bdev); - bdput(bdev); - if (!sb) - return -ENODEV; + sb = quota_get_sb(special); + if (IS_ERR(sb)) + return PTR_ERR(sb); } ret = check_quotactl_valid(sb, type, cmds, id); diff -uprN linux-2.6.18/fs/read_write.c linux-2.6.18.ovz/fs/read_write.c --- linux-2.6.18/fs/read_write.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/read_write.c 2007-06-13 06:55:07.000000000 -0400 @@ -19,6 +19,8 @@ #include #include +#include + const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read = generic_file_read, @@ -339,6 +341,29 @@ static inline void file_pos_write(struct file->f_pos = pos; } +static inline void bc_acct_write(size_t bytes) +{ + struct user_beancounter *ub; + + if (bytes > 0) { + ub = get_exec_ub(); + ub_percpu_inc(ub, write); + ub_percpu_add(ub, wchar, bytes); + } +} + +static inline void bc_acct_read(size_t bytes) +{ + struct user_beancounter *ub; + + if (bytes > 0) { + ub = get_exec_ub(); + ub_percpu_inc(ub, read); + ub_percpu_add(ub, rchar, bytes); + } +} + + asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) { struct file *file; @@ -351,6 +376,8 @@ asmlinkage ssize_t sys_read(unsigned int ret = vfs_read(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } return ret; @@ -369,6 +396,8 @@ asmlinkage ssize_t sys_write(unsigned in ret = vfs_write(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } return ret; @@ -390,6 +419,8 @@ asmlinkage ssize_t sys_pread64(unsigned if (file->f_mode & FMODE_PREAD) ret = vfs_read(file, buf, count, &pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } return ret; @@ -411,6 +442,8 @@ asmlinkage ssize_t sys_pwrite64(unsigned if (file->f_mode & FMODE_PWRITE) ret = vfs_write(file, buf, count, &pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } return ret; @@ -607,6 +640,8 @@ sys_readv(unsigned long fd, const struct ret = vfs_readv(file, vec, vlen, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } if (ret > 0) @@ -628,6 +663,8 @@ sys_writev(unsigned long fd, const struc ret = vfs_writev(file, vec, vlen, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } if (ret > 0) diff -uprN linux-2.6.18/fs/reiserfs/namei.c linux-2.6.18.ovz/fs/reiserfs/namei.c --- linux-2.6.18/fs/reiserfs/namei.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/reiserfs/namei.c 2007-06-13 06:55:07.000000000 -0400 @@ -863,6 +863,9 @@ static int reiserfs_rmdir(struct inode * INITIALIZE_PATH(path); struct reiserfs_dir_entry de; + inode = dentry->d_inode; + DQUOT_INIT(inode); + /* we will be doing 2 balancings and update 2 stat data, we change quotas * of the owner of the directory and of the owner of the parent directory. * The quota structure is possibly deleted only on last iput => outside @@ -887,8 +890,6 @@ static int reiserfs_rmdir(struct inode * goto end_rmdir; } - inode = dentry->d_inode; - reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); @@ -951,6 +952,7 @@ static int reiserfs_unlink(struct inode unsigned long savelink; inode = dentry->d_inode; + DQUOT_INIT(inode); /* in this transaction we can be doing at max two balancings and update * two stat datas, we change quotas of the owner of the directory and of @@ -1258,6 +1260,8 @@ static int reiserfs_rename(struct inode old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; + if (new_dentry_inode) + DQUOT_INIT(new_dentry_inode); // make sure, that oldname still exists and points to an object we // are going to rename diff -uprN linux-2.6.18/fs/select.c linux-2.6.18.ovz/fs/select.c --- linux-2.6.18/fs/select.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/select.c 2007-06-13 06:55:07.000000000 -0400 @@ -24,6 +24,8 @@ #include #include +#include + #include #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) @@ -339,7 +341,8 @@ static int core_sys_select(int n, fd_set if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); + bits = kmalloc(6 * size, size > PAGE_SIZE / 6 ? + GFP_KERNEL_UBC : GFP_KERNEL_UBC); if (!bits) goto out_nofds; } @@ -658,6 +661,7 @@ int do_sys_poll(struct pollfd __user *uf unsigned int i; struct poll_list *head; struct poll_list *walk; + int flags; struct fdtable *fdt; int max_fdset; /* Allocate small arguments on the stack to save memory and be @@ -680,9 +684,14 @@ int do_sys_poll(struct pollfd __user *uf walk = NULL; i = nfds; err = -ENOMEM; + + flags = GFP_KERNEL_UBC; while(i!=0) { struct poll_list *pp; int num, size; + if (i <= POLLFD_PER_PAGE) + flags = GFP_KERNEL; + if (stack_pp == NULL) num = N_STACK_PPS; else @@ -693,7 +702,7 @@ int do_sys_poll(struct pollfd __user *uf if (!stack_pp) stack_pp = pp = (struct poll_list *)stack_pps; else { - pp = kmalloc(size, GFP_KERNEL); + pp = kmalloc(size, flags); if (!pp) goto out_fds; } diff -uprN linux-2.6.18/fs/seq_file.c linux-2.6.18.ovz/fs/seq_file.c --- linux-2.6.18/fs/seq_file.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/seq_file.c 2007-06-13 06:55:07.000000000 -0400 @@ -177,21 +177,23 @@ EXPORT_SYMBOL(seq_read); static int traverse(struct seq_file *m, loff_t offset) { - loff_t pos = 0; + loff_t pos = 0, index; int error = 0; void *p; m->version = 0; - m->index = 0; + index = 0; m->count = m->from = 0; - if (!offset) + if (!offset) { + m->index = index; return 0; + } if (!m->buf) { m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); if (!m->buf) return -ENOMEM; } - p = m->op->start(m, &m->index); + p = m->op->start(m, &index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) @@ -204,15 +206,17 @@ static int traverse(struct seq_file *m, if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; + m->index = index; break; } pos += m->count; m->count = 0; if (pos == offset) { - m->index++; + index++; + m->index = index; break; } - p = m->op->next(m, p, &m->index); + p = m->op->next(m, p, &index); } m->op->stop(m, p); return error; @@ -345,6 +349,8 @@ int seq_path(struct seq_file *m, if (m->count < m->size) { char *s = m->buf + m->count; char *p = d_path(dentry, mnt, s, m->size - m->count); + if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG) + return 0; if (!IS_ERR(p)) { while (s <= p) { char c = *p++; diff -uprN linux-2.6.18/fs/simfs.c linux-2.6.18.ovz/fs/simfs.c --- linux-2.6.18/fs/simfs.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/fs/simfs.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,326 @@ +/* + * fs/simfs.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb + +static struct super_operations sim_super_ops; + +static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct super_block *sb; + struct inode *inode; + + inode = dentry->d_inode; + if (!inode->i_op->getattr) { + generic_fillattr(inode, stat); + if (!stat->blksize) { + unsigned blocks; + + sb = inode->i_sb; + blocks = (stat->size + sb->s_blocksize-1) >> + sb->s_blocksize_bits; + stat->blocks = (sb->s_blocksize / 512) * blocks; + stat->blksize = sb->s_blocksize; + } + } else { + int err; + + err = inode->i_op->getattr(mnt, dentry, stat); + if (err) + return err; + } + + sb = mnt->mnt_sb; + if (sb->s_op == &sim_super_ops) + stat->dev = sb->s_dev; + return 0; +} + +static void quota_get_stat(struct super_block *sb, struct kstatfs *buf) +{ + int err; + struct dq_stat qstat; + struct virt_info_quota q; + long free_file, adj_file; + s64 blk, free_blk, adj_blk; + int bsize_bits; + + q.super = sb; + q.qstat = &qstat; + err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q); + if (err != NOTIFY_OK) + return; + + bsize_bits = ffs(buf->f_bsize) - 1; + free_blk = (s64)(qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits; + if (free_blk < 0) + free_blk = 0; + /* + * In the regular case, we always set buf->f_bfree and buf->f_blocks to + * the values reported by quota. In case of real disk space shortage, + * we adjust the values. We want this adjustment to look as if the + * total disk space were reduced, not as if the usage were increased. + * -- SAW + */ + adj_blk = 0; + if (buf->f_bfree < free_blk) + adj_blk = free_blk - buf->f_bfree; + buf->f_bfree = (long)(free_blk - adj_blk); + + if (free_blk < buf->f_bavail) + buf->f_bavail = (long)free_blk; /* min(f_bavail, free_blk) */ + + blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk; + buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk; + + free_file = qstat.isoftlimit - qstat.icurrent; + if (free_file < 0) + free_file = 0; + if (buf->f_type == REISERFS_SUPER_MAGIC) + /* + * reiserfs doesn't initialize f_ffree and f_files values of + * kstatfs because it doesn't have an inode limit. + */ + buf->f_ffree = free_file; + adj_file = 0; + if (buf->f_ffree < free_file) + adj_file = free_file - buf->f_ffree; + buf->f_ffree = free_file - adj_file; + buf->f_files = qstat.isoftlimit - adj_file; +} + +static int sim_statfs(struct super_block *sb, struct kstatfs *buf) +{ + int err; + struct super_block *lsb; + struct kstatfs statbuf; + + err = 0; + if (sb->s_op != &sim_super_ops) + return 0; + + memset(&statbuf, 0, sizeof(statbuf)); + lsb = SIMFS_GET_LOWER_FS_SB(sb); + + err = -ENOSYS; + if (lsb && lsb->s_op && lsb->s_op->statfs) + err = lsb->s_op->statfs(lsb->s_root, &statbuf); + if (err) + return err; + + quota_get_stat(sb, &statbuf); + + buf->f_files = statbuf.f_files; + buf->f_ffree = statbuf.f_ffree; + buf->f_blocks = statbuf.f_blocks; + buf->f_bfree = statbuf.f_bfree; + buf->f_bavail = statbuf.f_bavail; + return 0; +} + +static int sim_systemcall(struct vnotifier_block *me, unsigned long n, + void *d, int old_ret) +{ + int err; + + switch (n) { + case VIRTINFO_FAUDIT_STAT: { + struct faudit_stat_arg *arg; + + arg = (struct faudit_stat_arg *)d; + err = sim_getattr(arg->mnt, arg->dentry, arg->stat); + arg->err = err; + } + break; + case VIRTINFO_FAUDIT_STATFS: { + struct faudit_statfs_arg *arg; + + arg = (struct faudit_statfs_arg *)d; + err = sim_statfs(arg->sb, arg->stat); + arg->err = err; + } + break; + default: + return old_ret; + } + return (err ? NOTIFY_BAD : NOTIFY_OK); +} + +static struct inode *sim_quota_root(struct super_block *sb) +{ + return sb->s_root->d_inode; +} + +/* + * NOTE: We need to setup s_bdev field on super block, since sys_quotactl() + * does lookup_bdev() and get_super() which are comparing sb->s_bdev. + * so this is a MUST if we want unmodified sys_quotactl + * to work correctly on /dev/simfs inside VE + */ +static int sim_init_blkdev(struct super_block *sb) +{ + static struct hd_struct fake_hd; + struct block_device *blkdev; + + blkdev = bdget(sb->s_dev); + if (blkdev == NULL) + return -ENOMEM; + + blkdev->bd_part = &fake_hd; /* required for bdev_read_only() */ + sb->s_bdev = blkdev; + + return 0; +} + +static void sim_free_blkdev(struct super_block *sb) +{ + /* set bd_part back to NULL */ + sb->s_bdev->bd_part = NULL; + bdput(sb->s_bdev); +} + +static void sim_quota_init(struct super_block *sb) +{ + struct virt_info_quota viq; + + viq.super = sb; + virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq); +} + +static void sim_quota_free(struct super_block *sb) +{ + struct virt_info_quota viq; + + viq.super = sb; + virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq); +} + +void sim_put_super(struct super_block *sb) +{ + mntput((struct vfsmount *)(sb->s_fs_info)); + sim_quota_free(sb); + sim_free_blkdev(sb); +} + +static struct super_operations sim_super_ops = { + .get_quota_root = sim_quota_root, + .put_super = sim_put_super, +}; + +static int sim_fill_super(struct super_block *s, void *data) +{ + int err; + struct nameidata *nd; + + err = set_anon_super(s, NULL); + if (err) + goto out; + + err = 0; + nd = (struct nameidata *)data; + s->s_fs_info = mntget(nd->mnt); + s->s_root = dget(nd->dentry); + s->s_op = &sim_super_ops; +out: + return err; +} + +static int sim_get_sb(struct file_system_type *type, int flags, + const char *dev_name, void *opt, struct vfsmount *mnt) +{ + int err; + struct nameidata nd; + struct super_block *sb; + + err = -EINVAL; + if (opt == NULL) + goto out; + + err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (err) + goto out; + + sb = sget(type, NULL, sim_fill_super, &nd); + err = PTR_ERR(sb); + if (IS_ERR(sb)) + goto out_path; + + err = sim_init_blkdev(sb); + if (err) + goto out_killsb; + + sim_quota_init(sb); + + path_release(&nd); + return simple_set_mnt(mnt, sb); + +out_killsb: + up_write(&sb->s_umount); + deactivate_super(sb); +out_path: + path_release(&nd); +out: + return err; +} + +static struct file_system_type sim_fs_type = { + .owner = THIS_MODULE, + .name = "simfs", + .get_sb = sim_get_sb, + .kill_sb = kill_anon_super, +}; + +static struct vnotifier_block sim_syscalls = { + .notifier_call = sim_systemcall, +}; + +static int __init init_simfs(void) +{ + int err; + + err = register_filesystem(&sim_fs_type); + if (err) + return err; + + virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls); + return 0; +} + +static void __exit exit_simfs(void) +{ + virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls); + unregister_filesystem(&sim_fs_type); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System"); +MODULE_LICENSE("GPL v2"); + +module_init(init_simfs); +module_exit(exit_simfs); diff -uprN linux-2.6.18/fs/smbfs/inode.c linux-2.6.18.ovz/fs/smbfs/inode.c --- linux-2.6.18/fs/smbfs/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/smbfs/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -233,7 +233,7 @@ smb_invalidate_inodes(struct smb_sb_info { VERBOSE("\n"); shrink_dcache_sb(SB_of(server)); - invalidate_inodes(SB_of(server)); + invalidate_inodes(SB_of(server), 0); } /* diff -uprN linux-2.6.18/fs/smbfs/sock.c linux-2.6.18.ovz/fs/smbfs/sock.c --- linux-2.6.18/fs/smbfs/sock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/smbfs/sock.c 2007-06-13 06:55:07.000000000 -0400 @@ -100,6 +100,7 @@ smb_close_socket(struct smb_sb_info *ser VERBOSE("closing socket %p\n", sock); sock->sk->sk_data_ready = server->data_ready; + sock->sk->sk_user_data = NULL; server->sock_file = NULL; fput(file); } diff -uprN linux-2.6.18/fs/splice.c linux-2.6.18.ovz/fs/splice.c --- linux-2.6.18/fs/splice.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/splice.c 2007-06-13 06:55:07.000000000 -0400 @@ -607,7 +607,7 @@ find_page: ret = -ENOMEM; page = page_cache_alloc_cold(mapping); if (unlikely(!page)) - goto out_nomem; + goto out_ret; /* * This will also lock the page @@ -666,7 +666,7 @@ find_page: if (sd->pos + this_len > isize) vmtruncate(mapping->host, isize); - goto out; + goto out_ret; } if (buf->page != page) { @@ -698,7 +698,7 @@ find_page: out: page_cache_release(page); unlock_page(page); -out_nomem: +out_ret: return ret; } diff -uprN linux-2.6.18/fs/stat.c linux-2.6.18.ovz/fs/stat.c --- linux-2.6.18/fs/stat.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/stat.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -41,11 +42,19 @@ int vfs_getattr(struct vfsmount *mnt, st { struct inode *inode = dentry->d_inode; int retval; + struct faudit_stat_arg arg; retval = security_inode_getattr(mnt, dentry); if (retval) return retval; + arg.mnt = mnt; + arg.dentry = dentry; + arg.stat = stat; + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg) + != NOTIFY_DONE) + return arg.err; + if (inode->i_op->getattr) return inode->i_op->getattr(mnt, dentry, stat); diff -uprN linux-2.6.18/fs/super.c linux-2.6.18.ovz/fs/super.c --- linux-2.6.18/fs/super.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/super.c 2007-06-13 06:55:07.000000000 -0400 @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -45,7 +46,9 @@ void put_filesystem(struct file_system_t struct file_system_type *get_fs_type(const char *name); LIST_HEAD(super_blocks); +EXPORT_SYMBOL_GPL(super_blocks); DEFINE_SPINLOCK(sb_lock); +EXPORT_SYMBOL_GPL(sb_lock); /** * alloc_super - create new superblock @@ -73,13 +76,15 @@ static struct super_block *alloc_super(s INIT_LIST_HEAD(&s->s_inodes); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); - lockdep_set_class(&s->s_umount, &type->s_umount_key); + lockdep_set_class(&s->s_umount, + &type->proto->s_umount_key); /* * The locking rules for s_lock are up to the * filesystem. For example ext3fs has different * lock ordering than usbfs: */ - lockdep_set_class(&s->s_lock, &type->s_lock_key); + lockdep_set_class(&s->s_lock, + &type->proto->s_lock_key); down_write(&s->s_umount); s->s_count = S_BIAS; atomic_set(&s->s_active, 1); @@ -244,7 +249,7 @@ void generic_shutdown_super(struct super lock_super(sb); sb->s_flags &= ~MS_ACTIVE; /* bad name - it should be evict_inodes() */ - invalidate_inodes(sb); + invalidate_inodes(sb, 0); lock_kernel(); if (sop->write_super && sb->s_dirt) @@ -253,7 +258,7 @@ void generic_shutdown_super(struct super sop->put_super(sb); /* Forget any remaining inodes */ - if (invalidate_inodes(sb)) { + if (invalidate_inodes(sb, 1)) { printk("VFS: Busy inodes after unmount of %s. " "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); @@ -482,17 +487,26 @@ rescan: spin_unlock(&sb_lock); return NULL; } +EXPORT_SYMBOL(user_get_super); asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf) { + dev_t kdev; struct super_block *s; struct ustat tmp; struct kstatfs sbuf; - int err = -EINVAL; + int err; + + kdev = new_decode_dev(dev); + err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ); + if (err) + goto out; + + err = -EINVAL; + s = user_get_super(kdev); + if (s == NULL) + goto out; - s = user_get_super(new_decode_dev(dev)); - if (s == NULL) - goto out; err = vfs_statfs(s->s_root, &sbuf); drop_super(s); if (err) @@ -606,6 +620,13 @@ void emergency_remount(void) static struct idr unnamed_dev_idr; static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ +/* for compatibility with coreutils still unaware of new minor sizes */ +int unnamed_dev_majors[] = { + 0, 144, 145, 146, 242, 243, 244, 245, + 246, 247, 248, 249, 250, 251, 252, 253 +}; +EXPORT_SYMBOL(unnamed_dev_majors); + int set_anon_super(struct super_block *s, void *data) { int dev; @@ -623,13 +644,13 @@ int set_anon_super(struct super_block *s else if (error) return -EAGAIN; - if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { + if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); idr_remove(&unnamed_dev_idr, dev); spin_unlock(&unnamed_dev_lock); return -EMFILE; } - s->s_dev = MKDEV(0, dev & MINORMASK); + s->s_dev = make_unnamed_dev(dev); return 0; } @@ -637,8 +658,9 @@ EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { - int slot = MINOR(sb->s_dev); + int slot; + slot = unnamed_dev_idx(sb->s_dev); generic_shutdown_super(sb); spin_lock(&unnamed_dev_lock); idr_remove(&unnamed_dev_idr, slot); diff -uprN linux-2.6.18/fs/sync.c linux-2.6.18.ovz/fs/sync.c --- linux-2.6.18/fs/sync.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/sync.c 2007-06-13 06:55:07.000000000 -0400 @@ -11,6 +11,8 @@ #include #include +#include + #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) @@ -130,13 +132,17 @@ int do_sync_file_range(struct file *file { int ret; struct address_space *mapping; + struct user_beancounter *ub; mapping = file->f_mapping; if (!mapping) { ret = -EINVAL; - goto out; + goto out_noacct; } + ub = get_exec_ub(); + ub_percpu_inc(ub, frsync); + ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { ret = wait_on_page_writeback_range(mapping, @@ -159,6 +165,8 @@ int do_sync_file_range(struct file *file endbyte >> PAGE_CACHE_SHIFT); } out: + ub_percpu_inc(ub, frsync_done); +out_noacct: return ret; } EXPORT_SYMBOL_GPL(do_sync_file_range); diff -uprN linux-2.6.18/fs/sysfs/bin.c linux-2.6.18.ovz/fs/sysfs/bin.c --- linux-2.6.18/fs/sysfs/bin.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/sysfs/bin.c 2007-06-13 06:55:07.000000000 -0400 @@ -120,6 +120,9 @@ static int open(struct inode * inode, st struct bin_attribute * attr = to_bin_attr(file->f_dentry); int error = -EINVAL; + if (!ve_sysfs_alowed()) + return 0; + if (!kobj || !attr) goto Done; @@ -196,6 +199,9 @@ int sysfs_create_bin_file(struct kobject int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) { + if (!ve_sysfs_alowed()) + return 0; + sysfs_hash_and_remove(kobj->dentry,attr->attr.name); return 0; } diff -uprN linux-2.6.18/fs/sysfs/dir.c linux-2.6.18.ovz/fs/sysfs/dir.c --- linux-2.6.18/fs/sysfs/dir.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/sysfs/dir.c 2007-06-13 06:55:07.000000000 -0400 @@ -175,6 +175,9 @@ int sysfs_create_dir(struct kobject * ko struct dentry * parent; int error = 0; + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj); if (kobj->parent) @@ -309,10 +312,14 @@ void sysfs_remove_subdir(struct dentry * void sysfs_remove_dir(struct kobject * kobj) { - struct dentry * dentry = dget(kobj->dentry); + struct dentry * dentry; struct sysfs_dirent * parent_sd; struct sysfs_dirent * sd, * tmp; + if (!ve_sysfs_alowed()) + return; + + dentry = dget(kobj->dentry); if (!dentry) return; @@ -341,6 +348,9 @@ int sysfs_rename_dir(struct kobject * ko int error = 0; struct dentry * new_dentry, * parent; + if (!ve_sysfs_alowed()) + return 0; + if (!strcmp(kobject_name(kobj), new_name)) return -EINVAL; diff -uprN linux-2.6.18/fs/sysfs/file.c linux-2.6.18.ovz/fs/sysfs/file.c --- linux-2.6.18/fs/sysfs/file.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/sysfs/file.c 2007-06-13 06:55:07.000000000 -0400 @@ -457,6 +457,9 @@ int sysfs_add_file(struct dentry * dir, int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) { + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj || !kobj->dentry || !attr); return sysfs_add_file(kobj->dentry, attr, SYSFS_KOBJ_ATTR); @@ -475,6 +478,9 @@ int sysfs_update_file(struct kobject * k struct dentry * victim; int res = -ENOENT; + if (!ve_sysfs_alowed()) + return 0; + mutex_lock(&dir->d_inode->i_mutex); victim = lookup_one_len(attr->name, dir, strlen(attr->name)); if (!IS_ERR(victim)) { @@ -483,11 +489,6 @@ int sysfs_update_file(struct kobject * k (victim->d_parent->d_inode == dir->d_inode)) { victim->d_inode->i_mtime = CURRENT_TIME; fsnotify_modify(victim); - - /** - * Drop reference from initial sysfs_get_dentry(). - */ - dput(victim); res = 0; } else d_drop(victim); @@ -550,6 +551,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { + if (!ve_sysfs_alowed()) + return; + sysfs_hash_and_remove(kobj->dentry,attr->name); } diff -uprN linux-2.6.18/fs/sysfs/group.c linux-2.6.18.ovz/fs/sysfs/group.c --- linux-2.6.18/fs/sysfs/group.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/sysfs/group.c 2007-06-13 06:55:07.000000000 -0400 @@ -46,6 +46,9 @@ int sysfs_create_group(struct kobject * struct dentry * dir; int error; + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj || !kobj->dentry); if (grp->name) { @@ -68,6 +71,9 @@ void sysfs_remove_group(struct kobject * { struct dentry * dir; + if (!ve_sysfs_alowed()) + return; + if (grp->name) dir = lookup_one_len(grp->name, kobj->dentry, strlen(grp->name)); diff -uprN linux-2.6.18/fs/sysfs/inode.c linux-2.6.18.ovz/fs/sysfs/inode.c --- linux-2.6.18/fs/sysfs/inode.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/sysfs/inode.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,8 +14,6 @@ #include #include "sysfs.h" -extern struct super_block * sysfs_sb; - static const struct address_space_operations sysfs_aops = { .readpage = simple_readpage, .prepare_write = simple_prepare_write, diff -uprN linux-2.6.18/fs/sysfs/mount.c linux-2.6.18.ovz/fs/sysfs/mount.c --- linux-2.6.18/fs/sysfs/mount.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/sysfs/mount.c 2007-06-13 06:55:07.000000000 -0400 @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "sysfs.h" @@ -14,8 +15,11 @@ /* Random magic number */ #define SYSFS_MAGIC 0x62656572 +#ifndef CONFIG_VE struct vfsmount *sysfs_mount; struct super_block * sysfs_sb = NULL; +#endif + kmem_cache_t *sysfs_dir_cachep; static struct super_operations sysfs_ops = { @@ -31,6 +35,15 @@ static struct sysfs_dirent sysfs_root = .s_iattr = NULL, }; +#ifdef CONFIG_VE +static void init_ve0_sysfs_root(void) +{ + get_ve0()->sysfs_root = &sysfs_root; +} + +#define sysfs_root (*(get_exec_env()->sysfs_root)) +#endif + static int sysfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; @@ -72,16 +85,21 @@ static int sysfs_get_sb(struct file_syst return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); } -static struct file_system_type sysfs_fs_type = { +struct file_system_type sysfs_fs_type = { .name = "sysfs", .get_sb = sysfs_get_sb, .kill_sb = kill_litter_super, }; +EXPORT_SYMBOL(sysfs_fs_type); + int __init sysfs_init(void) { int err = -ENOMEM; +#ifdef CONFIG_VE + init_ve0_sysfs_root(); +#endif sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", sizeof(struct sysfs_dirent), 0, 0, NULL, NULL); diff -uprN linux-2.6.18/fs/sysfs/symlink.c linux-2.6.18.ovz/fs/sysfs/symlink.c --- linux-2.6.18/fs/sysfs/symlink.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/sysfs/symlink.c 2007-06-13 06:55:07.000000000 -0400 @@ -85,6 +85,9 @@ int sysfs_create_link(struct kobject * k struct dentry * dentry = kobj->dentry; int error = -EEXIST; + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj || !kobj->dentry || !name); mutex_lock(&dentry->d_inode->i_mutex); @@ -103,6 +106,9 @@ int sysfs_create_link(struct kobject * k void sysfs_remove_link(struct kobject * kobj, const char * name) { + if(!ve_sysfs_alowed()) + return; + sysfs_hash_and_remove(kobj->dentry,name); } diff -uprN linux-2.6.18/fs/sysfs/sysfs.h linux-2.6.18.ovz/fs/sysfs/sysfs.h --- linux-2.6.18/fs/sysfs/sysfs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/fs/sysfs/sysfs.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,5 +1,14 @@ -extern struct vfsmount * sysfs_mount; +#ifndef CONFIG_VE +extern struct vfsmount *sysfs_mount; +extern struct super_block *sysfs_sb; +#define ve_sysfs_alowed() (1) +#else +#define sysfs_mount (get_exec_env()->sysfs_mnt) +#define sysfs_sb (get_exec_env()->sysfs_sb) +#define ve_sysfs_alowed() (sysfs_sb != NULL) +#endif + extern kmem_cache_t *sysfs_dir_cachep; extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); @@ -21,7 +30,6 @@ extern void sysfs_drop_dentry(struct sys extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); extern struct rw_semaphore sysfs_rename_sem; -extern struct super_block * sysfs_sb; extern const struct file_operations sysfs_dir_operations; extern const struct file_operations sysfs_file_operations; extern const struct file_operations bin_fops; diff -uprN linux-2.6.18/fs/vzdq_file.c linux-2.6.18.ovz/fs/vzdq_file.c --- linux-2.6.18/fs/vzdq_file.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/fs/vzdq_file.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,893 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo quota files as proc entry implementation. + * It is required for std quota tools to work correctly as they are expecting + * aquota.user and aquota.group files. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* ---------------------------------------------------------------------- + * + * File read operation + * + * FIXME: functions in this section (as well as many functions in vzdq_ugid.c, + * perhaps) abuse vz_quota_sem. + * Taking a global semaphore for lengthy and user-controlled operations inside + * VPSs is not a good idea in general. + * In this case, the reasons for taking this semaphore are completely unclear, + * especially taking into account that the only function that has comments + * about the necessity to be called under this semaphore + * (create_proc_quotafile) is actually called OUTSIDE it. + * + * --------------------------------------------------------------------- */ + +#define DQBLOCK_SIZE 1024 +#define DQUOTBLKNUM 21U +#define DQTREE_DEPTH 4 +#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1) +#define ISINDBLOCK(num) ((num)%2 != 0) +#define FIRST_DATABLK 2 /* first even number */ +#define LAST_IND_LEVEL (DQTREE_DEPTH - 1) +#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS)) +#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \ + & QUOTATREE_BMASK) + +#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH) +#error xBITS and DQTREE_DEPTH does not correspond +#endif + +#define BLOCK_NOT_FOUND 1 + +/* data for quota file -- one per proc entry */ +struct quotatree_data { + struct list_head list; + struct vz_quota_master *qmblk; + int type; /* type of the tree */ +}; + +/* serialized by vz_quota_sem */ +static LIST_HEAD(qf_data_head); + +static const u_int32_t vzquota_magics[] = V2_INITQMAGICS; +static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS; + +static inline loff_t get_depoff(int depth) +{ + loff_t res = 1; + while (depth) { + res += (1 << ((depth - 1)*QUOTAID_EBITS + 1)); + depth--; + } + return res; +} + +static inline loff_t get_blknum(loff_t num, int depth) +{ + loff_t res; + res = (num << 1) + get_depoff(depth); + return res; +} + +static int get_depth(loff_t num) +{ + int i; + for (i = 0; i < DQTREE_DEPTH; i++) { + if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1 + || num < get_depoff(i + 1))) + return i; + } + return -1; +} + +static inline loff_t get_offset(loff_t num) +{ + loff_t res, tmp; + + tmp = get_depth(num); + if (tmp < 0) + return -1; + num -= get_depoff(tmp); + BUG_ON(num < 0); + res = num >> 1; + + return res; +} + +static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level) +{ + /* return maximum available block num */ + return tree->levels[level].freenum; +} + +static inline loff_t get_block_num(struct quotatree_tree *tree) +{ + loff_t ind_blk_num, quot_blk_num, max_ind, max_quot; + + quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1); + max_quot = TREENUM_2_BLKNUM(quot_blk_num); + ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1)); + max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL) + : get_blknum(ind_blk_num, 0); + + return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1; +} + +/* Write quota file header */ +static int read_header(void *buf, struct quotatree_tree *tree, + struct dq_info *dq_ugid_info, int type) +{ + struct v2_disk_dqheader *dqh; + struct v2_disk_dqinfo *dq_disk_info; + + dqh = buf; + dq_disk_info = buf + sizeof(struct v2_disk_dqheader); + + dqh->dqh_magic = vzquota_magics[type]; + dqh->dqh_version = vzquota_versions[type]; + + dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire; + dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire; + dq_disk_info->dqi_flags = 0; /* no flags */ + dq_disk_info->dqi_blocks = get_block_num(tree); + dq_disk_info->dqi_free_blk = 0; /* first block in the file */ + dq_disk_info->dqi_free_entry = FIRST_DATABLK; + + return 0; +} + +static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf) +{ + int i, j, lev_num; + + lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1; + for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) { + struct quotatree_node *next, *parent; + + parent = p; + next = p; + for (j = lev_num; j >= 0; j--) { + if (!next->blocks[GETLEVINDX(i,j)]) { + buf[i] = 0; + goto bad_branch; + } + parent = next; + next = next->blocks[GETLEVINDX(i,j)]; + } + buf[i] = (depth == DQTREE_DEPTH - 1) ? + TREENUM_2_BLKNUM(parent->num) + : get_blknum(next->num, depth + 1); + + bad_branch: + ; + } + + return 0; +} + +/* + * Write index block to disk (or buffer) + * @buf has length 256*sizeof(u_int32_t) bytes + */ +static int read_index_block(int num, u_int32_t *buf, + struct quotatree_tree *tree) +{ + struct quotatree_node *p; + u_int32_t index; + loff_t off; + int depth, res; + + res = BLOCK_NOT_FOUND; + index = 0; + depth = get_depth(num); + off = get_offset(num); + if (depth < 0 || off < 0) + return -EINVAL; + + list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh, + list) { + if (p->num >= off) + res = 0; + if (p->num != off) + continue; + get_block_child(depth, p, buf); + break; + } + + return res; +} + +static inline void convert_quot_format(struct v2_disk_dqblk *dq, + struct vz_quota_ugid *vzq) +{ + dq->dqb_id = vzq->qugid_id; + dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit; + dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit; + dq->dqb_curinodes = vzq->qugid_stat.icurrent; + dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE; + dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE; + dq->dqb_curspace = vzq->qugid_stat.bcurrent; + dq->dqb_btime = vzq->qugid_stat.btime; + dq->dqb_itime = vzq->qugid_stat.itime; +} + +static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree) +{ + int res, i, entries = 0; + struct v2_disk_dqdbheader *dq_header; + struct quotatree_node *p; + struct v2_disk_dqblk *blk = buf + sizeof(struct v2_disk_dqdbheader); + + res = BLOCK_NOT_FOUND; + dq_header = buf; + memset(dq_header, 0, sizeof(*dq_header)); + + list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh), + list) { + if (TREENUM_2_BLKNUM(p->num) >= num) + res = 0; + if (TREENUM_2_BLKNUM(p->num) != num) + continue; + + for (i = 0; i < QUOTATREE_BSIZE; i++) { + if (!p->blocks[i]) + continue; + convert_quot_format(blk + entries, + (struct vz_quota_ugid *)p->blocks[i]); + entries++; + res = 0; + } + break; + } + dq_header->dqdh_entries = entries; + + return res; +} + +static int read_block(int num, void *buf, struct quotatree_tree *tree, + struct dq_info *dq_ugid_info, int magic) +{ + int res; + + memset(buf, 0, DQBLOCK_SIZE); + if (!num) + res = read_header(buf, tree, dq_ugid_info, magic); + else if (ISINDBLOCK(num)) + res = read_index_block(num, (u_int32_t*)buf, tree); + else + res = read_dquot(num, buf, tree); + + return res; +} + +/* + * FIXME: this function can handle quota files up to 2GB only. + */ +static int read_proc_quotafile(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + off_t blk_num, blk_off, buf_off; + char *tmp; + size_t buf_size; + struct quotatree_data *qtd; + struct quotatree_tree *tree; + struct dq_info *dqi; + int res; + + *start = NULL; + tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + qtd = data; + down(&vz_quota_sem); + down(&qtd->qmblk->dq_sem); + + res = 0; + tree = QUGID_TREE(qtd->qmblk, qtd->type); + if (!tree) { + *eof = 1; + goto out_dq; + } + + dqi = &qtd->qmblk->dq_ugid_info[qtd->type]; + + buf_off = 0; + buf_size = count; + blk_num = off / DQBLOCK_SIZE; + blk_off = off % DQBLOCK_SIZE; + + while (buf_size > 0) { + off_t len; + + len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size); + res = read_block(blk_num, tmp, tree, dqi, qtd->type); + if (res < 0) + goto out_err; + if (res == BLOCK_NOT_FOUND) { + *eof = 1; + break; + } + memcpy(page + buf_off, tmp + blk_off, len); + + blk_num++; + buf_size -= len; + blk_off = 0; + buf_off += len; + } + res = buf_off; + +out_err: + *start += count; +out_dq: + up(&qtd->qmblk->dq_sem); + up(&vz_quota_sem); + kfree(tmp); + + return res; +} + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota/QID/aquota.* files + * + * FIXME: this code lacks serialization of read/readdir/lseek. + * However, this problem should be fixed after the mainstream issue of what + * appears to be non-atomic read and update of file position in sys_read. + * + * --------------------------------------------------------------------- */ + +static inline unsigned long vzdq_aquot_getino(dev_t dev) +{ + return 0xec000000UL + dev; +} + +static inline dev_t vzdq_aquot_getidev(struct inode *inode) +{ + return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link; +} + +static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev) +{ + PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev; +} + +static ssize_t vzdq_aquotf_read(struct file *file, + char __user *buf, size_t size, loff_t *ppos) +{ + char *page; + size_t bufsize; + ssize_t l, l2, copied; + char *start; + struct inode *inode; + struct block_device *bdev; + struct super_block *sb; + struct quotatree_data data; + int eof, err; + + err = -ENOMEM; + page = (char *)__get_free_page(GFP_KERNEL); + if (page == NULL) + goto out_err; + + err = -ENODEV; + inode = file->f_dentry->d_inode; + bdev = bdget(vzdq_aquot_getidev(inode)); + if (bdev == NULL) + goto out_err; + sb = get_super(bdev); + bdput(bdev); + if (sb == NULL) + goto out_err; + data.qmblk = vzquota_find_qmblk(sb); + data.type = PROC_I(inode)->fd - 1; + drop_super(sb); + if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD) + goto out_err; + + copied = 0; + l = l2 = 0; + while (1) { + bufsize = min(size, (size_t)PAGE_SIZE); + if (bufsize <= 0) + break; + + l = read_proc_quotafile(page, &start, *ppos, bufsize, + &eof, &data); + if (l <= 0) + break; + + l2 = copy_to_user(buf, page, l); + copied += l - l2; + if (l2) + break; + + buf += l; + size -= l; + *ppos += (unsigned long)start; + l = l2 = 0; + } + + qmblk_put(data.qmblk); + free_page((unsigned long)page); + if (copied) + return copied; + else if (l2) /* last copy_to_user failed */ + return -EFAULT; + else /* read error or EOF */ + return l; + +out_err: + if (page != NULL) + free_page((unsigned long)page); + return err; +} + +static struct file_operations vzdq_aquotf_file_operations = { + .read = &vzdq_aquotf_read, +}; + +static struct inode_operations vzdq_aquotf_inode_operations = { +}; + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota/QID directory + * + * --------------------------------------------------------------------- */ + +static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler) +{ + loff_t n; + int err; + + n = file->f_pos; + for (err = 0; !err; n++) { + /* ppc32 can't cmp 2 long long's in switch, calls __cmpdi2() */ + switch ((unsigned long)n) { + case 0: + err = (*filler)(data, ".", 1, n, + file->f_dentry->d_inode->i_ino, + DT_DIR); + break; + case 1: + err = (*filler)(data, "..", 2, n, + parent_ino(file->f_dentry), DT_DIR); + break; + case 2: + err = (*filler)(data, "aquota.user", 11, n, + file->f_dentry->d_inode->i_ino + + USRQUOTA + 1, + DT_REG); + break; + case 3: + err = (*filler)(data, "aquota.group", 12, n, + file->f_dentry->d_inode->i_ino + + GRPQUOTA + 1, + DT_REG); + break; + default: + goto out; + } + } +out: + file->f_pos = n; + return err; +} + +struct vzdq_aquotq_lookdata { + dev_t dev; + int type; + struct vz_quota_master *qmblk; +}; + +static int vzdq_aquotq_looktest(struct inode *inode, void *data) +{ + struct vzdq_aquotq_lookdata *d; + + d = data; + return inode->i_op == &vzdq_aquotf_inode_operations && + vzdq_aquot_getidev(inode) == d->dev && + PROC_I(inode)->fd == d->type + 1; +} + +static int vzdq_aquotq_lookset(struct inode *inode, void *data) +{ + struct vzdq_aquotq_lookdata *d; + struct quotatree_tree *tree; + + d = data; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1; + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_nlink = 1; + inode->i_op = &vzdq_aquotf_inode_operations; + inode->i_fop = &vzdq_aquotf_file_operations; + PROC_I(inode)->fd = d->type + 1; + vzdq_aquot_setidev(inode, d->dev); + + /* Setting size */ + tree = QUGID_TREE(d->qmblk, d->type); + inode->i_size = get_block_num(tree) * 1024; + return 0; +} + +static int vzdq_aquotq_revalidate(struct dentry *vdentry, struct nameidata *nd) +{ + return 0; +} + +static struct dentry_operations vzdq_aquotq_dentry_operations = { + .d_revalidate = &vzdq_aquotq_revalidate, +}; + +static struct vz_quota_master *find_qmblk_by_dev(dev_t dev) +{ + struct super_block *sb; + struct vz_quota_master *qmblk; + + qmblk = NULL; + sb = user_get_super(dev); + if (sb != NULL) { + qmblk = vzquota_find_qmblk(sb); + drop_super(sb); + + if (qmblk == VZ_QUOTA_BAD) + qmblk = NULL; + } + + return qmblk; +} + +static struct dentry *vzdq_aquotq_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + struct vzdq_aquotq_lookdata d; + int k; + + if (dentry->d_name.len == 11) { + if (memcmp(dentry->d_name.name, "aquota.user", 11)) + goto out; + k = USRQUOTA; + } else if (dentry->d_name.len == 12) { + if (memcmp(dentry->d_name.name, "aquota.group", 12)) + goto out; + k = GRPQUOTA; + } else + goto out; + d.dev = vzdq_aquot_getidev(dir); + d.type = k; + d.qmblk = find_qmblk_by_dev(d.dev); + if (d.qmblk == NULL) + goto out; + + inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1, + vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d); + + /* qmlbk ref is not needed, we used it for i_size calculation only */ + qmblk_put(d.qmblk); + if (inode == NULL) + goto out; + + unlock_new_inode(inode); + dentry->d_op = &vzdq_aquotq_dentry_operations; + d_add(dentry, inode); + return NULL; + +out: + return ERR_PTR(-ENOENT); +} + +static struct file_operations vzdq_aquotq_file_operations = { + .read = &generic_read_dir, + .readdir = &vzdq_aquotq_readdir, +}; + +static struct inode_operations vzdq_aquotq_inode_operations = { + .lookup = &vzdq_aquotq_lookup, +}; + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota directory + * + * --------------------------------------------------------------------- */ + +struct vzdq_aquot_de { + struct list_head list; + struct vfsmount *mnt; +}; + +static int vzdq_aquot_buildmntlist(struct ve_struct *ve, + struct list_head *head) +{ + struct vfsmount *rmnt, *mnt; + struct vzdq_aquot_de *p; + int err; + +#ifdef CONFIG_VE + rmnt = mntget(ve->fs_rootmnt); +#else + read_lock(¤t->fs->lock); + rmnt = mntget(current->fs->rootmnt); + read_unlock(¤t->fs->lock); +#endif + mnt = rmnt; + spin_lock(&vfsmount_lock); + while (1) { + list_for_each_entry(p, head, list) { + if (p->mnt->mnt_sb == mnt->mnt_sb) + goto skip; + } + + err = -ENOMEM; + p = kmalloc(sizeof(*p), GFP_ATOMIC); + if (p == NULL) + goto out; + p->mnt = mntget(mnt); + list_add_tail(&p->list, head); + +skip: + err = 0; + if (list_empty(&mnt->mnt_mounts)) { + while (1) { + if (mnt == rmnt) + goto out; + if (mnt->mnt_child.next != + &mnt->mnt_parent->mnt_mounts) + break; + mnt = mnt->mnt_parent; + } + mnt = list_entry(mnt->mnt_child.next, + struct vfsmount, mnt_child); + } else + mnt = list_entry(mnt->mnt_mounts.next, + struct vfsmount, mnt_child); + } +out: + spin_unlock(&vfsmount_lock); + mntput(rmnt); + return err; +} + +static void vzdq_aquot_releasemntlist(struct ve_struct *ve, + struct list_head *head) +{ + struct vzdq_aquot_de *p; + + while (!list_empty(head)) { + p = list_entry(head->next, typeof(*p), list); + mntput(p->mnt); + list_del(&p->list); + kfree(p); + } +} + +static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler) +{ + struct ve_struct *ve, *old_ve; + struct list_head mntlist; + struct vzdq_aquot_de *de; + struct super_block *sb; + struct vz_quota_master *qmblk; + loff_t i, n; + char buf[24]; + int l, err; + + i = 0; + n = file->f_pos; + ve = file->f_dentry->d_sb->s_type->owner_env; + old_ve = set_exec_env(ve); + + INIT_LIST_HEAD(&mntlist); +#ifdef CONFIG_VE + /* + * The only reason of disabling readdir for the host system is that + * this readdir can be slow and CPU consuming with large number of VPSs + * (or just mount points). + */ + err = ve_is_super(ve); +#else + err = 0; +#endif + if (!err) { + err = vzdq_aquot_buildmntlist(ve, &mntlist); + if (err) + goto out_err; + } + + if (i >= n) { + if ((*filler)(data, ".", 1, i, + file->f_dentry->d_inode->i_ino, DT_DIR)) + goto out_fill; + } + i++; + + if (i >= n) { + if ((*filler)(data, "..", 2, i, + parent_ino(file->f_dentry), DT_DIR)) + goto out_fill; + } + i++; + + list_for_each_entry (de, &mntlist, list) { + sb = de->mnt->mnt_sb; + if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL)) + continue; + + qmblk = vzquota_find_qmblk(sb); + if (qmblk == NULL || qmblk == VZ_QUOTA_BAD) + continue; + + qmblk_put(qmblk); + i++; + if (i <= n) + continue; + + l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev)); + if ((*filler)(data, buf, l, i - 1, + vzdq_aquot_getino(sb->s_dev), DT_DIR)) + break; + } + +out_fill: + err = 0; + file->f_pos = i; +out_err: + vzdq_aquot_releasemntlist(ve, &mntlist); + (void)set_exec_env(old_ve); + return err; +} + +static int vzdq_aquotd_looktest(struct inode *inode, void *data) +{ + return inode->i_op == &vzdq_aquotq_inode_operations && + vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data; +} + +static int vzdq_aquotd_lookset(struct inode *inode, void *data) +{ + dev_t dev; + + dev = (dev_t)(unsigned long)data; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = vzdq_aquot_getino(dev); + inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_nlink = 2; + inode->i_op = &vzdq_aquotq_inode_operations; + inode->i_fop = &vzdq_aquotq_file_operations; + vzdq_aquot_setidev(inode, dev); + return 0; +} + +static struct dentry *vzdq_aquotd_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct ve_struct *ve, *old_ve; + const unsigned char *s; + int l; + dev_t dev; + struct inode *inode; + + ve = dir->i_sb->s_type->owner_env; + old_ve = set_exec_env(ve); +#ifdef CONFIG_VE + /* + * Lookup is much lighter than readdir, so it can be allowed for the + * host system. But it would be strange to be able to do lookup only + * without readdir... + */ + if (ve_is_super(ve)) + goto out; +#endif + + dev = 0; + l = dentry->d_name.len; + if (l <= 0) + goto out; + for (s = dentry->d_name.name; l > 0; s++, l--) { + if (!isxdigit(*s)) + goto out; + if (dev & ~(~0UL >> 4)) + goto out; + dev <<= 4; + if (isdigit(*s)) + dev += *s - '0'; + else if (islower(*s)) + dev += *s - 'a' + 10; + else + dev += *s - 'A' + 10; + } + dev = new_decode_dev(dev); + + if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL)) + goto out; + + inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev), + vzdq_aquotd_looktest, vzdq_aquotd_lookset, + (void *)(unsigned long)dev); + if (inode == NULL) + goto out; + unlock_new_inode(inode); + + d_add(dentry, inode); + (void)set_exec_env(old_ve); + return NULL; + +out: + (void)set_exec_env(old_ve); + return ERR_PTR(-ENOENT); +} + +static struct file_operations vzdq_aquotd_file_operations = { + .read = &generic_read_dir, + .readdir = &vzdq_aquotd_readdir, +}; + +static struct inode_operations vzdq_aquotd_inode_operations = { + .lookup = &vzdq_aquotd_lookup, +}; + + +/* ---------------------------------------------------------------------- + * + * Initialization and deinitialization + * + * --------------------------------------------------------------------- */ + +/* + * FIXME: creation of proc entries here is unsafe with respect to module + * unloading. + */ +void vzaquota_init(void) +{ + struct proc_dir_entry *de; + + de = create_proc_glob_entry("vz/vzaquota", + S_IFDIR | S_IRUSR | S_IXUSR, NULL); + if (de != NULL) { + de->proc_iops = &vzdq_aquotd_inode_operations; + de->proc_fops = &vzdq_aquotd_file_operations; + } else + printk("VZDQ: vz/vzaquota creation failed\n"); +#if defined(CONFIG_SYSCTL) + de = create_proc_glob_entry("sys/fs/quota", + S_IFDIR | S_IRUSR | S_IXUSR, NULL); + if (de == NULL) + printk("VZDQ: sys/fs/quota creation failed\n"); +#endif +} + +void vzaquota_fini(void) +{ + remove_proc_entry("vz/vzaquota", NULL); +} diff -uprN linux-2.6.18/fs/vzdq_mgmt.c linux-2.6.18.ovz/fs/vzdq_mgmt.c --- linux-2.6.18/fs/vzdq_mgmt.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/fs/vzdq_mgmt.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,758 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* ---------------------------------------------------------------------- + * Switching quota on. + * --------------------------------------------------------------------- */ + +/* + * check limits copied from user + */ +int vzquota_check_sane_limits(struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + + /* softlimit must be less then hardlimit */ + if (qstat->bsoftlimit > qstat->bhardlimit) + goto out; + + if (qstat->isoftlimit > qstat->ihardlimit) + goto out; + + err = 0; +out: + return err; +} + +/* + * check usage values copied from user + */ +int vzquota_check_sane_values(struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + + /* expiration time must not be set if softlimit was not exceeded */ + if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != 0) + goto out; + + if (qstat->icurrent < qstat->isoftlimit && qstat->itime != 0) + goto out; + + err = vzquota_check_sane_limits(qstat); +out: + return err; +} + +/* + * create new quota master block + * this function should: + * - copy limits and usage parameters from user buffer; + * - allock, initialize quota block and insert it to hash; + */ +static int vzquota_create(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); + + err = -EFAULT; + if (!compat) { + if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) + goto out; + compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); + compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); +#endif + } + + err = -EINVAL; + if (quota_id == 0) + goto out; + + if (vzquota_check_sane_values(&qstat.dq_stat)) + goto out; + err = 0; + qmblk = vzquota_alloc_master(quota_id, &qstat); + + if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */ + err = PTR_ERR(qmblk); +out: + up(&vz_quota_sem); + + return err; +} + +/** + * vzquota_on - turn quota on + * + * This function should: + * - find and get refcnt of directory entry for quota root and corresponding + * mountpoint; + * - find corresponding quota block and mark it with given path; + * - check quota tree; + * - initialize quota for the tree root. + */ +static int vzquota_on(unsigned int quota_id, const char __user *quota_root) +{ + int err; + struct nameidata nd; + struct vz_quota_master *qmblk; + struct super_block *dqsb; + + dqsb = NULL; + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; + + err = user_path_walk(quota_root, &nd); + if (err) + goto out; + /* init path must be a directory */ + err = -ENOTDIR; + if (!S_ISDIR(nd.dentry->d_inode->i_mode)) + goto out_path; + + qmblk->dq_root_dentry = nd.dentry; + qmblk->dq_root_mnt = nd.mnt; + qmblk->dq_sb = nd.dentry->d_inode->i_sb; + err = vzquota_get_super(qmblk->dq_sb); + if (err) + goto out_super; + + /* + * Serialization with quota initialization and operations is performed + * through generation check: generation is memorized before qmblk is + * found and compared under inode_qmblk_lock with assignment. + * + * Note that the dentry tree is shrunk only for high-level logical + * serialization, purely as a courtesy to the user: to have consistent + * quota statistics, files should be closed etc. on quota on. + */ + err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_dentry->d_inode, + qmblk); + if (err) + goto out_init; + qmblk->dq_state = VZDQ_WORKING; + + up(&vz_quota_sem); + return 0; + +out_init: + dqsb = qmblk->dq_sb; +out_super: + /* clear for qmblk_put/quota_free_master */ + qmblk->dq_sb = NULL; + qmblk->dq_root_dentry = NULL; + qmblk->dq_root_mnt = NULL; +out_path: + path_release(&nd); +out: + if (dqsb) + vzquota_put_super(dqsb); + up(&vz_quota_sem); + return err; +} + + +/* ---------------------------------------------------------------------- + * Switching quota off. + * --------------------------------------------------------------------- */ + +/* + * destroy quota block by ID + */ +static int vzquota_destroy(unsigned int quota_id) +{ + int err; + struct vz_quota_master *qmblk; + struct dentry *dentry; + struct vfsmount *mnt; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state == VZDQ_WORKING) + goto out; /* quota_off first */ + + list_del_init(&qmblk->dq_hash); + dentry = qmblk->dq_root_dentry; + qmblk->dq_root_dentry = NULL; + mnt = qmblk->dq_root_mnt; + qmblk->dq_root_mnt = NULL; + + if (qmblk->dq_sb) + vzquota_put_super(qmblk->dq_sb); + up(&vz_quota_sem); + + qmblk_put(qmblk); + dput(dentry); + mntput(mnt); + return 0; + +out: + up(&vz_quota_sem); + return err; +} + +/** + * vzquota_off - turn quota off + */ + +static int __vzquota_sync_list(struct list_head *lh, + struct vz_quota_master *qmblk, + enum writeback_sync_modes sync_mode) +{ + struct writeback_control wbc; + LIST_HEAD(list); + struct vz_quota_ilink *qlnk; + struct inode *inode; + int err, ret; + + memset(&wbc, 0, sizeof(wbc)); + wbc.sync_mode = sync_mode; + + err = ret = 0; + while (!list_empty(lh)) { + if (need_resched()) { + inode_qmblk_unlock(qmblk->dq_sb); + schedule(); + inode_qmblk_lock(qmblk->dq_sb); + continue; + } + + qlnk = list_first_entry(lh, struct vz_quota_ilink, list); + list_move(&qlnk->list, &list); + + inode = igrab(QLNK_INODE(qlnk)); + if (!inode) + continue; + + inode_qmblk_unlock(qmblk->dq_sb); + + wbc.nr_to_write = LONG_MAX; + ret = sync_inode(inode, &wbc); + if (ret) + err = ret; + iput(inode); + + inode_qmblk_lock(qmblk->dq_sb); + } + + list_splice(&list, lh); + return err; +} + +static int vzquota_sync_list(struct list_head *lh, + struct vz_quota_master *qmblk) +{ + (void)__vzquota_sync_list(lh, qmblk, WB_SYNC_NONE); + return __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL); +} + +static int vzquota_sync_inodes(struct vz_quota_master *qmblk) +{ + int err; + LIST_HEAD(qlnk_list); + + list_splice_init(&qmblk->dq_ilink_list, &qlnk_list); + err = vzquota_sync_list(&qlnk_list, qmblk); + if (!err && !list_empty(&qmblk->dq_ilink_list)) + err = -EBUSY; + list_splice(&qlnk_list, &qmblk->dq_ilink_list); + + return err; +} + +static int vzquota_off(unsigned int quota_id) +{ + int err, ret; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EALREADY; + if (qmblk->dq_state != VZDQ_WORKING) + goto out; + + inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */ + ret = vzquota_sync_inodes(qmblk); + inode_qmblk_unlock(qmblk->dq_sb); + + err = vzquota_off_qmblk(qmblk->dq_sb, qmblk); + if (err) + goto out; + + err = ret; + /* vzquota_destroy will free resources */ + qmblk->dq_state = VZDQ_STOPING; +out: + up(&vz_quota_sem); + + return err; +} + + +/* ---------------------------------------------------------------------- + * Other VZQUOTA ioctl's. + * --------------------------------------------------------------------- */ + +/* + * this function should: + * - set new limits/buffer under quota master block lock + * - if new softlimit less then usage, then set expiration time + * - no need to alloc ugid hash table - we'll do that on demand + */ +int vzquota_update_limit(struct dq_stat *_qstat, + struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + if (vzquota_check_sane_limits(qstat)) + goto out; + + err = 0; + + /* limits */ + _qstat->bsoftlimit = qstat->bsoftlimit; + _qstat->bhardlimit = qstat->bhardlimit; + /* + * If the soft limit is exceeded, administrator can override the moment + * when the grace period for limit exceeding ends. + * Specifying the moment may be useful if the soft limit is set to be + * lower than the current usage. In the latter case, if the grace + * period end isn't specified, the grace period will start from the + * moment of the first write operation. + * There is a race with the user level. Soft limit may be already + * exceeded before the limit change, and grace period end calculated by + * the kernel will be overriden. User level may check if the limit is + * already exceeded, but check and set calls are not atomic. + * This race isn't dangerous. Under normal cicrumstances, the + * difference between the grace period end calculated by the kernel and + * the user level should be not greater than as the difference between + * the moments of check and set calls, i.e. not bigger than the quota + * timer resolution - 1 sec. + */ + if (qstat->btime != (time_t)0 && + _qstat->bcurrent >= _qstat->bsoftlimit) + _qstat->btime = qstat->btime; + + _qstat->isoftlimit = qstat->isoftlimit; + _qstat->ihardlimit = qstat->ihardlimit; + if (qstat->itime != (time_t)0 && + _qstat->icurrent >= _qstat->isoftlimit) + _qstat->itime = qstat->itime; + +out: + return err; +} + +/* + * set new quota limits. + * this function should: + * copy new limits from user level + * - find quota block + * - set new limits and flags. + */ +static int vzquota_setlimit(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); /* for hash list protection */ + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (!compat) { + if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) + goto out; + compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); + compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); +#endif + } + + qmblk_data_write_lock(qmblk); + err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat); + if (err == 0) + qmblk->dq_info = qstat.dq_info; + qmblk_data_write_unlock(qmblk); + +out: + up(&vz_quota_sem); + return err; +} + +/* + * get quota limits. + * very simple - just return stat buffer to user + */ +static int vzquota_getstat(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + qmblk_data_read_lock(qmblk); + /* copy whole buffer under lock */ + memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat)); + memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info)); + qmblk_data_read_unlock(qmblk); + + if (!compat) + err = copy_to_user(u_qstat, &qstat, sizeof(qstat)); + else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + dqstat2compat_dqstat(&qstat.dq_stat, &cqstat.dq_stat); + dqinfo2compat_dqinfo(&qstat.dq_info, &cqstat.dq_info); + err = copy_to_user(u_qstat, &cqstat, sizeof(cqstat)); +#endif + } + if (err) + err = -EFAULT; + +out: + up(&vz_quota_sem); + return err; +} + +/* + * This is a system call to turn per-VE disk quota on. + * Note this call is allowed to run ONLY from VE0 + */ +long do_vzquotactl(int cmd, unsigned int quota_id, + struct vz_quota_stat __user *qstat, const char __user *ve_root, + int compat) +{ + int ret; + + ret = -EPERM; + /* access allowed only from root of VE0 */ + if (!capable(CAP_SYS_RESOURCE) || + !capable(CAP_SYS_ADMIN)) + goto out; + + switch (cmd) { + case VZ_DQ_CREATE: + ret = vzquota_create(quota_id, qstat, compat); + break; + case VZ_DQ_DESTROY: + ret = vzquota_destroy(quota_id); + break; + case VZ_DQ_ON: + ret = vzquota_on(quota_id, ve_root); + break; + case VZ_DQ_OFF: + ret = vzquota_off(quota_id); + break; + case VZ_DQ_SETLIMIT: + ret = vzquota_setlimit(quota_id, qstat, compat); + break; + case VZ_DQ_GETSTAT: + ret = vzquota_getstat(quota_id, qstat, compat); + break; + + default: + ret = -EINVAL; + goto out; + } + +out: + return ret; +} + + +/* ---------------------------------------------------------------------- + * Proc filesystem routines + * ---------------------------------------------------------------------*/ + +#if defined(CONFIG_PROC_FS) + +#define QUOTA_UINT_LEN 15 +#define QUOTA_TIME_LEN_FMT_UINT "%11u" +#define QUOTA_NUM_LEN_FMT_UINT "%15u" +#define QUOTA_NUM_LEN_FMT_ULL "%15Lu" +#define QUOTA_TIME_LEN_FMT_STR "%11s" +#define QUOTA_NUM_LEN_FMT_STR "%15s" +#define QUOTA_PROC_MAX_LINE_LEN 2048 + +/* + * prints /proc/ve_dq header line + */ +static int print_proc_header(char * buffer) +{ + return sprintf(buffer, + "%-11s" + QUOTA_NUM_LEN_FMT_STR + QUOTA_NUM_LEN_FMT_STR + QUOTA_NUM_LEN_FMT_STR + QUOTA_TIME_LEN_FMT_STR + QUOTA_TIME_LEN_FMT_STR + "\n", + "qid: path", + "usage", "softlimit", "hardlimit", "time", "expire"); +} + +/* + * prints proc master record id, dentry path + */ +static int print_proc_master_id(char * buffer, char * path_buf, + struct vz_quota_master * qp) +{ + char *path; + int over; + + path = NULL; + switch (qp->dq_state) { + case VZDQ_WORKING: + if (!path_buf) { + path = ""; + break; + } + path = d_path(qp->dq_root_dentry, + qp->dq_root_mnt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) { + path = ""; + break; + } + /* do not print large path, truncate it */ + over = strlen(path) - + (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 - + QUOTA_UINT_LEN); + if (over > 0) { + path += over - 3; + path[0] = path[1] = path[3] = '.'; + } + break; + case VZDQ_STARTING: + path = "-- started --"; + break; + case VZDQ_STOPING: + path = "-- stopped --"; + break; + } + + return sprintf(buffer, "%u: %s\n", qp->dq_id, path); +} + +/* + * prints struct vz_quota_stat data + */ +static int print_proc_stat(char * buffer, struct dq_stat *qs, + struct dq_info *qi) +{ + return sprintf(buffer, + "%11s" + QUOTA_NUM_LEN_FMT_ULL + QUOTA_NUM_LEN_FMT_ULL + QUOTA_NUM_LEN_FMT_ULL + QUOTA_TIME_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + "\n" + "%11s" + QUOTA_NUM_LEN_FMT_UINT + QUOTA_NUM_LEN_FMT_UINT + QUOTA_NUM_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + "\n", + "1k-blocks", + (unsigned long long)qs->bcurrent >> 10, + (unsigned long long)qs->bsoftlimit >> 10, + (unsigned long long)qs->bhardlimit >> 10, + (unsigned int)qs->btime, + (unsigned int)qi->bexpire, + "inodes", + qs->icurrent, + qs->isoftlimit, + qs->ihardlimit, + (unsigned int)qs->itime, + (unsigned int)qi->iexpire); +} + + +/* + * for /proc filesystem output + */ +static int vzquota_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len, i; + off_t printed = 0; + char *p = page; + struct vz_quota_master *qp; + struct vz_quota_ilink *ql2; + struct list_head *listp; + char *path_buf; + + path_buf = (char*)__get_free_page(GFP_KERNEL); + if (path_buf == NULL) + return -ENOMEM; + + len = print_proc_header(p); + printed += len; + if (off < printed) /* keep header in output */ { + *start = p + off; + p += len; + } + + down(&vz_quota_sem); + + /* traverse master hash table for all records */ + for (i = 0; i < vzquota_hash_size; i++) { + list_for_each(listp, &vzquota_hash_table[i]) { + qp = list_entry(listp, + struct vz_quota_master, dq_hash); + + /* Skip other VE's information if not root of VE0 */ + if ((!capable(CAP_SYS_ADMIN) || + !capable(CAP_SYS_RESOURCE))) { + ql2 = INODE_QLNK(current->fs->root->d_inode); + if (ql2 == NULL || qp != ql2->qmblk) + continue; + } + /* + * Now print the next record + */ + len = 0; + /* we print quotaid and path only in VE0 */ + if (capable(CAP_SYS_ADMIN)) + len += print_proc_master_id(p+len,path_buf, qp); + len += print_proc_stat(p+len, &qp->dq_stat, + &qp->dq_info); + printed += len; + /* skip unnecessary lines */ + if (printed <= off) + continue; + p += len; + /* provide start offset */ + if (*start == NULL) + *start = p + (off - printed); + /* have we printed all requested size? */ + if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN || + (p - *start) >= count) + goto out; + } + } + + *eof = 1; /* checked all hash */ +out: + up(&vz_quota_sem); + + len = 0; + if (*start != NULL) { + len = (p - *start); + if (len > count) + len = count; + } + + if (path_buf) + free_page((unsigned long) path_buf); + + return len; +} + +/* + * Register procfs read callback + */ +int vzquota_proc_init(void) +{ + struct proc_dir_entry *de; + + de = create_proc_entry_mod("vz/vzquota", S_IFREG|S_IRUSR, NULL, + THIS_MODULE); + if (de == NULL) { + /* create "vz" subdirectory, if not exist */ + de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); + if (de == NULL) + goto out_err; + de = create_proc_entry_mod("vzquota", S_IFREG|S_IRUSR, de, + THIS_MODULE); + if (de == NULL) + goto out_err; + } + de->read_proc = vzquota_read_proc; + de->data = NULL; + return 0; +out_err: + return -EBUSY; +} + +void vzquota_proc_release(void) +{ + /* Unregister procfs read callback */ + remove_proc_entry("vz/vzquota", NULL); +} + +#endif diff -uprN linux-2.6.18/fs/vzdq_ops.c linux-2.6.18.ovz/fs/vzdq_ops.c --- linux-2.6.18/fs/vzdq_ops.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/fs/vzdq_ops.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,634 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +/* ---------------------------------------------------------------------- + * Quota superblock operations - helper functions. + * --------------------------------------------------------------------- */ + +static inline void vzquota_incr_inodes(struct dq_stat *dqstat, + unsigned long number) +{ + dqstat->icurrent += number; +} + +static inline void vzquota_incr_space(struct dq_stat *dqstat, + __u64 number) +{ + dqstat->bcurrent += number; +} + +static inline void vzquota_decr_inodes(struct dq_stat *dqstat, + unsigned long number) +{ + if (dqstat->icurrent > number) + dqstat->icurrent -= number; + else + dqstat->icurrent = 0; + if (dqstat->icurrent < dqstat->isoftlimit) + dqstat->itime = (time_t) 0; +} + +static inline void vzquota_decr_space(struct dq_stat *dqstat, + __u64 number) +{ + if (dqstat->bcurrent > number) + dqstat->bcurrent -= number; + else + dqstat->bcurrent = 0; + if (dqstat->bcurrent < dqstat->bsoftlimit) + dqstat->btime = (time_t) 0; +} + +/* + * better printk() message or use /proc/vzquotamsg interface + * similar to /proc/kmsg + */ +static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag, + const char *fmt) +{ + if (dq_info->flags & flag) /* warning already printed for this + masterblock */ + return; + printk(fmt, dq_id); + dq_info->flags |= flag; +} + +/* + * ignore_hardlimit - + * + * Intended to allow superuser of VE0 to overwrite hardlimits. + * + * ignore_hardlimit() has a very bad feature: + * + * writepage() operation for writable mapping of a file with holes + * may trigger get_block() with wrong current and as a consequence, + * opens a possibility to overcommit hardlimits + */ +/* for the reason above, it is disabled now */ +static inline int ignore_hardlimit(struct dq_info *dqstat) +{ +#if 0 + return ve_is_super(get_exec_env()) && + capable(CAP_SYS_RESOURCE) && + (dqstat->options & VZ_QUOTA_OPT_RSQUASH); +#else + return 0; +#endif +} + +static int vzquota_check_inodes(struct dq_info *dq_info, + struct dq_stat *dqstat, + unsigned long number, int dq_id) +{ + if (number == 0) + return QUOTA_OK; + + if (dqstat->icurrent + number > dqstat->ihardlimit && + !ignore_hardlimit(dq_info)) { + vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, + "VZ QUOTA: file hardlimit reached for id=%d\n"); + return NO_QUOTA; + } + + if (dqstat->icurrent + number > dqstat->isoftlimit) { + if (dqstat->itime == (time_t)0) { + vzquota_warn(dq_info, dq_id, 0, + "VZ QUOTA: file softlimit exceeded " + "for id=%d\n"); + dqstat->itime = CURRENT_TIME_SECONDS + + dq_info->iexpire; + } else if (CURRENT_TIME_SECONDS >= dqstat->itime && + !ignore_hardlimit(dq_info)) { + vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, + "VZ QUOTA: file softlimit expired " + "for id=%d\n"); + return NO_QUOTA; + } + } + + return QUOTA_OK; +} + +static int vzquota_check_space(struct dq_info *dq_info, + struct dq_stat *dqstat, + __u64 number, int dq_id, char prealloc) +{ + if (number == 0) + return QUOTA_OK; + + if (prealloc == DQUOT_CMD_FORCE) + return QUOTA_OK; + + if (dqstat->bcurrent + number > dqstat->bhardlimit && + !ignore_hardlimit(dq_info)) { + if (!prealloc) + vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, + "VZ QUOTA: disk hardlimit reached " + "for id=%d\n"); + return NO_QUOTA; + } + + if (dqstat->bcurrent + number > dqstat->bsoftlimit) { + if (dqstat->btime == (time_t)0) { + if (!prealloc) { + vzquota_warn(dq_info, dq_id, 0, + "VZ QUOTA: disk softlimit exceeded " + "for id=%d\n"); + dqstat->btime = CURRENT_TIME_SECONDS + + dq_info->bexpire; + } else { + /* + * Original Linux quota doesn't allow + * preallocation to exceed softlimit so + * exceeding will be always printed + */ + return NO_QUOTA; + } + } else if (CURRENT_TIME_SECONDS >= dqstat->btime && + !ignore_hardlimit(dq_info)) { + if (!prealloc) + vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, + "VZ QUOTA: disk quota " + "softlimit expired " + "for id=%d\n"); + return NO_QUOTA; + } + } + + return QUOTA_OK; +} + +#ifdef CONFIG_VZ_QUOTA_UGID +static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + int type, unsigned long number) +{ + struct dq_info *dqinfo; + struct dq_stat *dqstat; + + if (qugid[type] == NULL) + return QUOTA_OK; + if (qugid[type] == VZ_QUOTA_UGBAD) + return NO_QUOTA; + + if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) + return QUOTA_OK; + if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) + return QUOTA_OK; + if (number == 0) + return QUOTA_OK; + + dqinfo = &qmblk->dq_ugid_info[type]; + dqstat = &qugid[type]->qugid_stat; + + if (dqstat->ihardlimit != 0 && + dqstat->icurrent + number > dqstat->ihardlimit) + return NO_QUOTA; + + if (dqstat->isoftlimit != 0 && + dqstat->icurrent + number > dqstat->isoftlimit) { + if (dqstat->itime == (time_t)0) + dqstat->itime = CURRENT_TIME_SECONDS + + dqinfo->iexpire; + else if (CURRENT_TIME_SECONDS >= dqstat->itime) + return NO_QUOTA; + } + + return QUOTA_OK; +} + +static int vzquota_check_ugid_space(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + int type, __u64 number, char prealloc) +{ + struct dq_info *dqinfo; + struct dq_stat *dqstat; + + if (prealloc == DQUOT_CMD_FORCE) + return QUOTA_OK; + + if (qugid[type] == NULL) + return QUOTA_OK; + if (qugid[type] == VZ_QUOTA_UGBAD) + return NO_QUOTA; + + if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) + return QUOTA_OK; + if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) + return QUOTA_OK; + if (number == 0) + return QUOTA_OK; + + dqinfo = &qmblk->dq_ugid_info[type]; + dqstat = &qugid[type]->qugid_stat; + + if (dqstat->bhardlimit != 0 && + dqstat->bcurrent + number > dqstat->bhardlimit) + return NO_QUOTA; + + if (dqstat->bsoftlimit != 0 && + dqstat->bcurrent + number > dqstat->bsoftlimit) { + if (dqstat->btime == (time_t)0) { + if (!prealloc) + dqstat->btime = CURRENT_TIME_SECONDS + + dqinfo->bexpire; + else + /* + * Original Linux quota doesn't allow + * preallocation to exceed softlimit so + * exceeding will be always printed + */ + return NO_QUOTA; + } else if (CURRENT_TIME_SECONDS >= dqstat->btime) + return NO_QUOTA; + } + + return QUOTA_OK; +} +#endif + +/* ---------------------------------------------------------------------- + * Quota superblock operations + * --------------------------------------------------------------------- */ + +/* + * S_NOQUOTA note. + * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for + * - quota file (absent in our case) + * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like + * filesystem-specific new_inode, before the inode gets outside links. + * For the latter case, the only quota operation where care about S_NOQUOTA + * might be required is vzquota_drop, but there S_NOQUOTA has already been + * checked in DQUOT_DROP(). + * So, S_NOQUOTA may be ignored for now in the VZDQ code. + * + * The above note is not entirely correct. + * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from + * delete_inode if new_inode fails (for example, because of inode quota + * limits), so S_NOQUOTA check is needed in free_inode. + * This seems to be the dark corner of the current quota API. + */ + +/* + * Initialize quota operations for the specified inode. + */ +static int vzquota_initialize(struct inode *inode, int type) +{ + vzquota_inode_init_call(inode); + return 0; /* ignored by caller */ +} + +/* + * Release quota for the specified inode. + */ +static int vzquota_drop(struct inode *inode) +{ + vzquota_inode_drop_call(inode); + return 0; /* ignored by caller */ +} + +/* + * Allocate block callback. + * + * If (prealloc) disk quota exceeding warning is not printed. + * See Linux quota to know why. + * + * Return: + * QUOTA_OK == 0 on SUCCESS + * NO_QUOTA == 1 if allocation should fail + */ +static int vzquota_alloc_space(struct inode *inode, + qsize_t number, int prealloc) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + int ret = QUOTA_OK; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid[MAXQUOTAS]; +#endif + + /* checking first */ + ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat, + number, qmblk->dq_id, prealloc); + if (ret == NO_QUOTA) + goto no_quota; +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; + ret = vzquota_check_ugid_space(qmblk, qugid, + cnt, number, prealloc); + if (ret == NO_QUOTA) + goto no_quota; + } + /* check ok, may increment */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (qugid[cnt] == NULL) + continue; + vzquota_incr_space(&qugid[cnt]->qugid_stat, number); + } +#endif + vzquota_incr_space(&qmblk->dq_stat, number); + vzquota_data_unlock(inode, &data); + } + + inode_add_bytes(inode, number); + might_sleep(); + return QUOTA_OK; + +no_quota: + vzquota_data_unlock(inode, &data); + return NO_QUOTA; +} + +/* + * Allocate inodes callback. + * + * Return: + * QUOTA_OK == 0 on SUCCESS + * NO_QUOTA == 1 if allocation should fail + */ +static int vzquota_alloc_inode(const struct inode *inode, unsigned long number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + int ret = QUOTA_OK; + + qmblk = vzquota_inode_data((struct inode *)inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid *qugid[MAXQUOTAS]; +#endif + + /* checking first */ + ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat, + number, qmblk->dq_id); + if (ret == NO_QUOTA) + goto no_quota; +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; + ret = vzquota_check_ugid_inodes(qmblk, qugid, + cnt, number); + if (ret == NO_QUOTA) + goto no_quota; + } + /* check ok, may increment */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (qugid[cnt] == NULL) + continue; + vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number); + } +#endif + vzquota_incr_inodes(&qmblk->dq_stat, number); + vzquota_data_unlock((struct inode *)inode, &data); + } + + might_sleep(); + return QUOTA_OK; + +no_quota: + vzquota_data_unlock((struct inode *)inode, &data); + return NO_QUOTA; +} + +/* + * Free space callback. + */ +static int vzquota_free_space(struct inode *inode, qsize_t number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; /* isn't checked by the caller */ + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + vzquota_decr_space(&qmblk->dq_stat, number); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_space(&qugid->qugid_stat, number); + } +#endif + vzquota_data_unlock(inode, &data); + } + inode_sub_bytes(inode, number); + might_sleep(); + return QUOTA_OK; +} + +/* + * Free inodes callback. + */ +static int vzquota_free_inode(const struct inode *inode, unsigned long number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + qmblk = vzquota_inode_data((struct inode *)inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + vzquota_decr_inodes(&qmblk->dq_stat, number); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_inodes(&qugid->qugid_stat, number); + } +#endif + vzquota_data_unlock((struct inode *)inode, &data); + } + might_sleep(); + return QUOTA_OK; +} + +void vzquota_inode_off(struct inode * inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + /* The call is made through virtinfo, it can be an inode + * not controlled by vzquota. + */ + if (inode->i_sb->dq_op != &vz_quota_operations) + return; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return; + + if (qmblk == NULL) { + /* Tricky place. If qmblk == NULL, it means that this inode + * is not in area controlled by vzquota (except for rare + * case of already set S_NOQUOTA). But we have to set + * S_NOQUOTA in any case because vzquota can be turned + * on later, when this inode is invalid from viewpoint + * of vzquota. + * + * To be safe, we reacquire vzquota lock. + */ + inode_qmblk_lock(inode->i_sb); + inode->i_flags |= S_NOQUOTA; + inode_qmblk_unlock(inode->i_sb); + return; + } else { + loff_t bytes = inode_get_bytes(inode); +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + inode->i_flags |= S_NOQUOTA; + + vzquota_decr_space(&qmblk->dq_stat, bytes); + vzquota_decr_inodes(&qmblk->dq_stat, 1); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_space(&qugid->qugid_stat, bytes); + vzquota_decr_inodes(&qugid->qugid_stat, 1); + } +#endif + + vzquota_data_unlock(inode, &data); + + vzquota_inode_drop_call(inode); + } +} + + +#ifdef CONFIG_VZ_QUOTA_UGID + +/* + * helper function for quota_transfer + * check that we can add inode to this quota_id + */ +static int vzquota_transfer_check(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + unsigned int type, __u64 size) +{ + if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK || + vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK) + return -1; + return 0; +} + +int vzquota_transfer_usage(struct inode *inode, + int mask, + struct vz_quota_ilink *qlnk) +{ + struct vz_quota_ugid *qugid_old; + __u64 space; + int i; + + space = inode_get_bytes(inode); + for (i = 0; i < MAXQUOTAS; i++) { + if (!(mask & (1 << i))) + continue; + /* + * Do not permit chown a file if its owner does not have + * ugid record. This might happen if we somehow exceeded + * the UID/GID (e.g. set uglimit less than number of users). + */ + if (INODE_QLNK(inode)->qugid[i] == VZ_QUOTA_UGBAD) + return -1; + if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space)) + return -1; + } + + for (i = 0; i < MAXQUOTAS; i++) { + if (!(mask & (1 << i))) + continue; + qugid_old = INODE_QLNK(inode)->qugid[i]; + vzquota_decr_space(&qugid_old->qugid_stat, space); + vzquota_decr_inodes(&qugid_old->qugid_stat, 1); + vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space); + vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1); + } + return 0; +} + +/* + * Transfer the inode between diffent user/group quotas. + */ +static int vzquota_transfer(struct inode *inode, struct iattr *iattr) +{ + return vzquota_inode_transfer_call(inode, iattr) ? + NO_QUOTA : QUOTA_OK; +} + +#else /* CONFIG_VZ_QUOTA_UGID */ + +static int vzquota_transfer(struct inode *inode, struct iattr *iattr) +{ + return QUOTA_OK; +} + +#endif + +/* + * Called under following semaphores: + * old_d->d_inode->i_sb->s_vfs_rename_sem + * old_d->d_inode->i_sem + * new_d->d_inode->i_sem + * [not verified --SAW] + */ +static int vzquota_rename(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + return vzquota_rename_check(inode, old_dir, new_dir) ? + NO_QUOTA : QUOTA_OK; +} + +/* + * Structure of superblock diskquota operations. + */ +struct dquot_operations vz_quota_operations = { + .initialize = vzquota_initialize, + .drop = vzquota_drop, + .alloc_space = vzquota_alloc_space, + .alloc_inode = vzquota_alloc_inode, + .free_space = vzquota_free_space, + .free_inode = vzquota_free_inode, + .transfer = vzquota_transfer, + .rename = vzquota_rename, +}; diff -uprN linux-2.6.18/fs/vzdq_tree.c linux-2.6.18.ovz/fs/vzdq_tree.c --- linux-2.6.18/fs/vzdq_tree.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/fs/vzdq_tree.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,286 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo quota tree implementation + */ + +#include +#include +#include + +struct quotatree_tree *quotatree_alloc(void) +{ + int l; + struct quotatree_tree *tree; + + tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL); + if (tree == NULL) + goto out; + + for (l = 0; l < QUOTATREE_DEPTH; l++) { + INIT_LIST_HEAD(&tree->levels[l].usedlh); + INIT_LIST_HEAD(&tree->levels[l].freelh); + tree->levels[l].freenum = 0; + } + tree->root = NULL; + tree->leaf_num = 0; +out: + return tree; +} + +static struct quotatree_node * +quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level, + struct quotatree_find_state *st) +{ + void **block; + struct quotatree_node *parent; + int l, index; + + parent = NULL; + block = (void **)&tree->root; + l = 0; + while (l < level && *block != NULL) { + index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; + parent = *block; + block = parent->blocks + index; + l++; + } + if (st != NULL) { + st->block = block; + st->level = l; + } + + return parent; +} + +void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st) +{ + quotatree_follow(tree, id, QUOTATREE_DEPTH, st); + if (st->level == QUOTATREE_DEPTH) + return *st->block; + else + return NULL; +} + +void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index) +{ + int i, count; + struct quotatree_node *p; + void *leaf; + + if (QTREE_LEAFNUM(tree) <= index) + return NULL; + + count = 0; + list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { + for (i = 0; i < QUOTATREE_BSIZE; i++) { + leaf = p->blocks[i]; + if (leaf == NULL) + continue; + if (count == index) + return leaf; + count++; + } + } + return NULL; +} + +/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id) + * in the tree... */ +void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id) +{ + int off; + struct quotatree_node *parent, *p; + struct list_head *lh; + + /* get parent refering correct quota tree node of the last level */ + parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL); + if (!parent) + return NULL; + + off = (id & QUOTATREE_BMASK) + 1; /* next ugid */ + lh = &parent->list; + do { + p = list_entry(lh, struct quotatree_node, list); + for ( ; off < QUOTATREE_BSIZE; off++) + if (p->blocks[off]) + return p->blocks[off]; + off = 0; + lh = lh->next; + } while (lh != &QTREE_LEAFLVL(tree)->usedlh); + + return NULL; +} + +int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st, void *data) +{ + struct quotatree_node *p; + int l, index; + + while (st->level < QUOTATREE_DEPTH) { + l = st->level; + if (!list_empty(&tree->levels[l].freelh)) { + p = list_entry(tree->levels[l].freelh.next, + struct quotatree_node, list); + list_del(&p->list); + } else { + p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL); + if (p == NULL) + return -ENOMEM; + /* save block number in the l-level + * it uses for quota file generation */ + p->num = tree->levels[l].freenum++; + } + list_add(&p->list, &tree->levels[l].usedlh); + memset(p->blocks, 0, sizeof(p->blocks)); + *st->block = p; + + index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; + st->block = p->blocks + index; + st->level++; + } + tree->leaf_num++; + *st->block = data; + + return 0; +} + +static struct quotatree_node * +quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id, + int level) +{ + struct quotatree_node *parent; + struct quotatree_find_state st; + + parent = quotatree_follow(tree, id, level, &st); + if (st.level == QUOTATREE_DEPTH) + tree->leaf_num--; + *st.block = NULL; + return parent; +} + +void quotatree_remove(struct quotatree_tree *tree, quotaid_t id) +{ + struct quotatree_node *p; + int level, i; + + p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH); + for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) { + for (i = 0; i < QUOTATREE_BSIZE; i++) + if (p->blocks[i] != NULL) + return; + list_move(&p->list, &tree->levels[level].freelh); + p = quotatree_remove_ptr(tree, id, level); + } +} + +#if 0 +static void quotatree_walk(struct quotatree_tree *tree, + struct quotatree_node *node_start, + quotaid_t id_start, + int level_start, int level_end, + int (*callback)(struct quotatree_tree *, + quotaid_t id, + int level, + void *ptr, + void *data), + void *data) +{ + struct quotatree_node *p; + int l, shift, index; + quotaid_t id; + struct quotatree_find_state st; + + p = node_start; + l = level_start; + shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; + id = id_start; + index = 0; + + /* + * Invariants: + * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; + * id & ((1 << shift) - 1) == 0 + * p is l-level node corresponding to id + */ + do { + if (!p) + break; + + if (l < level_end) { + for (; index < QUOTATREE_BSIZE; index++) + if (p->blocks[index] != NULL) + break; + if (index < QUOTATREE_BSIZE) { + /* descend */ + p = p->blocks[index]; + l++; + shift -= QUOTAID_BBITS; + id += (quotaid_t)index << shift; + index = 0; + continue; + } + } + + if ((*callback)(tree, id, l, p, data)) + break; + + /* ascend and to the next node */ + p = quotatree_follow(tree, id, l, &st); + + index = ((id >> shift) & QUOTATREE_BMASK) + 1; + l--; + shift += QUOTAID_BBITS; + id &= ~(((quotaid_t)1 << shift) - 1); + } while (l >= level_start); +} +#endif + +static void free_list(struct list_head *node_list) +{ + struct quotatree_node *p, *tmp; + + list_for_each_entry_safe(p, tmp, node_list, list) { + list_del(&p->list); + kfree(p); + } +} + +static inline void quotatree_free_nodes(struct quotatree_tree *tree) +{ + int i; + + for (i = 0; i < QUOTATREE_DEPTH; i++) { + free_list(&tree->levels[i].usedlh); + free_list(&tree->levels[i].freelh); + } +} + +static void quotatree_free_leafs(struct quotatree_tree *tree, + void (*dtor)(void *)) +{ + int i; + struct quotatree_node *p; + + list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { + for (i = 0; i < QUOTATREE_BSIZE; i++) { + if (p->blocks[i] == NULL) + continue; + + dtor(p->blocks[i]); + } + } +} + +void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)) +{ + quotatree_free_leafs(tree, dtor); + quotatree_free_nodes(tree); + kfree(tree); +} diff -uprN linux-2.6.18/fs/vzdq_ugid.c linux-2.6.18.ovz/fs/vzdq_ugid.c --- linux-2.6.18/fs/vzdq_ugid.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/fs/vzdq_ugid.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1224 @@ +/* + * Copyright (C) 2002 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo UID/GID disk quota implementation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * XXX + * may be something is needed for sb->s_dquot->info[]? + */ + +#define USRQUOTA_MASK (1 << USRQUOTA) +#define GRPQUOTA_MASK (1 << GRPQUOTA) +#define QTYPE2MASK(type) (1 << (type)) + +static kmem_cache_t *vz_quota_ugid_cachep; + +/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects + * list on the hash table */ +extern struct semaphore vz_quota_sem; + +inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid) +{ + if (qugid != VZ_QUOTA_UGBAD) + atomic_inc(&qugid->qugid_count); + return qugid; +} + +/* we don't limit users with zero limits */ +static inline int vzquota_fake_stat(struct dq_stat *stat) +{ + return stat->bhardlimit == 0 && stat->bsoftlimit == 0 && + stat->ihardlimit == 0 && stat->isoftlimit == 0; +} + +/* callback function for quotatree_free() */ +static inline void vzquota_free_qugid(void *ptr) +{ + kmem_cache_free(vz_quota_ugid_cachep, ptr); +} + +/* + * destroy ugid, if it have zero refcount, limits and usage + * must be called under qmblk->dq_sem + */ +void vzquota_put_ugid(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid) +{ + if (qugid == VZ_QUOTA_UGBAD) + return; + qmblk_data_read_lock(qmblk); + if (atomic_dec_and_test(&qugid->qugid_count) && + (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 && + vzquota_fake_stat(&qugid->qugid_stat) && + qugid->qugid_stat.bcurrent == 0 && + qugid->qugid_stat.icurrent == 0) { + quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type), + qugid->qugid_id); + qmblk->dq_ugid_count--; + vzquota_free_qugid(qugid); + } + qmblk_data_read_unlock(qmblk); +} + +/* + * Get ugid block by its index, like it would present in array. + * In reality, this is not array - this is leafs chain of the tree. + * NULL if index is out of range. + * qmblk semaphore is required to protect the tree. + */ +static inline struct vz_quota_ugid * +vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type) +{ + return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index); +} + +/* + * get next element from ugid "virtual array" + * ugid must be in current array and this array may not be changed between + * two accesses (quaranteed by "stopped" quota state and quota semaphore) + * qmblk semaphore is required to protect the tree + */ +static inline struct vz_quota_ugid * +vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid) +{ + return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type), + qugid->qugid_id); +} + +/* + * requires dq_sem + */ +struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) +{ + struct vz_quota_ugid *qugid; + struct quotatree_tree *tree; + struct quotatree_find_state st; + + tree = QUGID_TREE(qmblk, type); + qugid = quotatree_find(tree, quota_id, &st); + if (qugid) + goto success; + + /* caller does not want alloc */ + if (flags & VZDQUG_FIND_DONT_ALLOC) + goto fail; + + if (flags & VZDQUG_FIND_FAKE) + goto doit; + + /* check limit */ + if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max) + goto fail; + + /* see comment at VZDQUG_FIXED_SET define */ + if (qmblk->dq_flags & VZDQUG_FIXED_SET) + goto fail; + +doit: + /* alloc new structure */ + qugid = kmem_cache_alloc(vz_quota_ugid_cachep, + SLAB_NOFS | __GFP_NOFAIL); + if (qugid == NULL) + goto fail; + + /* initialize new structure */ + qugid->qugid_id = quota_id; + memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat)); + qugid->qugid_type = type; + atomic_set(&qugid->qugid_count, 0); + + /* insert in tree */ + if (quotatree_insert(tree, quota_id, &st, qugid) < 0) + goto fail_insert; + qmblk->dq_ugid_count++; + +success: + vzquota_get_ugid(qugid); + return qugid; + +fail_insert: + vzquota_free_qugid(qugid); +fail: + return VZ_QUOTA_UGBAD; +} + +/* + * takes dq_sem, may schedule + */ +struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) +{ + struct vz_quota_ugid *qugid; + + down(&qmblk->dq_sem); + qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags); + up(&qmblk->dq_sem); + + return qugid; +} + +/* + * destroy all ugid records on given quota master + */ +void vzquota_kill_ugid(struct vz_quota_master *qmblk) +{ + BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) || + (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL)); + + if (qmblk->dq_uid_tree != NULL) { + quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid); + quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid); + } +} + + +/* ---------------------------------------------------------------------- + * Management interface to ugid quota for (super)users. + * --------------------------------------------------------------------- */ + +static int vzquota_initialize2(struct inode *inode, int type) +{ + return QUOTA_OK; +} + +static int vzquota_drop2(struct inode *inode) +{ + return QUOTA_OK; +} + +static int vzquota_alloc_space2(struct inode *inode, + qsize_t number, int prealloc) +{ + inode_add_bytes(inode, number); + return QUOTA_OK; +} + +static int vzquota_alloc_inode2(const struct inode *inode, unsigned long number) +{ + return QUOTA_OK; +} + +static int vzquota_free_space2(struct inode *inode, qsize_t number) +{ + inode_sub_bytes(inode, number); + return QUOTA_OK; +} + +static int vzquota_free_inode2(const struct inode *inode, unsigned long number) +{ + return QUOTA_OK; +} + +static int vzquota_transfer2(struct inode *inode, struct iattr *iattr) +{ + return QUOTA_OK; +} + +struct dquot_operations vz_quota_operations2 = { + .initialize = vzquota_initialize2, + .drop = vzquota_drop2, + .alloc_space = vzquota_alloc_space2, + .alloc_inode = vzquota_alloc_inode2, + .free_space = vzquota_free_space2, + .free_inode = vzquota_free_inode2, + .transfer = vzquota_transfer2, +}; + + +asmlinkage long sys_unlink(const char __user * pathname); +asmlinkage long sys_rename(const char __user * oldname, + const char __user * newname); +asmlinkage long sys_symlink(const char __user * oldname, + const char __user * newname); + +/* called under sb->s_umount semaphore */ +static int vz_restore_symlink(struct super_block *sb, char *path, int type) +{ + mm_segment_t oldfs; + char *newpath; + char dest[64]; + const char *names[] = { + [USRQUOTA] "aquota.user", + [GRPQUOTA] "aquota.group" + }; + int err; + + newpath = kmalloc(strlen(path) + sizeof(".new"), GFP_KERNEL); + if (newpath == NULL) + return -ENOMEM; + + strcpy(newpath, path); + strcat(newpath, ".new"); + + sprintf(dest, "/proc/vz/vzaquota/%08x/%s", + new_encode_dev(sb->s_dev), names[type]); + + /* + * Lockdep will learn unneeded dependency while unlink(2): + * ->s_umount => ->i_mutex/1 => ->i_mutex + * Reverse dependency is, + * open_namei() => ->i_mutex => lookup_hash() => __lookup_hash() + * => ->lookup() \eq vzdq_aquotq_lookup() => find_qmblk_by_dev() + * => user_get_super() => ->s_umount + * + * However, first set of ->i_mutex'es belong to /, second to /proc . + * Right fix is to get rid of vz_restore_symlink(), of course. + */ + up_read(&sb->s_umount); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_unlink(newpath); + if (err < 0 && err != -ENOENT) + goto out_restore; + err = sys_symlink(dest, newpath); + if (err < 0) + goto out_restore; + err = sys_rename(newpath, path); +out_restore: + set_fs(oldfs); + + down_read(&sb->s_umount); + /* umounted meanwhile? */ + if (err == 0 && !sb->s_root) + err = -ENODEV; + + kfree(newpath); + return err; +} + +/* called under sb->s_umount semaphore */ +static int vz_quota_on(struct super_block *sb, int type, + int format_id, char *path) +{ + struct vz_quota_master *qmblk; + int mask, mask2; + int err; + + qmblk = vzquota_find_qmblk(sb); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = vz_restore_symlink(sb, path, type); + if (err < 0) + goto out_put; + + down(&vz_quota_sem); + mask = 0; + mask2 = 0; + sb->dq_op = &vz_quota_operations2; + sb->s_qcop = &vz_quotactl_operations; + if (type == USRQUOTA) { + mask = DQUOT_USR_ENABLED; + mask2 = VZDQ_USRQUOTA; + } + if (type == GRPQUOTA) { + mask = DQUOT_GRP_ENABLED; + mask2 = VZDQ_GRPQUOTA; + } + err = -EBUSY; + if (qmblk->dq_flags & mask2) + goto out_sem; + + err = 0; + qmblk->dq_flags |= mask2; + sb->s_dquot.flags |= mask; + +out_sem: + up(&vz_quota_sem); +out_put: + qmblk_put(qmblk); +out: + return err; +} + +static int vz_quota_off(struct super_block *sb, int type) +{ + struct vz_quota_master *qmblk; + int mask2; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + mask2 = 0; + if (type == USRQUOTA) + mask2 = VZDQ_USRQUOTA; + if (type == GRPQUOTA) + mask2 = VZDQ_GRPQUOTA; + err = -EINVAL; + if (!(qmblk->dq_flags & mask2)) + goto out; + + qmblk->dq_flags &= ~mask2; + err = 0; + +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +static int vz_quota_sync(struct super_block *sb, int type) +{ + return 0; /* vz quota is always uptodate */ +} + +static int vz_get_dqblk(struct super_block *sb, int type, + qid_t id, struct if_dqblk *di) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid *ugid; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = 0; + ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC); + if (ugid != VZ_QUOTA_UGBAD) { + qmblk_data_read_lock(qmblk); + di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10; + di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10; + di->dqb_curspace = ugid->qugid_stat.bcurrent; + di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit; + di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit; + di->dqb_curinodes = ugid->qugid_stat.icurrent; + di->dqb_btime = ugid->qugid_stat.btime; + di->dqb_itime = ugid->qugid_stat.itime; + qmblk_data_read_unlock(qmblk); + di->dqb_valid = QIF_ALL; + vzquota_put_ugid(qmblk, ugid); + } else { + memset(di, 0, sizeof(*di)); + di->dqb_valid = QIF_ALL; + } + +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +/* must be called under vz_quota_sem */ +static int __vz_set_dqblk(struct vz_quota_master *qmblk, + int type, qid_t id, struct if_dqblk *di) +{ + struct vz_quota_ugid *ugid; + + ugid = vzquota_find_ugid(qmblk, id, type, 0); + if (ugid == VZ_QUOTA_UGBAD) + return -ESRCH; + + qmblk_data_write_lock(qmblk); + /* + * Subtle compatibility breakage. + * + * Some old non-vz kernel quota didn't start grace period + * if the new soft limit happens to be below the usage. + * Non-vz kernel quota in 2.4.20 starts the grace period + * (if it hasn't been started). + * Current non-vz kernel performs even more complicated + * manipulations... + * + * Also, current non-vz kernels have inconsistency related to + * the grace time start. In regular operations the grace period + * is started if the usage is greater than the soft limit (and, + * strangely, is cancelled if the usage is less). + * However, set_dqblk starts the grace period if the usage is greater + * or equal to the soft limit. + * + * Here we try to mimic the behavior of the current non-vz kernel. + */ + if (di->dqb_valid & QIF_BLIMITS) { + ugid->qugid_stat.bhardlimit = + (__u64)di->dqb_bhardlimit << 10; + ugid->qugid_stat.bsoftlimit = + (__u64)di->dqb_bsoftlimit << 10; + if (di->dqb_bsoftlimit == 0 || + ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit) + ugid->qugid_stat.btime = 0; + else if (!(di->dqb_valid & QIF_BTIME)) + ugid->qugid_stat.btime = CURRENT_TIME_SECONDS + + qmblk->dq_ugid_info[type].bexpire; + else + ugid->qugid_stat.btime = di->dqb_btime; + } + if (di->dqb_valid & QIF_ILIMITS) { + ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit; + ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit; + if (di->dqb_isoftlimit == 0 || + ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit) + ugid->qugid_stat.itime = 0; + else if (!(di->dqb_valid & QIF_ITIME)) + ugid->qugid_stat.itime = CURRENT_TIME_SECONDS + + qmblk->dq_ugid_info[type].iexpire; + else + ugid->qugid_stat.itime = di->dqb_itime; + } + qmblk_data_write_unlock(qmblk); + vzquota_put_ugid(qmblk, ugid); + + return 0; +} + +static int vz_set_dqblk(struct super_block *sb, int type, + qid_t id, struct if_dqblk *di) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + err = __vz_set_dqblk(qmblk, type, id, di); +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +static int vz_get_dqinfo(struct super_block *sb, int type, + struct if_dqinfo *ii) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = 0; + ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire; + ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire; + ii->dqi_flags = 0; + ii->dqi_valid = IIF_ALL; + +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +/* must be called under vz_quota_sem */ +static int __vz_set_dqinfo(struct vz_quota_master *qmblk, + int type, struct if_dqinfo *ii) +{ + if (ii->dqi_valid & IIF_FLAGS) + if (ii->dqi_flags & DQF_MASK) + return -EINVAL; + + if (ii->dqi_valid & IIF_BGRACE) + qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace; + if (ii->dqi_valid & IIF_IGRACE) + qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace; + return 0; +} + +static int vz_set_dqinfo(struct super_block *sb, int type, + struct if_dqinfo *ii) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + err = __vz_set_dqinfo(qmblk, type, ii); +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +#ifdef CONFIG_QUOTA_COMPAT + +#define Q_GETQUOTI_SIZE 1024 + +#define UGID2DQBLK(dst, src) \ + do { \ + (dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \ + (dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \ + (dst)->dqb_curinodes = (src)->qugid_stat.icurrent; \ + /* in 1K blocks */ \ + (dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \ + /* in 1K blocks */ \ + (dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \ + /* in bytes, 64 bit */ \ + (dst)->dqb_curspace = (src)->qugid_stat.bcurrent; \ + (dst)->dqb_btime = (src)->qugid_stat.btime; \ + (dst)->dqb_itime = (src)->qugid_stat.itime; \ + } while (0) + +static int vz_get_quoti(struct super_block *sb, int type, qid_t idx, + struct v2_disk_dqblk __user *dqblk) +{ + struct vz_quota_master *qmblk; + struct v2_disk_dqblk *data, *kbuf; + struct vz_quota_ugid *ugid; + int count; + int err; + + qmblk = vzquota_find_qmblk(sb); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = -ENOMEM; + kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf)); + if (!kbuf) + goto out; + + down(&vz_quota_sem); + down(&qmblk->dq_sem); + for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0; + ugid != NULL && count < Q_GETQUOTI_SIZE; + count++) + { + data = kbuf + count; + qmblk_data_read_lock(qmblk); + UGID2DQBLK(data, ugid); + qmblk_data_read_unlock(qmblk); + data->dqb_id = ugid->qugid_id; + + /* Find next entry */ + ugid = vzquota_get_next(qmblk, ugid); + BUG_ON(ugid != NULL && ugid->qugid_type != type); + } + up(&qmblk->dq_sem); + up(&vz_quota_sem); + + err = count; + if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf))) + err = -EFAULT; + + vfree(kbuf); +out: + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + + return err; +} + +#endif + +struct quotactl_ops vz_quotactl_operations = { + .quota_on = vz_quota_on, + .quota_off = vz_quota_off, + .quota_sync = vz_quota_sync, + .get_info = vz_get_dqinfo, + .set_info = vz_set_dqinfo, + .get_dqblk = vz_get_dqblk, + .set_dqblk = vz_set_dqblk, +#ifdef CONFIG_QUOTA_COMPAT + .get_quoti = vz_get_quoti, +#endif +}; + + +/* ---------------------------------------------------------------------- + * Management interface for host system admins. + * --------------------------------------------------------------------- */ + +static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size, + struct vz_quota_iface __user *u_ugid_buf, int compat) +{ + struct vz_quota_master *qmblk; + int ret; + + down(&vz_quota_sem); + + ret = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + ret = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; /* working quota doesn't accept new ugids */ + + ret = 0; + /* start to add ugids */ + for (ret = 0; ret < ugid_size; ret++) { + struct vz_quota_iface ugid_buf; + struct vz_quota_ugid *ugid; + + if (!compat) { + if (copy_from_user(&ugid_buf, u_ugid_buf, + sizeof(ugid_buf))) + break; + u_ugid_buf++; /* next user buffer */ + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_iface oqif; + if (copy_from_user(&oqif, u_ugid_buf, + sizeof(oqif))) + break; + ugid_buf.qi_id = oqif.qi_id; + ugid_buf.qi_type = oqif.qi_type; + compat_dqstat2dqstat(&oqif.qi_stat, &ugid_buf.qi_stat); + u_ugid_buf = (struct vz_quota_iface __user *) + (((void *)u_ugid_buf) + sizeof(oqif)); +#endif + } + + if (ugid_buf.qi_type >= MAXQUOTAS) + break; /* bad quota type - this is the only check */ + + ugid = vzquota_find_ugid(qmblk, + ugid_buf.qi_id, ugid_buf.qi_type, 0); + if (ugid == VZ_QUOTA_UGBAD) { + qmblk->dq_flags |= VZDQUG_FIXED_SET; + break; /* limit reached */ + } + + /* update usage/limits + * we can copy the data without the lock, because the data + * cannot be modified in VZDQ_STARTING state */ + ugid->qugid_stat = ugid_buf.qi_stat; + + vzquota_put_ugid(qmblk, ugid); + } +out: + up(&vz_quota_sem); + + return ret; +} + +static int quota_ugid_setgrace(unsigned int quota_id, + struct dq_info __user u_dq_info[], int compat) +{ + struct vz_quota_master *qmblk; + struct dq_info dq_info[MAXQUOTAS]; + struct dq_info *target; + int err, type; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; /* working quota doesn't accept changing options */ + + err = -EFAULT; + if (!compat) { + if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_dq_info odqi[MAXQUOTAS]; + if (copy_from_user(odqi, u_dq_info, sizeof(odqi))) + goto out; + for (type = 0; type < MAXQUOTAS; type++) + compat_dqinfo2dqinfo(&odqi[type], &dq_info[type]); +#endif + } + + err = 0; + + /* update in qmblk */ + for (type = 0; type < MAXQUOTAS; type++) { + target = &qmblk->dq_ugid_info[type]; + target->bexpire = dq_info[type].bexpire; + target->iexpire = dq_info[type].iexpire; + } +out: + up(&vz_quota_sem); + + return err; +} + +static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size, + struct vz_quota_iface *u_ugid_buf) +{ + int type, count; + struct vz_quota_ugid *ugid; + + if (QTREE_LEAFNUM(qmblk->dq_uid_tree) + + QTREE_LEAFNUM(qmblk->dq_gid_tree) + <= index) + return 0; + + count = 0; + + type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA; + if (type == GRPQUOTA) + index -= QTREE_LEAFNUM(qmblk->dq_uid_tree); + + /* loop through ugid and then qgid quota */ +repeat: + for (ugid = vzquota_get_byindex(qmblk, index, type); + ugid != NULL && count < size; + ugid = vzquota_get_next(qmblk, ugid), count++) + { + struct vz_quota_iface ugid_buf; + + /* form interface buffer and send in to user-level */ + qmblk_data_read_lock(qmblk); + memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat, + sizeof(ugid_buf.qi_stat)); + qmblk_data_read_unlock(qmblk); + ugid_buf.qi_id = ugid->qugid_id; + ugid_buf.qi_type = ugid->qugid_type; + + memcpy(u_ugid_buf, &ugid_buf, sizeof(ugid_buf)); + u_ugid_buf++; /* next portion of user buffer */ + } + + if (type == USRQUOTA && count < size) { + type = GRPQUOTA; + index = 0; + goto repeat; + } + + return count; +} + +static int quota_ugid_getstat(unsigned int quota_id, + int index, int size, struct vz_quota_iface __user *u_ugid_buf, + int compat) +{ + struct vz_quota_master *qmblk; + struct vz_quota_iface *k_ugid_buf; + int err; + + if (index < 0 || size < 0) + return -EINVAL; + + if (size > INT_MAX / sizeof(struct vz_quota_iface)) + return -EINVAL; + + k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface)); + if (k_ugid_buf == NULL) + return -ENOMEM; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + down(&qmblk->dq_sem); + err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf); + up(&qmblk->dq_sem); + if (err < 0) + goto out; + + if (!compat) { + if (copy_to_user(u_ugid_buf, k_ugid_buf, + err * sizeof(struct vz_quota_iface))) + err = -EFAULT; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_iface oqif; + int i; + for (i = 0; i < err; i++) { + oqif.qi_id = k_ugid_buf[i].qi_id; + oqif.qi_type = k_ugid_buf[i].qi_type; + dqstat2compat_dqstat(&k_ugid_buf[i].qi_stat, + &oqif.qi_stat); + if (copy_to_user(u_ugid_buf, &oqif, sizeof(oqif))) + err = -EFAULT; + u_ugid_buf = (struct vz_quota_iface __user *) + (((void *)u_ugid_buf) + sizeof(oqif)); + } +#endif + } + +out: + up(&vz_quota_sem); + vfree(k_ugid_buf); + return err; +} + +static int quota_ugid_getgrace(unsigned int quota_id, + struct dq_info __user u_dq_info[], int compat) +{ + struct vz_quota_master *qmblk; + struct dq_info dq_info[MAXQUOTAS]; + struct dq_info *target; + int err, type; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = 0; + /* update from qmblk */ + for (type = 0; type < MAXQUOTAS; type ++) { + target = &qmblk->dq_ugid_info[type]; + dq_info[type].bexpire = target->bexpire; + dq_info[type].iexpire = target->iexpire; + dq_info[type].flags = target->flags; + } + + if (!compat) { + if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info))) + err = -EFAULT; + } else { +#ifdef CONFIG_COMPAT + struct compat_dq_info odqi[MAXQUOTAS]; + for (type = 0; type < MAXQUOTAS; type ++) + dqinfo2compat_dqinfo(&dq_info[type], &odqi[type]); + if (copy_to_user(u_dq_info, odqi, sizeof(odqi))) + err = -EFAULT; +#endif + } +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_getconfig(unsigned int quota_id, + struct vz_quota_ugid_stat __user *info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_stat kinfo; + int err; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = 0; + kinfo.limit = qmblk->dq_ugid_max; + kinfo.count = qmblk->dq_ugid_count; + kinfo.flags = qmblk->dq_flags; + + if (copy_to_user(info, &kinfo, sizeof(kinfo))) + err = -EFAULT; +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_setconfig(unsigned int quota_id, + struct vz_quota_ugid_stat __user *info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_stat kinfo; + int err; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&kinfo, info, sizeof(kinfo))) + goto out; + + err = 0; + qmblk->dq_ugid_max = kinfo.limit; + if (qmblk->dq_state == VZDQ_STARTING) { + qmblk->dq_flags = kinfo.flags; + if (qmblk->dq_flags & VZDQUG_ON) + qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA; + } + +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_setlimit(unsigned int quota_id, + struct vz_quota_ugid_setlimit __user *u_lim) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_setlimit lim; + int err; + + down(&vz_quota_sem); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&lim, u_lim, sizeof(lim))) + goto out; + + err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb); + +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_setinfo(unsigned int quota_id, + struct vz_quota_ugid_setinfo __user *u_info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_setinfo info; + int err; + + down(&vz_quota_sem); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&info, u_info, sizeof(info))) + goto out; + + err = __vz_set_dqinfo(qmblk, info.type, &info.dqi); + +out: + up(&vz_quota_sem); + + return err; +} + +/* + * This is a system call to maintain UGID quotas + * Note this call is allowed to run ONLY from VE0 + */ +long do_vzquotaugidctl(int cmd, unsigned int quota_id, + unsigned int ugid_index, unsigned int ugid_size, + void *addr, int compat) +{ + int ret; + + ret = -EPERM; + /* access allowed only from root of VE0 */ + if (!capable(CAP_SYS_RESOURCE) || + !capable(CAP_SYS_ADMIN)) + goto out; + + switch (cmd) { + case VZ_DQ_UGID_GETSTAT: + ret = quota_ugid_getstat(quota_id, + ugid_index, ugid_size, + (struct vz_quota_iface __user *)addr, + compat); + break; + case VZ_DQ_UGID_ADDSTAT: + ret = quota_ugid_addstat(quota_id, ugid_size, + (struct vz_quota_iface __user *) addr, + compat); + break; + case VZ_DQ_UGID_GETGRACE: + ret = quota_ugid_getgrace(quota_id, + (struct dq_info __user *)addr, compat); + break; + case VZ_DQ_UGID_SETGRACE: + ret = quota_ugid_setgrace(quota_id, + (struct dq_info __user *)addr, compat); + break; + case VZ_DQ_UGID_GETCONFIG: + ret = quota_ugid_getconfig(quota_id, + (struct vz_quota_ugid_stat __user *) + addr); + break; + case VZ_DQ_UGID_SETCONFIG: + ret = quota_ugid_setconfig(quota_id, + (struct vz_quota_ugid_stat __user *) + addr); + break; + case VZ_DQ_UGID_SETLIMIT: + ret = quota_ugid_setlimit(quota_id, + (struct vz_quota_ugid_setlimit __user *) + addr); + break; + case VZ_DQ_UGID_SETINFO: + ret = quota_ugid_setinfo(quota_id, + (struct vz_quota_ugid_setinfo __user *) + addr); + break; + default: + ret = -EINVAL; + goto out; + } +out: + return ret; +} + +static void ugid_quota_on_sb(struct super_block *sb) +{ + struct super_block *real_sb; + struct vz_quota_master *qmblk; + + if (!sb->s_op->get_quota_root) + return; + + real_sb = sb->s_op->get_quota_root(sb)->i_sb; + if (real_sb->dq_op != &vz_quota_operations) + return; + + sb->dq_op = &vz_quota_operations2; + sb->s_qcop = &vz_quotactl_operations; + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; + + qmblk = vzquota_find_qmblk(sb); + if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD)) + return; + down(&vz_quota_sem); + if (qmblk->dq_flags & VZDQ_USRQUOTA) + sb->s_dquot.flags |= DQUOT_USR_ENABLED; + if (qmblk->dq_flags & VZDQ_GRPQUOTA) + sb->s_dquot.flags |= DQUOT_GRP_ENABLED; + up(&vz_quota_sem); + qmblk_put(qmblk); +} + +static void ugid_quota_off_sb(struct super_block *sb) +{ + /* can't make quota off on mounted super block */ + BUG_ON(sb->s_root != NULL); +} + +static int ugid_notifier_call(struct vnotifier_block *self, + unsigned long n, void *data, int old_ret) +{ + struct virt_info_quota *viq; + + viq = (struct virt_info_quota *)data; + + switch (n) { + case VIRTINFO_QUOTA_ON: + ugid_quota_on_sb(viq->super); + break; + case VIRTINFO_QUOTA_OFF: + ugid_quota_off_sb(viq->super); + break; + case VIRTINFO_QUOTA_GETSTAT: + break; + default: + return old_ret; + } + return NOTIFY_OK; +} + +static struct vnotifier_block ugid_notifier_block = { + .notifier_call = ugid_notifier_call, +}; + +/* ---------------------------------------------------------------------- + * Init/exit. + * --------------------------------------------------------------------- */ + +int vzquota_ugid_init(void) +{ + int err; + + vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid", + sizeof(struct vz_quota_ugid), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (vz_quota_ugid_cachep == NULL) + goto err_slab; + + err = register_quota_format(&vz_quota_empty_v2_format); + if (err) + goto err_reg; + + virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block); + return 0; + +err_reg: + kmem_cache_destroy(vz_quota_ugid_cachep); + return err; + +err_slab: + printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); + return -ENOMEM; +} + +void vzquota_ugid_release(void) +{ + virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block); + unregister_quota_format(&vz_quota_empty_v2_format); + + if (kmem_cache_destroy(vz_quota_ugid_cachep)) + printk(KERN_ERR "VZQUOTA: kmem_cache_destroy failed\n"); +} diff -uprN linux-2.6.18/fs/vzdquot.c linux-2.6.18.ovz/fs/vzdquot.c --- linux-2.6.18/fs/vzdquot.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/fs/vzdquot.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1831 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains the core of Virtuozzo disk quota implementation: + * maintenance of VZDQ information in inodes, + * external interfaces, + * module entry. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ---------------------------------------------------------------------- + * + * Locking + * + * ---------------------------------------------------------------------- */ + +/* + * Serializes on/off and all other do_vzquotactl operations. + * Protects qmblk hash. + */ +struct semaphore vz_quota_sem; + +/* + * Data access locks + * inode_qmblk + * protects qmblk pointers in all inodes and qlnk content in general + * (but not qmblk content); + * also protects related qmblk invalidation procedures; + * can't be per-inode because of vzquota_dtree_qmblk complications + * and problems with serialization with quota_on, + * but can be per-superblock; + * qmblk_data + * protects qmblk fields (such as current usage) + * quota_data + * protects charge/uncharge operations, thus, implies + * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock + * (to protect ugid pointers). + * + * Lock order: + * inode_qmblk_lock -> dcache_lock + * inode_qmblk_lock -> qmblk_data + */ +static spinlock_t vzdq_qmblk_lock = SPIN_LOCK_UNLOCKED; + +inline void inode_qmblk_lock(struct super_block *sb) +{ + spin_lock(&vzdq_qmblk_lock); +} + +inline void inode_qmblk_unlock(struct super_block *sb) +{ + spin_unlock(&vzdq_qmblk_lock); +} + +inline void qmblk_data_read_lock(struct vz_quota_master *qmblk) +{ + spin_lock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk) +{ + spin_unlock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_write_lock(struct vz_quota_master *qmblk) +{ + spin_lock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk) +{ + spin_unlock(&qmblk->dq_data_lock); +} + +struct quota_format_type vz_quota_empty_v2_format = { + .qf_fmt_id = QFMT_VFS_V0, + .qf_ops = NULL, + .qf_owner = THIS_MODULE, +}; + +/* ---------------------------------------------------------------------- + * + * Master hash table handling. + * + * SMP not safe, serialied by vz_quota_sem within quota syscalls + * + * --------------------------------------------------------------------- */ + +static kmem_cache_t *vzquota_cachep; + +/* + * Hash function. + */ +#define QHASH_BITS 6 +#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS) +#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1) + +struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE]; +int vzquota_hash_size = VZ_QUOTA_HASH_SIZE; + +static inline int vzquota_hash_func(unsigned int qid) +{ + return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK); +} + +/** + * vzquota_alloc_master - alloc and instantiate master quota record + * + * Returns: + * pointer to newly created record if SUCCESS + * -ENOMEM if out of memory + * -EEXIST if record with given quota_id already exist + */ +struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, + struct vz_quota_stat *qstat) +{ + int err; + struct vz_quota_master *qmblk; + + err = -EEXIST; + if (vzquota_find_master(quota_id) != NULL) + goto out; + + err = -ENOMEM; + qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL); + if (qmblk == NULL) + goto out; +#ifdef CONFIG_VZ_QUOTA_UGID + qmblk->dq_uid_tree = quotatree_alloc(); + if (!qmblk->dq_uid_tree) + goto out_free; + + qmblk->dq_gid_tree = quotatree_alloc(); + if (!qmblk->dq_gid_tree) + goto out_free_tree; +#endif + + qmblk->dq_state = VZDQ_STARTING; + init_MUTEX(&qmblk->dq_sem); + spin_lock_init(&qmblk->dq_data_lock); + + qmblk->dq_id = quota_id; + qmblk->dq_stat = qstat->dq_stat; + qmblk->dq_info = qstat->dq_info; + qmblk->dq_root_dentry = NULL; + qmblk->dq_root_mnt = NULL; + qmblk->dq_sb = NULL; + qmblk->dq_ugid_count = 0; + qmblk->dq_ugid_max = 0; + qmblk->dq_flags = 0; + memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info)); + INIT_LIST_HEAD(&qmblk->dq_ilink_list); + + atomic_set(&qmblk->dq_count, 1); + + /* insert in hash chain */ + list_add(&qmblk->dq_hash, + &vzquota_hash_table[vzquota_hash_func(quota_id)]); + + /* success */ + return qmblk; + +#ifdef CONFIG_VZ_QUOTA_UGID +out_free_tree: + quotatree_free(qmblk->dq_uid_tree, NULL); +out_free: + kmem_cache_free(vzquota_cachep, qmblk); +#endif +out: + return ERR_PTR(err); +} + +static struct vz_quota_master *vzquota_alloc_fake(void) +{ + struct vz_quota_master *qmblk; + + qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL); + if (qmblk == NULL) + return NULL; + memset(qmblk, 0, sizeof(*qmblk)); + qmblk->dq_state = VZDQ_STOPING; + qmblk->dq_flags = VZDQ_NOQUOT; + spin_lock_init(&qmblk->dq_data_lock); + INIT_LIST_HEAD(&qmblk->dq_ilink_list); + atomic_set(&qmblk->dq_count, 1); + return qmblk; +} + +/** + * vzquota_find_master - find master record with given id + * + * Returns qmblk without touching its refcounter. + * Called under vz_quota_sem. + */ +struct vz_quota_master *vzquota_find_master(unsigned int quota_id) +{ + int i; + struct vz_quota_master *qp; + + i = vzquota_hash_func(quota_id); + list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) { + if (qp->dq_id == quota_id) + return qp; + } + return NULL; +} + +/** + * vzquota_free_master - release resources taken by qmblk, freeing memory + * + * qmblk is assumed to be already taken out from the hash. + * Should be called outside vz_quota_sem. + */ +void vzquota_free_master(struct vz_quota_master *qmblk) +{ +#ifdef CONFIG_VZ_QUOTA_UGID + vzquota_kill_ugid(qmblk); +#endif + BUG_ON(!list_empty(&qmblk->dq_ilink_list)); + kmem_cache_free(vzquota_cachep, qmblk); +} + + +/* ---------------------------------------------------------------------- + * + * Passing quota information through current + * + * Used in inode -> qmblk lookup at inode creation stage (since at that + * time there are no links between the inode being created and its parent + * directory). + * + * --------------------------------------------------------------------- */ + +#define VZDQ_CUR_MAGIC 0x57d0fee2 + +static inline int vzquota_cur_qmblk_check(void) +{ + return current->magic == VZDQ_CUR_MAGIC; +} + +static inline struct inode *vzquota_cur_qmblk_fetch(void) +{ + return current->ino; +} + +static inline void vzquota_cur_qmblk_set(struct inode *data) +{ + struct task_struct *tsk; + + tsk = current; + tsk->magic = VZDQ_CUR_MAGIC; + tsk->ino = data; +} + +#if 0 +static inline void vzquota_cur_qmblk_reset(void) +{ + current->magic = 0; +} +#endif + + +/* ---------------------------------------------------------------------- + * + * Superblock quota operations + * + * --------------------------------------------------------------------- */ + +/* + * Kernel structure abuse. + * We use files[0] pointer as an int variable: + * reference counter of how many quota blocks uses this superblock. + * files[1] is used for generations structure which helps us to track + * when traversing of dentries is really required. + */ +#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master +#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\ + &sb->s_dquot.dqio_mutex) + +#if defined(VZ_QUOTA_UNLOAD) + +#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count + +struct dquot_operations *orig_dq_op; +struct quotactl_ops *orig_dq_cop; + +/** + * quota_get_super - account for new a quoted tree under the superblock + * + * One superblock can have multiple directory subtrees with different VZ + * quotas. We keep a counter of such subtrees and set VZ quota operations or + * reset the default ones. + * + * Called under vz_quota_sem (from quota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ + if (sb->dq_op != &vz_quota_operations) { + down(&sb->s_dquot.dqonoff_sem); + if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) { + up(&sb->s_dquot.dqonoff_sem); + return -EEXIST; + } + if (orig_dq_op == NULL && sb->dq_op != NULL) + orig_dq_op = sb->dq_op; + sb->dq_op = &vz_quota_operations; + if (orig_dq_cop == NULL && sb->s_qcop != NULL) + orig_dq_cop = sb->s_qcop; + /* XXX this may race with sys_quotactl */ +#ifdef CONFIG_VZ_QUOTA_UGID + sb->s_qcop = &vz_quotactl_operations; +#else + sb->s_qcop = NULL; +#endif + do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; + /* + * To get quotaops.h call us we need to mark superblock + * as having quota. These flags mark the moment when + * our dq_op start to be called. + * + * The ordering of dq_op and s_dquot.flags assignment + * needs to be enforced, but other CPUs do not do rmb() + * between s_dquot.flags and dq_op accesses. + */ + wmb(); synchronize_sched(); + sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; + __module_get(THIS_MODULE); + up(&sb->s_dquot.dqonoff_sem); + } + /* protected by vz_quota_sem */ + __VZ_QUOTA_SBREF(sb)++; + return 0; +} + +/** + * quota_put_super - release superblock when one quota tree goes away + * + * Called under vz_quota_sem. + */ +void vzquota_put_super(struct super_block *sb) +{ + int count; + + count = --__VZ_QUOTA_SBREF(sb); + if (count == 0) { + down(&sb->s_dquot.dqonoff_sem); + sb->s_dquot.flags = 0; + wmb(); synchronize_sched(); + sema_init(&sb->s_dquot.dqio_sem, 1); + sb->s_qcop = orig_dq_cop; + sb->dq_op = orig_dq_op; + inode_qmblk_lock(sb); + quota_gen_put(SB_QGEN(sb)); + SB_QGEN(sb) = NULL; + /* release qlnk's without qmblk */ + remove_inode_quota_links_list(&non_vzquota_inodes_lh, + sb, NULL); + /* + * Races with quota initialization: + * after this inode_qmblk_unlock all inode's generations are + * invalidated, quota_inode_qmblk checks superblock operations. + */ + inode_qmblk_unlock(sb); + /* + * Module refcounting: in theory, this is the best place + * to call module_put(THIS_MODULE). + * In reality, it can't be done because we can't be sure that + * other CPUs do not enter our code segment through dq_op + * cached long time ago. Quotaops interface isn't supposed to + * go into modules currently (that is, into unloadable + * modules). By omitting module_put, our module isn't + * unloadable. + */ + up(&sb->s_dquot.dqonoff_sem); + } +} + +#else + +struct vzquota_new_sop { + struct super_operations new_op; + struct super_operations *old_op; +}; + +/** + * vzquota_shutdown_super - callback on umount + */ +void vzquota_shutdown_super(struct super_block *sb) +{ + struct vz_quota_master *qmblk; + struct vzquota_new_sop *sop; + + qmblk = __VZ_QUOTA_NOQUOTA(sb); + __VZ_QUOTA_NOQUOTA(sb) = NULL; + if (qmblk != NULL) + qmblk_put(qmblk); + sop = container_of(sb->s_op, struct vzquota_new_sop, new_op); + sb->s_op = sop->old_op; + kfree(sop); + if (sb->s_op->put_super != NULL) + (*sb->s_op->put_super)(sb); +} + +/** + * vzquota_get_super - account for new a quoted tree under the superblock + * + * One superblock can have multiple directory subtrees with different VZ + * quotas. + * + * Called under vz_quota_sem (from vzquota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ + struct vz_quota_master *qnew; + struct vzquota_new_sop *sop; + int err; + + mutex_lock(&sb->s_dquot.dqonoff_mutex); + err = -EEXIST; + if ((sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) && + sb->dq_op != &vz_quota_operations) + goto out_up; + + /* + * This allocation code should be under sb->dq_op check below, but + * it doesn't really matter... + */ + if (__VZ_QUOTA_NOQUOTA(sb) == NULL) { + qnew = vzquota_alloc_fake(); + if (qnew == NULL) + goto out_up; + __VZ_QUOTA_NOQUOTA(sb) = qnew; + } + + if (sb->dq_op != &vz_quota_operations) { + sop = kmalloc(sizeof(*sop), GFP_KERNEL); + if (sop == NULL) { + vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb)); + __VZ_QUOTA_NOQUOTA(sb) = NULL; + goto out_up; + } + memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op)); + sop->new_op.put_super = &vzquota_shutdown_super; + sop->old_op = sb->s_op; + sb->s_op = &sop->new_op; + + sb->dq_op = &vz_quota_operations; +#ifdef CONFIG_VZ_QUOTA_UGID + sb->s_qcop = &vz_quotactl_operations; +#else + sb->s_qcop = NULL; +#endif + do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); + + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + /* these 2 list heads are checked in sync_dquots() */ + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = + &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = + &vz_quota_empty_v2_format; + + /* + * To get quotaops.h to call us we need to mark superblock + * as having quota. These flags mark the moment when + * our dq_op start to be called. + * + * The ordering of dq_op and s_dquot.flags assignment + * needs to be enforced, but other CPUs do not do rmb() + * between s_dquot.flags and dq_op accesses. + */ + wmb(); synchronize_sched(); + sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; + } + err = 0; + +out_up: + mutex_unlock(&sb->s_dquot.dqonoff_mutex); + return err; +} + +/** + * vzquota_put_super - one quota tree less on this superblock + * + * Called under vz_quota_sem. + */ +void vzquota_put_super(struct super_block *sb) +{ + /* + * Even if this put is the last one, + * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop + * won't be called and the remaining qmblk references won't be put. + */ +} + +#endif + + +/* ---------------------------------------------------------------------- + * + * Helpers for inode -> qmblk link maintenance + * + * --------------------------------------------------------------------- */ + +#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd) +#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT) +#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops) +extern struct inode_operations vfs_empty_iops; + +static int VZ_QUOTA_IS_ACTUAL(struct inode *inode) +{ + struct vz_quota_master *qmblk; + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk == VZ_QUOTA_BAD) + return 1; + if (qmblk == __VZ_QUOTA_EMPTY) + return 0; + if (qmblk->dq_flags & VZDQ_NOACT) + /* not actual (invalidated) qmblk */ + return 0; + return 1; +} + +static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk) +{ + return qlnk->qmblk == __VZ_QUOTA_EMPTY; +} + +static inline void set_qlnk_origin(struct vz_quota_ilink *qlnk, + unsigned char origin) +{ + qlnk->origin[0] = qlnk->origin[1]; + qlnk->origin[1] = origin; +} + +static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk) +{ + qlnk->qmblk = __VZ_QUOTA_EMPTY; + set_qlnk_origin(qlnk, VZ_QUOTAO_SETE); +} + +void vzquota_qlnk_init(struct vz_quota_ilink *qlnk) +{ + memset(qlnk, 0, sizeof(*qlnk)); + INIT_LIST_HEAD(&qlnk->list); + vzquota_qlnk_set_empty(qlnk); + set_qlnk_origin(qlnk, VZ_QUOTAO_INIT); +} + +void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk) +{ + might_sleep(); + if (vzquota_qlnk_is_empty(qlnk)) + return; +#if defined(CONFIG_VZ_QUOTA_UGID) + if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) { + struct vz_quota_master *qmblk; + struct vz_quota_ugid *quid, *qgid; + qmblk = qlnk->qmblk; + quid = qlnk->qugid[USRQUOTA]; + qgid = qlnk->qugid[GRPQUOTA]; + if (quid != NULL || qgid != NULL) { + down(&qmblk->dq_sem); + if (qgid != NULL) + vzquota_put_ugid(qmblk, qgid); + if (quid != NULL) + vzquota_put_ugid(qmblk, quid); + up(&qmblk->dq_sem); + } + } +#endif + if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) + qmblk_put(qlnk->qmblk); + set_qlnk_origin(qlnk, VZ_QUOTAO_DESTR); +} + +/** + * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents + * @qlt: temporary + * @qli: inode's + * + * Locking is provided by the caller (depending on the context). + * After swap, @qli is inserted into the corresponding dq_ilink_list, + * @qlt list is reinitialized. + */ +static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt, + struct vz_quota_ilink *qli) +{ + struct vz_quota_master *qb; + struct vz_quota_ugid *qu; + int i; + + qb = qlt->qmblk; + qlt->qmblk = qli->qmblk; + qli->qmblk = qb; + list_del_init(&qli->list); + if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD) + list_add(&qli->list, &qb->dq_ilink_list); + INIT_LIST_HEAD(&qlt->list); + set_qlnk_origin(qli, VZ_QUOTAO_SWAP); + + for (i = 0; i < MAXQUOTAS; i++) { + qu = qlt->qugid[i]; + qlt->qugid[i] = qli->qugid[i]; + qli->qugid[i] = qu; + } +} + +/** + * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks + * + * Called under dcache_lock and inode_qmblk locks. + * Returns 1 if locks were dropped inside, 0 if atomic. + */ +static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk, + struct inode *inode) +{ + if (vzquota_qlnk_is_empty(qlnk)) + return 0; + if (qlnk->qmblk == VZ_QUOTA_BAD) { + vzquota_qlnk_set_empty(qlnk); + set_qlnk_origin(qlnk, VZ_QUOTAO_RE_LOCK); + return 0; + } + spin_unlock(&dcache_lock); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(qlnk); + vzquota_qlnk_init(qlnk); + inode_qmblk_lock(inode->i_sb); + spin_lock(&dcache_lock); + return 1; +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content + * + * Similar to vzquota_qlnk_reinit_locked, called under different locks. + */ +static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct vz_quota_master *qmblk) +{ + if (vzquota_qlnk_is_empty(qlnk)) + return 0; + /* may be optimized if qlnk->qugid all NULLs */ + qmblk_data_write_unlock(qmblk); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(qlnk); + vzquota_qlnk_init(qlnk); + inode_qmblk_lock(inode->i_sb); + qmblk_data_write_lock(qmblk); + return 1; +} +#endif + +/** + * vzquota_qlnk_fill - fill vz_quota_ilink content + * @qlnk: vz_quota_ilink to fill + * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid) + * @qmblk: qmblk to which this @qlnk will belong + * + * Called under dcache_lock and inode_qmblk locks. + * Returns 1 if locks were dropped inside, 0 if atomic. + * @qlnk is expected to be empty. + */ +static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct vz_quota_master *qmblk) +{ + if (qmblk != VZ_QUOTA_BAD) + qmblk_get(qmblk); + qlnk->qmblk = qmblk; + +#if defined(CONFIG_VZ_QUOTA_UGID) + if (qmblk != VZ_QUOTA_BAD && + !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && + (qmblk->dq_flags & VZDQUG_ON)) { + struct vz_quota_ugid *quid, *qgid; + + spin_unlock(&dcache_lock); + inode_qmblk_unlock(inode->i_sb); + + down(&qmblk->dq_sem); + quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0); + qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0); + up(&qmblk->dq_sem); + + inode_qmblk_lock(inode->i_sb); + spin_lock(&dcache_lock); + qlnk->qugid[USRQUOTA] = quid; + qlnk->qugid[GRPQUOTA] = qgid; + return 1; + } +#endif + + return 0; +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid + * + * This function is a helper for vzquota_transfer, and differs from + * vzquota_qlnk_fill only by locking. + */ +static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct iattr *iattr, + int mask, + struct vz_quota_master *qmblk) +{ + qmblk_get(qmblk); + qlnk->qmblk = qmblk; + + if (mask) { + struct vz_quota_ugid *quid, *qgid; + + quid = qgid = NULL; /* to make gcc happy */ + if (!(mask & (1 << USRQUOTA))) + quid = vzquota_get_ugid(INODE_QLNK(inode)-> + qugid[USRQUOTA]); + if (!(mask & (1 << GRPQUOTA))) + qgid = vzquota_get_ugid(INODE_QLNK(inode)-> + qugid[GRPQUOTA]); + + qmblk_data_write_unlock(qmblk); + inode_qmblk_unlock(inode->i_sb); + + down(&qmblk->dq_sem); + if (mask & (1 << USRQUOTA)) + quid = __vzquota_find_ugid(qmblk, iattr->ia_uid, + USRQUOTA, 0); + if (mask & (1 << GRPQUOTA)) + qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid, + GRPQUOTA, 0); + up(&qmblk->dq_sem); + + inode_qmblk_lock(inode->i_sb); + qmblk_data_write_lock(qmblk); + qlnk->qugid[USRQUOTA] = quid; + qlnk->qugid[GRPQUOTA] = qgid; + return 1; + } + + return 0; +} +#endif + +/** + * __vzquota_inode_init - make sure inode's qlnk is initialized + * + * May be called if qlnk is already initialized, detects this situation itself. + * Called under inode_qmblk_lock. + */ +static void __vzquota_inode_init(struct inode *inode, unsigned char origin) +{ + if (inode->i_dquot[USRQUOTA] == NODQUOT) { + vzquota_qlnk_init(INODE_QLNK(inode)); + inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NODQUOT; + } + set_qlnk_origin(INODE_QLNK(inode), origin); +} + +/** + * vzquota_inode_drop - destroy VZ quota information in the inode + * + * Inode must not be externally accessible or dirty. + */ +static void vzquota_inode_drop(struct inode *inode) +{ + struct vz_quota_ilink qlnk; + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(inode->i_sb); + vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DRCAL); + inode->i_dquot[USRQUOTA] = NODQUOT; + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk); +} + +/** + * vzquota_inode_qmblk_set - initialize inode's qlnk + * @inode: inode to be initialized + * @qmblk: quota master block to which this inode should belong (may be BAD) + * @qlnk: placeholder to store data to resolve locking issues + * + * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise. + * Called under dcache_lock and inode_qmblk locks. + * @qlnk will be destroyed in the caller chain. + * + * It is not mandatory to restart parent checks since quota on/off currently + * shrinks dentry tree and checks that there are not outside references. + * But if at some time that shink is removed, restarts will be required. + * Additionally, the restarts prevent inconsistencies if the dentry tree + * changes (inode is moved). This is not a big deal, but anyway... + */ +static int vzquota_inode_qmblk_set(struct inode *inode, + struct vz_quota_master *qmblk, + struct vz_quota_ilink *qlnk) +{ + if (qmblk == NULL) { + printk(KERN_ERR "VZDQ: NULL in set, orig {%u, %u}, " + "dev %s, inode %lu, fs %s\n", + INODE_QLNK(inode)->origin[0], + INODE_QLNK(inode)->origin[1], + inode->i_sb->s_id, inode->i_ino, + inode->i_sb->s_type->name); + printk(KERN_ERR "current %d (%s), VE %d\n", + current->pid, current->comm, + VEID(get_exec_env())); + dump_stack(); + qmblk = VZ_QUOTA_BAD; + } + while (1) { + if (vzquota_qlnk_is_empty(qlnk) && + vzquota_qlnk_fill(qlnk, inode, qmblk)) + return 1; + if (qlnk->qmblk == qmblk) + break; + if (vzquota_qlnk_reinit_locked(qlnk, inode)) + return 1; + } + vzquota_qlnk_swap(qlnk, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_QSET); + return 0; +} + + +/* ---------------------------------------------------------------------- + * + * vzquota_inode_qmblk (inode -> qmblk lookup) parts + * + * --------------------------------------------------------------------- */ + +static int vzquota_dparents_check_attach(struct inode *inode) +{ + if (!list_empty(&inode->i_dentry)) + return 0; + printk(KERN_ERR "VZDQ: no parent for " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + return -1; +} + +static struct inode *vzquota_dparents_check_actual(struct inode *inode) +{ + struct dentry *de; + + list_for_each_entry(de, &inode->i_dentry, d_alias) { + if (de->d_parent == de) /* detached dentry, perhaps */ + continue; + /* first access to parent, make sure its qlnk initialized */ + __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT); + if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode)) + return de->d_parent->d_inode; + } + return NULL; +} + +static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode) +{ + struct dentry *de; + struct vz_quota_master *qmblk; + + qmblk = NULL; + list_for_each_entry(de, &inode->i_dentry, d_alias) { + if (de->d_parent == de) /* detached dentry, perhaps */ + continue; + if (qmblk == NULL) { + qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk; + continue; + } + if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) { + printk(KERN_WARNING "VZDQ: multiple quotas for " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + qmblk = VZ_QUOTA_BAD; + break; + } + } + if (qmblk == NULL) { + printk(KERN_WARNING "VZDQ: not attached to tree, " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + qmblk = VZ_QUOTA_BAD; + } + return qmblk; +} + +static void vzquota_dbranch_actualize(struct inode *inode, + struct inode *refinode) +{ + struct inode *pinode; + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk; + + vzquota_qlnk_init(&qlnk); + +start: + if (inode == inode->i_sb->s_root->d_inode) { + /* filesystem root */ + atomic_inc(&inode->i_count); + do { + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk)); + goto out; + } + + if (!vzquota_dparents_check_attach(inode)) { + pinode = vzquota_dparents_check_actual(inode); + if (pinode != NULL) { + inode = pinode; + goto start; + } + } + + atomic_inc(&inode->i_count); + while (1) { + if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */ + break; + /* + * Need to check parents again if we have slept inside + * vzquota_inode_qmblk_set() in the loop. + * If the state of parents is different, just return and repeat + * the actualizing process again from the inode passed to + * vzquota_inode_qmblk_recalc(). + */ + if (!vzquota_dparents_check_attach(inode)) { + if (vzquota_dparents_check_actual(inode) != NULL) + break; + qmblk = vzquota_dparents_check_same(inode); + } else + qmblk = VZ_QUOTA_BAD; + if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */ + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ACT); + break; + } + } + +out: + spin_unlock(&dcache_lock); + inode_qmblk_unlock(refinode->i_sb); + vzquota_qlnk_destroy(&qlnk); + iput(inode); + inode_qmblk_lock(refinode->i_sb); + spin_lock(&dcache_lock); +} + +static void vzquota_dtree_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + struct inode *pinode; + struct vz_quota_master *qmblk; + + if (inode == inode->i_sb->s_root->d_inode) { + /* filesystem root */ + do { + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk)); + return; + } + +start: + if (VZ_QUOTA_IS_ACTUAL(inode)) + return; + /* + * Here qmblk is (re-)initialized for all ancestors. + * This is not a very efficient procedure, but it guarantees that + * the quota tree is consistent (that is, the inode doesn't have two + * ancestors with different qmblk). + */ + if (!vzquota_dparents_check_attach(inode)) { + pinode = vzquota_dparents_check_actual(inode); + if (pinode != NULL) { + vzquota_dbranch_actualize(pinode, inode); + goto start; + } + qmblk = vzquota_dparents_check_same(inode); + } else + qmblk = VZ_QUOTA_BAD; + + if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) + goto start; + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DTREE); +} + +static void vzquota_det_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + struct inode *parent; + struct vz_quota_master *qmblk; + char *msg; + int cnt; + time_t timeout; + + cnt = 0; + parent = NULL; +start: + /* + * qmblk of detached inodes shouldn't be considered as not actual. + * They are not in any dentry tree, so quota on/off shouldn't affect + * them. + */ + if (!vzquota_qlnk_is_empty(INODE_QLNK(inode))) + return; + + timeout = 3; + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + /* + * Scenario: + * open + * unlink + * quotaon + * generic_delete_inode + * + * This is the first time vzquota sees inode. inode is outside of + * vzquota area of interest, otherwise quotaon would have got -EBUSY + * due to shrink_dcache_parent(). + * inode is almost completely destroyed, so don't intervene. + * + * dev@: + * However, there is a small race here... + * dput() first removes itself from all the lists, + * so shrink_dcache_parent() can succeed while dentry_iput is not + * done yet. + */ + if (inode->i_state & I_FREEING) + goto set; + + msg = "detached inode not in creation"; + if (inode->i_op != VZ_QUOTA_EMPTY_IOPS) + goto fail; + qmblk = VZ_QUOTA_BAD; + msg = "unexpected creation context"; + if (!vzquota_cur_qmblk_check()) + goto fail; + timeout = 0; + parent = vzquota_cur_qmblk_fetch(); + msg = "uninitialized parent"; + if (vzquota_qlnk_is_empty(INODE_QLNK(parent))) + goto fail; + msg = "parent not in tree"; + if (list_empty(&parent->i_dentry)) + goto fail; + msg = "parent has 0 refcount"; + if (!atomic_read(&parent->i_count)) + goto fail; + msg = "parent has different sb"; + if (parent->i_sb != inode->i_sb) + goto fail; + if (!VZ_QUOTA_IS_ACTUAL(parent)) { + vzquota_dbranch_actualize(parent, inode); + goto start; + } + + qmblk = INODE_QLNK(parent)->qmblk; +set: + if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) + goto start; + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DET); + return; + +fail: + { + struct timeval tv, tvo; + do_gettimeofday(&tv); + memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo)); + tv.tv_sec -= tvo.tv_sec; + if (tv.tv_usec < tvo.tv_usec) { + tv.tv_sec--; + tv.tv_usec += USEC_PER_SEC - tvo.tv_usec; + } else + tv.tv_usec -= tvo.tv_usec; + if (tv.tv_sec < timeout) + goto set; + printk(KERN_ERR "VZDQ: %s, orig {%u, %u}," + " dev %s, inode %lu, fs %s\n", + msg, + INODE_QLNK(inode)->origin[0], + INODE_QLNK(inode)->origin[1], + inode->i_sb->s_id, inode->i_ino, + inode->i_sb->s_type->name); + printk(KERN_ERR "i_count %u, ", atomic_read(&inode->i_count)); + printk(KERN_ERR "i_mode %o, ", inode->i_mode); + printk(KERN_ERR "i_state %lx, ", inode->i_state); + printk(KERN_ERR "i_flags %x\n", inode->i_flags); + printk(KERN_ERR "i_op %p, vfs_empty_iops %p, " + "i_fop %p, i_mapping %p\n", + inode->i_op, &vfs_empty_iops, + inode->i_fop, inode->i_mapping); + if (!cnt++) { + printk(KERN_ERR "current %d (%s), VE %d," + " time %ld.%06ld\n", + current->pid, current->comm, + VEID(get_exec_env()), + tv.tv_sec, (long)tv.tv_usec); + dump_stack(); + } + if (parent != NULL) + printk(KERN_ERR "VZDQ: parent of %lu is %lu\n", + inode->i_ino, parent->i_ino); + } + goto set; +} + +static void vzquota_inode_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + spin_lock(&dcache_lock); + if (!list_empty(&inode->i_dentry)) + vzquota_dtree_qmblk_recalc(inode, qlnk); + else + vzquota_det_qmblk_recalc(inode, qlnk); + spin_unlock(&dcache_lock); +} + +/** + * vzquota_inode_qmblk - obtain inode's qmblk + * + * Returns qmblk with refcounter taken, %NULL if not under + * VZ quota or %VZ_QUOTA_BAD. + * + * FIXME: This function should be removed when vzquota_find_qmblk / + * get_quota_root / vzquota_dstat code is cleaned up. + */ +struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk; + + might_sleep(); + + if (inode->i_sb->dq_op != &vz_quota_operations) + return NULL; +#if defined(VZ_QUOTA_UNLOAD) +#error Make sure qmblk does not disappear +#endif + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(inode->i_sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) + vzquota_inode_qmblk_recalc(inode, &qlnk); + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != VZ_QUOTA_BAD) { + if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) + qmblk_get(qmblk); + else + qmblk = NULL; + } + + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk); + return qmblk; +} + +/** + * vzquota_find_qmblk - helper to emulate quota on virtual filesystems + * + * This function finds a quota master block corresponding to the root of + * a virtual filesystem. + * Returns a quota master block with reference taken, or %NULL if not under + * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation + * operations will fail). + * + * Note: this function uses vzquota_inode_qmblk(). + * The latter is a rather confusing function: it returns qmblk that used to be + * on the inode some time ago (without guarantee that it still has any + * relations to the inode). So, vzquota_find_qmblk() leaves it up to the + * caller to think whether the inode could have changed its qmblk and what to + * do in that case. + * Currently, the callers appear to not care :( + */ +struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb) +{ + struct inode *qrinode; + struct vz_quota_master *qmblk; + + qmblk = NULL; + qrinode = NULL; + if (sb->s_op->get_quota_root != NULL) + qrinode = sb->s_op->get_quota_root(sb); + if (qrinode != NULL) + qmblk = vzquota_inode_qmblk(qrinode); + return qmblk; +} + +/* ---------------------------------------------------------------------- + * + * Calls from quota operations + * + * --------------------------------------------------------------------- */ + +/** + * vzquota_inode_init_call - call from DQUOT_INIT + */ +void vzquota_inode_init_call(struct inode *inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + /* initializes inode's quota inside */ + qmblk = vzquota_inode_data(inode, &data); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + vzquota_data_unlock(inode, &data); + + /* + * The check is needed for repeated new_inode() calls from a single + * ext3 call like create or mkdir in case of -ENOSPC. + */ + spin_lock(&dcache_lock); + if (!list_empty(&inode->i_dentry)) + vzquota_cur_qmblk_set(inode); + spin_unlock(&dcache_lock); +} + +/** + * vzquota_inode_drop_call - call from DQUOT_DROP + */ +void vzquota_inode_drop_call(struct inode *inode) +{ + vzquota_inode_drop(inode); +} + +/** + * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs + * @inode: the inode + * @data: storage space + * + * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk. + * On return if qmblk is neither NULL nor VZ_QUOTA_BAD: + * qmblk in inode's qlnk is the same as returned, + * ugid pointers inside inode's qlnk are valid, + * some locks are taken (and should be released by vzquota_data_unlock). + * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken. + */ +struct vz_quota_master *vzquota_inode_data(struct inode *inode, + struct vz_quota_datast *data) +{ + struct vz_quota_master *qmblk; + + might_sleep(); + + vzquota_qlnk_init(&data->qlnk); + inode_qmblk_lock(inode->i_sb); + if (unlikely(inode->i_flags & S_NOQUOTA)) { + inode_qmblk_unlock(inode->i_sb); + return NULL; + } + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) + vzquota_inode_qmblk_recalc(inode, &data->qlnk); + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != VZ_QUOTA_BAD) { + if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) { + /* + * Note that in the current implementation, + * inode_qmblk_lock can theoretically be dropped here. + * This place is serialized with quota_off because + * quota_off fails when there are extra dentry + * references and syncs inodes before removing quota + * information from them. + * However, quota usage information should stop being + * updated immediately after vzquota_off. + */ + qmblk_data_write_lock(qmblk); + } else { + inode_qmblk_unlock(inode->i_sb); + qmblk = NULL; + } + } else { + inode_qmblk_unlock(inode->i_sb); + } + return qmblk; +} + +void vzquota_data_unlock(struct inode *inode, + struct vz_quota_datast *data) +{ + qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&data->qlnk); +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_inode_transfer_call - call from vzquota_transfer + */ +int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + struct vz_quota_ilink qlnew; + int mask; + int ret; + + might_sleep(); + vzquota_qlnk_init(&qlnew); +start: + qmblk = vzquota_inode_data(inode, &data); + ret = NO_QUOTA; + if (qmblk == VZ_QUOTA_BAD) + goto out_destr; + ret = QUOTA_OK; + if (qmblk == NULL) + goto out_destr; + qmblk_get(qmblk); + + ret = QUOTA_OK; + if (!(qmblk->dq_flags & VZDQUG_ON)) + /* no ugid quotas */ + goto out_unlock; + + mask = 0; + if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid) + mask |= 1 << USRQUOTA; + if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid) + mask |= 1 << GRPQUOTA; + while (1) { + if (vzquota_qlnk_is_empty(&qlnew) && + vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk)) + break; + if (qlnew.qmblk == INODE_QLNK(inode)->qmblk && + qlnew.qmblk == qmblk) + goto finish; + if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk)) + break; + } + + /* prepare for restart */ + vzquota_data_unlock(inode, &data); + qmblk_put(qmblk); + goto start; + +finish: + /* all references obtained successfully */ + ret = vzquota_transfer_usage(inode, mask, &qlnew); + if (!ret) { + vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_TRANS); + } +out_unlock: + vzquota_data_unlock(inode, &data); + qmblk_put(qmblk); +out_destr: + vzquota_qlnk_destroy(&qlnew); + return ret; +} +#endif + +int vzquota_rename_check(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk1, qlnk2; + int c, ret; + + if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb) + return -1; + + might_sleep(); + + vzquota_qlnk_init(&qlnk1); + vzquota_qlnk_init(&qlnk2); + inode_qmblk_lock(inode->i_sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL); + __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL); + + do { + c = 0; + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) { + vzquota_inode_qmblk_recalc(inode, &qlnk1); + c++; + } + if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) || + !VZ_QUOTA_IS_ACTUAL(new_dir)) { + vzquota_inode_qmblk_recalc(new_dir, &qlnk2); + c++; + } + } while (c); + + ret = 0; + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != INODE_QLNK(new_dir)->qmblk) { + ret = -1; + if (qmblk != VZ_QUOTA_BAD && + !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && + qmblk->dq_root_dentry->d_inode == inode && + VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk, + inode->i_sb) && + VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk, + inode->i_sb)) + /* quota root rename is allowed */ + ret = 0; + } + + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk2); + vzquota_qlnk_destroy(&qlnk1); + return ret; +} + + +/* ---------------------------------------------------------------------- + * + * qmblk-related parts of on/off operations + * + * --------------------------------------------------------------------- */ + +/** + * vzquota_check_dtree - check dentry tree if quota on/off is allowed + * + * This function doesn't allow quota to be turned on/off if some dentries in + * the tree have external references. + * In addition to technical reasons, it enforces user-space correctness: + * current usage (taken from or reported to the user space) can be meaningful + * and accurate only if the tree is not being modified. + * Side effect: additional vfsmount structures referencing the tree (bind + * mounts of tree nodes to some other places) are not allowed at on/off time. + */ +int vzquota_check_dtree(struct vz_quota_master *qmblk, int off) +{ + struct dentry *dentry; + int err, count; + + err = -EBUSY; + dentry = qmblk->dq_root_dentry; + + if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root) + goto unhashed; + + /* attempt to shrink */ + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dcache_lock); + inode_qmblk_unlock(dentry->d_sb); + shrink_dcache_parent(dentry); + inode_qmblk_lock(dentry->d_sb); + spin_lock(&dcache_lock); + if (!list_empty(&dentry->d_subdirs)) + goto out; + + count = 1; + if (dentry == dentry->d_sb->s_root) + count += 2; /* sb and mnt refs */ + if (atomic_read(&dentry->d_count) < count) { + printk(KERN_ERR "%s: too small count %d vs %d.\n", + __FUNCTION__, + atomic_read(&dentry->d_count), count); + goto out; + } + if (atomic_read(&dentry->d_count) > count) + goto out; + } + + err = 0; +out: + return err; + +unhashed: + /* + * Quota root is removed. + * Allow to turn quota off, but not on. + */ + if (off) + err = 0; + goto out; +} + +int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, + struct vz_quota_master *qmblk) +{ + struct vz_quota_ilink qlnk; + struct vz_quota_master *qold, *qnew; + int err; + + might_sleep(); + + qold = NULL; + qnew = vzquota_alloc_fake(); + if (qnew == NULL) + return -ENOMEM; + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + spin_lock(&dcache_lock); + while (1) { + err = vzquota_check_dtree(qmblk, 0); + if (err) + break; + if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)) + break; + } + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ON); + spin_unlock(&dcache_lock); + + if (!err) { + qold = __VZ_QUOTA_NOQUOTA(sb); + qold->dq_flags |= VZDQ_NOACT; + __VZ_QUOTA_NOQUOTA(sb) = qnew; + } + + inode_qmblk_unlock(sb); + vzquota_qlnk_destroy(&qlnk); + if (qold != NULL) + qmblk_put(qold); + + return err; +} + +int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk) +{ + int ret; + + ret = 0; + inode_qmblk_lock(sb); + + spin_lock(&dcache_lock); + if (vzquota_check_dtree(qmblk, 1)) + ret = -EBUSY; + spin_unlock(&dcache_lock); + + if (!ret) + qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT; + inode_qmblk_unlock(sb); + return ret; +} + + +/* ---------------------------------------------------------------------- + * + * External interfaces + * + * ---------------------------------------------------------------------*/ + +static int vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case VZCTL_QUOTA_NEW_CTL: { + struct vzctl_quotactl qb; + + err = -EFAULT; + if (copy_from_user(&qb, (void __user *)arg, sizeof(qb))) + break; + err = do_vzquotactl(qb.cmd, qb.quota_id, + qb.qstat, qb.ve_root, 0); + break; + } +#ifdef CONFIG_VZ_QUOTA_UGID + case VZCTL_QUOTA_UGID_CTL: { + struct vzctl_quotaugidctl qub; + + err = -EFAULT; + if (copy_from_user(&qub, (void __user *)arg, sizeof(qub))) + break; + err = do_vzquotaugidctl(qub.cmd, qub.quota_id, + qub.ugid_index, qub.ugid_size, qub.addr, 0); + break; + } +#endif + default: + err = -ENOTTY; + } + return err; +} + +#ifdef CONFIG_COMPAT +static int compat_vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case VZCTL_COMPAT_QUOTA_CTL: { + struct compat_vzctl_quotactl cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + err = do_vzquotactl(cs.cmd, cs.quota_id, + compat_ptr(cs.qstat), + compat_ptr(cs.ve_root), 1); + break; + } +#ifdef CONFIG_VZ_QUOTA_UGID + case VZCTL_COMPAT_QUOTA_UGID_CTL: { + struct compat_vzctl_quotaugidctl cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + err = do_vzquotaugidctl(cs.cmd, cs.quota_id, cs.ugid_index, + cs.ugid_size, compat_ptr(cs.addr), 1); + break; + } +#endif + default: + err = -ENOIOCTLCMD; + } + return err; +} +#endif + +static struct vzioctlinfo vzdqcalls = { + .type = VZDQCTLTYPE, + .ioctl = vzquota_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_vzquota_ioctl, +#endif + .owner = THIS_MODULE, +}; + +/** + * vzquota_dstat - get quota usage info for virtual superblock + */ +static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat) +{ + struct vz_quota_master *qmblk; + + qmblk = vzquota_find_qmblk(super); + if (qmblk == NULL) + return -ENOENT; + if (qmblk == VZ_QUOTA_BAD) { + memset(qstat, 0, sizeof(*qstat)); + return 0; + } + + qmblk_data_read_lock(qmblk); + memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat)); + qmblk_data_read_unlock(qmblk); + qmblk_put(qmblk); + return 0; +} + + +/* ---------------------------------------------------------------------- + * + * Init/exit helpers + * + * ---------------------------------------------------------------------*/ + +static int vzquota_cache_init(void) +{ + int i; + + vzquota_cachep = kmem_cache_create("vz_quota_master", + sizeof(struct vz_quota_master), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (vzquota_cachep == NULL) { + printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); + goto nomem2; + } + for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) + INIT_LIST_HEAD(&vzquota_hash_table[i]); + + return 0; + +nomem2: + return -ENOMEM; +} + +static void vzquota_cache_release(void) +{ + int i; + + /* sanity check */ + for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) + if (!list_empty(&vzquota_hash_table[i])) + BUG(); + + /* release caches */ + if (kmem_cache_destroy(vzquota_cachep)) + printk(KERN_ERR + "VZQUOTA: vz_quota_master kmem_cache_destroy failed\n"); + vzquota_cachep = NULL; +} + +static int quota_notifier_call(struct vnotifier_block *self, + unsigned long n, void *data, int err) +{ + struct virt_info_quota *viq; + struct super_block *sb; + + viq = (struct virt_info_quota *)data; + switch (n) { + case VIRTINFO_QUOTA_ON: + err = NOTIFY_BAD; + if (!try_module_get(THIS_MODULE)) + break; + sb = viq->super; + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_OFF: + module_put(THIS_MODULE); + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_GETSTAT: + err = NOTIFY_BAD; + if (vzquota_dstat(viq->super, viq->qstat)) + break; + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_DISABLE: + err = NOTIFY_OK; + vzquota_inode_off((struct inode *)data); + break; + } + return err; +} + +struct vnotifier_block quota_notifier_block = { + .notifier_call = quota_notifier_call, + .priority = INT_MAX, +}; + +/* ---------------------------------------------------------------------- + * + * Init/exit procedures + * + * ---------------------------------------------------------------------*/ + +static int __init vzquota_init(void) +{ + int err; + + if ((err = vzquota_cache_init()) != 0) + goto out_cache; + + if ((err = vzquota_proc_init()) != 0) + goto out_proc; + +#ifdef CONFIG_VZ_QUOTA_UGID + if ((err = vzquota_ugid_init()) != 0) + goto out_ugid; +#endif + + init_MUTEX(&vz_quota_sem); + vzioctl_register(&vzdqcalls); + virtinfo_notifier_register(VITYPE_QUOTA, "a_notifier_block); +#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS) + vzaquota_init(); +#endif + + return 0; + +#ifdef CONFIG_VZ_QUOTA_UGID +out_ugid: + vzquota_proc_release(); +#endif +out_proc: + vzquota_cache_release(); +out_cache: + return err; +} + +#if defined(VZ_QUOTA_UNLOAD) +static void __exit vzquota_release(void) +{ + virtinfo_notifier_unregister(VITYPE_QUOTA, "a_notifier_block); + vzioctl_unregister(&vzdqcalls); +#ifdef CONFIG_VZ_QUOTA_UGID +#ifdef CONFIG_PROC_FS + vzaquota_fini(); +#endif + vzquota_ugid_release(); +#endif + vzquota_proc_release(); + vzquota_cache_release(); +} +#endif + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Disk Quota"); +MODULE_LICENSE("GPL v2"); + +module_init(vzquota_init) +#if defined(VZ_QUOTA_UNLOAD) +module_exit(vzquota_release) +#endif diff -uprN linux-2.6.18/include/Kbuild linux-2.6.18.ovz/include/Kbuild --- linux-2.6.18/include/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,2 +1,9 @@ -header-y += asm-generic/ linux/ scsi/ sound/ mtd/ rdma/ video/ -header-y += asm-$(ARCH)/ +header-y += asm-generic/ +header-y += linux/ +header-y += scsi/ +header-y += sound/ +header-y += mtd/ +header-y += rdma/ +header-y += video/ + +header-y += asm-$(ARCH)/ diff -uprN linux-2.6.18/include/asm-alpha/Kbuild linux-2.6.18.ovz/include/asm-alpha/Kbuild --- linux-2.6.18/include/asm-alpha/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-alpha/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,5 +1,11 @@ include include/asm-generic/Kbuild.asm -unifdef-y += console.h fpu.h sysinfo.h compiler.h +header-y += gentrap.h +header-y += regdef.h +header-y += pal.h +header-y += reg.h -header-y += gentrap.h regdef.h pal.h reg.h +unifdef-y += console.h +unifdef-y += fpu.h +unifdef-y += sysinfo.h +unifdef-y += compiler.h diff -uprN linux-2.6.18/include/asm-arm/elf.h linux-2.6.18.ovz/include/asm-arm/elf.h --- linux-2.6.18/include/asm-arm/elf.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-arm/elf.h 2007-06-13 06:55:07.000000000 -0400 @@ -8,9 +8,6 @@ #include #include -#ifdef __KERNEL -#include -#endif typedef unsigned long elf_greg_t; typedef unsigned long elf_freg_t[3]; @@ -32,11 +29,6 @@ typedef elf_greg_t elf_gregset_t[ELF_NGR typedef struct user_fp elf_fpregset_t; /* - * This is used to ensure we don't load something for the wrong architecture. - */ -#define elf_check_arch(x) ( ((x)->e_machine == EM_ARM) && (ELF_PROC_OK((x))) ) - -/* * These are used to set parameters in the core dumps. */ #define ELF_CLASS ELFCLASS32 @@ -47,6 +39,14 @@ typedef struct user_fp elf_fpregset_t; #endif #define ELF_ARCH EM_ARM +#ifdef __KERNEL__ +#include + +/* + * This is used to ensure we don't load something for the wrong architecture. + */ +#define elf_check_arch(x) ( ((x)->e_machine == EM_ARM) && (ELF_PROC_OK((x))) ) + #define USE_ELF_CORE_DUMP #define ELF_EXEC_PAGESIZE 4096 @@ -83,8 +83,6 @@ typedef struct user_fp elf_fpregset_t; extern char elf_platform[]; #define ELF_PLATFORM (elf_platform) -#ifdef __KERNEL__ - /* * 32-bit code is always OK. Some cpus can do 26-bit, some can't. */ diff -uprN linux-2.6.18/include/asm-arm/page.h linux-2.6.18.ovz/include/asm-arm/page.h --- linux-2.6.18/include/asm-arm/page.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-arm/page.h 2007-06-13 06:55:07.000000000 -0400 @@ -11,13 +11,13 @@ #define _ASMARM_PAGE_H +#ifdef __KERNEL__ + /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#ifdef __KERNEL__ - /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) diff -uprN linux-2.6.18/include/asm-arm/unistd.h linux-2.6.18.ovz/include/asm-arm/unistd.h --- linux-2.6.18/include/asm-arm/unistd.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-arm/unistd.h 2007-06-13 06:55:07.000000000 -0400 @@ -347,6 +347,19 @@ #define __NR_mbind (__NR_SYSCALL_BASE+319) #define __NR_get_mempolicy (__NR_SYSCALL_BASE+320) #define __NR_set_mempolicy (__NR_SYSCALL_BASE+321) +#define __NR_openat (__NR_SYSCALL_BASE+322) +#define __NR_mkdirat (__NR_SYSCALL_BASE+323) +#define __NR_mknodat (__NR_SYSCALL_BASE+324) +#define __NR_fchownat (__NR_SYSCALL_BASE+325) +#define __NR_futimesat (__NR_SYSCALL_BASE+326) +#define __NR_fstatat64 (__NR_SYSCALL_BASE+327) +#define __NR_unlinkat (__NR_SYSCALL_BASE+328) +#define __NR_renameat (__NR_SYSCALL_BASE+329) +#define __NR_linkat (__NR_SYSCALL_BASE+330) +#define __NR_symlinkat (__NR_SYSCALL_BASE+331) +#define __NR_readlinkat (__NR_SYSCALL_BASE+332) +#define __NR_fchmodat (__NR_SYSCALL_BASE+333) +#define __NR_faccessat (__NR_SYSCALL_BASE+334) /* * The following SWIs are ARM private. diff -uprN linux-2.6.18/include/asm-arm26/tlbflush.h linux-2.6.18.ovz/include/asm-arm26/tlbflush.h --- linux-2.6.18/include/asm-arm26/tlbflush.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-arm26/tlbflush.h 2007-06-13 06:55:07.000000000 -0400 @@ -25,7 +25,7 @@ static inline void memc_update_all(void) { struct task_struct *p; cpu_memc_update_all(init_mm.pgd); - for_each_process(p) { + for_each_process_all(p) { if (!p->mm) continue; cpu_memc_update_all(p->mm->pgd); diff -uprN linux-2.6.18/include/asm-cris/Kbuild linux-2.6.18.ovz/include/asm-cris/Kbuild --- linux-2.6.18/include/asm-cris/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-cris/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1 +1,5 @@ include include/asm-generic/Kbuild.asm + +header-y += arch-v10/ arch-v32/ + +unifdef-y += rs485.h diff -uprN linux-2.6.18/include/asm-cris/arch-v10/Kbuild linux-2.6.18.ovz/include/asm-cris/arch-v10/Kbuild --- linux-2.6.18/include/asm-cris/arch-v10/Kbuild 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/asm-cris/arch-v10/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,2 @@ +header-y += ptrace.h +header-y += user.h diff -uprN linux-2.6.18/include/asm-cris/arch-v32/Kbuild linux-2.6.18.ovz/include/asm-cris/arch-v32/Kbuild --- linux-2.6.18/include/asm-cris/arch-v32/Kbuild 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/asm-cris/arch-v32/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,2 @@ +header-y += ptrace.h +header-y += user.h diff -uprN linux-2.6.18/include/asm-cris/byteorder.h linux-2.6.18.ovz/include/asm-cris/byteorder.h --- linux-2.6.18/include/asm-cris/byteorder.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-cris/byteorder.h 2007-06-13 06:55:07.000000000 -0400 @@ -3,14 +3,15 @@ #ifdef __GNUC__ +#ifdef __KERNEL__ #include /* defines are necessary because the other files detect the presence * of a defined __arch_swab32, not an inline */ - #define __arch__swab32(x) ___arch__swab32(x) #define __arch__swab16(x) ___arch__swab16(x) +#endif /* __KERNEL__ */ #if !defined(__STRICT_ANSI__) || defined(__KERNEL__) # define __BYTEORDER_HAS_U64__ diff -uprN linux-2.6.18/include/asm-cris/elf.h linux-2.6.18.ovz/include/asm-cris/elf.h --- linux-2.6.18/include/asm-cris/elf.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-cris/elf.h 2007-06-13 06:55:07.000000000 -0400 @@ -5,7 +5,6 @@ * ELF register definitions.. */ -#include #include #define R_CRIS_NONE 0 @@ -46,6 +45,9 @@ typedef unsigned long elf_fpregset_t; #define ELF_DATA ELFDATA2LSB #define ELF_ARCH EM_CRIS +#ifdef __KERNEL__ +#include + /* The master for these definitions is {binutils}/include/elf/cris.h: */ /* User symbols in this file have a leading underscore. */ #define EF_CRIS_UNDERSCORE 0x00000001 @@ -87,8 +89,8 @@ typedef unsigned long elf_fpregset_t; #define ELF_PLATFORM (NULL) -#ifdef __KERNEL__ #define SET_PERSONALITY(ex, ibcs2) set_personality((ibcs2)?PER_SVR4:PER_LINUX) -#endif + +#endif /* __KERNEL__ */ #endif diff -uprN linux-2.6.18/include/asm-cris/page.h linux-2.6.18.ovz/include/asm-cris/page.h --- linux-2.6.18/include/asm-cris/page.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-cris/page.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,6 +1,8 @@ #ifndef _CRIS_PAGE_H #define _CRIS_PAGE_H +#ifdef __KERNEL__ + #include /* PAGE_SHIFT determines the page size */ @@ -12,8 +14,6 @@ #endif #define PAGE_MASK (~(PAGE_SIZE-1)) -#ifdef __KERNEL__ - #define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) @@ -73,10 +73,10 @@ typedef struct { unsigned long pgprot; } #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#endif /* __KERNEL__ */ - #include #include +#endif /* __KERNEL__ */ + #endif /* _CRIS_PAGE_H */ diff -uprN linux-2.6.18/include/asm-cris/posix_types.h linux-2.6.18.ovz/include/asm-cris/posix_types.h --- linux-2.6.18/include/asm-cris/posix_types.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-cris/posix_types.h 2007-06-13 06:55:07.000000000 -0400 @@ -6,8 +6,6 @@ #ifndef __ARCH_CRIS_POSIX_TYPES_H #define __ARCH_CRIS_POSIX_TYPES_H -#include - /* * This file is generally used by user-level software, so you need to * be a little careful about namespace pollution etc. Also, we cannot @@ -53,9 +51,8 @@ typedef struct { #endif /* !defined(__KERNEL__) && !defined(__USE_ALL) */ } __kernel_fsid_t; -/* should this ifdef be here ? */ - -#if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2) +#ifdef __KERNEL__ +#include #undef __FD_SET #define __FD_SET(fd,fdsetp) set_bit(fd, (void *)(fdsetp)) @@ -69,6 +66,6 @@ typedef struct { #undef __FD_ZERO #define __FD_ZERO(fdsetp) memset((void *)(fdsetp), 0, __FDSET_LONGS << 2) -#endif /* defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2) */ +#endif /* __KERNEL__ */ #endif /* __ARCH_CRIS_POSIX_TYPES_H */ diff -uprN linux-2.6.18/include/asm-cris/unistd.h linux-2.6.18.ovz/include/asm-cris/unistd.h --- linux-2.6.18/include/asm-cris/unistd.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-cris/unistd.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,8 +1,6 @@ #ifndef _ASM_CRIS_UNISTD_H_ #define _ASM_CRIS_UNISTD_H_ -#include - /* * This file contains the system call numbers, and stub macros for libc. */ @@ -299,6 +297,7 @@ #define NR_syscalls 289 +#include #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR @@ -322,7 +321,6 @@ #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION -#endif #ifdef __KERNEL_SYSCALLS__ diff -uprN linux-2.6.18/include/asm-generic/Kbuild linux-2.6.18.ovz/include/asm-generic/Kbuild --- linux-2.6.18/include/asm-generic/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-generic/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,3 +1,12 @@ -header-y += atomic.h errno-base.h errno.h fcntl.h ioctl.h ipc.h mman.h \ - signal.h statfs.h -unifdef-y := resource.h siginfo.h +header-y += atomic.h +header-y += errno-base.h +header-y += errno.h +header-y += fcntl.h +header-y += ioctl.h +header-y += ipc.h +header-y += mman.h +header-y += signal.h +header-y += statfs.h + +unifdef-y += resource.h +unifdef-y += siginfo.h diff -uprN linux-2.6.18/include/asm-generic/Kbuild.asm linux-2.6.18.ovz/include/asm-generic/Kbuild.asm --- linux-2.6.18/include/asm-generic/Kbuild.asm 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-generic/Kbuild.asm 2007-06-13 06:55:07.000000000 -0400 @@ -1,8 +1,34 @@ -unifdef-y += a.out.h auxvec.h byteorder.h errno.h fcntl.h ioctl.h \ - ioctls.h ipcbuf.h mman.h msgbuf.h param.h poll.h \ - posix_types.h ptrace.h resource.h sembuf.h shmbuf.h shmparam.h \ - sigcontext.h siginfo.h signal.h socket.h sockios.h stat.h \ - statfs.h termbits.h termios.h types.h unistd.h user.h +unifdef-y += a.out.h +unifdef-y += auxvec.h +unifdef-y += byteorder.h +unifdef-y += errno.h +unifdef-y += fcntl.h +unifdef-y += ioctl.h +unifdef-y += ioctls.h +unifdef-y += ipcbuf.h +unifdef-y += mman.h +unifdef-y += msgbuf.h +unifdef-y += param.h +unifdef-y += poll.h +unifdef-y += posix_types.h +unifdef-y += ptrace.h +unifdef-y += resource.h +unifdef-y += sembuf.h +unifdef-y += shmbuf.h +unifdef-y += sigcontext.h +unifdef-y += siginfo.h +unifdef-y += signal.h +unifdef-y += socket.h +unifdef-y += sockios.h +unifdef-y += stat.h +unifdef-y += statfs.h +unifdef-y += termbits.h +unifdef-y += termios.h +unifdef-y += types.h +unifdef-y += unistd.h +unifdef-y += user.h # These probably shouldn't be exported -unifdef-y += elf.h page.h +unifdef-y += shmparam.h +unifdef-y += elf.h +unifdef-y += page.h diff -uprN linux-2.6.18/include/asm-generic/audit_change_attr.h linux-2.6.18.ovz/include/asm-generic/audit_change_attr.h --- linux-2.6.18/include/asm-generic/audit_change_attr.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-generic/audit_change_attr.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,16 +1,20 @@ __NR_chmod, __NR_fchmod, +#ifdef __NR_chown __NR_chown, __NR_fchown, __NR_lchown, +#endif __NR_setxattr, __NR_lsetxattr, __NR_fsetxattr, __NR_removexattr, __NR_lremovexattr, __NR_fremovexattr, +#ifdef __NR_fchownat __NR_fchownat, __NR_fchmodat, +#endif #ifdef __NR_chown32 __NR_chown32, __NR_fchown32, diff -uprN linux-2.6.18/include/asm-generic/audit_dir_write.h linux-2.6.18.ovz/include/asm-generic/audit_dir_write.h --- linux-2.6.18/include/asm-generic/audit_dir_write.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-generic/audit_dir_write.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,14 +1,18 @@ __NR_rename, __NR_mkdir, __NR_rmdir, +#ifdef __NR_creat __NR_creat, +#endif __NR_link, __NR_unlink, __NR_symlink, __NR_mknod, +#ifdef __NR_mkdirat __NR_mkdirat, __NR_mknodat, __NR_unlinkat, __NR_renameat, __NR_linkat, __NR_symlinkat, +#endif diff -uprN linux-2.6.18/include/asm-h8300/page.h linux-2.6.18.ovz/include/asm-h8300/page.h --- linux-2.6.18/include/asm-h8300/page.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-h8300/page.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,6 +1,7 @@ #ifndef _H8300_PAGE_H #define _H8300_PAGE_H +#ifdef __KERNEL__ /* PAGE_SHIFT determines the page size */ @@ -8,8 +9,6 @@ #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#ifdef __KERNEL__ - #include #ifndef __ASSEMBLY__ @@ -76,9 +75,9 @@ extern unsigned long memory_end; #endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #include #include +#endif /* __KERNEL__ */ + #endif /* _H8300_PAGE_H */ diff -uprN linux-2.6.18/include/asm-i386/Kbuild linux-2.6.18.ovz/include/asm-i386/Kbuild --- linux-2.6.18/include/asm-i386/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-i386/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,5 +1,10 @@ include include/asm-generic/Kbuild.asm -header-y += boot.h debugreg.h ldt.h ucontext.h +header-y += boot.h +header-y += debugreg.h +header-y += ldt.h +header-y += ucontext.h -unifdef-y += mtrr.h setup.h vm86.h +unifdef-y += mtrr.h +unifdef-y += setup.h +unifdef-y += vm86.h diff -uprN linux-2.6.18/include/asm-i386/bug.h linux-2.6.18.ovz/include/asm-i386/bug.h --- linux-2.6.18/include/asm-i386/bug.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-i386/bug.h 2007-06-13 06:55:07.000000000 -0400 @@ -13,7 +13,10 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE #define BUG() \ __asm__ __volatile__( "ud2\n" \ + "\t.byte 0x66\n"\ + "\t.byte 0xb8\n" /* mov $xxx, %ax */\ "\t.word %c0\n" \ + "\t.byte 0xb8\n" /* mov $xxx, %eax */\ "\t.long %c1\n" \ : : "i" (__LINE__), "i" (__FILE__)) #else diff -uprN linux-2.6.18/include/asm-i386/bugs.h linux-2.6.18.ovz/include/asm-i386/bugs.h --- linux-2.6.18/include/asm-i386/bugs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-i386/bugs.h 2007-06-13 06:55:07.000000000 -0400 @@ -189,6 +189,6 @@ static void __init check_bugs(void) check_fpu(); check_hlt(); check_popad(); - system_utsname.machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); } diff -uprN linux-2.6.18/include/asm-i386/elf.h linux-2.6.18.ovz/include/asm-i386/elf.h --- linux-2.6.18/include/asm-i386/elf.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-i386/elf.h 2007-06-13 06:55:07.000000000 -0400 @@ -112,7 +112,7 @@ typedef struct user_fxsr_struct elf_fpxr For the moment, we have only optimizations for the Intel generations, but that could change... */ -#define ELF_PLATFORM (system_utsname.machine) +#define ELF_PLATFORM (utsname()->machine) #define SET_PERSONALITY(ex, ibcs2) do { } while (0) @@ -164,7 +164,7 @@ extern int arch_setup_additional_pages(s extern unsigned int vdso_enabled; #define ARCH_DLINFO \ -do if (vdso_enabled) { \ +do if (vdso_enabled && sysctl_at_vsyscall) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_COMPAT_BASE); \ } while (0) diff -uprN linux-2.6.18/include/asm-i386/mman.h linux-2.6.18.ovz/include/asm-i386/mman.h --- linux-2.6.18/include/asm-i386/mman.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-i386/mman.h 2007-06-13 06:55:07.000000000 -0400 @@ -10,6 +10,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff -uprN linux-2.6.18/include/asm-i386/msr.h linux-2.6.18.ovz/include/asm-i386/msr.h --- linux-2.6.18/include/asm-i386/msr.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-i386/msr.h 2007-06-13 06:55:07.000000000 -0400 @@ -78,6 +78,21 @@ static inline void wrmsrl (unsigned long : "=a" (low), "=d" (high) \ : "c" (counter)) +#ifdef CONFIG_SMP +void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); +void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); +#else +static inline void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + rdmsr(msr_no, *l, *h); +} + +static inline void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + wrmsr(msr_no, l, h); +} +#endif + /* symbolic names for some interesting MSRs */ /* Intel defined MSRs. */ #define MSR_IA32_P5_MC_ADDR 0 diff -uprN linux-2.6.18/include/asm-i386/nmi.h linux-2.6.18.ovz/include/asm-i386/nmi.h --- linux-2.6.18/include/asm-i386/nmi.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-i386/nmi.h 2007-06-13 06:55:07.000000000 -0400 @@ -17,6 +17,7 @@ typedef int (*nmi_callback_t)(struct pt_ * set. Return 1 if the NMI was handled. */ void set_nmi_callback(nmi_callback_t callback); +void set_nmi_ipi_callback(nmi_callback_t callback); /** * unset_nmi_callback @@ -24,6 +25,7 @@ void set_nmi_callback(nmi_callback_t cal * Remove the handler previously set. */ void unset_nmi_callback(void); +void unset_nmi_ipi_callback(void); extern void setup_apic_nmi_watchdog (void); extern int reserve_lapic_nmi(void); diff -uprN linux-2.6.18/include/asm-i386/processor.h linux-2.6.18.ovz/include/asm-i386/processor.h --- linux-2.6.18/include/asm-i386/processor.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-i386/processor.h 2007-06-13 06:55:07.000000000 -0400 @@ -214,6 +214,21 @@ static inline unsigned int cpuid_edx(uns return edx; } +#ifdef CONFIG_SMP +void cpuid_on_cpu(unsigned int cpu, u32 op, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); +u32 cpuid_eax_on_cpu(unsigned int cpu, u32 op); +#else +static inline void cpuid_on_cpu(unsigned int cpu, u32 op, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + cpuid(op, eax, ebx, ecx, edx); +} + +static inline u32 cpuid_eax_on_cpu(unsigned int cpu, u32 op) +{ + return cpuid_eax(op); +} +#endif + #define load_cr3(pgdir) write_cr3(__pa(pgdir)) /* diff -uprN linux-2.6.18/include/asm-i386/thread_info.h linux-2.6.18.ovz/include/asm-i386/thread_info.h --- linux-2.6.18/include/asm-i386/thread_info.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-i386/thread_info.h 2007-06-13 06:55:07.000000000 -0400 @@ -99,13 +99,13 @@ static inline struct thread_info *curren ({ \ struct thread_info *ret; \ \ - ret = kmalloc(THREAD_SIZE, GFP_KERNEL); \ + ret = kmalloc(THREAD_SIZE, GFP_KERNEL_UBC); \ if (ret) \ memset(ret, 0, THREAD_SIZE); \ ret; \ }) #else -#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) +#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL_UBC) #endif #define free_thread_info(info) kfree(info) @@ -142,6 +142,7 @@ static inline struct thread_info *curren #define TIF_MEMDIE 16 #define TIF_DEBUG 17 /* uses debug registers */ #define TIF_IO_BITMAP 18 /* uses I/O bitmap */ +#define TIF_FREEZE 19 /* Freeze request (atomic PF_FREEZE) */ #define _TIF_SYSCALL_TRACE (1< #include +#include + #include DECLARE_PER_CPU(unsigned long *, __pgtable_quicklist); @@ -37,7 +39,7 @@ static inline long pgtable_quicklist_tot return ql_size; } -static inline void *pgtable_quicklist_alloc(void) +static inline void *pgtable_quicklist_alloc(int charge) { unsigned long *ret = NULL; @@ -45,13 +47,21 @@ static inline void *pgtable_quicklist_al ret = pgtable_quicklist; if (likely(ret != NULL)) { + if (ub_page_charge(virt_to_page(ret), 0, + charge ? __GFP_UBC|__GFP_SOFT_UBC : 0)) { + ret = NULL; + goto out; + } + pgtable_quicklist = (unsigned long *)(*ret); ret[0] = 0; --pgtable_quicklist_size; +out: preempt_enable(); } else { preempt_enable(); - ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO | + (charge ? __GFP_UBC | __GFP_SOFT_UBC : 0)); } return ret; @@ -69,6 +79,7 @@ static inline void pgtable_quicklist_fre #endif preempt_disable(); + ub_page_uncharge(virt_to_page(pgtable_entry), 0); *(unsigned long *)pgtable_entry = (unsigned long)pgtable_quicklist; pgtable_quicklist = (unsigned long *)pgtable_entry; ++pgtable_quicklist_size; @@ -77,7 +88,7 @@ static inline void pgtable_quicklist_fre static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return pgtable_quicklist_alloc(); + return pgtable_quicklist_alloc(1); } static inline void pgd_free(pgd_t * pgd) @@ -94,7 +105,7 @@ pgd_populate(struct mm_struct *mm, pgd_t static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return pgtable_quicklist_alloc(); + return pgtable_quicklist_alloc(1); } static inline void pud_free(pud_t * pud) @@ -112,7 +123,7 @@ pud_populate(struct mm_struct *mm, pud_t static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return pgtable_quicklist_alloc(); + return pgtable_quicklist_alloc(1); } static inline void pmd_free(pmd_t * pmd) @@ -137,13 +148,14 @@ pmd_populate_kernel(struct mm_struct *mm static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr) { - return virt_to_page(pgtable_quicklist_alloc()); + void * pg = pgtable_quicklist_alloc(1); + return pg ? virt_to_page(pg) : NULL; } static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { - return pgtable_quicklist_alloc(); + return pgtable_quicklist_alloc(0); } static inline void pte_free(struct page *pte) diff -uprN linux-2.6.18/include/asm-ia64/processor.h linux-2.6.18.ovz/include/asm-ia64/processor.h --- linux-2.6.18/include/asm-ia64/processor.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-ia64/processor.h 2007-06-13 06:55:07.000000000 -0400 @@ -305,7 +305,7 @@ struct thread_struct { regs->loadrs = 0; \ regs->r8 = current->mm->dumpable; /* set "don't zap registers" flag */ \ regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ - if (unlikely(!current->mm->dumpable)) { \ + if (unlikely(!current->mm->dumpable || !current->mm->vps_dumpable)) { \ /* \ * Zap scratch regs to avoid leaking bits between processes with different \ * uid/privileges. \ diff -uprN linux-2.6.18/include/asm-ia64/thread_info.h linux-2.6.18.ovz/include/asm-ia64/thread_info.h --- linux-2.6.18/include/asm-ia64/thread_info.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-ia64/thread_info.h 2007-06-13 06:55:07.000000000 -0400 @@ -84,15 +84,18 @@ struct thread_info { #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_SYSCALL_TRACE 3 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */ +#define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ +#define TIF_FREEZE 20 /* Freeze request, atomic version of PF_FREEZE */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEAUDIT (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) @@ -100,7 +103,7 @@ struct thread_info { #define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED) /* "work to do on user-return" bits */ -#define TIF_ALLWORK_MASK (_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT) +#define TIF_ALLWORK_MASK (_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_RESTORE_SIGMASK) /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ #define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) diff -uprN linux-2.6.18/include/asm-ia64/unistd.h linux-2.6.18.ovz/include/asm-ia64/unistd.h --- linux-2.6.18/include/asm-ia64/unistd.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-ia64/unistd.h 2007-06-13 06:55:07.000000000 -0400 @@ -290,13 +290,26 @@ #define __NR_sync_file_range 1300 #define __NR_tee 1301 #define __NR_vmsplice 1302 +#define __NR_fairsched_vcpus 1499 +#define __NR_fairsched_mknod 1500 +#define __NR_fairsched_rmnod 1501 +#define __NR_fairsched_chwt 1502 +#define __NR_fairsched_mvpr 1503 +#define __NR_fairsched_rate 1504 +#define __NR_getluid 1505 +#define __NR_setluid 1506 +#define __NR_setublimit 1507 +#define __NR_ubstat 1508 +#define __NR_lchmod 1509 +#define __NR_lutime 1510 #ifdef __KERNEL__ -#define NR_syscalls 279 /* length of syscall table */ +#define NR_syscalls 487 /* length of syscall table */ #define __ARCH_WANT_SYS_RT_SIGACTION +#define __ARCH_WANT_SYS_RT_SIGSUSPEND #ifdef CONFIG_IA32_SUPPORT # define __ARCH_WANT_SYS_FADVISE64 @@ -307,6 +320,7 @@ # define __ARCH_WANT_SYS_OLDUMOUNT # define __ARCH_WANT_SYS_SIGPENDING # define __ARCH_WANT_SYS_SIGPROCMASK +# define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND # define __ARCH_WANT_COMPAT_SYS_TIME #endif diff -uprN linux-2.6.18/include/asm-m32r/page.h linux-2.6.18.ovz/include/asm-m32r/page.h --- linux-2.6.18/include/asm-m32r/page.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-m32r/page.h 2007-06-13 06:55:07.000000000 -0400 @@ -87,10 +87,9 @@ typedef struct { unsigned long pgprot; } #define devmem_is_allowed(x) 1 -#endif /* __KERNEL__ */ - #include #include +#endif /* __KERNEL__ */ #endif /* _ASM_M32R_PAGE_H */ diff -uprN linux-2.6.18/include/asm-m32r/ptrace.h linux-2.6.18.ovz/include/asm-m32r/ptrace.h --- linux-2.6.18/include/asm-m32r/ptrace.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-m32r/ptrace.h 2007-06-13 06:55:07.000000000 -0400 @@ -12,8 +12,6 @@ * Copyright (C) 2001-2002, 2004 Hirokazu Takata */ -#include /* M32R_PSW_BSM, M32R_PSW_BPM */ - /* 0 - 13 are integer registers (general purpose registers). */ #define PT_R4 0 #define PT_R5 1 @@ -35,21 +33,10 @@ #define PT_R15 PT_SP /* processor status and miscellaneous context registers. */ -#if defined(CONFIG_ISA_M32R2) && defined(CONFIG_ISA_DSP_LEVEL2) #define PT_ACC0H 15 #define PT_ACC0L 16 -#define PT_ACC1H 17 -#define PT_ACC1L 18 -#define PT_ACCH PT_ACC0H -#define PT_ACCL PT_ACC0L -#elif defined(CONFIG_ISA_M32R2) || defined(CONFIG_ISA_M32R) -#define PT_ACCH 15 -#define PT_ACCL 16 -#define PT_DUMMY_ACC1H 17 -#define PT_DUMMY_ACC1L 18 -#else -#error unknown isa conifiguration -#endif +#define PT_ACC1H 17 /* ISA_DSP_LEVEL2 only */ +#define PT_ACC1L 18 /* ISA_DSP_LEVEL2 only */ #define PT_PSW 19 #define PT_BPC 20 #define PT_BBPSW 21 @@ -105,19 +92,10 @@ struct pt_regs { long syscall_nr; /* Saved main processor status and miscellaneous context registers. */ -#if defined(CONFIG_ISA_M32R2) && defined(CONFIG_ISA_DSP_LEVEL2) unsigned long acc0h; unsigned long acc0l; - unsigned long acc1h; - unsigned long acc1l; -#elif defined(CONFIG_ISA_M32R2) || defined(CONFIG_ISA_M32R) - unsigned long acch; - unsigned long accl; - unsigned long dummy_acc1h; - unsigned long dummy_acc1l; -#else -#error unknown isa configuration -#endif + unsigned long acc1h; /* ISA_DSP_LEVEL2 only */ + unsigned long acc1l; /* ISA_DSP_LEVEL2 only */ unsigned long psw; unsigned long bpc; /* saved PC for TRAP syscalls */ unsigned long bbpsw; @@ -140,6 +118,8 @@ struct pt_regs { #ifdef __KERNEL__ +#include /* M32R_PSW_BSM, M32R_PSW_BPM */ + #define __ARCH_SYS_PTRACE 1 #if defined(CONFIG_ISA_M32R2) || defined(CONFIG_CHIP_VDEC2) diff -uprN linux-2.6.18/include/asm-m32r/sigcontext.h linux-2.6.18.ovz/include/asm-m32r/sigcontext.h --- linux-2.6.18/include/asm-m32r/sigcontext.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-m32r/sigcontext.h 2007-06-13 06:55:07.000000000 -0400 @@ -23,19 +23,10 @@ struct sigcontext { unsigned long sc_r12; /* Saved main processor status and miscellaneous context registers. */ -#if defined(CONFIG_ISA_M32R2) && defined(CONFIG_ISA_DSP_LEVEL2) unsigned long sc_acc0h; unsigned long sc_acc0l; - unsigned long sc_acc1h; - unsigned long sc_acc1l; -#elif defined(CONFIG_ISA_M32R2) || defined(CONFIG_ISA_M32R) - unsigned long sc_acch; - unsigned long sc_accl; - unsigned long sc_dummy_acc1h; - unsigned long sc_dummy_acc1l; -#else -#error unknown isa configuration -#endif + unsigned long sc_acc1h; /* ISA_DSP_LEVEL2 only */ + unsigned long sc_acc1l; /* ISA_DSP_LEVEL2 only */ unsigned long sc_psw; unsigned long sc_bpc; /* saved PC for TRAP syscalls */ unsigned long sc_bbpsw; diff -uprN linux-2.6.18/include/asm-m32r/signal.h linux-2.6.18.ovz/include/asm-m32r/signal.h --- linux-2.6.18/include/asm-m32r/signal.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-m32r/signal.h 2007-06-13 06:55:07.000000000 -0400 @@ -6,7 +6,6 @@ /* orig : i386 2.4.18 */ #include -#include #include #include diff -uprN linux-2.6.18/include/asm-m32r/unistd.h linux-2.6.18.ovz/include/asm-m32r/unistd.h --- linux-2.6.18/include/asm-m32r/unistd.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-m32r/unistd.h 2007-06-13 06:55:07.000000000 -0400 @@ -3,8 +3,6 @@ /* $Id: 0001-2.6.18-openvz-combined-028.035.patch,v 1.1 2007-07-23 23:01:52 niro Exp $ */ -#include /* SYSCALL_* */ - /* * This file contains the system call numbers. */ @@ -303,6 +301,8 @@ * */ +#include /* SYSCALL_* */ + #define __syscall_return(type, res) \ do { \ if ((unsigned long)(res) >= (unsigned long)(-(124 + 1))) { \ diff -uprN linux-2.6.18/include/asm-m32r/user.h linux-2.6.18.ovz/include/asm-m32r/user.h --- linux-2.6.18/include/asm-m32r/user.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-m32r/user.h 2007-06-13 06:55:07.000000000 -0400 @@ -8,7 +8,6 @@ */ #include -#include #include #include diff -uprN linux-2.6.18/include/asm-m68knommu/page.h linux-2.6.18.ovz/include/asm-m68knommu/page.h --- linux-2.6.18/include/asm-m68knommu/page.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-m68knommu/page.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,6 +1,7 @@ #ifndef _M68KNOMMU_PAGE_H #define _M68KNOMMU_PAGE_H +#ifdef __KERNEL__ /* PAGE_SHIFT determines the page size */ @@ -8,8 +9,6 @@ #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#ifdef __KERNEL__ - #include #ifndef __ASSEMBLY__ @@ -76,8 +75,8 @@ extern unsigned long memory_end; #endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #include +#endif /* __KERNEL__ */ + #endif /* _M68KNOMMU_PAGE_H */ diff -uprN linux-2.6.18/include/asm-powerpc/Kbuild linux-2.6.18.ovz/include/asm-powerpc/Kbuild --- linux-2.6.18/include/asm-powerpc/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-powerpc/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,10 +1,41 @@ include include/asm-generic/Kbuild.asm -unifdef-y += a.out.h asm-compat.h bootx.h byteorder.h cputable.h elf.h \ - nvram.h param.h posix_types.h ptrace.h seccomp.h signal.h \ - termios.h types.h unistd.h +header-y += auxvec.h +header-y += ioctls.h +header-y += mman.h +header-y += sembuf.h +header-y += siginfo.h +header-y += stat.h +header-y += errno.h +header-y += ipcbuf.h +header-y += msgbuf.h +header-y += shmbuf.h +header-y += socket.h +header-y += termbits.h +header-y += fcntl.h +header-y += ipc.h +header-y += poll.h +header-y += shmparam.h +header-y += sockios.h +header-y += ucontext.h +header-y += ioctl.h +header-y += linkage.h +header-y += resource.h +header-y += sigcontext.h +header-y += statfs.h -header-y += auxvec.h ioctls.h mman.h sembuf.h siginfo.h stat.h errno.h \ - ipcbuf.h msgbuf.h shmbuf.h socket.h termbits.h fcntl.h ipc.h \ - poll.h shmparam.h sockios.h ucontext.h ioctl.h linkage.h \ - resource.h sigcontext.h statfs.h +unifdef-y += a.out.h +unifdef-y += asm-compat.h +unifdef-y += bootx.h +unifdef-y += byteorder.h +unifdef-y += cputable.h +unifdef-y += elf.h +unifdef-y += nvram.h +unifdef-y += param.h +unifdef-y += posix_types.h +unifdef-y += ptrace.h +unifdef-y += seccomp.h +unifdef-y += signal.h +unifdef-y += termios.h +unifdef-y += types.h +unifdef-y += unistd.h diff -uprN linux-2.6.18/include/asm-powerpc/current.h linux-2.6.18.ovz/include/asm-powerpc/current.h --- linux-2.6.18/include/asm-powerpc/current.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-powerpc/current.h 2007-06-13 06:55:07.000000000 -0400 @@ -13,8 +13,19 @@ struct task_struct; #ifdef __powerpc64__ #include +#include -#define current (get_paca()->__current) +static inline struct task_struct *get_current(void) +{ + struct task_struct *task; + + __asm__ __volatile__("ld %0,%1(13)" + : "=r" (task) + : "i" (offsetof(struct paca_struct, __current))); + + return task; +} +#define current get_current() #else diff -uprN linux-2.6.18/include/asm-powerpc/mman.h linux-2.6.18.ovz/include/asm-powerpc/mman.h --- linux-2.6.18/include/asm-powerpc/mman.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-powerpc/mman.h 2007-06-13 06:55:07.000000000 -0400 @@ -23,5 +23,6 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ #endif /* _ASM_POWERPC_MMAN_H */ diff -uprN linux-2.6.18/include/asm-powerpc/pgalloc.h linux-2.6.18.ovz/include/asm-powerpc/pgalloc.h --- linux-2.6.18/include/asm-powerpc/pgalloc.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-powerpc/pgalloc.h 2007-06-13 06:55:07.000000000 -0400 @@ -35,7 +35,8 @@ extern kmem_cache_t *pgtable_cache[]; static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); + return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], + GFP_KERNEL_UBC | __GFP_SOFT_UBC); } static inline void pgd_free(pgd_t *pgd) @@ -50,7 +51,7 @@ static inline void pgd_free(pgd_t *pgd) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], - GFP_KERNEL|__GFP_REPEAT); + GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); } static inline void pud_free(pud_t *pud) @@ -86,7 +87,7 @@ static inline void pmd_populate_kernel(s static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM], - GFP_KERNEL|__GFP_REPEAT); + GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); } static inline void pmd_free(pmd_t *pmd) @@ -94,17 +95,21 @@ static inline void pmd_free(pmd_t *pmd) kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); } +static inline pte_t *do_pte_alloc(gfp_t flags) +{ + return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], flags); +} + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], - GFP_KERNEL|__GFP_REPEAT); + return do_pte_alloc(GFP_KERNEL | __GFP_REPEAT); } static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - return virt_to_page(pte_alloc_one_kernel(mm, address)); + return virt_to_page(do_pte_alloc(GFP_KERNEL_UBC | __GFP_SOFT_UBC)); } static inline void pte_free_kernel(pte_t *pte) diff -uprN linux-2.6.18/include/asm-powerpc/ptrace.h linux-2.6.18.ovz/include/asm-powerpc/ptrace.h --- linux-2.6.18/include/asm-powerpc/ptrace.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-powerpc/ptrace.h 2007-06-13 06:55:07.000000000 -0400 @@ -215,12 +215,10 @@ do { \ #define PTRACE_GETVRREGS 18 #define PTRACE_SETVRREGS 19 -#ifndef __powerpc64__ /* Get/set all the upper 32-bits of the SPE registers, accumulator, and * spefscr, in one go */ #define PTRACE_GETEVRREGS 20 #define PTRACE_SETEVRREGS 21 -#endif /* __powerpc64__ */ /* * Get or set a debug register. The first 16 are DABR registers and the @@ -235,7 +233,6 @@ do { \ #define PPC_PTRACE_GETFPREGS 0x97 /* Get FPRs 0 - 31 */ #define PPC_PTRACE_SETFPREGS 0x96 /* Set FPRs 0 - 31 */ -#ifdef __powerpc64__ /* Calls to trace a 64bit program from a 32bit program */ #define PPC_PTRACE_PEEKTEXT_3264 0x95 #define PPC_PTRACE_PEEKDATA_3264 0x94 @@ -243,6 +240,5 @@ do { \ #define PPC_PTRACE_POKEDATA_3264 0x92 #define PPC_PTRACE_PEEKUSR_3264 0x91 #define PPC_PTRACE_POKEUSR_3264 0x90 -#endif /* __powerpc64__ */ #endif /* _ASM_POWERPC_PTRACE_H */ diff -uprN linux-2.6.18/include/asm-powerpc/systbl.h linux-2.6.18.ovz/include/asm-powerpc/systbl.h --- linux-2.6.18/include/asm-powerpc/systbl.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-powerpc/systbl.h 2007-06-13 06:55:07.000000000 -0400 @@ -303,4 +303,20 @@ SYSCALL_SPU(readlinkat) SYSCALL_SPU(fchmodat) SYSCALL_SPU(faccessat) COMPAT_SYS_SPU(get_robust_list) -COMPAT_SYS_SPU(set_robust_list) +COMPAT_SYS_SPU(set_robust_list) /* 300 */ +SYS_SKIP(301, 400) +SYSCALL(ni_syscall) +SYS_SKIP_END() +SYSCALL(fairsched_mknod) /* 400 */ +SYSCALL(fairsched_rmnod) +SYSCALL(fairsched_chwt) +SYSCALL(fairsched_mvpr) +SYSCALL(fairsched_rate) +SYSCALL(fairsched_vcpus) +SYS_SKIP(406, 410) +SYSCALL(ni_syscall) +SYS_SKIP_END() +SYSCALL(getluid) /* 410 */ +SYSCALL(setluid) +SYSCALL(setublimit) +SYSCALL(ubstat) diff -uprN linux-2.6.18/include/asm-powerpc/thread_info.h linux-2.6.18.ovz/include/asm-powerpc/thread_info.h --- linux-2.6.18/include/asm-powerpc/thread_info.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-powerpc/thread_info.h 2007-06-13 06:55:07.000000000 -0400 @@ -122,6 +122,8 @@ static inline struct thread_info *curren #define TIF_RESTOREALL 12 /* Restore all regs (implies NOERROR) */ #define TIF_NOERROR 14 /* Force successful syscall return */ #define TIF_RESTORE_SIGMASK 15 /* Restore signal mask in do_signal */ +#define TIF_FREEZE 16 /* Freeze request, atomic version + of PF_FREEZE */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1< -#endif /* __s390x__ */ - -#endif diff -uprN linux-2.6.18/include/asm-s390/pgalloc.h linux-2.6.18.ovz/include/asm-s390/pgalloc.h --- linux-2.6.18/include/asm-s390/pgalloc.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-s390/pgalloc.h 2007-06-13 06:55:07.000000000 -0400 @@ -33,12 +33,12 @@ static inline pgd_t *pgd_alloc(struct mm int i; #ifndef __s390x__ - pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,1); + pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 1); if (pgd != NULL) for (i = 0; i < USER_PTRS_PER_PGD; i++) pmd_clear(pmd_offset(pgd + i, i*PGDIR_SIZE)); #else /* __s390x__ */ - pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,2); + pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2); if (pgd != NULL) for (i = 0; i < PTRS_PER_PGD; i++) pgd_clear(pgd + i); @@ -71,7 +71,7 @@ static inline pmd_t * pmd_alloc_one(stru pmd_t *pmd; int i; - pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 2); + pmd = (pmd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2); if (pmd != NULL) { for (i=0; i < PTRS_PER_PMD; i++) pmd_clear(pmd+i); @@ -117,16 +117,13 @@ pmd_populate(struct mm_struct *mm, pmd_t pmd_populate_kernel(mm, pmd, (pte_t *)((page-mem_map) << PAGE_SHIFT)); } -/* - * page table entry allocation/free routines. - */ -static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) +static inline pte_t *pte_alloc(struct mm_struct *mm, unsigned long vmaddr, + gfp_t mask) { pte_t *pte; int i; - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + pte = (pte_t *)__get_free_page(mask); if (pte != NULL) { for (i=0; i < PTRS_PER_PTE; i++) { pte_clear(mm, vmaddr, pte+i); @@ -136,10 +133,20 @@ pte_alloc_one_kernel(struct mm_struct *m return pte; } +/* + * page table entry allocation/free routines. + */ +static inline pte_t * +pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) +{ + return pte_alloc(mm, vmaddr, GFP_KERNEL | __GFP_REPEAT); +} + static inline struct page * pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr) { - pte_t *pte = pte_alloc_one_kernel(mm, vmaddr); + pte_t *pte = pte_alloc(mm, vmaddr, GFP_KERNEL_UBC | __GFP_SOFT_UBC | + __GFP_REPEAT); if (pte) return virt_to_page(pte); return NULL; diff -uprN linux-2.6.18/include/asm-sh/bugs.h linux-2.6.18.ovz/include/asm-sh/bugs.h --- linux-2.6.18/include/asm-sh/bugs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sh/bugs.h 2007-06-13 06:55:07.000000000 -0400 @@ -18,7 +18,7 @@ static void __init check_bugs(void) { extern char *get_cpu_subtype(void); extern unsigned long loops_per_jiffy; - char *p= &system_utsname.machine[2]; /* "sh" */ + char *p= &init_utsname()->machine[2]; /* "sh" */ cpu_data->loops_per_jiffy = loops_per_jiffy; diff -uprN linux-2.6.18/include/asm-sh/page.h linux-2.6.18.ovz/include/asm-sh/page.h --- linux-2.6.18/include/asm-sh/page.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sh/page.h 2007-06-13 06:55:07.000000000 -0400 @@ -112,9 +112,8 @@ typedef struct { unsigned long pgprot; } #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#endif /* __KERNEL__ */ - #include #include +#endif /* __KERNEL__ */ #endif /* __ASM_SH_PAGE_H */ diff -uprN linux-2.6.18/include/asm-sh/ptrace.h linux-2.6.18.ovz/include/asm-sh/ptrace.h --- linux-2.6.18/include/asm-sh/ptrace.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sh/ptrace.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,8 +1,6 @@ #ifndef __ASM_SH_PTRACE_H #define __ASM_SH_PTRACE_H -#include - /* * Copyright (C) 1999, 2000 Niibe Yutaka * diff -uprN linux-2.6.18/include/asm-sh64/page.h linux-2.6.18.ovz/include/asm-sh64/page.h --- linux-2.6.18/include/asm-sh64/page.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sh64/page.h 2007-06-13 06:55:07.000000000 -0400 @@ -112,9 +112,8 @@ typedef struct { unsigned long pgprot; } #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#endif /* __KERNEL__ */ - #include #include +#endif /* __KERNEL__ */ #endif /* __ASM_SH64_PAGE_H */ diff -uprN linux-2.6.18/include/asm-sh64/shmparam.h linux-2.6.18.ovz/include/asm-sh64/shmparam.h --- linux-2.6.18/include/asm-sh64/shmparam.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sh64/shmparam.h 2007-06-13 06:55:07.000000000 -0400 @@ -2,19 +2,11 @@ #define __ASM_SH64_SHMPARAM_H /* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * include/asm-sh64/shmparam.h - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * + * Set this to a sensible safe default, we'll work out the specifics for the + * align mask from the cache descriptor at run-time. */ +#define SHMLBA 0x4000 -#include - -/* attach addr a multiple of this */ -#define SHMLBA (cpu_data->dcache.sets * L1_CACHE_BYTES) +#define __ARCH_FORCE_SHMLBA #endif /* __ASM_SH64_SHMPARAM_H */ diff -uprN linux-2.6.18/include/asm-sh64/signal.h linux-2.6.18.ovz/include/asm-sh64/signal.h --- linux-2.6.18/include/asm-sh64/signal.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sh64/signal.h 2007-06-13 06:55:07.000000000 -0400 @@ -13,7 +13,6 @@ */ #include -#include /* Avoid too many header ordering problems. */ struct siginfo; diff -uprN linux-2.6.18/include/asm-sh64/user.h linux-2.6.18.ovz/include/asm-sh64/user.h --- linux-2.6.18/include/asm-sh64/user.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sh64/user.h 2007-06-13 06:55:07.000000000 -0400 @@ -13,7 +13,6 @@ */ #include -#include #include #include diff -uprN linux-2.6.18/include/asm-sparc/Kbuild linux-2.6.18.ovz/include/asm-sparc/Kbuild --- linux-2.6.18/include/asm-sparc/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,6 +1,15 @@ include include/asm-generic/Kbuild.asm -unifdef-y += fbio.h perfctr.h psr.h -header-y += apc.h asi.h auxio.h bpp.h head.h ipc.h jsflash.h \ - openpromio.h pbm.h pconf.h pgtsun4.h reg.h traps.h \ - turbosparc.h vfc_ioctls.h winmacro.h +header-y += apc.h +header-y += asi.h +header-y += bpp.h +header-y += jsflash.h +header-y += openpromio.h +header-y += pconf.h +header-y += reg.h +header-y += traps.h +header-y += vfc_ioctls.h + +unifdef-y += fbio.h +unifdef-y += perfctr.h +unifdef-y += psr.h diff -uprN linux-2.6.18/include/asm-sparc/page.h linux-2.6.18.ovz/include/asm-sparc/page.h --- linux-2.6.18/include/asm-sparc/page.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc/page.h 2007-06-13 06:55:07.000000000 -0400 @@ -8,6 +8,8 @@ #ifndef _SPARC_PAGE_H #define _SPARC_PAGE_H +#ifdef __KERNEL__ + #ifdef CONFIG_SUN4 #define PAGE_SHIFT 13 #else @@ -21,8 +23,6 @@ #endif #define PAGE_MASK (~(PAGE_SIZE-1)) -#ifdef __KERNEL__ - #include #ifndef __ASSEMBLY__ @@ -160,9 +160,9 @@ extern unsigned long pfn_base; #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#endif /* __KERNEL__ */ - #include #include +#endif /* __KERNEL__ */ + #endif /* _SPARC_PAGE_H */ diff -uprN linux-2.6.18/include/asm-sparc/unistd.h linux-2.6.18.ovz/include/asm-sparc/unistd.h --- linux-2.6.18/include/asm-sparc/unistd.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc/unistd.h 2007-06-13 06:55:07.000000000 -0400 @@ -319,6 +319,8 @@ #define __NR_set_robust_list 300 #define __NR_get_robust_list 301 +#define NR_SYSCALLS 302 + #ifdef __KERNEL__ /* WARNING: You MAY NOT add syscall numbers larger than 301, since * all of the syscall tables in the Sparc kernel are diff -uprN linux-2.6.18/include/asm-sparc64/Kbuild linux-2.6.18.ovz/include/asm-sparc64/Kbuild --- linux-2.6.18/include/asm-sparc64/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc64/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -4,7 +4,23 @@ ALTARCH := sparc ARCHDEF := defined __sparc__ && defined __arch64__ ALTARCHDEF := defined __sparc__ && !defined __arch64__ -unifdef-y += fbio.h perfctr.h -header-y += apb.h asi.h bbc.h bpp.h display7seg.h envctrl.h floppy.h \ - ipc.h kdebug.h mostek.h openprom.h openpromio.h parport.h \ - pconf.h psrcompat.h pstate.h reg.h uctx.h utrap.h watchdog.h +header-y += apb.h +header-y += asi.h +header-y += bbc.h +header-y += bpp.h +header-y += const.h +header-y += display7seg.h +header-y += envctrl.h +header-y += ipc.h +header-y += openprom.h +header-y += openpromio.h +header-y += pconf.h +header-y += psrcompat.h +header-y += pstate.h +header-y += reg.h +header-y += uctx.h +header-y += utrap.h +header-y += watchdog.h + +unifdef-y += fbio.h +unifdef-y += perfctr.h diff -uprN linux-2.6.18/include/asm-sparc64/futex.h linux-2.6.18.ovz/include/asm-sparc64/futex.h --- linux-2.6.18/include/asm-sparc64/futex.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc64/futex.h 2007-06-13 06:55:07.000000000 -0400 @@ -87,24 +87,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) { __asm__ __volatile__( - "\n1: lduwa [%2] %%asi, %0\n" - "2: casa [%2] %%asi, %0, %1\n" - "3:\n" + "\n1: casa [%3] %%asi, %2, %0\n" + "2:\n" " .section .fixup,#alloc,#execinstr\n" " .align 4\n" - "4: ba 3b\n" - " mov %3, %0\n" + "3: ba 2b\n" + " mov %4, %0\n" " .previous\n" " .section __ex_table,\"a\"\n" " .align 4\n" - " .word 1b, 4b\n" - " .word 2b, 4b\n" + " .word 1b, 3b\n" " .previous\n" - : "=&r" (oldval) - : "r" (newval), "r" (uaddr), "i" (-EFAULT) + : "=r" (newval) + : "0" (newval), "r" (oldval), "r" (uaddr), "i" (-EFAULT) : "memory"); - return oldval; + return newval; } #endif /* !(_SPARC64_FUTEX_H) */ diff -uprN linux-2.6.18/include/asm-sparc64/mman.h linux-2.6.18.ovz/include/asm-sparc64/mman.h --- linux-2.6.18/include/asm-sparc64/mman.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc64/mman.h 2007-06-13 06:55:07.000000000 -0400 @@ -21,6 +21,7 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ /* XXX Need to add flags to SunOS's mctl, mlockall, and madvise system * XXX calls. diff -uprN linux-2.6.18/include/asm-sparc64/page.h linux-2.6.18.ovz/include/asm-sparc64/page.h --- linux-2.6.18/include/asm-sparc64/page.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc64/page.h 2007-06-13 06:55:07.000000000 -0400 @@ -3,6 +3,8 @@ #ifndef _SPARC64_PAGE_H #define _SPARC64_PAGE_H +#ifdef __KERNEL__ + #include #if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) @@ -27,8 +29,6 @@ #define DCACHE_ALIASING_POSSIBLE #endif -#ifdef __KERNEL__ - #if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) #define HPAGE_SHIFT 22 #elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) @@ -141,8 +141,7 @@ typedef unsigned long pgprot_t; #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#endif /* !(__KERNEL__) */ - #include -#endif /* !(_SPARC64_PAGE_H) */ +#endif /* __KERNEL__ */ +#endif /* _SPARC64_PAGE_H */ diff -uprN linux-2.6.18/include/asm-sparc64/pgalloc.h linux-2.6.18.ovz/include/asm-sparc64/pgalloc.h --- linux-2.6.18/include/asm-sparc64/pgalloc.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc64/pgalloc.h 2007-06-13 06:55:07.000000000 -0400 @@ -17,7 +17,7 @@ extern kmem_cache_t *pgtable_cache; static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(pgtable_cache, GFP_KERNEL); + return kmem_cache_alloc(pgtable_cache, GFP_KERNEL_UBC); } static inline void pgd_free(pgd_t *pgd) @@ -30,7 +30,7 @@ static inline void pgd_free(pgd_t *pgd) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { return kmem_cache_alloc(pgtable_cache, - GFP_KERNEL|__GFP_REPEAT); + GFP_KERNEL_UBC|__GFP_REPEAT); } static inline void pmd_free(pmd_t *pmd) @@ -48,7 +48,8 @@ static inline pte_t *pte_alloc_one_kerne static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - return virt_to_page(pte_alloc_one_kernel(mm, address)); + return virt_to_page(kmem_cache_alloc(pgtable_cache, + GFP_KERNEL_UBC|__GFP_REPEAT)); } static inline void pte_free_kernel(pte_t *pte) diff -uprN linux-2.6.18/include/asm-sparc64/shmparam.h linux-2.6.18.ovz/include/asm-sparc64/shmparam.h --- linux-2.6.18/include/asm-sparc64/shmparam.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc64/shmparam.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,6 +1,7 @@ /* $Id: 0001-2.6.18-openvz-combined-028.035.patch,v 1.1 2007-07-23 23:01:52 niro Exp $ */ #ifndef _ASMSPARC64_SHMPARAM_H #define _ASMSPARC64_SHMPARAM_H +#ifdef __KERNEL__ #include @@ -8,4 +9,5 @@ /* attach addr a multiple of this */ #define SHMLBA ((PAGE_SIZE > L1DCACHE_SIZE) ? PAGE_SIZE : L1DCACHE_SIZE) +#endif /* __KERNEL__ */ #endif /* _ASMSPARC64_SHMPARAM_H */ diff -uprN linux-2.6.18/include/asm-sparc64/thread_info.h linux-2.6.18.ovz/include/asm-sparc64/thread_info.h --- linux-2.6.18/include/asm-sparc64/thread_info.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-sparc64/thread_info.h 2007-06-13 06:55:07.000000000 -0400 @@ -162,14 +162,14 @@ register struct thread_info *current_thr struct thread_info *ret; \ \ ret = (struct thread_info *) \ - __get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER); \ + __get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER);\ if (ret) \ memset(ret, 0, PAGE_SIZE<<__THREAD_INFO_ORDER); \ ret; \ }) #else #define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER)) + ((struct thread_info *)__get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER)) #endif #define free_thread_info(ti) \ @@ -236,6 +236,7 @@ register struct thread_info *current_thr #define TIF_ABI_PENDING 12 #define TIF_MEMDIE 13 #define TIF_POLLING_NRFLAG 14 +#define TIF_FREEZE 15 /* Freeze request (atomic PF_FREEZE) */ #define _TIF_SYSCALL_TRACE (1< @@ -32,7 +34,6 @@ #endif -#ifdef __KERNEL__ #ifndef __ASSEMBLY__ #define STRICT_MM_TYPECHECKS @@ -122,9 +123,9 @@ typedef unsigned long pgprot_t; #define __va(x) ((void *)__phys_to_virt ((unsigned long)(x))) -#endif /* KERNEL */ - #include #include +#endif /* KERNEL */ + #endif /* __V850_PAGE_H__ */ diff -uprN linux-2.6.18/include/asm-v850/param.h linux-2.6.18.ovz/include/asm-v850/param.h --- linux-2.6.18/include/asm-v850/param.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-v850/param.h 2007-06-13 06:55:07.000000000 -0400 @@ -14,8 +14,6 @@ #ifndef __V850_PARAM_H__ #define __V850_PARAM_H__ -#include /* For HZ */ - #define EXEC_PAGESIZE 4096 #ifndef NOGROUP @@ -25,6 +23,8 @@ #define MAXHOSTNAMELEN 64 /* max length of hostname */ #ifdef __KERNEL__ +#include /* For HZ */ + # define USER_HZ 100 # define CLOCKS_PER_SEC USER_HZ #endif diff -uprN linux-2.6.18/include/asm-x86_64/Kbuild linux-2.6.18.ovz/include/asm-x86_64/Kbuild --- linux-2.6.18/include/asm-x86_64/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -4,8 +4,18 @@ ALTARCH := i386 ARCHDEF := defined __x86_64__ ALTARCHDEF := defined __i386__ -header-y += boot.h bootsetup.h cpufeature.h debugreg.h ldt.h \ - msr.h prctl.h setup.h sigcontext32.h ucontext.h \ - vsyscall32.h +header-y += boot.h +header-y += bootsetup.h +header-y += cpufeature.h +header-y += debugreg.h +header-y += ldt.h +header-y += msr.h +header-y += prctl.h +header-y += setup.h +header-y += sigcontext32.h +header-y += ucontext.h +header-y += vsyscall32.h -unifdef-y += mce.h mtrr.h vsyscall.h +unifdef-y += mce.h +unifdef-y += mtrr.h +unifdef-y += vsyscall.h diff -uprN linux-2.6.18/include/asm-x86_64/mman.h linux-2.6.18.ovz/include/asm-x86_64/mman.h --- linux-2.6.18/include/asm-x86_64/mman.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/mman.h 2007-06-13 06:55:07.000000000 -0400 @@ -12,6 +12,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff -uprN linux-2.6.18/include/asm-x86_64/msr.h linux-2.6.18.ovz/include/asm-x86_64/msr.h --- linux-2.6.18/include/asm-x86_64/msr.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/msr.h 2007-06-13 06:55:07.000000000 -0400 @@ -149,6 +149,33 @@ static inline unsigned int cpuid_edx(uns #define MSR_IA32_UCODE_WRITE 0x79 #define MSR_IA32_UCODE_REV 0x8b +#ifdef CONFIG_SMP +void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); +void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); + +void cpuid_on_cpu(unsigned int cpu, u32 op, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); +u32 cpuid_eax_on_cpu(unsigned int cpu, u32 op); +#else +static inline void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + rdmsr(msr_no, *l, *h); +} + +static inline void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + wrmsr(msr_no, l, h); +} + +static inline void cpuid_on_cpu(unsigned int cpu, u32 op, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + cpuid(op, eax, ebx, ecx, edx); +} + +static inline u32 cpuid_eax_on_cpu(unsigned int cpu, u32 op) +{ + return cpuid_eax(op); +} +#endif #endif diff -uprN linux-2.6.18/include/asm-x86_64/nmi.h linux-2.6.18.ovz/include/asm-x86_64/nmi.h --- linux-2.6.18/include/asm-x86_64/nmi.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/nmi.h 2007-06-13 06:55:07.000000000 -0400 @@ -26,6 +26,9 @@ void set_nmi_callback(nmi_callback_t cal */ void unset_nmi_callback(void); +void set_nmi_ipi_callback(nmi_callback_t callback); +void unset_nmi_ipi_callback(void); + #ifdef CONFIG_PM /** Replace the PM callback routine for NMI. */ diff -uprN linux-2.6.18/include/asm-x86_64/pgalloc.h linux-2.6.18.ovz/include/asm-x86_64/pgalloc.h --- linux-2.6.18/include/asm-x86_64/pgalloc.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/pgalloc.h 2007-06-13 06:55:07.000000000 -0400 @@ -31,12 +31,14 @@ static inline void pmd_free(pmd_t *pmd) static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| + __GFP_SOFT_UBC); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| + __GFP_SOFT_UBC); } static inline void pud_free (pud_t *pud) @@ -74,7 +76,8 @@ static inline void pgd_list_del(pgd_t *p static inline pgd_t *pgd_alloc(struct mm_struct *mm) { unsigned boundary; - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL_UBC|__GFP_REPEAT| + __GFP_SOFT_UBC); if (!pgd) return NULL; pgd_list_add(pgd); @@ -105,7 +108,8 @@ static inline pte_t *pte_alloc_one_kerne static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - void *p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + void *p = (void *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| + __GFP_SOFT_UBC); if (!p) return NULL; return virt_to_page(p); diff -uprN linux-2.6.18/include/asm-x86_64/processor.h linux-2.6.18.ovz/include/asm-x86_64/processor.h --- linux-2.6.18/include/asm-x86_64/processor.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/processor.h 2007-06-13 06:55:07.000000000 -0400 @@ -175,7 +175,7 @@ static inline void clear_in_cr4 (unsigne /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000) +#define IA32_PAGE_OFFSET 0xc0000000 #define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) diff -uprN linux-2.6.18/include/asm-x86_64/segment.h linux-2.6.18.ovz/include/asm-x86_64/segment.h --- linux-2.6.18/include/asm-x86_64/segment.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/segment.h 2007-06-13 06:55:07.000000000 -0400 @@ -3,29 +3,28 @@ #include -#define __KERNEL_CS 0x10 -#define __KERNEL_DS 0x18 - -#define __KERNEL32_CS 0x38 - +#define GDT_ENTRY_BOOT_CS 2 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) +#define GDT_ENTRY_BOOT_DS 3 +#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) +#define GDT_ENTRY_TSS 4 /* needs two entries */ /* * we cannot use the same code segment descriptor for user and kernel * -- not even in the long flat mode, because of different DPL /kkeil * The segment offset needs to contain a RPL. Grr. -AK * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) */ +#define GDT_ENTRY_TLS_MIN 6 +#define GDT_ENTRY_TLS_MAX 8 -#define __USER32_CS 0x23 /* 4*8+3 */ -#define __USER_DS 0x2b /* 5*8+3 */ -#define __USER_CS 0x33 /* 6*8+3 */ +#define GDT_ENTRY_LDT 9 /* needs two entries */ +#define __KERNEL32_CS 0x58 /* 11*8 */ +#define __KERNEL_CS 0x60 /* 12*8 */ +#define __KERNEL_DS 0x68 /* 13*8 */ +#define __USER32_CS 0x73 /* 14*8+3 */ +#define __USER_DS 0x7b /* 15*8+3 */ #define __USER32_DS __USER_DS - -#define GDT_ENTRY_TLS 1 -#define GDT_ENTRY_TSS 8 /* needs two entries */ -#define GDT_ENTRY_LDT 10 /* needs two entries */ -#define GDT_ENTRY_TLS_MIN 12 -#define GDT_ENTRY_TLS_MAX 14 -/* 15 free */ +#define __USER_CS 0x83 /* 16*8+3 */ #define GDT_ENTRY_TLS_ENTRIES 3 @@ -37,7 +36,7 @@ #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) #define IDT_ENTRIES 256 -#define GDT_ENTRIES 16 +#define GDT_ENTRIES 32 #define GDT_SIZE (GDT_ENTRIES * 8) #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) diff -uprN linux-2.6.18/include/asm-x86_64/signal.h linux-2.6.18.ovz/include/asm-x86_64/signal.h --- linux-2.6.18/include/asm-x86_64/signal.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/signal.h 2007-06-13 06:55:07.000000000 -0400 @@ -23,11 +23,6 @@ typedef struct { unsigned long sig[_NSIG_WORDS]; } sigset_t; - -struct pt_regs; -asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); - - #else /* Here we must cater to libcs that poke about in kernel headers. */ diff -uprN linux-2.6.18/include/asm-x86_64/system.h linux-2.6.18.ovz/include/asm-x86_64/system.h --- linux-2.6.18/include/asm-x86_64/system.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/system.h 2007-06-13 06:55:07.000000000 -0400 @@ -14,12 +14,13 @@ #define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\n\t" +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" #define __EXTRA_CLOBBER \ ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" +/* Save restore flags to clear handle leaking NT */ #define switch_to(prev,next,last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ diff -uprN linux-2.6.18/include/asm-x86_64/thread_info.h linux-2.6.18.ovz/include/asm-x86_64/thread_info.h --- linux-2.6.18/include/asm-x86_64/thread_info.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/asm-x86_64/thread_info.h 2007-06-13 06:55:07.000000000 -0400 @@ -78,14 +78,15 @@ static inline struct thread_info *stack_ ({ \ struct thread_info *ret; \ \ - ret = ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)); \ + ret = ((struct thread_info *) __get_free_pages(GFP_KERNEL_UBC,\ + THREAD_ORDER)); \ if (ret) \ memset(ret, 0, THREAD_SIZE); \ ret; \ }) #else #define alloc_thread_info(tsk) \ - ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)) + ((struct thread_info *) __get_free_pages(GFP_KERNEL_UBC,THREAD_ORDER)) #endif #define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER) @@ -114,11 +115,14 @@ static inline struct thread_info *stack_ #define TIF_IRET 5 /* force IRET */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ /* 16 free */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ #define TIF_ABI_PENDING 19 -#define TIF_MEMDIE 20 +#define TIF_FREEZE 20 +#define TIF_MEMDIE 21 +#define TIF_RESUME 22 #define _TIF_SYSCALL_TRACE (1< + +#ifdef CONFIG_VE + +/* Replacement for CAP_NET_ADMIN: + delegated rights to the Virtual environment of its network administration. + For now the following rights have been delegated: + + Allow setting arbitrary process / process group ownership on sockets + Allow interface configuration + */ +#define CAP_VE_NET_ADMIN CAP_VE_ADMIN + +/* Replacement for CAP_SYS_ADMIN: + delegated rights to the Virtual environment of its administration. + For now the following rights have been delegated: + */ +/* Allow mount/umount/remount */ +/* Allow examination and configuration of disk quotas */ +/* Allow removing semaphores */ +/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores + and shared memory */ +/* Allow locking/unlocking of shared memory segment */ +/* Allow forged pids on socket credentials passing */ + +#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN +#else +#define CAP_VE_NET_ADMIN CAP_NET_ADMIN +#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN +#endif + /* * Bounding set */ +#ifndef CONFIG_VE extern kernel_cap_t cap_bset; +#else +#define cap_bset get_exec_env()->ve_cap_bset +#endif /* * Internal kernel functions only @@ -352,13 +393,19 @@ static inline kernel_cap_t cap_invert(ke #define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set))) #define cap_clear(c) do { cap_t(c) = 0; } while(0) +#ifndef CONFIG_VE #define cap_set_full(c) do { cap_t(c) = ~0; } while(0) +#else +#define cap_set_full(c) \ + do {cap_t(c) = ve_is_super(get_exec_env()) ? ~0 : \ + cap_bset; } while(0) +#endif #define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0) - #define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK) int capable(int cap); int __capable(struct task_struct *t, int cap); +extern spinlock_t task_capability_lock; #endif /* __KERNEL__ */ diff -uprN linux-2.6.18/include/linux/cfq-iosched.h linux-2.6.18.ovz/include/linux/cfq-iosched.h --- linux-2.6.18/include/linux/cfq-iosched.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/cfq-iosched.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,200 @@ +#ifndef _LINUX_CFQ_IOSCHED_H +#define _LINUX_CFQ_IOSCHED_H + +#include +#include + +/* + * Each block device managed by CFQ I/O scheduler is represented + * by cfq_data structure. Certain members of this structure are + * distinguished to cfq_bc_data on per-UBC basis. Thus cfq_bc_data + * structure is per (Device, UBC) pare. + * + * BC holds a list head of all cfq_bc_data, that belong to UBC, + * and cfq_data holds a list head of all active cfq_bc_data for + * for the device (active means that there are requests in-flight). + * cfq_bc_data has a pointers to owning UBC and cfq_data. + * + * For example, if there are two devices and three beancounters: + * + * cfq_data 1 cfq_data 2 + * | | + * | | + * UB1 --- cfq_bc_data ------- cfq_bc_data + * | | + * | | + * UB2 --- cfq_bc_data ------- cfq_bc_data + * | | + * | | + * UB2 --- cfq_bc_data ------- cfq_bc_data + * + * One more basic structure in CFQ scheduler is cfq_queue, + * which is a queue of requests. For sync queues it's a per-process + * structure. While creating new cfq_queue we store cfq_bc_data + * it belongs to, and later use this information in order to add + * queue in proper lists. + * + */ + +extern kmem_cache_t *cfq_pool; + +#define CFQ_PRIO_LISTS IOPRIO_BE_NR + +/* + * Per (Device, UBC) queue data + */ +struct cfq_bc_data { + /* for ub.iopriv->cfq_bc_head */ + struct list_head cfq_bc_list; + /* for cfqd->act_cfq_bc_head */ + struct list_head act_cfq_bc_list; + + struct cfq_data *cfqd; + struct ub_iopriv *ub_iopriv; + + /* + * rr list of queues with requests + */ + struct list_head rr_list[CFQ_PRIO_LISTS]; + struct list_head cur_rr; + struct list_head idle_rr; + struct list_head busy_rr; + + int cur_prio; + int cur_end_prio; + + unsigned long rqnum; + unsigned long on_dispatch; + struct cfq_queue *async_cfqq[CFQ_PRIO_LISTS]; +}; + +/* + * Per block device queue structure + */ +struct cfq_data { + struct request_queue *queue; + +#ifndef CONFIG_UBC_IO_PRIO + struct cfq_bc_data cfq_bc; +#endif + unsigned int busy_queues; + + /* + * non-ordered list of empty cfqq's + */ + struct list_head empty_list; + + /* + * cfqq lookup hash + */ + struct hlist_head *cfq_hash; + + /* + * global crq hash for all queues + */ + struct hlist_head *crq_hash; + + mempool_t *crq_pool; + + int rq_in_driver; + int hw_tag; + + /* + * schedule slice state info + */ + /* + * idle window management + */ + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + struct cfq_queue *active_queue; + struct cfq_io_context *active_cic; + unsigned int dispatch_slice; + + struct timer_list idle_class_timer; + + sector_t last_sector; + unsigned long last_end_request; + + unsigned int rq_starved; + + /* + * tunables, see top of file + */ + unsigned int cfq_quantum; + unsigned int cfq_queued; + unsigned int cfq_fifo_expire[2]; + unsigned int cfq_back_penalty; + unsigned int cfq_back_max; + unsigned int cfq_slice[2]; + unsigned int cfq_slice_async_rq; + unsigned int cfq_slice_idle; + + struct list_head cic_list; + + /* list of ub that have requests */ + struct list_head act_cfq_bc_head; + /* ub that owns a timeslice at the moment */ + struct cfq_bc_data *active_cfq_bc; + unsigned int cfq_ub_slice; + unsigned long slice_end; + int virt_mode; + int write_virt_mode; +}; + +/* + * Per process-grouping structure + */ +struct cfq_queue { + /* reference count */ + atomic_t ref; + /* parent cfq_data */ + struct cfq_data *cfqd; + /* cfqq lookup hash */ + struct hlist_node cfq_hash; + /* hash key */ + unsigned int key; + /* on either rr or empty list of cfqd */ + struct list_head cfq_list; + /* sorted list of pending requests */ + struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ + struct cfq_rq *next_crq; + /* requests queued in sort_list */ + int queued[2]; + /* currently allocated requests */ + int allocated[2]; + /* fifo list of requests in sort_list */ + struct list_head fifo; + + unsigned long slice_start; + unsigned long slice_end; + unsigned long slice_left; + unsigned long service_last; + + /* number of requests that are on the dispatch list */ + int on_dispatch[2]; + + /* io prio of this group */ + unsigned short ioprio, org_ioprio; + unsigned short ioprio_class, org_ioprio_class; + + /* various state flags, see below */ + unsigned int flags; + + struct cfq_bc_data *cfq_bc; +}; + +static void inline cfq_init_cfq_bc(struct cfq_bc_data *cfq_bc) +{ + int i; + + for (i = 0; i < CFQ_PRIO_LISTS; i++) + INIT_LIST_HEAD(&cfq_bc->rr_list[i]); + + INIT_LIST_HEAD(&cfq_bc->cur_rr); + INIT_LIST_HEAD(&cfq_bc->idle_rr); + INIT_LIST_HEAD(&cfq_bc->busy_rr); +} +#endif /* _LINUX_CFQ_IOSCHED_H */ diff -uprN linux-2.6.18/include/linux/compat.h linux-2.6.18.ovz/include/linux/compat.h --- linux-2.6.18/include/linux/compat.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/compat.h 2007-06-13 06:55:07.000000000 -0400 @@ -227,6 +227,9 @@ static inline int compat_timespec_compar asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); extern int compat_printk(const char *fmt, ...); +extern int ve_compat_printk(int dst, const char *fmt, ...); + +extern long compat_nanosleep_restart(struct restart_block *restart); #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff -uprN linux-2.6.18/include/linux/console.h linux-2.6.18.ovz/include/linux/console.h --- linux-2.6.18/include/linux/console.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/console.h 2007-06-13 06:55:07.000000000 -0400 @@ -137,4 +137,22 @@ extern void resume_console(void); #define VESA_HSYNC_SUSPEND 2 #define VESA_POWERDOWN 3 + +#include +#include +#include + +struct printk_aligned { + int v; +} ____cacheline_aligned; +extern struct printk_aligned printk_no_wake_var[NR_CPUS]; +#define __printk_no_wake (printk_no_wake_var[smp_processor_id()].v) +#define printk_no_wake ({ \ + int v; \ + preempt_disable(); \ + v = __printk_no_wake; \ + preempt_enable_no_resched(); \ + v; \ + }) + #endif /* _LINUX_CONSOLE_H */ diff -uprN linux-2.6.18/include/linux/cpt_image.h linux-2.6.18.ovz/include/linux/cpt_image.h --- linux-2.6.18/include/linux/cpt_image.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/cpt_image.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1586 @@ +/* + * + * include/linux/cpt_image.h + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __CPT_IMAGE_H_ +#define __CPT_IMAGE_H_ 1 + +#define CPT_NULL (~0ULL) +#define CPT_NOINDEX (~0U) + +/* + * Image file layout. + * + * - major header + * - sections[] + * + * Each section is: + * - section header + * - array of objects + * + * All data records are arch independent, 64 bit aligned. + */ + +enum _cpt_object_type +{ + CPT_OBJ_TASK = 0, + CPT_OBJ_MM, + CPT_OBJ_FS, + CPT_OBJ_FILES, + CPT_OBJ_FILE, + CPT_OBJ_SIGHAND_STRUCT, + CPT_OBJ_SIGNAL_STRUCT, + CPT_OBJ_TTY, + CPT_OBJ_SOCKET, + CPT_OBJ_SYSVSEM_UNDO, + CPT_OBJ_NAMESPACE, + CPT_OBJ_SYSV_SHM, + CPT_OBJ_INODE, + CPT_OBJ_UBC, + CPT_OBJ_SLM_SGREG, + CPT_OBJ_SLM_REGOBJ, + CPT_OBJ_SLM_MM, + CPT_OBJ_MAX, + /* The objects above are stored in memory while checkpointing */ + + CPT_OBJ_VMA = 1024, + CPT_OBJ_FILEDESC, + CPT_OBJ_SIGHANDLER, + CPT_OBJ_SIGINFO, + CPT_OBJ_LASTSIGINFO, + CPT_OBJ_SYSV_SEM, + CPT_OBJ_SKB, + CPT_OBJ_FLOCK, + CPT_OBJ_OPENREQ, + CPT_OBJ_VFSMOUNT, + CPT_OBJ_TRAILER, + CPT_OBJ_SYSVSEM_UNDO_REC, + CPT_OBJ_NET_DEVICE, + CPT_OBJ_NET_IFADDR, + CPT_OBJ_NET_ROUTE, + CPT_OBJ_NET_CONNTRACK, + CPT_OBJ_NET_CONNTRACK_EXPECT, + CPT_OBJ_AIO_CONTEXT, + CPT_OBJ_VEINFO, + CPT_OBJ_EPOLL, + CPT_OBJ_EPOLL_FILE, + CPT_OBJ_SKFILTER, + CPT_OBJ_SIGALTSTACK, + CPT_OBJ_SOCK_MCADDR, + CPT_OBJ_BIND_MNT, + + CPT_OBJ_X86_REGS = 4096, + CPT_OBJ_X86_64_REGS, + CPT_OBJ_PAGES, + CPT_OBJ_COPYPAGES, + CPT_OBJ_REMAPPAGES, + CPT_OBJ_LAZYPAGES, + CPT_OBJ_NAME, + CPT_OBJ_BITS, + CPT_OBJ_REF, + CPT_OBJ_ITERPAGES, + CPT_OBJ_ITERYOUNGPAGES, + CPT_OBJ_VSYSCALL, + CPT_OBJ_IA64_REGS, + CPT_OBJ_INOTIFY, + CPT_OBJ_INOTIFY_WATCH, + CPT_OBJ_INOTIFY_EVENT, + CPT_OBJ_TASK_AUX, +}; + +#define CPT_ALIGN(n) (((n)+7)&~7) + +struct cpt_major_hdr +{ + __u8 cpt_signature[4]; /* Magic number */ + __u16 cpt_hdrlen; /* Length of this header */ + __u16 cpt_image_version; /* Format of this file */ +#define CPT_VERSION_MINOR(a) ((a) & 0xf) +#define CPT_VERSION_8 0 +#define CPT_VERSION_9 0x100 +#define CPT_VERSION_9_1 0x101 +#define CPT_VERSION_16 0x200 +#define CPT_VERSION_18 0x300 + __u16 cpt_os_arch; /* Architecture */ +#define CPT_OS_ARCH_I386 0 +#define CPT_OS_ARCH_EMT64 1 +#define CPT_OS_ARCH_IA64 2 + __u16 __cpt_pad1; + __u32 cpt_ve_features; /* VE features */ + __u32 cpt_ve_features2; /* VE features */ + __u16 cpt_pagesize; /* Page size used by OS */ + __u16 cpt_hz; /* HZ used by OS */ + __u64 cpt_start_jiffies64; /* Jiffies */ + __u32 cpt_start_sec; /* Seconds */ + __u32 cpt_start_nsec; /* Nanoseconds */ + __u32 cpt_cpu_caps[4]; /* CPU capabilities */ + __u32 cpt_kernel_config[4]; /* Kernel config */ + __u64 cpt_iptables_mask; /* Used netfilter modules */ +} __attribute__ ((aligned (8))); + +#define CPT_SIGNATURE0 0x79 +#define CPT_SIGNATURE1 0x1c +#define CPT_SIGNATURE2 0x01 +#define CPT_SIGNATURE3 0x63 + +/* CPU capabilities */ +#define CPT_CPU_X86_CMOV 0 +#define CPT_CPU_X86_FXSR 1 +#define CPT_CPU_X86_SSE 2 +#define CPT_CPU_X86_SSE2 3 +#define CPT_CPU_X86_MMX 4 +#define CPT_CPU_X86_3DNOW 5 +#define CPT_CPU_X86_3DNOW2 6 +#define CPT_CPU_X86_SEP 7 +#define CPT_CPU_X86_EMT64 8 +#define CPT_CPU_X86_IA64 9 + +/* Unsupported features */ +#define CPT_EXTERNAL_PROCESS 16 +#define CPT_NAMESPACES 17 +#define CPT_SCHEDULER_POLICY 18 +#define CPT_PTRACED_FROM_VE0 19 +#define CPT_UNSUPPORTED_FSTYPE 20 +#define CPT_BIND_MOUNT 21 +#define CPT_UNSUPPORTED_NETDEV 22 +#define CPT_UNSUPPORTED_MISC 23 + +/* This mask is used to determine whether VE + has some unsupported features or not */ +#define CPT_UNSUPPORTED_MASK 0xffff0000UL + +#define CPT_KERNEL_CONFIG_PAE 0 + +struct cpt_section_hdr +{ + __u64 cpt_next; + __u32 cpt_section; + __u16 cpt_hdrlen; + __u16 cpt_align; +} __attribute__ ((aligned (8))); + +enum +{ + CPT_SECT_ERROR, /* Error section, content is string */ + CPT_SECT_VEINFO, + CPT_SECT_FILES, /* Files. Content is array of file objects */ + CPT_SECT_TASKS, + CPT_SECT_MM, + CPT_SECT_FILES_STRUCT, + CPT_SECT_FS, + CPT_SECT_SIGHAND_STRUCT, + CPT_SECT_TTY, + CPT_SECT_SOCKET, + CPT_SECT_NAMESPACE, + CPT_SECT_SYSVSEM_UNDO, + CPT_SECT_INODE, /* Inodes with i->i_nlink==0 and + * deleted dentires with inodes not + * referenced inside dumped process. + */ + CPT_SECT_SYSV_SHM, + CPT_SECT_SYSV_SEM, + CPT_SECT_ORPHANS, + CPT_SECT_NET_DEVICE, + CPT_SECT_NET_IFADDR, + CPT_SECT_NET_ROUTE, + CPT_SECT_NET_IPTABLES, + CPT_SECT_NET_CONNTRACK, + CPT_SECT_NET_CONNTRACK_VE0, + CPT_SECT_UTSNAME, + CPT_SECT_TRAILER, + CPT_SECT_UBC, + CPT_SECT_SLM_SGREGS, + CPT_SECT_SLM_REGOBJS, +/* Due to silly mistake we cannot index sections beyond this value */ +#define CPT_SECT_MAX_INDEX (CPT_SECT_SLM_REGOBJS+1) + CPT_SECT_EPOLL, + CPT_SECT_VSYSCALL, + CPT_SECT_MAX +}; + +struct cpt_major_tail +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_lazypages; + __u32 cpt_64bit; + __u64 cpt_sections[CPT_SECT_MAX_INDEX]; + __u32 cpt_nsect; + __u8 cpt_signature[4]; /* Magic number */ +} __attribute__ ((aligned (8))); + + +/* Common object header. */ +struct cpt_object_hdr +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; +} __attribute__ ((aligned (8))); + +enum _cpt_content_type { + CPT_CONTENT_VOID, + CPT_CONTENT_ARRAY, + CPT_CONTENT_DATA, + CPT_CONTENT_NAME, + + CPT_CONTENT_STACK, + CPT_CONTENT_X86_FPUSTATE_OLD, + CPT_CONTENT_X86_FPUSTATE, + CPT_CONTENT_MM_CONTEXT, + CPT_CONTENT_SEMARRAY, + CPT_CONTENT_SEMUNDO, + CPT_CONTENT_NLMARRAY, + CPT_CONTENT_MAX +}; + +/* CPT_OBJ_BITS: encode array of bytes */ +struct cpt_obj_bits +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_REF: a reference to another object */ +struct cpt_obj_ref +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_pos; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_VEINFO: various ve specific data */ +struct cpt_veinfo_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + /* ipc ctls */ + __u32 shm_ctl_max; + __u32 shm_ctl_all; + __u32 shm_ctl_mni; + __u32 msg_ctl_max; + __u32 msg_ctl_mni; + __u32 msg_ctl_mnb; + __u32 sem_ctl_arr[4]; + + /* start time */ + __u64 start_timespec_delta; + __u64 start_jiffies_delta; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_FILE: one struct file */ +struct cpt_file_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_flags; + __u32 cpt_mode; + __u64 cpt_pos; + __u32 cpt_uid; + __u32 cpt_gid; + + __u32 cpt_i_mode; + __u32 cpt_lflags; +#define CPT_DENTRY_DELETED 1 +#define CPT_DENTRY_ROOT 2 +#define CPT_DENTRY_CLONING 4 +#define CPT_DENTRY_PROC 8 +#define CPT_DENTRY_EPOLL 0x10 +#define CPT_DENTRY_REPLACED 0x20 + __u64 cpt_inode; + __u64 cpt_priv; + + __u32 cpt_fown_fd; + __u32 cpt_fown_pid; + __u32 cpt_fown_uid; + __u32 cpt_fown_euid; + __u32 cpt_fown_signo; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by file name, encoded as CPT_OBJ_NAME */ + +struct cpt_epoll_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; +} __attribute__ ((aligned (8))); +/* Followed by array of struct cpt_epoll_file */ + +struct cpt_epoll_file_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_fd; + __u32 cpt_events; + __u64 cpt_data; + __u32 cpt_revents; + __u32 cpt_ready; +} __attribute__ ((aligned (8))); + + +/* CPT_OBJ_FILEDESC: one file descriptor */ +struct cpt_fd_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_fd; + __u32 cpt_flags; +#define CPT_FD_FLAG_CLOSEEXEC 1 + __u64 cpt_file; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_FILES: one files_struct */ +struct cpt_files_struct_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u32 cpt_max_fds; + __u32 cpt_next_fd; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by array of cpt_fd_image */ + +/* CPT_OBJ_FS: one fs_struct */ +struct cpt_fs_struct_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_umask; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */ + +/* CPT_OBJ_INODE: one struct inode */ +struct cpt_inode_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_dev; + __u64 cpt_ino; + __u32 cpt_mode; + __u32 cpt_nlink; + __u32 cpt_uid; + __u32 cpt_gid; + __u64 cpt_rdev; + __u64 cpt_size; + __u64 cpt_blksize; + __u64 cpt_atime; + __u64 cpt_mtime; + __u64 cpt_ctime; + __u64 cpt_blocks; + __u32 cpt_sb; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_VFSMOUNT: one vfsmount */ +struct cpt_vfsmount_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_mntflags; +#define CPT_MNT_BIND 0x80000000 +#define CPT_MNT_EXT 0x40000000 + __u32 cpt_flags; +} __attribute__ ((aligned (8))); + + +struct cpt_flock_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_pid; + __u64 cpt_start; + __u64 cpt_end; + __u32 cpt_flags; + __u32 cpt_type; +} __attribute__ ((aligned (8))); + + +struct cpt_tty_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_flags; + __u32 cpt_link; + __u32 cpt_index; + __u32 cpt_drv_type; + __u32 cpt_drv_subtype; + __u32 cpt_drv_flags; + __u8 cpt_packet; + __u8 cpt_stopped; + __u8 cpt_hw_stopped; + __u8 cpt_flow_stopped; + + __u32 cpt_canon_data; + __u32 cpt_canon_head; + __u32 cpt_canon_column; + __u32 cpt_column; + __u8 cpt_ctrl_status; + __u8 cpt_erasing; + __u8 cpt_lnext; + __u8 cpt_icanon; + __u8 cpt_raw; + __u8 cpt_real_raw; + __u8 cpt_closing; + __u8 __cpt_pad1; + __u16 cpt_minimum_to_wake; + __u16 __cpt_pad2; + __u32 cpt_pgrp; + __u32 cpt_session; + __u32 cpt_c_line; + __u8 cpt_name[64]; + __u16 cpt_ws_row; + __u16 cpt_ws_col; + __u16 cpt_ws_prow; + __u16 cpt_ws_pcol; + __u8 cpt_c_cc[32]; + __u32 cpt_c_iflag; + __u32 cpt_c_oflag; + __u32 cpt_c_cflag; + __u32 cpt_c_lflag; + __u32 cpt_read_flags[4096/32]; +} __attribute__ ((aligned (8))); + +struct cpt_sock_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_parent; + __u32 cpt_index; + + __u64 cpt_ssflags; + __u16 cpt_type; + __u16 cpt_family; + __u8 cpt_sstate; + __u8 cpt_passcred; + __u8 cpt_state; + __u8 cpt_reuse; + + __u8 cpt_zapped; + __u8 cpt_shutdown; + __u8 cpt_userlocks; + __u8 cpt_no_check; + __u8 cpt_debug; + __u8 cpt_rcvtstamp; + __u8 cpt_localroute; + __u8 cpt_protocol; + + __u32 cpt_err; + __u32 cpt_err_soft; + + __u16 cpt_max_ack_backlog; + __u16 __cpt_pad1; + __u32 cpt_priority; + + __u32 cpt_rcvlowat; + __u32 cpt_bound_dev_if; + + __u64 cpt_rcvtimeo; + __u64 cpt_sndtimeo; + __u32 cpt_rcvbuf; + __u32 cpt_sndbuf; + __u64 cpt_flags; + __u64 cpt_lingertime; + __u32 cpt_peer_pid; + __u32 cpt_peer_uid; + + __u32 cpt_peer_gid; + __u32 cpt_laddrlen; + __u32 cpt_laddr[128/4]; + __u32 cpt_raddrlen; + __u32 cpt_raddr[128/4]; + /* AF_UNIX */ + __u32 cpt_peer; + + __u8 cpt_socketpair; + __u8 cpt_deleted; + __u16 __cpt_pad4; + __u32 __cpt_pad5; +/* + struct sk_filter *sk_filter; + */ + + __u64 cpt_stamp; + __u32 cpt_daddr; + __u16 cpt_dport; + __u16 cpt_sport; + + __u32 cpt_saddr; + __u32 cpt_rcv_saddr; + + __u32 cpt_uc_ttl; + __u32 cpt_tos; + + __u32 cpt_cmsg_flags; + __u32 cpt_mc_index; + + __u32 cpt_mc_addr; +/* + struct ip_options *opt; + */ + __u8 cpt_hdrincl; + __u8 cpt_mc_ttl; + __u8 cpt_mc_loop; + __u8 cpt_pmtudisc; + + __u8 cpt_recverr; + __u8 cpt_freebind; + __u16 cpt_idcounter; + __u32 cpt_cork_flags; + + __u32 cpt_cork_fragsize; + __u32 cpt_cork_length; + __u32 cpt_cork_addr; + __u32 cpt_cork_saddr; + __u32 cpt_cork_daddr; + __u32 cpt_cork_oif; + + __u32 cpt_udp_pending; + __u32 cpt_udp_corkflag; + __u16 cpt_udp_encap; + __u16 cpt_udp_len; + __u32 __cpt_pad7; + + __u64 cpt_saddr6[2]; + __u64 cpt_rcv_saddr6[2]; + __u64 cpt_daddr6[2]; + __u32 cpt_flow_label6; + __u32 cpt_frag_size6; + __u32 cpt_hop_limit6; + __u32 cpt_mcast_hops6; + + __u32 cpt_mcast_oif6; + __u8 cpt_rxopt6; + __u8 cpt_mc_loop6; + __u8 cpt_recverr6; + __u8 cpt_sndflow6; + + __u8 cpt_pmtudisc6; + __u8 cpt_ipv6only6; + __u8 cpt_mapped; + __u8 __cpt_pad8; + __u32 cpt_pred_flags; + + __u32 cpt_rcv_nxt; + __u32 cpt_snd_nxt; + + __u32 cpt_snd_una; + __u32 cpt_snd_sml; + + __u32 cpt_rcv_tstamp; + __u32 cpt_lsndtime; + + __u8 cpt_tcp_header_len; + __u8 cpt_ack_pending; + __u8 cpt_quick; + __u8 cpt_pingpong; + __u8 cpt_blocked; + __u8 __cpt_pad9; + __u16 __cpt_pad10; + + __u32 cpt_ato; + __u32 cpt_ack_timeout; + + __u32 cpt_lrcvtime; + __u16 cpt_last_seg_size; + __u16 cpt_rcv_mss; + + __u32 cpt_snd_wl1; + __u32 cpt_snd_wnd; + + __u32 cpt_max_window; + __u32 cpt_pmtu_cookie; + + __u32 cpt_mss_cache; + __u16 cpt_mss_cache_std; + __u16 cpt_mss_clamp; + + __u16 cpt_ext_header_len; + __u16 cpt_ext2_header_len; + __u8 cpt_ca_state; + __u8 cpt_retransmits; + __u8 cpt_reordering; + __u8 cpt_frto_counter; + + __u32 cpt_frto_highmark; + __u8 cpt_adv_cong; + __u8 cpt_defer_accept; + __u8 cpt_backoff; + __u8 __cpt_pad11; + + __u32 cpt_srtt; + __u32 cpt_mdev; + + __u32 cpt_mdev_max; + __u32 cpt_rttvar; + + __u32 cpt_rtt_seq; + __u32 cpt_rto; + + __u32 cpt_packets_out; + __u32 cpt_left_out; + + __u32 cpt_retrans_out; + __u32 cpt_snd_ssthresh; + + __u32 cpt_snd_cwnd; + __u16 cpt_snd_cwnd_cnt; + __u16 cpt_snd_cwnd_clamp; + + __u32 cpt_snd_cwnd_used; + __u32 cpt_snd_cwnd_stamp; + + __u32 cpt_timeout; + __u32 cpt_ka_timeout; + + __u32 cpt_rcv_wnd; + __u32 cpt_rcv_wup; + + __u32 cpt_write_seq; + __u32 cpt_pushed_seq; + + __u32 cpt_copied_seq; + __u8 cpt_tstamp_ok; + __u8 cpt_wscale_ok; + __u8 cpt_sack_ok; + __u8 cpt_saw_tstamp; + + __u8 cpt_snd_wscale; + __u8 cpt_rcv_wscale; + __u8 cpt_nonagle; + __u8 cpt_keepalive_probes; + __u32 cpt_rcv_tsval; + + __u32 cpt_rcv_tsecr; + __u32 cpt_ts_recent; + + __u64 cpt_ts_recent_stamp; + __u16 cpt_user_mss; + __u8 cpt_dsack; + __u8 cpt_eff_sacks; + __u32 cpt_sack_array[2*5]; + __u32 cpt_window_clamp; + + __u32 cpt_rcv_ssthresh; + __u8 cpt_probes_out; + __u8 cpt_num_sacks; + __u16 cpt_advmss; + + __u8 cpt_syn_retries; + __u8 cpt_ecn_flags; + __u16 cpt_prior_ssthresh; + __u32 cpt_lost_out; + + __u32 cpt_sacked_out; + __u32 cpt_fackets_out; + + __u32 cpt_high_seq; + __u32 cpt_retrans_stamp; + + __u32 cpt_undo_marker; + __u32 cpt_undo_retrans; + + __u32 cpt_urg_seq; + __u16 cpt_urg_data; + __u8 cpt_pending; + __u8 cpt_urg_mode; + + __u32 cpt_snd_up; + __u32 cpt_keepalive_time; + + __u32 cpt_keepalive_intvl; + __u32 cpt_linger2; + + __u32 cpt_rcvrtt_rtt; + __u32 cpt_rcvrtt_seq; + + __u32 cpt_rcvrtt_time; + __u32 __cpt_pad12; +} __attribute__ ((aligned (8))); + +struct cpt_sockmc_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u16 cpt_family; + __u16 cpt_mode; + __u32 cpt_ifindex; + __u32 cpt_mcaddr[4]; +} __attribute__ ((aligned (8))); +/* Followed by array of source addresses, each zero padded to 16 bytes */ + +struct cpt_openreq_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_rcv_isn; + __u32 cpt_snt_isn; + + __u16 cpt_rmt_port; + __u16 cpt_mss; + __u8 cpt_family; + __u8 cpt_retrans; + __u8 cpt_snd_wscale; + __u8 cpt_rcv_wscale; + + __u8 cpt_tstamp_ok; + __u8 cpt_sack_ok; + __u8 cpt_wscale_ok; + __u8 cpt_ecn_ok; + __u8 cpt_acked; + __u8 __cpt_pad1; + __u16 __cpt_pad2; + + __u32 cpt_window_clamp; + __u32 cpt_rcv_wnd; + __u32 cpt_ts_recent; + __u32 cpt_iif; + __u64 cpt_expires; + + __u64 cpt_loc_addr[2]; + __u64 cpt_rmt_addr[2]; +/* + struct ip_options *opt; + */ + +} __attribute__ ((aligned (8))); + +struct cpt_skb_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_queue; +#define CPT_SKB_NQ 0 +#define CPT_SKB_RQ 1 +#define CPT_SKB_WQ 2 +#define CPT_SKB_OFOQ 3 + + __u64 cpt_stamp; + __u32 cpt_len; + __u32 cpt_hspace; + __u32 cpt_tspace; + __u32 cpt_h; + __u32 cpt_nh; + __u32 cpt_mac; + + __u64 cpt_cb[5]; + __u32 cpt_mac_len; + __u32 cpt_csum; + __u8 cpt_local_df; + __u8 cpt_pkt_type; + __u8 cpt_ip_summed; + __u8 __cpt_pad1; + __u32 cpt_priority; + __u16 cpt_protocol; + __u16 cpt_security; + __u16 cpt_gso_segs; + __u16 cpt_gso_size; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvshm_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + + __u32 cpt_id; + __u32 cpt_mlockuser; + __u64 cpt_segsz; + __u64 cpt_atime; + __u64 cpt_ctime; + __u64 cpt_dtime; + __u64 cpt_creator; + __u64 cpt_last; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvsem_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + __u32 cpt_id; + __u32 __cpt_pad1; + + __u64 cpt_otime; + __u64 cpt_ctime; +} __attribute__ ((aligned (8))); +/* Content is array of pairs semval/sempid */ + +struct cpt_sysvsem_undo_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_id; + __u32 cpt_nsem; +} __attribute__ ((aligned (8))); + + +struct cpt_mm_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start_code; + __u64 cpt_end_code; + __u64 cpt_start_data; + __u64 cpt_end_data; + __u64 cpt_start_brk; + __u64 cpt_brk; + __u64 cpt_start_stack; + __u64 cpt_start_arg; + __u64 cpt_end_arg; + __u64 cpt_start_env; + __u64 cpt_end_env; + __u64 cpt_def_flags; + __u64 cpt_mmub; + __u8 cpt_dumpable; + __u8 cpt_vps_dumpable; + __u8 cpt_used_hugetlb; + __u8 __cpt_pad; +} __attribute__ ((aligned (8))); + +struct cpt_page_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; +} __attribute__ ((aligned (8))); + +struct cpt_remappage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_pgoff; +} __attribute__ ((aligned (8))); + +struct cpt_copypage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_source; +} __attribute__ ((aligned (8))); + +struct cpt_lazypage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_index; +} __attribute__ ((aligned (8))); + +struct cpt_iterpage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; +} __attribute__ ((aligned (8))); +/* Followed by array of PFNs */ + +struct cpt_vma_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_type; +#define CPT_VMA_TYPE_0 0 +#define CPT_VMA_TYPE_SHM 1 + __u32 cpt_anonvma; + __u64 cpt_anonvmaid; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_flags; + __u64 cpt_pgprot; + __u64 cpt_pgoff; +} __attribute__ ((aligned (8))); + +struct cpt_aio_ctx_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_max_reqs; + __u32 cpt_ring_pages; + __u32 cpt_tail; + __u32 cpt_nr; + __u64 cpt_mmap_base; + /* Data (io_event's) and struct aio_ring are stored in user space VM */ +} __attribute__ ((aligned (8))); + + +/* Format of MM section. + * + * It is array of MM objects (mm_struct). Each MM object is + * header, encoding mm_struct, followed by array of VMA objects. + * Each VMA consists of VMA header, encoding vm_area_struct, and + * if the VMA contains copied pages, the header is followed by + * array of tuples start-end each followed by data. + * + * ATTN: no block/page alignment. Only 64bit alignment. This might be not good? + */ + +struct cpt_restart_block { + __u64 fn; +#define CPT_RBL_0 0 +#define CPT_RBL_NANOSLEEP 1 +#define CPT_RBL_COMPAT_NANOSLEEP 2 + __u64 arg0; + __u64 arg1; + __u64 arg2; + __u64 arg3; +} __attribute__ ((aligned (8))); + +struct cpt_siginfo_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_qflags; + __u32 cpt_signo; + __u32 cpt_errno; + __u32 cpt_code; + + __u64 cpt_sigval; + __u32 cpt_pid; + __u32 cpt_uid; + __u64 cpt_utime; + __u64 cpt_stime; + + __u64 cpt_user; +} __attribute__ ((aligned (8))); + +/* Portable presentaions for segment registers */ + +#define CPT_SEG_ZERO 0 +#define CPT_SEG_TLS1 1 +#define CPT_SEG_TLS2 2 +#define CPT_SEG_TLS3 3 +#define CPT_SEG_USER32_DS 4 +#define CPT_SEG_USER32_CS 5 +#define CPT_SEG_USER64_DS 6 +#define CPT_SEG_USER64_CS 7 +#define CPT_SEG_LDT 256 + +struct cpt_x86_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_debugreg[8]; + __u32 cpt_fs; + __u32 cpt_gs; + + __u32 cpt_ebx; + __u32 cpt_ecx; + __u32 cpt_edx; + __u32 cpt_esi; + __u32 cpt_edi; + __u32 cpt_ebp; + __u32 cpt_eax; + __u32 cpt_xds; + __u32 cpt_xes; + __u32 cpt_orig_eax; + __u32 cpt_eip; + __u32 cpt_xcs; + __u32 cpt_eflags; + __u32 cpt_esp; + __u32 cpt_xss; + __u32 cpt_pad; +}; + +struct cpt_x86_64_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_debugreg[8]; + + __u64 cpt_fsbase; + __u64 cpt_gsbase; + __u32 cpt_fsindex; + __u32 cpt_gsindex; + __u32 cpt_ds; + __u32 cpt_es; + + __u64 cpt_r15; + __u64 cpt_r14; + __u64 cpt_r13; + __u64 cpt_r12; + __u64 cpt_rbp; + __u64 cpt_rbx; + __u64 cpt_r11; + __u64 cpt_r10; + __u64 cpt_r9; + __u64 cpt_r8; + __u64 cpt_rax; + __u64 cpt_rcx; + __u64 cpt_rdx; + __u64 cpt_rsi; + __u64 cpt_rdi; + __u64 cpt_orig_rax; + __u64 cpt_rip; + __u64 cpt_cs; + __u64 cpt_eflags; + __u64 cpt_rsp; + __u64 cpt_ss; +}; + +struct cpt_ia64_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 gr[128]; + __u64 fr[256]; + __u64 br[8]; + __u64 nat[2]; + + __u64 ar_bspstore; + __u64 num_regs; + __u64 loadrs; + __u64 ar_bsp; + __u64 ar_unat; + __u64 ar_pfs; + __u64 ar_ccv; + __u64 ar_fpsr; + __u64 ar_csd; + __u64 ar_ssd; + __u64 ar_ec; + __u64 ar_lc; + __u64 ar_rsc; + __u64 ar_rnat; + + __u64 cr_iip; + __u64 cr_ipsr; + + __u64 cfm; + __u64 pr; + + __u64 ibr[8]; + __u64 dbr[8]; +}; + + +struct cpt_task_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_state; + __u64 cpt_flags; + __u64 cpt_ptrace; + __u32 cpt_prio; + __u32 cpt_static_prio; + __u32 cpt_policy; + __u32 cpt_rt_priority; + + /* struct thread_info */ + __u64 cpt_exec_domain; + __u64 cpt_thrflags; + __u64 cpt_thrstatus; + __u64 cpt_addr_limit; + + __u64 cpt_personality; + + __u64 cpt_mm; + __u64 cpt_files; + __u64 cpt_fs; + __u64 cpt_signal; + __u64 cpt_sighand; + __u64 cpt_sigblocked; + __u64 cpt_sigrblocked; + __u64 cpt_sigpending; + __u64 cpt_namespace; + __u64 cpt_sysvsem_undo; + __u32 cpt_pid; + __u32 cpt_tgid; + __u32 cpt_ppid; + __u32 cpt_rppid; + __u32 cpt_pgrp; + __u32 cpt_session; + __u32 cpt_old_pgrp; + __u32 __cpt_pad; + __u32 cpt_leader; + __u8 cpt_pn_state; + __u8 cpt_stopped_state; + __u8 cpt_sigsuspend_state; + __u8 cpt_64bit; + __u64 cpt_set_tid; + __u64 cpt_clear_tid; + __u32 cpt_exit_code; + __u32 cpt_exit_signal; + __u32 cpt_pdeath_signal; + __u32 cpt_user; + __u32 cpt_uid; + __u32 cpt_euid; + __u32 cpt_suid; + __u32 cpt_fsuid; + __u32 cpt_gid; + __u32 cpt_egid; + __u32 cpt_sgid; + __u32 cpt_fsgid; + __u32 cpt_ngids; + __u32 cpt_gids[32]; + __u8 cpt_prctl_uac; + __u8 cpt_prctl_fpemu; + __u16 __cpt_pad1; + __u64 cpt_ecap; + __u64 cpt_icap; + __u64 cpt_pcap; + __u8 cpt_comm[16]; + __u64 cpt_tls[3]; + struct cpt_restart_block cpt_restart; + __u64 cpt_it_real_value; /* V8: jiffies, V9..: nsec */ + __u64 cpt_it_real_incr; /* V8: jiffies, V9..: nsec */ + __u64 cpt_it_prof_value; + __u64 cpt_it_prof_incr; + __u64 cpt_it_virt_value; + __u64 cpt_it_virt_incr; + + __u16 cpt_used_math; + __u8 cpt_keepcap; + __u8 cpt_did_exec; + __u32 cpt_ptrace_message; + + __u64 cpt_utime; + __u64 cpt_stime; + __u64 cpt_starttime; /* V8: jiffies, V9...: timespec */ + __u64 cpt_nvcsw; + __u64 cpt_nivcsw; + __u64 cpt_min_flt; + __u64 cpt_maj_flt; + + __u64 cpt_sigsuspend_blocked; + __u64 cpt_cutime, cpt_cstime; + __u64 cpt_cnvcsw, cpt_cnivcsw; + __u64 cpt_cmin_flt, cpt_cmaj_flt; + +#define CPT_RLIM_NLIMITS 16 + __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; + __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; + + __u64 cpt_task_ub; + __u64 cpt_exec_ub; + __u64 cpt_mm_ub; + __u64 cpt_fork_sub; +} __attribute__ ((aligned (8))); + +struct cpt_sigaltstack_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_stack; + __u32 cpt_stacksize; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +struct cpt_task_aux_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_robust_list; + __u64 __cpt_future[16]; +} __attribute__ ((aligned (8))); + + +struct cpt_signal_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_leader; + __u8 cpt_pgrp_type; + __u8 cpt_old_pgrp_type; + __u8 cpt_session_type; +#define CPT_PGRP_NORMAL 0 +#define CPT_PGRP_ORPHAN 1 +#define CPT_PGRP_STRAY 2 + __u8 __cpt_pad1; + __u64 cpt_pgrp; + __u64 cpt_old_pgrp; + __u64 cpt_session; + __u64 cpt_sigpending; + __u64 cpt_ctty; + + __u32 cpt_curr_target; + __u32 cpt_group_exit; + __u32 cpt_group_exit_code; + __u32 cpt_group_exit_task; + __u32 cpt_notify_count; + __u32 cpt_group_stop_count; + __u32 cpt_stop_state; + __u32 __cpt_pad2; + + __u64 cpt_utime, cpt_stime, cpt_cutime, cpt_cstime; + __u64 cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw; + __u64 cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt; + + __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; + __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; +} __attribute__ ((aligned (8))); +/* Followed by list of posix timers. */ + +struct cpt_sighand_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + +} __attribute__ ((aligned (8))); +/* Followed by list of sighandles. */ + +struct cpt_sighandler_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_signo; + __u32 __cpt_pad1; + __u64 cpt_handler; + __u64 cpt_restorer; + __u64 cpt_flags; + __u64 cpt_mask; +} __attribute__ ((aligned (8))); + +struct cpt_netdev_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u32 cpt_flags; + __u8 cpt_name[16]; +} __attribute__ ((aligned (8))); + +struct cpt_ifaddr_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u8 cpt_family; + __u8 cpt_masklen; + __u8 cpt_flags; + __u8 cpt_scope; + __u32 cpt_address[4]; + __u32 cpt_peer[4]; + __u32 cpt_broadcast[4]; + __u8 cpt_label[16]; + __u32 cpt_valid_lft; + __u32 cpt_prefered_lft; +} __attribute__ ((aligned (8))); + +struct cpt_ipct_tuple +{ + __u32 cpt_src; + __u16 cpt_srcport; + __u16 __cpt_pad1; + + __u32 cpt_dst; + __u16 cpt_dstport; + __u8 cpt_protonum; + __u8 cpt_dir; /* TEMPORARY HACK TO VALIDATE CODE */ +} __attribute__ ((aligned (8))); + +struct cpt_nat_manip +{ + __u8 cpt_direction; + __u8 cpt_hooknum; + __u8 cpt_maniptype; + __u8 __cpt_pad1; + + __u32 cpt_manip_addr; + __u16 cpt_manip_port; + __u16 __cpt_pad2; + __u32 __cpt_pad3; +} __attribute__ ((aligned (8))); + +struct cpt_nat_seq +{ + __u32 cpt_correction_pos; + __u32 cpt_offset_before; + __u32 cpt_offset_after; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +struct cpt_ip_connexpect_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_timeout; + __u32 cpt_sibling_conntrack; /* Index of child conntrack */ + __u32 cpt_seq; /* id in 2.6.15 */ + + struct cpt_ipct_tuple cpt_ct_tuple; /* NU 2.6.15 */ + struct cpt_ipct_tuple cpt_tuple; + struct cpt_ipct_tuple cpt_mask; + + /* union ip_conntrack_expect_help. Used by ftp, irc, amanda */ + __u32 cpt_help[3]; /* NU 2.6.15 */ + __u16 cpt_manip_proto; + __u8 cpt_dir; + __u8 cpt_flags; +} __attribute__ ((aligned (8))); + +struct cpt_ip_conntrack_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + struct cpt_ipct_tuple cpt_tuple[2]; + __u64 cpt_status; + __u64 cpt_timeout; + __u32 cpt_index; + __u8 cpt_ct_helper; + __u8 cpt_nat_helper; + __u16 cpt_pad1; + + /* union ip_conntrack_proto. Used by tcp and icmp. */ + __u32 cpt_proto_data[12]; + + /* union ip_conntrack_help. Used by ftp and pptp helper. + * We do not support pptp... + */ + __u32 cpt_help_data[6]; + + /* nat info */ + __u32 cpt_initialized; /* NU 2.6.15 */ + __u32 cpt_num_manips; /* NU 2.6.15 */ + struct cpt_nat_manip cpt_nat_manips[6]; /* NU 2.6.15 */ + + struct cpt_nat_seq cpt_nat_seq[2]; + + __u32 cpt_masq_index; + __u32 cpt_id; + __u32 cpt_mark; +} __attribute__ ((aligned (8))); + +struct cpt_ubparm +{ + __u64 barrier; + __u64 limit; + __u64 held; + __u64 maxheld; + __u64 minheld; + __u64 failcnt; +} __attribute__ ((aligned (8))); + +struct cpt_beancounter_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_parent; + __u32 cpt_id; + __u32 __cpt_pad; + struct cpt_ubparm cpt_parms[32 * 2]; +} __attribute__ ((aligned (8))); + +struct cpt_slm_sgreg_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; + __u32 cpt_id; + __u16 cpt_resource; + __u8 cpt_regname[32]; + __u8 __cpt_pad2[2]; +} __attribute__ ((aligned (8))); + +struct cpt_slm_obj_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +#ifdef __KERNEL__ + +static inline void *cpt_ptr_import(__u64 ptr) +{ + return (void*)(unsigned long)ptr; +} + +static inline __u64 cpt_ptr_export(void __user *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr) +{ + memcpy(sig, &ptr, sizeof(*sig)); +} + +static inline __u64 cpt_sigset_export(sigset_t *sig) +{ + return *(__u64*)sig; +} + +static inline __u64 cpt_timespec_export(struct timespec *tv) +{ + return (((u64)tv->tv_sec) << 32) + tv->tv_nsec; +} + +static inline void cpt_timespec_import(struct timespec *tv, __u64 val) +{ + tv->tv_sec = val>>32; + tv->tv_nsec = (val&0xFFFFFFFF); +} + +static inline __u64 cpt_timeval_export(struct timeval *tv) +{ + return (((u64)tv->tv_sec) << 32) + tv->tv_usec; +} + +static inline void cpt_timeval_import(struct timeval *tv, __u64 val) +{ + tv->tv_sec = val>>32; + tv->tv_usec = (val&0xFFFFFFFF); +} + +#endif + +#endif /* __CPT_IMAGE_H_ */ diff -uprN linux-2.6.18/include/linux/cpt_ioctl.h linux-2.6.18.ovz/include/linux/cpt_ioctl.h --- linux-2.6.18/include/linux/cpt_ioctl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/cpt_ioctl.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,43 @@ +/* + * + * include/linux/cpt_ioctl.h + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _CPT_IOCTL_H_ +#define _CPT_IOCTL_H_ 1 + +#include +#include + +#define CPTCTLTYPE '-' +#define CPT_SET_DUMPFD _IOW(CPTCTLTYPE, 1, int) +#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int) +#define CPT_SET_LOCKFD _IOW(CPTCTLTYPE, 3, int) +#define CPT_SET_VEID _IOW(CPTCTLTYPE, 4, int) +#define CPT_SUSPEND _IO(CPTCTLTYPE, 5) +#define CPT_DUMP _IO(CPTCTLTYPE, 6) +#define CPT_UNDUMP _IO(CPTCTLTYPE, 7) +#define CPT_RESUME _IO(CPTCTLTYPE, 8) +#define CPT_KILL _IO(CPTCTLTYPE, 9) +#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10) +#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int) +#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12) +#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int) +#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int) +#define CPT_PAGEIND _IO(CPTCTLTYPE, 15) +#define CPT_VMPREP _IOW(CPTCTLTYPE, 16, int) +#define CPT_SET_LAZY _IOW(CPTCTLTYPE, 17, int) +#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int) +#define CPT_TEST_CAPS _IOW(CPTCTLTYPE, 19, unsigned int) +#define CPT_TEST_VECAPS _IOW(CPTCTLTYPE, 20, unsigned int) +#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int) + +#define CPT_ITER _IOW(CPTCTLTYPE, 23, int) + +#endif diff -uprN linux-2.6.18/include/linux/dcache.h linux-2.6.18.ovz/include/linux/dcache.h --- linux-2.6.18/include/linux/dcache.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/dcache.h 2007-06-13 06:55:07.000000000 -0400 @@ -9,6 +9,8 @@ #include #include +#include + struct nameidata; struct vfsmount; @@ -111,6 +113,9 @@ struct dentry { struct dcookie_struct *d_cookie; /* cookie, if any */ #endif int d_mounted; +#ifdef CONFIG_USER_RESOURCE + struct dentry_beancounter dentry_bc; +#endif unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ }; @@ -173,9 +178,13 @@ d_iput: no no no yes #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ #define DCACHE_UNHASHED 0x0010 +#define DCACHE_VIRTUAL 0x0100 /* ve accessible */ + +extern void mark_tree_virtual(struct vfsmount *m, struct dentry *d); #define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */ +extern kmem_cache_t *dentry_cache; extern spinlock_t dcache_lock; /** @@ -291,7 +300,12 @@ extern struct dentry * d_hash_and_lookup /* validate "insecure" dentry pointer */ extern int d_validate(struct dentry *, struct dentry *); +extern int d_root_check(struct dentry *, struct vfsmount *); extern char * d_path(struct dentry *, struct vfsmount *, char *, int); +extern char * __d_path( struct dentry *dentry, struct vfsmount *vfsmnt, + struct dentry *root, struct vfsmount *rootmnt, + char *buffer, int buflen); + /* Allocation counts.. */ @@ -311,6 +325,12 @@ extern char * d_path(struct dentry *, st static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { +#ifdef CONFIG_USER_RESOURCE + preempt_disable(); + if (ub_dentry_on && ub_dget_testone(dentry)) + BUG(); + preempt_enable_no_resched(); +#endif BUG_ON(!atomic_read(&dentry->d_count)); atomic_inc(&dentry->d_count); } @@ -354,6 +374,8 @@ extern struct dentry *lookup_create(stru extern int sysctl_vfs_cache_pressure; +extern int check_area_access_ve(struct dentry *, struct vfsmount *); +extern int check_area_execute_ve(struct dentry *, struct vfsmount *); #endif /* __KERNEL__ */ #endif /* __LINUX_DCACHE_H */ diff -uprN linux-2.6.18/include/linux/device.h linux-2.6.18.ovz/include/linux/device.h --- linux-2.6.18/include/linux/device.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/device.h 2007-06-13 06:55:07.000000000 -0400 @@ -281,6 +281,8 @@ extern struct class_device *class_device __attribute__((format(printf,5,6))); extern void class_device_destroy(struct class *cls, dev_t devt); +extern struct class net_class; + /* interface for exporting device attributes */ struct device_attribute { diff -uprN linux-2.6.18/include/linux/devpts_fs.h linux-2.6.18.ovz/include/linux/devpts_fs.h --- linux-2.6.18/include/linux/devpts_fs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/devpts_fs.h 2007-06-13 06:55:07.000000000 -0400 @@ -21,6 +21,16 @@ int devpts_pty_new(struct tty_struct *tt struct tty_struct *devpts_get_tty(int number); /* get tty structure */ void devpts_pty_kill(int number); /* unlink */ +struct devpts_config { + int setuid; + int setgid; + uid_t uid; + gid_t gid; + umode_t mode; +}; + +extern struct devpts_config devpts_config; +extern struct file_system_type devpts_fs_type; #else /* Dummy stubs in the no-pty case */ diff -uprN linux-2.6.18/include/linux/drbd.h linux-2.6.18.ovz/include/linux/drbd.h --- linux-2.6.18/include/linux/drbd.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/drbd.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,284 @@ +/* + drbd.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2001-2007, Philipp Reisner . + Copyright (C) 2001-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ +#ifndef DRBD_H +#define DRBD_H +#include + +#include + +#ifdef __KERNEL__ +#include +#else +#include +#include +#include +#endif + +enum io_error_handler { + PassOn, /* FIXME should the better be named "Ignore"? */ + CallIOEHelper, + Detach +}; + +enum fencing_policy { + DontCare, + Resource, + Stonith +}; + +enum disconnect_handler { + Reconnect, + DropNetConf, + FreezeIO +}; + +enum after_sb_handler { + Disconnect, + DiscardYoungerPri, + DiscardOlderPri, + DiscardZeroChg, + DiscardLeastChg, + DiscardLocal, + DiscardRemote, + Consensus, + DiscardSecondary, + CallHelper, + Violently +}; + +/* KEEP the order, do not delete or insert! + * Or change the API_VERSION, too. */ +enum ret_codes { + RetCodeBase=100, + NoError, // 101 ... + LAAlreadyInUse, + OAAlreadyInUse, + LDNameInvalid, + MDNameInvalid, + LDAlreadyInUse, + LDNoBlockDev, + MDNoBlockDev, + LDOpenFailed, + MDOpenFailed, + LDDeviceTooSmall, + MDDeviceTooSmall, + LDNoConfig, + LDMounted, + MDMounted, + LDMDInvalid, + LDDeviceTooLarge, + MDIOError, + MDInvalid, + CRAMAlgNotAvail, + CRAMAlgNotDigest, + KMallocFailed, + DiscardNotAllowed, + HaveDiskConfig, + HaveNetConfig, + UnknownMandatoryTag, + MinorNotKnown, + StateNotAllowed, + GotSignal, // EINTR + NoResizeDuringResync, + APrimaryNodeNeeded, + SyncAfterInvalid, + SyncAfterCycle, + PauseFlagAlreadySet, + PauseFlagAlreadyClear, + DiskLowerThanOutdated, + UnknownNetLinkPacket, + HaveNoDiskConfig, + ProtocolCRequired, + + /* insert new ones above this line */ + AfterLastRetCode +}; + +#define DRBD_PROT_A 1 +#define DRBD_PROT_B 2 +#define DRBD_PROT_C 3 + +typedef enum { + Unknown=0, + Primary=1, // role + Secondary=2, // role + role_mask=3, +} drbd_role_t; + +/* The order of these constants is important. + * The lower ones (=WFReportParams ==> There is a socket + * + * THINK + * Skipped should be < Connected, + * so writes on a Primary after Skipped sync are not mirrored either ? + */ +typedef enum { + StandAlone, + Disconnecting, // Temporal state on the way to StandAlone. + Unconnected, // >= Unconnected -> inc_net() succeeds + Timeout, /// These temporal states are all used on the way + BrokenPipe, /// from >= Connected to Unconnected. + NetworkFailure, /// The 'disconnect reason' states + ProtocolError, /// + TearDown, /// I do not allow to change beween them. + WFConnection, + WFReportParams, // we have a socket + Connected, // we have introduced each other + StartingSyncS, // starting full sync by IOCTL. + StartingSyncT, // stariing full sync by IOCTL. + WFBitMapS, + WFBitMapT, + WFSyncUUID, + SyncSource, // The distance between original state and pause + SyncTarget, // state must be the same for source and target. (+2) + PausedSyncS, // All SyncStates are tested with this comparison + PausedSyncT, // xx >= SyncSource && xx <= PausedSyncT + conn_mask=31 +} drbd_conns_t; + +typedef enum { + Diskless, + Attaching, /* In the process of reading the meta-data */ + Failed, /* Becomes Diskless as soon as we told it the peer */ + /* when >= Failed it is legal to access mdev->bc */ + Negotiating, /* Late attaching state, we need to talk to the peer... */ + Inconsistent, + Outdated, + DUnknown, /* Only used for the peer, never for myself */ + Consistent, /* Might be Outdated, might be UpToDate ... */ + UpToDate, /* Only this disk state allows applications' IO ! */ + disk_mask=15 +} drbd_disks_t; + +typedef union { + struct { + unsigned role : 2 ; // 3/4 primary/secondary/unknown + unsigned peer : 2 ; // 3/4 primary/secondary/unknown + unsigned conn : 5 ; // 17/32 cstates + unsigned disk : 4 ; // 8/16 from Diskless to UpToDate + unsigned pdsk : 4 ; // 8/16 from Diskless to UpToDate + unsigned susp : 1 ; // 2/2 IO suspended no/yes + unsigned aftr_isp : 1 ; // isp .. imposed sync pause + unsigned peer_isp : 1 ; + unsigned user_isp : 1 ; + unsigned _pad : 11; // 0 unused + }; + unsigned int i; +} drbd_state_t; + +typedef enum { + SS_CW_NoNeed=4, + SS_CW_Success=3, + SS_NothingToDo=2, + SS_Success=1, + SS_UnknownError=0, // Used to sleep longer in _drbd_request_state + SS_TwoPrimaries=-1, + SS_NoUpToDateDisk=-2, + SS_BothInconsistent=-4, + SS_SyncingDiskless=-5, + SS_ConnectedOutdates=-6, + SS_PrimaryNOP=-7, + SS_ResyncRunning=-8, + SS_AlreadyStandAlone=-9, + SS_CW_FailedByPeer=-10, + SS_CanNotOutdateDL=-11, + SS_DeviceInUse=-12 +} set_st_err_t; + +/* from drbd_strings.c */ +extern const char* conns_to_name(drbd_conns_t); +extern const char* roles_to_name(drbd_role_t); +extern const char* disks_to_name(drbd_disks_t); +extern const char* set_st_err_name(set_st_err_t); + +#ifndef BDEVNAME_SIZE +# define BDEVNAME_SIZE 32 +#endif + +#define SHARED_SECRET_MAX 64 + +enum MetaDataFlags { + __MDF_Consistent, + __MDF_PrimaryInd, + __MDF_ConnectedInd, + __MDF_FullSync, + __MDF_WasUpToDate, + __MDF_PeerOutDated // or less/lower. +}; +#define MDF_Consistent (1<<__MDF_Consistent) +#define MDF_PrimaryInd (1<<__MDF_PrimaryInd) +#define MDF_ConnectedInd (1<<__MDF_ConnectedInd) +#define MDF_FullSync (1<<__MDF_FullSync) +#define MDF_WasUpToDate (1<<__MDF_WasUpToDate) +#define MDF_PeerOutDated (1<<__MDF_PeerOutDated) + +enum UuidIndex { + Current, + Bitmap, + History_start, + History_end, + UUID_SIZE, // In the packet we store the number of dirty bits here + UUID_FLAGS, // In the packet we store flags here. + EXT_UUID_SIZE // Everything. +}; + +#define UUID_JUST_CREATED ((__u64)4) + +#define DRBD_MAGIC 0x83740267 +#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) + +/* these are of type "int" */ +#define DRBD_MD_INDEX_INTERNAL -1 +#define DRBD_MD_INDEX_FLEX_EXT -2 +#define DRBD_MD_INDEX_FLEX_INT -3 + +// Start of the new netlink/connector stuff + +#define DRBD_NL_CREATE_DEVICE 0x01 +#define DRBD_NL_SET_DEFAULTS 0x02 + +// The following line should be moved over to linux/connector.h +// when the time comes +#define CN_IDX_DRBD 0x4 +#define CN_VAL_DRBD 0x1 + +struct drbd_nl_cfg_req { + int packet_type; + int drbd_minor; + int flags; + unsigned short tag_list[]; +}; + +struct drbd_nl_cfg_reply { + int packet_type; + int minor; + int ret_code; // enum ret_code or set_st_err_t + unsigned short tag_list[]; // only used with get_* calls +}; + +#endif diff -uprN linux-2.6.18/include/linux/drbd_config.h linux-2.6.18.ovz/include/linux/drbd_config.h --- linux-2.6.18/include/linux/drbd_config.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/drbd_config.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,79 @@ +/* + drbd_config.h + DRBD's compile time configuration. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef DRBD_CONFIG_H +#define DRBD_CONFIG_H + +extern const char * drbd_buildtag(void); + +#define REL_VERSION "8.0.3" +#define API_VERSION 86 +#define PRO_VERSION 86 + +// undef if you need the workaround in drbd_receiver +#define HAVE_UML_TO_VIRT 1 + +#define DBG_ALL_SYMBOLS // no static functs, improves quality of OOPS traces + +//#define DBG_SPINLOCKS // enables MUST_HOLD macro (assertions for spinlocks) +//#define DBG_ASSERTS // drbd_assert_breakpoint() function +#define DUMP_MD 2 // Dump even all cstate changes (I like it!) +//#define PARANOIA // some extra checks + +// Dump every hour the usage / not usage of zero copy IO +//#define SHOW_SENDPAGE_USAGE + +// Define this to enable dynamic tracing controlled by module parameters +// at run time. This enables ALL use of dynamic tracing including packet +// and bio dumping, etc +#define ENABLE_DYNAMIC_TRACE + +// You can disable the use of the sendpage() call (= zero copy +// IO ) If you have the feeling that this might be the cause +// for troubles. +// #define DRBD_DISABLE_SENDPAGE + +// Enable fault insertion code +#define DRBD_ENABLE_FAULTS + +// RedHat's 2.6.9 kernels have the gfp_t type. Mainline has this feature +// since 2.6.16. If you build for RedHat enable the line below. +#define KERNEL_HAS_GFP_T + +// kernel.org has atomic_add_return since 2.6.10. some vendor kernels +// have it backported, though. Others don't. +//#define NEED_BACKPORT_OF_ATOMIC_ADD + +// 2.6.something has deprecated kmem_cache_t +// some older still use it. +// some have it defined as struct kmem_cache_s, some as struct kmem_cache +//#define USE_KMEM_CACHE_S + +// 2.6.something has sock_create_kern (SE-linux security context stuff) +// some older distribution kernels don't. +//#define DEFINE_SOCK_CREATE_KERN + +// in older kernels (vanilla < 2.6.16) struct netlink_skb_parms has a +// member called dst_groups. Later it is called dst_group (without 's'). +//#define DRBD_NL_DST_GROUPS + +// in older kernels (vanilla < 2.6.14) is no kzalloc() +//#define NEED_BACKPORT_OF_KZALLOC + +#endif diff -uprN linux-2.6.18/include/linux/drbd_limits.h linux-2.6.18.ovz/include/linux/drbd_limits.h --- linux-2.6.18/include/linux/drbd_limits.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/drbd_limits.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,124 @@ +/* + drbd_limits.h + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. +*/ + +/* + * Our current limitations. + * Some of them are hard limits, + * some of them are arbitrary range limits, that make it easier to provide + * feedback about nonsense settings for certain configurable values. + */ + +#ifndef DRBD_LIMITS_H +#define DRBD_LIMITS_H 1 + +#define DEBUG_RANGE_CHECK 0 + +#define DRBD_MINOR_COUNT_MIN 1 +#define DRBD_MINOR_COUNT_MAX 255 + +#define DRBD_DIALOG_REFRESH_MIN 0 +#define DRBD_DIALOG_REFRESH_MAX 600 + +/* valid port number */ +#define DRBD_PORT_MIN 1 +#define DRBD_PORT_MAX 0xffff + +/* startup { */ + /* if you want more than 3.4 days, disable */ +#define DRBD_WFC_TIMEOUT_MIN 0 +#define DRBD_WFC_TIMEOUT_MAX 300000 +#define DRBD_WFC_TIMEOUT_DEF 0 + +#define DRBD_DEGR_WFC_TIMEOUT_MIN 0 +#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 +#define DRBD_DEGR_WFC_TIMEOUT_DEF 60 + +/* }*/ + +/* net { */ + /* timeout, unit centi seconds + * more than one minute timeout is not usefull */ +#define DRBD_TIMEOUT_MIN 1 +#define DRBD_TIMEOUT_MAX 600 +#define DRBD_TIMEOUT_DEF 60 // 6 seconds + + /* active connection retries when WFConnection */ +#define DRBD_CONNECT_INT_MIN 1 +#define DRBD_CONNECT_INT_MAX 120 +#define DRBD_CONNECT_INT_DEF 10 //seconds + + /* keep-alive probes when idle */ +#define DRBD_PING_INT_MIN 1 +#define DRBD_PING_INT_MAX 120 +#define DRBD_PING_INT_DEF 10 + + /* timeout for the ping packets.*/ +#define DRBD_PING_TIMEO_MIN 1 +#define DRBD_PING_TIMEO_MAX 100 +#define DRBD_PING_TIMEO_DEF 5 + + /* max number of write requests between write barriers */ +#define DRBD_MAX_EPOCH_SIZE_MIN 1 +#define DRBD_MAX_EPOCH_SIZE_MAX 20000 +#define DRBD_MAX_EPOCH_SIZE_DEF 2048 + + /* I don't think that a tcp send buffer of more than 10M is usefull */ +#define DRBD_SNDBUF_SIZE_MIN 1 +#define DRBD_SNDBUF_SIZE_MAX 10000000 +#define DRBD_SNDBUF_SIZE_DEF (2*65535) + + /* @4k PageSize -> 128kB - 512MB */ +#define DRBD_MAX_BUFFERS_MIN 32 +#define DRBD_MAX_BUFFERS_MAX 131072 +#define DRBD_MAX_BUFFERS_DEF 2048 + + /* @4k PageSize -> 4kB - 512MB */ +#define DRBD_UNPLUG_WATERMARK_MIN 1 +#define DRBD_UNPLUG_WATERMARK_MAX 131072 +#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) + + /* 0 is disabled. + * 200 should be more than enough even for very short timeouts */ +#define DRBD_KO_COUNT_MIN 0 +#define DRBD_KO_COUNT_MAX 200 +#define DRBD_KO_COUNT_DEF 0 +/* } */ + +/* syncer { */ + /* FIXME allow rate to be zero? */ +#define DRBD_RATE_MIN 1 +#define DRBD_RATE_MAX 700000 +#define DRBD_RATE_DEF 250 // kb/second + + /* less than 7 would hit performance unneccessarily. + * 3833 is the largest prime that still does fit + * into 64 sectors of activity log */ +#define DRBD_AL_EXTENTS_MIN 7 +#define DRBD_AL_EXTENTS_MAX 3833 +#define DRBD_AL_EXTENTS_DEF 127 + +#define DRBD_AFTER_MIN -1 +#define DRBD_AFTER_MAX 255 +#define DRBD_AFTER_DEF -1 + +/* } */ + +/* drbdsetup XY resize -d Z + * you are free to reduce the device size to nothing, if you want to. + * but more than 3998G are currently not possible */ +/* DRBD_MAX_SECTORS */ +#define DRBD_DISK_SIZE_SECT_MIN 0 +#define DRBD_DISK_SIZE_SECT_MAX ((128LLU*1024*2 - 72)*512LLU*8*8) +#define DRBD_DISK_SIZE_SECT_DEF 0 // = disabled = no user size... + +#define DRBD_ON_IO_ERROR_DEF PassOn +#define DRBD_FENCING_DEF DontCare +#define DRBD_AFTER_SB_0P_DEF Disconnect +#define DRBD_AFTER_SB_1P_DEF Disconnect +#define DRBD_AFTER_SB_2P_DEF Disconnect +#define DRBD_RR_CONFLICT_DEF Disconnect + +#undef RANGE +#endif diff -uprN linux-2.6.18/include/linux/drbd_nl.h linux-2.6.18.ovz/include/linux/drbd_nl.h --- linux-2.6.18/include/linux/drbd_nl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/drbd_nl.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,98 @@ +/* + PAKET( name, + TYPE ( pn, pr, member ) + ... + ) + + You may never reissue one of the pn arguments +*/ + +#if !defined(PACKET) || !defined(STRING) || !defined(INTEGER) || !defined(BIT) || !defined(INT64) +#error "The macros PACKET, STRING, INTEGER, INT64 and BIT needs to be defined" +#endif + +PACKET(primary, 1, + BIT( 1, T_MAY_IGNORE, overwrite_peer) +) + +PACKET(secondary, 2, ) + +PACKET(disk_conf, 3, + INT64( 2, T_MAY_IGNORE, disk_size) + STRING( 3, T_MANDATORY, backing_dev, 32) + STRING( 4, T_MANDATORY, meta_dev, 32) + INTEGER( 5, T_MANDATORY, meta_dev_idx) + INTEGER( 6, T_MAY_IGNORE, on_io_error) + INTEGER( 7, T_MAY_IGNORE, fencing) + BIT( 37, T_MAY_IGNORE, use_bmbv) +) + +PACKET(detach, 4,) + +PACKET(net_conf, 5, + STRING( 8, T_MANDATORY, my_addr, 128) + STRING( 9, T_MANDATORY, peer_addr, 128) + STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) + STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) + INTEGER( 14, T_MAY_IGNORE, timeout) + INTEGER( 15, T_MANDATORY, wire_protocol) + INTEGER( 16, T_MAY_IGNORE, try_connect_int) + INTEGER( 17, T_MAY_IGNORE, ping_int) + INTEGER( 18, T_MAY_IGNORE, max_epoch_size) + INTEGER( 19, T_MAY_IGNORE, max_buffers) + INTEGER( 20, T_MAY_IGNORE, unplug_watermark) + INTEGER( 21, T_MAY_IGNORE, sndbuf_size) + INTEGER( 22, T_MAY_IGNORE, ko_count) + INTEGER( 24, T_MAY_IGNORE, after_sb_0p) + INTEGER( 25, T_MAY_IGNORE, after_sb_1p) + INTEGER( 26, T_MAY_IGNORE, after_sb_2p) + INTEGER( 39, T_MAY_IGNORE, rr_conflict) + INTEGER( 40, T_MAY_IGNORE, ping_timeo) + BIT( 27, T_MAY_IGNORE, want_lose) + BIT( 28, T_MAY_IGNORE, two_primaries) + BIT( 41, T_MAY_IGNORE, always_asbp) +) + +PACKET(disconnect, 6, ) + +PACKET(resize, 7, + INT64( 29, T_MAY_IGNORE, resize_size) +) + +PACKET(syncer_conf, 8, + INTEGER( 30, T_MAY_IGNORE, rate) + INTEGER( 31, T_MAY_IGNORE, after) + INTEGER( 32, T_MAY_IGNORE, al_extents) +) + +PACKET(invalidate, 9, ) +PACKET(invalidate_peer, 10, ) +PACKET(pause_sync, 11, ) +PACKET(resume_sync, 12, ) +PACKET(suspend_io, 13, ) +PACKET(resume_io, 14, ) +PACKET(outdate, 15, ) +PACKET(get_config, 16, ) +PACKET(get_state, 17, + INTEGER( 33, T_MAY_IGNORE, state_i) +) + +PACKET(get_uuids, 18, + STRING( 34, T_MAY_IGNORE, uuids, (UUID_SIZE*sizeof(__u64))) + INTEGER( 35, T_MAY_IGNORE, uuids_flags) +) + +PACKET(get_timeout_flag, 19, + BIT( 36, T_MAY_IGNORE, use_degraded) +) + +PACKET(call_helper, 20, + STRING( 38, T_MAY_IGNORE, helper, 32) +) + +#undef PACKET +#undef INTEGER +#undef INT64 +#undef BIT +#undef STRING + diff -uprN linux-2.6.18/include/linux/drbd_tag_magic.h linux-2.6.18.ovz/include/linux/drbd_tag_magic.h --- linux-2.6.18/include/linux/drbd_tag_magic.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/drbd_tag_magic.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,77 @@ +#ifndef DRBD_TAG_MAGIC_H +#define DRBD_TAG_MAGIC_H + +#define TT_END 0 +#define TT_REMOVED 0xE000 + +// declare packet_type enums +enum packet_types { +#define PACKET(name, number, fields) P_ ## name = number, +#define INTEGER(pn,pr,member) +#define INT64(pn,pr,member) +#define BIT(pn,pr,member) +#define STRING(pn,pr,member,len) +#include "drbd_nl.h" + P_nl_after_last_packet, +}; + +// These struct are used to deduce the size of the tag lists: +#define PACKET(name, number ,fields) struct name ## _tag_len_struct { fields }; +#define INTEGER(pn,pr,member) int member; int tag_and_len ## member; +#define INT64(pn,pr,member) __u64 member; int tag_and_len ## member; +#define BIT(pn,pr,member) unsigned char member : 1; int tag_and_len ## member; +#define STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; \ + int tag_and_len ## member; +#include "linux/drbd_nl.h" + +// declate tag-list-sizes +const int tag_list_sizes[] = { +#define PACKET(name,number,fields) 2 fields , +#define INTEGER(pn,pr,member) +4+4 +#define INT64(pn,pr,member) +4+8 +#define BIT(pn,pr,member) +4+1 +#define STRING(pn,pr,member,len) +4+len +#include "drbd_nl.h" +}; + +/* The two highest bits are used for the tag type */ +#define TT_MASK 0xC000 +#define TT_INTEGER 0x0000 +#define TT_INT64 0x4000 +#define TT_BIT 0x8000 +#define TT_STRING 0xC000 +/* The next bit indicates if processing of the tag is mandatory */ +#define T_MANDATORY 0x2000 +#define T_MAY_IGNORE 0x0000 +#define TN_MASK 0x1fff +/* The remaining 13 bits are used to enumerate the tags */ + +#define tag_type(T) ((T) & TT_MASK) +#define tag_number(T) ((T) & TN_MASK) + +// declare tag enums +#define PACKET(name, number, fields) fields +enum drbd_tags { +#define INTEGER(pn,pr,member) T_ ## member = pn | TT_INTEGER | pr , +#define INT64(pn,pr,member) T_ ## member = pn | TT_INT64 | pr , +#define BIT(pn,pr,member) T_ ## member = pn | TT_BIT | pr , +#define STRING(pn,pr,member,len) T_ ## member = pn | TT_STRING | pr , +#include "drbd_nl.h" +}; + +struct tag { + const char* name; + int type_n_flags; +}; + +// declare tag names +#define PACKET(name, number, fields) fields +const struct tag tag_descriptions[] = { +#define INTEGER(pn,pr,member) [ pn ] = { #member, TT_INTEGER | pr }, +#define INT64(pn,pr,member) [ pn ] = { #member, TT_INT64 | pr }, +#define BIT(pn,pr,member) [ pn ] = { #member, TT_BIT | pr }, +#define STRING(pn,pr,member,len) [ pn ] = { #member, TT_STRING | pr }, +#include "drbd_nl.h" +}; + +#endif diff -uprN linux-2.6.18/include/linux/dvb/Kbuild linux-2.6.18.ovz/include/linux/dvb/Kbuild --- linux-2.6.18/include/linux/dvb/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/dvb/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,2 +1,9 @@ -header-y += ca.h frontend.h net.h osd.h version.h -unifdef-y := audio.h dmx.h video.h +header-y += ca.h +header-y += frontend.h +header-y += net.h +header-y += osd.h +header-y += version.h + +unifdef-y += audio.h +unifdef-y += dmx.h +unifdef-y += video.h diff -uprN linux-2.6.18/include/linux/elevator.h linux-2.6.18.ovz/include/linux/elevator.h --- linux-2.6.18/include/linux/elevator.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/elevator.h 2007-06-13 06:55:07.000000000 -0400 @@ -8,6 +8,8 @@ typedef void (elevator_merge_req_fn) (re typedef void (elevator_merged_fn) (request_queue_t *, struct request *); +typedef int (elevator_allow_merge_fn) (request_queue_t *, struct request *, struct bio *); + typedef int (elevator_dispatch_fn) (request_queue_t *, int); typedef void (elevator_add_req_fn) (request_queue_t *, struct request *); @@ -29,6 +31,7 @@ struct elevator_ops elevator_merge_fn *elevator_merge_fn; elevator_merged_fn *elevator_merged_fn; elevator_merge_req_fn *elevator_merge_req_fn; + elevator_allow_merge_fn *elevator_allow_merge_fn; elevator_dispatch_fn *elevator_dispatch_fn; elevator_add_req_fn *elevator_add_req_fn; @@ -49,6 +52,11 @@ struct elevator_ops elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; void (*trim)(struct io_context *); + /* In original cfq design task holds a cfqq refcount and puts it + * on exit via io context. Now async cfqqs are hold by UB, + * so we need somehow to put these queues. Use this function. + */ + void (*put_queue)(struct cfq_queue *); }; #define ELV_NAME_MAX (16) diff -uprN linux-2.6.18/include/linux/elfcore.h linux-2.6.18.ovz/include/linux/elfcore.h --- linux-2.6.18/include/linux/elfcore.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/elfcore.h 2007-06-13 06:55:07.000000000 -0400 @@ -7,6 +7,8 @@ #include #include +extern int sysctl_at_vsyscall; + struct elf_siginfo { int si_signo; /* signal number */ diff -uprN linux-2.6.18/include/linux/eventpoll.h linux-2.6.18.ovz/include/linux/eventpoll.h --- linux-2.6.18/include/linux/eventpoll.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/eventpoll.h 2007-06-13 06:55:07.000000000 -0400 @@ -58,6 +58,91 @@ static inline void eventpoll_init_file(s spin_lock_init(&file->f_ep_lock); } +struct epoll_filefd { + struct file *file; + int fd; +}; + +/* + * This structure is stored inside the "private_data" member of the file + * structure and rapresent the main data sructure for the eventpoll + * interface. + */ +struct eventpoll { + /* Protect the this structure access */ + rwlock_t lock; + + /* + * This semaphore is used to ensure that files are not removed + * while epoll is using them. This is read-held during the event + * collection loop and it is write-held during the file cleanup + * path, the epoll file exit code and the ctl operations. + */ + struct rw_semaphore sem; + + /* Wait queue used by sys_epoll_wait() */ + wait_queue_head_t wq; + + /* Wait queue used by file->poll() */ + wait_queue_head_t poll_wait; + + /* List of ready file descriptors */ + struct list_head rdllist; + + /* RB-Tree root used to store monitored fd structs */ + struct rb_root rbr; +}; + +/* + * Each file descriptor added to the eventpoll interface will + * have an entry of this type linked to the hash. + */ +struct epitem { + /* RB-Tree node used to link this structure to the eventpoll rb-tree */ + struct rb_node rbn; + + /* List header used to link this structure to the eventpoll ready list */ + struct list_head rdllink; + + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; + + /* Number of active wait queue attached to poll operations */ + int nwait; + + /* List containing poll wait queues */ + struct list_head pwqlist; + + /* The "container" of this item */ + struct eventpoll *ep; + + /* The structure that describe the interested events and the source fd */ + struct epoll_event event; + + /* + * Used to keep track of the usage count of the structure. This avoids + * that the structure will desappear from underneath our processing. + */ + atomic_t usecnt; + + /* List header used to link this item to the "struct file" items list */ + struct list_head fllink; + + /* List header used to link the item to the transfer list */ + struct list_head txlink; + + /* + * This is used during the collection/transfer of events to userspace + * to pin items empty events set. + */ + unsigned int revents; +}; + +extern struct semaphore epsem; +struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); +int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd); +void ep_release_epitem(struct epitem *epi); /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); @@ -90,6 +175,8 @@ static inline void eventpoll_release(str eventpoll_release_file(file); } +extern struct mutex epmutex; + #else static inline void eventpoll_init_file(struct file *file) {} diff -uprN linux-2.6.18/include/linux/fairsched.h linux-2.6.18.ovz/include/linux/fairsched.h --- linux-2.6.18/include/linux/fairsched.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/fairsched.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,139 @@ +#ifndef __LINUX_FAIRSCHED_H__ +#define __LINUX_FAIRSCHED_H__ + +/* + * Fair Scheduler + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#define FAIRSCHED_SET_RATE 0 +#define FAIRSCHED_DROP_RATE 1 +#define FAIRSCHED_GET_RATE 2 + +#ifdef __KERNEL__ +#include +#include + +#define FAIRSCHED_HAS_CPU_BINDING 0 + +typedef struct { cycles_t t; } fschtag_t; +typedef struct { unsigned long d; } fschdur_t; +typedef struct { cycles_t v; } fschvalue_t; + +struct vcpu_scheduler; + +struct fairsched_node { + struct list_head runlist; + + /* + * Fair Scheduler fields + * + * nr_running >= nr_ready (!= if delayed) + */ + fschtag_t start_tag; + int nr_ready; + int nr_runnable; + int nr_pcpu; + int vcpus; + + /* + * Rate limitator fields + */ + cycles_t last_updated_at; + fschvalue_t value; /* leaky function value */ + cycles_t delay; /* removed from schedule till */ + unsigned char delayed; + + /* + * Configuration + * + * Read-only most of the time. + */ + unsigned weight ____cacheline_aligned_in_smp; + /* fairness weight */ + unsigned char rate_limited; + unsigned rate; /* max CPU share */ + fschtag_t max_latency; + unsigned min_weight; + + struct list_head nodelist; + int id; +#ifdef CONFIG_VE + struct ve_struct *owner_env; +#endif + struct vcpu_scheduler *vsched; +}; + +#define for_each_fairsched_node(n) \ + list_for_each_entry((n), &fairsched_node_head, nodelist) + +#ifdef CONFIG_FAIRSCHED + +#define FSCHWEIGHT_MAX ((1 << 16) - 1) +#define FSCHRATE_SHIFT 10 +/* + * Fairsched timeslice value (in msecs) specifies maximum possible time a + * node can be running continuously without rescheduling, in other words + * main linux scheduler must call fairsched_scheduler() during + * FSCH_TIMESLICE msecs or fairscheduler logic will be broken. + * + * NOTE: must correspond VCPU_TIMESLICE_MAX value + */ +#define FSCH_TIMESLICE 8 + +/* + * Fairsched nodes used in boot process. + */ +extern struct fairsched_node fairsched_init_node; +extern struct fairsched_node fairsched_idle_node; + +/* + * For proc output. + */ +extern unsigned fairsched_nr_cpus; +extern void fairsched_cpu_online_map(int id, cpumask_t *mask); + +/* I hope vsched_id is always equal to fairsched node id --SAW */ +#define task_fairsched_node_id(p) task_vsched_id(p) + +/* + * Core functions. + */ +extern void fairsched_incrun(struct fairsched_node *node); +extern void fairsched_decrun(struct fairsched_node *node); +extern void fairsched_inccpu(struct fairsched_node *node); +extern void fairsched_deccpu(struct fairsched_node *node); +extern struct fairsched_node *fairsched_schedule( + struct fairsched_node *prev_node, + struct fairsched_node *cur_node, + int cur_node_active, + cycles_t time); + +/* + * Management functions. + */ +void fairsched_init_early(void); +asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid); +asmlinkage int sys_fairsched_rmnod(unsigned int id); +asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid); +asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus); + +#else /* CONFIG_FAIRSCHED */ + +#define task_fairsched_node_id(p) 0 +#define fairsched_incrun(p) do { } while (0) +#define fairsched_decrun(p) do { } while (0) +#define fairsched_inccpu(p) do { } while (0) +#define fairsched_deccpu(p) do { } while (0) +#define fairsched_cpu_online_map(id, mask) do { *(mask) = cpu_online_map; } while (0) + +#endif /* CONFIG_FAIRSCHED */ +#endif /* __KERNEL__ */ + +#endif /* __LINUX_FAIRSCHED_H__ */ diff -uprN linux-2.6.18/include/linux/faudit.h linux-2.6.18.ovz/include/linux/faudit.h --- linux-2.6.18/include/linux/faudit.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/faudit.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,46 @@ +/* + * include/linux/faudit.h + * + * Copyright (C) 2005 SWSoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __FAUDIT_H_ +#define __FAUDIT_H_ + +#include +#include + +struct vfsmount; +struct dentry; +struct super_block; +struct kstatfs; +struct kstat; +struct pt_regs; + +struct faudit_regs_arg { + int err; + struct pt_regs *regs; +}; + +struct faudit_stat_arg { + int err; + struct vfsmount *mnt; + struct dentry *dentry; + struct kstat *stat; +}; + +struct faudit_statfs_arg { + int err; + struct super_block *sb; + struct kstatfs *stat; +}; + +#define VIRTINFO_FAUDIT (0) +#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0) +#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1) + +#endif diff -uprN linux-2.6.18/include/linux/file.h linux-2.6.18.ovz/include/linux/file.h --- linux-2.6.18/include/linux/file.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/file.h 2007-06-13 06:55:07.000000000 -0400 @@ -112,5 +112,6 @@ struct task_struct; struct files_struct *get_files_struct(struct task_struct *); void FASTCALL(put_files_struct(struct files_struct *fs)); +void reset_files_struct(struct task_struct *, struct files_struct *); #endif /* __LINUX_FILE_H */ diff -uprN linux-2.6.18/include/linux/fs.h linux-2.6.18.ovz/include/linux/fs.h --- linux-2.6.18/include/linux/fs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/fs.h 2007-06-13 06:55:07.000000000 -0400 @@ -67,6 +67,7 @@ extern int dir_notify_enable; #define FMODE_LSEEK 4 #define FMODE_PREAD 8 #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ +#define FMODE_QUOTACTL 4 /* File is being opened for execution. Primary users of this flag are distributed filesystems that can use it to achieve correct ETXTBUSY @@ -91,6 +92,7 @@ extern int dir_notify_enable; /* public flags for file_system_type */ #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 +#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon * as nfs_rename() will be cleaned up @@ -312,6 +314,9 @@ struct iattr { * Includes for diskquotas. */ #include +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) +#include +#endif /** * enum positive_aop_returns - aop return codes with specific semantics @@ -382,6 +387,7 @@ struct address_space_operations { /* migrate the contents of a page to the specified target */ int (*migratepage) (struct address_space *, struct page *, struct page *); + int (*launder_page) (struct page *); }; struct backing_dev_info; @@ -527,6 +533,9 @@ struct inode { #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; #endif +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + struct vz_quota_ilink i_qlnk; +#endif /* These three should probably be a union */ struct list_head i_devices; struct pipe_inode_info *i_pipe; @@ -581,6 +590,8 @@ enum inode_i_mutex_lock_class I_MUTEX_QUOTA }; +extern kmem_cache_t *inode_cachep; + /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic @@ -687,6 +698,7 @@ struct file { struct fown_struct f_owner; unsigned int f_uid, f_gid; struct file_ra_state f_ra; + struct user_beancounter *f_ub; unsigned long f_version; void *f_security; @@ -700,7 +712,9 @@ struct file { spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; + struct ve_struct *owner_env; }; + extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); #define file_list_unlock() spin_unlock(&files_lock); @@ -765,6 +779,9 @@ struct file_lock { struct file *fl_file; unsigned char fl_flags; unsigned char fl_type; +#ifdef CONFIG_USER_RESOURCE + unsigned char fl_charged; +#endif loff_t fl_start; loff_t fl_end; @@ -1147,6 +1164,8 @@ struct super_operations { ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + + struct inode *(*get_quota_root)(struct super_block *); }; /* Inode state bits. Protected by inode_lock. */ @@ -1318,8 +1337,13 @@ struct file_system_type { struct list_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; + struct file_system_type *proto; + struct ve_struct *owner_env; }; +void get_filesystem(struct file_system_type *fs); +void put_filesystem(struct file_system_type *fs); + extern int get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int), @@ -1358,10 +1382,15 @@ void unnamed_dev_init(void); extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); +extern int register_ve_fs_type(struct ve_struct *, struct file_system_type *, + struct file_system_type **, struct vfsmount **); +extern void unregister_ve_fs_type(struct file_system_type *, struct vfsmount *); +extern void umount_ve_fs_type(struct file_system_type *local_fs_type); extern struct vfsmount *kern_mount(struct file_system_type *); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern void umount_tree(struct vfsmount *, int, struct list_head *); +#define kern_umount mntput extern void release_mounts(struct list_head *); extern long do_mount(char *, char *, char *, unsigned long, void *); extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); @@ -1369,6 +1398,7 @@ extern void mnt_set_mountpoint(struct vf struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); +extern int faudit_statfs(struct super_block *, struct kstatfs *); /* /sys/fs */ extern struct subsystem fs_subsys; @@ -1484,7 +1514,7 @@ extern void chrdev_show(struct seq_file #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); -extern struct block_device *lookup_bdev(const char *); +extern struct block_device *lookup_bdev(const char *, int mode); extern struct block_device *open_bdev_excl(const char *, int, void *); extern void close_bdev_excl(struct block_device *); extern void blkdev_show(struct seq_file *,off_t); @@ -1512,7 +1542,7 @@ extern int fs_may_remount_ro(struct supe #define bio_data_dir(bio) ((bio)->bi_rw & 1) extern int check_disk_change(struct block_device *); -extern int invalidate_inodes(struct super_block *); +extern int invalidate_inodes(struct super_block *, int); extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); unsigned long invalidate_mapping_pages(struct address_space *mapping, diff -uprN linux-2.6.18/include/linux/genhd.h linux-2.6.18.ovz/include/linux/genhd.h --- linux-2.6.18/include/linux/genhd.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/genhd.h 2007-06-13 06:55:07.000000000 -0400 @@ -417,6 +417,7 @@ static inline struct block_device *bdget return bdget(MKDEV(disk->major, disk->first_minor) + index); } +extern struct subsystem block_subsys; #endif #endif diff -uprN linux-2.6.18/include/linux/gfp.h linux-2.6.18.ovz/include/linux/gfp.h --- linux-2.6.18/include/linux/gfp.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/gfp.h 2007-06-13 06:55:07.000000000 -0400 @@ -46,6 +46,8 @@ struct vm_area_struct; #define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */ #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ +#define __GFP_UBC ((__force gfp_t)0x40000u)/* charge kmem in buddy and slab */ +#define __GFP_SOFT_UBC ((__force gfp_t)0x80000u)/* use soft charging */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) @@ -54,7 +56,8 @@ struct vm_area_struct; #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ - __GFP_NOMEMALLOC|__GFP_HARDWALL) + __GFP_NOMEMALLOC|__GFP_HARDWALL| \ + __GFP_UBC|__GFP_SOFT_UBC) /* This equals 0, but use constants in case they ever change */ #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) @@ -63,7 +66,9 @@ struct vm_area_struct; #define GFP_NOIO (__GFP_WAIT) #define GFP_NOFS (__GFP_WAIT | __GFP_IO) #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) +#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC) #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC) #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ __GFP_HIGHMEM) diff -uprN linux-2.6.18/include/linux/hardirq.h linux-2.6.18.ovz/include/linux/hardirq.h --- linux-2.6.18/include/linux/hardirq.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/hardirq.h 2007-06-13 06:55:07.000000000 -0400 @@ -7,6 +7,9 @@ #include #include +#include +#include + /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: @@ -95,6 +98,24 @@ static inline void account_system_vtime( } #endif +#define save_context() do { \ + struct task_struct *tsk; \ + if (hardirq_count() == HARDIRQ_OFFSET) { \ + tsk = current; \ + ve_save_context(tsk); \ + ub_save_context(tsk); \ + } \ + } while (0) + +#define restore_context() do { \ + struct task_struct *tsk; \ + if (hardirq_count() == HARDIRQ_OFFSET) { \ + tsk = current; \ + ve_restore_context(tsk); \ + ub_restore_context(tsk); \ + } \ + } while (0) + /* * It is safe to do non-atomic ops on ->hardirq_context, * because NMI handlers may not preempt and the ops are @@ -105,6 +126,7 @@ static inline void account_system_vtime( do { \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ + save_context(); \ trace_hardirq_enter(); \ } while (0) @@ -115,6 +137,7 @@ static inline void account_system_vtime( do { \ trace_hardirq_exit(); \ account_system_vtime(current); \ + restore_context(); \ sub_preempt_count(HARDIRQ_OFFSET); \ } while (0) diff -uprN linux-2.6.18/include/linux/hrtimer.h linux-2.6.18.ovz/include/linux/hrtimer.h --- linux-2.6.18/include/linux/hrtimer.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/hrtimer.h 2007-06-13 06:55:07.000000000 -0400 @@ -148,4 +148,9 @@ extern void hrtimer_run_queues(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); +extern long nanosleep_restart(struct restart_block *restart); + +extern ktime_t schedule_hrtimer(struct hrtimer *timer, + const enum hrtimer_mode mode); + #endif diff -uprN linux-2.6.18/include/linux/if_vlan.h linux-2.6.18.ovz/include/linux/if_vlan.h --- linux-2.6.18/include/linux/if_vlan.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/if_vlan.h 2007-06-13 06:55:07.000000000 -0400 @@ -77,6 +77,9 @@ struct vlan_group { struct hlist_node hlist; /* linked list */ struct net_device *vlan_devices[VLAN_GROUP_ARRAY_LEN]; struct rcu_head rcu; +#ifdef CONFIG_VE + struct ve_struct *owner; +#endif }; struct vlan_priority_tci_mapping { diff -uprN linux-2.6.18/include/linux/inetdevice.h linux-2.6.18.ovz/include/linux/inetdevice.h --- linux-2.6.18/include/linux/inetdevice.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/inetdevice.h 2007-06-13 06:55:07.000000000 -0400 @@ -35,6 +35,12 @@ struct ipv4_devconf }; extern struct ipv4_devconf ipv4_devconf; +extern struct ipv4_devconf ipv4_devconf_dflt; +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define ve_ipv4_devconf (*(get_exec_env()->_ipv4_devconf)) +#else +#define ve_ipv4_devconf ipv4_devconf +#endif struct in_device { @@ -61,29 +67,29 @@ struct in_device }; #define IN_DEV_FORWARD(in_dev) ((in_dev)->cnf.forwarding) -#define IN_DEV_MFORWARD(in_dev) (ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding) -#define IN_DEV_RPFILTER(in_dev) (ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter) -#define IN_DEV_SOURCE_ROUTE(in_dev) (ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route) -#define IN_DEV_BOOTP_RELAY(in_dev) (ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay) - -#define IN_DEV_LOG_MARTIANS(in_dev) (ipv4_devconf.log_martians || (in_dev)->cnf.log_martians) -#define IN_DEV_PROXY_ARP(in_dev) (ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp) -#define IN_DEV_SHARED_MEDIA(in_dev) (ipv4_devconf.shared_media || (in_dev)->cnf.shared_media) -#define IN_DEV_TX_REDIRECTS(in_dev) (ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects) -#define IN_DEV_SEC_REDIRECTS(in_dev) (ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects) +#define IN_DEV_MFORWARD(in_dev) (ve_ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding) +#define IN_DEV_RPFILTER(in_dev) (ve_ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter) +#define IN_DEV_SOURCE_ROUTE(in_dev) (ve_ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route) +#define IN_DEV_BOOTP_RELAY(in_dev) (ve_ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay) + +#define IN_DEV_LOG_MARTIANS(in_dev) (ve_ipv4_devconf.log_martians || (in_dev)->cnf.log_martians) +#define IN_DEV_PROXY_ARP(in_dev) (ve_ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp) +#define IN_DEV_SHARED_MEDIA(in_dev) (ve_ipv4_devconf.shared_media || (in_dev)->cnf.shared_media) +#define IN_DEV_TX_REDIRECTS(in_dev) (ve_ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects) +#define IN_DEV_SEC_REDIRECTS(in_dev) (ve_ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects) #define IN_DEV_IDTAG(in_dev) ((in_dev)->cnf.tag) #define IN_DEV_MEDIUM_ID(in_dev) ((in_dev)->cnf.medium_id) #define IN_DEV_PROMOTE_SECONDARIES(in_dev) (ipv4_devconf.promote_secondaries || (in_dev)->cnf.promote_secondaries) #define IN_DEV_RX_REDIRECTS(in_dev) \ ((IN_DEV_FORWARD(in_dev) && \ - (ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \ + (ve_ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \ || (!IN_DEV_FORWARD(in_dev) && \ - (ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects))) + (ve_ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects))) -#define IN_DEV_ARPFILTER(in_dev) (ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter) -#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce)) -#define IN_DEV_ARP_IGNORE(in_dev) (max(ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore)) +#define IN_DEV_ARPFILTER(in_dev) (ve_ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter) +#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ve_ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce)) +#define IN_DEV_ARP_IGNORE(in_dev) (max(ve_ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore)) struct in_ifaddr { @@ -114,6 +120,7 @@ extern u32 inet_select_addr(const struc extern u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope); extern struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask); extern void inet_forward_change(void); +extern void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); static __inline__ int inet_ifa_match(u32 addr, struct in_ifaddr *ifa) { @@ -181,6 +188,16 @@ static inline void in_dev_put(struct in_ #define __in_dev_put(idev) atomic_dec(&(idev)->refcnt) #define in_dev_hold(idev) atomic_inc(&(idev)->refcnt) +struct ve_struct; +#ifdef CONFIG_INET +extern int devinet_sysctl_init(struct ve_struct *); +extern void devinet_sysctl_fini(struct ve_struct *); +extern void devinet_sysctl_free(struct ve_struct *); +#else +static inline int devinet_sysctl_init(struct ve_struct *ve) { return 0; } +static inline void devinet_sysctl_fini(struct ve_struct *ve) { ; } +static inline void devinet_sysctl_free(struct ve_struct *ve) { ; } +#endif #endif /* __KERNEL__ */ static __inline__ __u32 inet_make_mask(int logmask) diff -uprN linux-2.6.18/include/linux/init_task.h linux-2.6.18.ovz/include/linux/init_task.h --- linux-2.6.18/include/linux/init_task.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/init_task.h 2007-06-13 06:55:07.000000000 -0400 @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #define INIT_FDTABLE \ { \ @@ -68,6 +70,22 @@ .session = 1, \ } +#ifdef CONFIG_VE +/* one for ve0, one for init_task */ +#define INIT_NSPROXY_COUNT ATOMIC_INIT(2) +#else +#define INIT_NSPROXY_COUNT ATOMIC_INIT(1) +#endif + +extern struct nsproxy init_nsproxy; +#define INIT_NSPROXY(nsproxy) { \ + .count = INIT_NSPROXY_COUNT, \ + .nslock = SPIN_LOCK_UNLOCKED, \ + .uts_ns = &init_uts_ns, \ + .namespace = NULL, \ + INIT_IPC_NS(ipc_ns) \ +} + #define INIT_SIGHAND(sighand) { \ .count = ATOMIC_INIT(1), \ .action = { { { .sa_handler = NULL, } }, }, \ @@ -117,6 +135,7 @@ extern struct group_info init_groups; .files = &init_files, \ .signal = &init_signals, \ .sighand = &init_sighand, \ + .nsproxy = &init_nsproxy, \ .pending = { \ .list = LIST_HEAD_INIT(tsk.pending.list), \ .signal = {{0}}}, \ diff -uprN linux-2.6.18/include/linux/ioprio.h linux-2.6.18.ovz/include/linux/ioprio.h --- linux-2.6.18/include/linux/ioprio.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/ioprio.h 2007-06-13 06:55:07.000000000 -0400 @@ -38,6 +38,7 @@ enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, + IOPRIO_WHO_UBC = 1000, }; /* diff -uprN linux-2.6.18/include/linux/ipc.h linux-2.6.18.ovz/include/linux/ipc.h --- linux-2.6.18/include/linux/ipc.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/ipc.h 2007-06-13 06:55:07.000000000 -0400 @@ -2,6 +2,7 @@ #define _LINUX_IPC_H #include +#include #define IPC_PRIVATE ((__kernel_key_t) 0) @@ -68,6 +69,59 @@ struct kern_ipc_perm void *security; }; +struct ipc_ids; +struct ipc_namespace { + struct kref kref; + struct ipc_ids *ids[3]; + + int sem_ctls[4]; + int used_sems; + + int msg_ctlmax; + int msg_ctlmnb; + int msg_ctlmni; + + size_t shm_ctlmax; + size_t shm_ctlall; + int shm_ctlmni; + int shm_tot; +}; + +extern struct ipc_namespace init_ipc_ns; + +#ifdef CONFIG_SYSVIPC +#define INIT_IPC_NS(ns) .ns = &init_ipc_ns, +#else +#define INIT_IPC_NS(ns) +#endif + +#ifdef CONFIG_IPC_NS +extern void free_ipc_ns(struct kref *kref); +extern int copy_ipcs(unsigned long flags, struct task_struct *tsk); +extern int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns); +#else +static inline int copy_ipcs(unsigned long flags, struct task_struct *tsk) +{ + return 0; +} +#endif + +static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) +{ +#ifdef CONFIG_IPC_NS + if (ns) + kref_get(&ns->kref); +#endif + return ns; +} + +static inline void put_ipc_ns(struct ipc_namespace *ns) +{ +#ifdef CONFIG_IPC_NS + kref_put(&ns->kref, free_ipc_ns); +#endif +} + #endif /* __KERNEL__ */ #endif /* _LINUX_IPC_H */ diff -uprN linux-2.6.18/include/linux/ipv6.h linux-2.6.18.ovz/include/linux/ipv6.h --- linux-2.6.18/include/linux/ipv6.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/ipv6.h 2007-06-13 06:55:07.000000000 -0400 @@ -428,12 +428,13 @@ static inline struct raw6_sock *raw6_sk( #define inet_v6_ipv6only(__sk) 0 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ -#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif)\ +#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif,__ve)\ (((__sk)->sk_hash == (__hash)) && \ ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ ((__sk)->sk_family == AF_INET6) && \ ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \ ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ + ve_accessible_strict((__sk)->owner_env, (__ve)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #endif /* __KERNEL__ */ diff -uprN linux-2.6.18/include/linux/jbd.h linux-2.6.18.ovz/include/linux/jbd.h --- linux-2.6.18/include/linux/jbd.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/jbd.h 2007-06-13 06:55:07.000000000 -0400 @@ -251,10 +251,15 @@ typedef struct journal_superblock_s #define J_ASSERT(assert) \ do { \ if (!(assert)) { \ + unsigned long stack; \ printk (KERN_EMERG \ "Assertion failure in %s() at %s:%d: \"%s\"\n", \ __FUNCTION__, __FILE__, __LINE__, # assert); \ - BUG(); \ + printk("Stack=%p current=%p pid=%d ve=%d comm='%s'\n", \ + &stack, current, current->pid, \ + get_exec_env()->veid, \ + current->comm); \ + dump_stack(); \ } \ } while (0) diff -uprN linux-2.6.18/include/linux/jiffies.h linux-2.6.18.ovz/include/linux/jiffies.h --- linux-2.6.18/include/linux/jiffies.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/jiffies.h 2007-06-13 06:55:07.000000000 -0400 @@ -80,6 +80,7 @@ */ extern u64 __jiffy_data jiffies_64; extern unsigned long volatile __jiffy_data jiffies; +extern unsigned long cycles_per_jiffy, cycles_per_clock; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); @@ -394,12 +395,14 @@ static inline clock_t jiffies_to_clock_t static inline unsigned long clock_t_to_jiffies(unsigned long x) { #if (HZ % USER_HZ)==0 + WARN_ON((long)x < 0); if (x >= ~0UL / (HZ / USER_HZ)) return ~0UL; return x * (HZ / USER_HZ); #else u64 jif; + WARN_ON((long)x < 0); /* Don't worry about loss of precision here .. */ if (x >= ~0UL / HZ * USER_HZ) return ~0UL; @@ -413,6 +416,7 @@ static inline unsigned long clock_t_to_j static inline u64 jiffies_64_to_clock_t(u64 x) { + WARN_ON((s64)x < 0); #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 do_div(x, HZ / USER_HZ); #else @@ -429,6 +433,7 @@ static inline u64 jiffies_64_to_clock_t( static inline u64 nsec_to_clock_t(u64 x) { + WARN_ON((s64)x < 0); #if (NSEC_PER_SEC % USER_HZ) == 0 do_div(x, (NSEC_PER_SEC / USER_HZ)); #elif (USER_HZ % 512) == 0 diff -uprN linux-2.6.18/include/linux/kdev_t.h linux-2.6.18.ovz/include/linux/kdev_t.h --- linux-2.6.18/include/linux/kdev_t.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/kdev_t.h 2007-06-13 06:55:07.000000000 -0400 @@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 de return dev & 0x3ffff; } +#define UNNAMED_MAJOR_COUNT 16 + +#if UNNAMED_MAJOR_COUNT > 1 + +extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT]; + +static inline dev_t make_unnamed_dev(int idx) +{ + /* + * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the + * unnamed device index into major number. + */ + return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)], + idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8)); +} + +static inline int unnamed_dev_idx(dev_t dev) +{ + int i; + for (i = 0; i < UNNAMED_MAJOR_COUNT && + MAJOR(dev) != unnamed_dev_majors[i]; i++); + return MINOR(dev) | (i << 8); +} + +static inline int is_unnamed_dev(dev_t dev) +{ + int i; + for (i = 0; i < UNNAMED_MAJOR_COUNT && + MAJOR(dev) != unnamed_dev_majors[i]; i++); + return i < UNNAMED_MAJOR_COUNT; +} + +#else /* UNNAMED_MAJOR_COUNT */ + +static inline dev_t make_unnamed_dev(int idx) +{ + return MKDEV(0, idx); +} + +static inline int unnamed_dev_idx(dev_t dev) +{ + return MINOR(dev); +} + +static inline int is_unnamed_dev(dev_t dev) +{ + return MAJOR(dev) == 0; +} + +#endif /* UNNAMED_MAJOR_COUNT */ + #else /* __KERNEL__ */ diff -uprN linux-2.6.18/include/linux/kernel.h linux-2.6.18.ovz/include/linux/kernel.h --- linux-2.6.18/include/linux/kernel.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/kernel.h 2007-06-13 06:55:07.000000000 -0400 @@ -143,6 +143,11 @@ asmlinkage int vprintk(const char *fmt, __attribute__ ((format (printf, 1, 0))); asmlinkage int printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))); +asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) + __attribute__ ((format (printf, 2, 0))); +asmlinkage int ve_printk(int, const char * fmt, ...) + __attribute__ ((format (printf, 2, 3))); +void prepare_printk(void); #else static inline int vprintk(const char *s, va_list args) __attribute__ ((format (printf, 1, 0))); @@ -150,8 +155,16 @@ static inline int vprintk(const char *s, static inline int printk(const char *s, ...) __attribute__ ((format (printf, 1, 2))); static inline int printk(const char *s, ...) { return 0; } +static inline int ve_printk(int d, const char *s, ...) + __attribute__ ((format (printf, 1, 2))); +static inline int printk(int d, const char *s, ...) { return 0; } +#define prepare_printk() do { } while (0) #endif +#define VE0_LOG 1 +#define VE_LOG 2 +#define VE_LOG_BOTH (VE0_LOG | VE_LOG) + unsigned long int_sqrt(unsigned long); static inline int __attribute_pure__ long_log2(unsigned long x) @@ -171,9 +184,14 @@ __attribute_const__ roundup_pow_of_two(u extern int printk_ratelimit(void); extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst); +extern int console_silence_loglevel; + static inline void console_silent(void) { - console_loglevel = 0; + if (console_loglevel > console_silence_loglevel) { + printk(KERN_EMERG "console shuts up ...\n"); + console_loglevel = 0; + } } static inline void console_verbose(void) @@ -183,10 +201,13 @@ static inline void console_verbose(void) } extern void bust_spinlocks(int yes); +extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; +extern int decode_call_traces; extern int tainted; +extern int kernel_text_csum_broken; extern const char *print_tainted(void); extern void add_taint(unsigned); diff -uprN linux-2.6.18/include/linux/kmem_cache.h linux-2.6.18.ovz/include/linux/kmem_cache.h --- linux-2.6.18/include/linux/kmem_cache.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/kmem_cache.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,224 @@ +#ifndef __KMEM_CACHE_H__ +#define __KMEM_CACHE_H__ +#include +#include +#include +#include +#include +#include + +/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, + * SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define SLAB_DEBUG 1 +#define SLAB_STATS 1 +#define SLAB_FORCED_DEBUG 1 +#else +#define SLAB_DEBUG 0 +#define SLAB_STATS 0 +#define SLAB_FORCED_DEBUG 0 +#endif + +/* + * struct array_cache + * + * Purpose: + * - LIFO ordering, to hand out cache-warm objects from _alloc + * - reduce the number of linked list operations + * - reduce spinlock operations + * + * The limit is stored in the per-cpu structure to reduce the data cache + * footprint. + * + */ +struct array_cache { + unsigned int avail; + unsigned int limit; + unsigned int batchcount; + unsigned int touched; + spinlock_t lock; + void *entry[0]; /* + * Must have this definition in here for the proper + * alignment of array_cache. Also simplifies accessing + * the entries. + * [0] is for gcc 2.95. It should really be []. + */ +}; + +/* bootstrap: The caches do not work without cpuarrays anymore, + * but the cpuarrays are allocated from the generic caches... + */ +#define BOOT_CPUCACHE_ENTRIES 1 +struct arraycache_init { + struct array_cache cache; + void *entries[BOOT_CPUCACHE_ENTRIES]; +}; + +/* + * The slab lists for all objects. + */ +struct kmem_list3 { + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + spinlock_t list_lock; + struct array_cache *shared; /* shared per node */ + struct array_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +}; + +/* + * struct kmem_cache + * + * manages a cache. + */ + +struct kmem_cache { +/* 1) per-cpu data, touched during every alloc/free */ + struct array_cache *array[NR_CPUS]; +/* 2) Cache tunables. Protected by cache_chain_mutex */ + unsigned int batchcount; + unsigned int limit; + unsigned int shared; + + unsigned int buffer_size; +/* 3) touched by every alloc & free from the backend */ + struct kmem_list3 *nodelists[MAX_NUMNODES]; + + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + +/* 4) cache_grow/shrink */ + /* order of pgs per slab (2^n) */ + unsigned int gfporder; + + /* force GFP flags, e.g. GFP_DMA */ + gfp_t gfpflags; + + size_t colour; /* cache colouring range */ + unsigned int colour_off; /* colour offset */ + struct kmem_cache *slabp_cache; + unsigned int slab_size; + unsigned int dflags; /* dynamic flags */ + + /* constructor func */ + void (*ctor) (void *, struct kmem_cache *, unsigned long); + + /* de-constructor func */ + void (*dtor) (void *, struct kmem_cache *, unsigned long); + +/* 5) cache creation/removal */ + const char *name; + struct list_head next; + +/* 6) statistics */ + unsigned long grown; + unsigned long reaped; + unsigned long shrunk; +#if SLAB_STATS + unsigned long num_active; + unsigned long num_allocations; + unsigned long high_mark; + unsigned long errors; + unsigned long max_freeable; + unsigned long node_allocs; + unsigned long node_frees; + unsigned long node_overflow; + atomic_t allochit; + atomic_t allocmiss; + atomic_t freehit; + atomic_t freemiss; +#endif +#if SLAB_DEBUG + /* + * If debugging is enabled, then the allocator can add additional + * fields and/or padding to every object. buffer_size contains the total + * object size including these internal fields, the following two + * variables contain the offset to the user object and its size. + */ + int obj_offset; + int obj_size; +#endif +#ifdef CONFIG_USER_RESOURCE + unsigned int objuse; +#endif +}; + +#define CFLGS_OFF_SLAB (0x80000000UL) +#define CFLGS_ENVIDS (0x04000000UL) +#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) +#define ENVIDS(x) ((x)->flags & CFLGS_ENVIDS) +#define kmem_mark_nocharge(c) do { (c)->flags |= SLAB_NO_CHARGE; } while (0) + +struct slab; +/* + * Functions for storing/retrieving the cachep and or slab from the page + * allocator. These are used to find the slab an obj belongs to. With kfree(), + * these are used to find the cache which an obj belongs to. + */ +static inline void page_set_cache(struct page *page, struct kmem_cache *cache) +{ + page->lru.next = (struct list_head *)cache; +} + +static inline struct kmem_cache *page_get_cache(struct page *page) +{ + if (unlikely(PageCompound(page))) + page = (struct page *)page_private(page); + BUG_ON(!PageSlab(page)); + return (struct kmem_cache *)page->lru.next; +} + +static inline void page_set_slab(struct page *page, struct slab *slab) +{ + page->lru.prev = (struct list_head *)slab; +} + +static inline struct slab *page_get_slab(struct page *page) +{ + if (unlikely(PageCompound(page))) + page = (struct page *)page_private(page); + BUG_ON(!PageSlab(page)); + return (struct slab *)page->lru.prev; +} + +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct page *page = virt_to_page(obj); + return page_get_cache(page); +} + +static inline struct slab *virt_to_slab(const void *obj) +{ + struct page *page = virt_to_page(obj); + return page_get_slab(page); +} + +#include + +static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, + unsigned int idx) +{ + return slab->s_mem + cache->buffer_size * idx; +} + +static inline unsigned int obj_to_index(struct kmem_cache *cache, + struct slab *slab, void *obj) +{ + return (unsigned)(obj - slab->s_mem) / cache->buffer_size; +} + +#endif diff -uprN linux-2.6.18/include/linux/kmem_slab.h linux-2.6.18.ovz/include/linux/kmem_slab.h --- linux-2.6.18/include/linux/kmem_slab.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/kmem_slab.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,72 @@ +#ifndef __KMEM_SLAB_H__ +#define __KMEM_SLAB_H__ + +/* + * kmem_bufctl_t: + * + * Bufctl's are used for linking objs within a slab + * linked offsets. + * + * This implementation relies on "struct page" for locating the cache & + * slab an object belongs to. + * This allows the bufctl structure to be small (one int), but limits + * the number of objects a slab (not a cache) can contain when off-slab + * bufctls are used. The limit is the size of the largest general cache + * that does not use off-slab slabs. + * For 32bit archs with 4 kB pages, is this 56. + * This is not serious, as it is only for large objects, when it is unwise + * to have too many per slab. + * Note: This limit can be raised by introducing a general cache whose size + * is less than 512 (PAGE_SIZE<<3), but greater than 256. + */ + +typedef unsigned int kmem_bufctl_t; +#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) +#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) +#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) +#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) + +/* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; + unsigned short nodeid; +}; + +/* + * struct slab_rcu + * + * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to + * arrange for kmem_freepages to be called via RCU. This is useful if + * we need to approach a kernel structure obliquely, from its address + * obtained without the usual locking. We can lock the structure to + * stabilize it and check it's still at the given address, only if we + * can be sure that the memory has not been meanwhile reused for some + * other kind of object (which our subsystem's lock might corrupt). + * + * rcu_read_lock before reading the address, then rcu_read_unlock after + * taking the spinlock within the structure expected at that address. + * + * We assume struct slab_rcu can overlay struct slab when destroying. + */ +struct slab_rcu { + struct rcu_head head; + struct kmem_cache *cachep; + void *addr; +}; + +static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +{ + return (kmem_bufctl_t *) (slabp + 1); +} + +#endif diff -uprN linux-2.6.18/include/linux/kobject.h linux-2.6.18.ovz/include/linux/kobject.h --- linux-2.6.18/include/linux/kobject.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/kobject.h 2007-06-13 06:55:07.000000000 -0400 @@ -46,6 +46,8 @@ enum kobject_action { KOBJ_UMOUNT = (__force kobject_action_t) 0x05, /* umount event for block devices (broken) */ KOBJ_OFFLINE = (__force kobject_action_t) 0x06, /* device offline */ KOBJ_ONLINE = (__force kobject_action_t) 0x07, /* device online */ + KOBJ_START = (__force kobject_action_t) 0x08, /* start subsystem */ + KOBJ_STOP = (__force kobject_action_t) 0x09, /* stop subsystem */ }; struct kobject { @@ -193,6 +195,9 @@ extern struct subsystem kernel_subsys; /* The global /sys/hypervisor/ subsystem */ extern struct subsystem hypervisor_subsys; +extern struct subsystem class_obj_subsys; +extern struct subsystem class_subsys; + /** * Helpers for setting the kset of registered objects. * Often, a registered object belongs to a kset embedded in a diff -uprN linux-2.6.18/include/linux/list.h linux-2.6.18.ovz/include/linux/list.h --- linux-2.6.18/include/linux/list.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/list.h 2007-06-13 06:55:07.000000000 -0400 @@ -353,6 +353,9 @@ static inline void list_splice_init(stru #define list_entry(ptr, type, member) \ container_of(ptr, type, member) +#define list_first_entry(ptr, type, member) \ + container_of((ptr)->next, type, member) + /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. @@ -454,6 +457,20 @@ static inline void list_splice_init(stru pos = list_entry(pos->member.next, typeof(*pos), member)) /** + * list_for_each_entry_continue_reverse - iterate backwards over list of given + * type continuing after existing point + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_entry(pos->member.prev, typeof(*pos), member), \ + prefetch(pos->member.prev); \ + &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member), \ + prefetch(pos->member.prev)) + +/** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage diff -uprN linux-2.6.18/include/linux/lockd/lockd.h linux-2.6.18.ovz/include/linux/lockd/lockd.h --- linux-2.6.18/include/linux/lockd/lockd.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/lockd/lockd.h 2007-06-13 06:55:07.000000000 -0400 @@ -61,6 +61,7 @@ struct nlm_host { spinlock_t h_lock; struct list_head h_granted; /* Locks in GRANTED state */ struct list_head h_reclaim; /* Locks in RECLAIM state */ + struct ve_struct * owner_env; /* VE owning the host */ }; /* @@ -80,7 +81,7 @@ struct nlm_wait; /* * Memory chunk for NLM client RPC request. */ -#define NLMCLNT_OHSIZE (sizeof(system_utsname.nodename)+10) +#define NLMCLNT_OHSIZE (sizeof(utsname()->nodename)+10) struct nlm_rqst { unsigned int a_flags; /* initial RPC task flags */ struct nlm_host * a_host; /* host handle */ @@ -141,8 +142,11 @@ extern struct svc_procedure nlmsvc_proce #ifdef CONFIG_LOCKD_V4 extern struct svc_procedure nlmsvc_procedures4[]; #endif -extern int nlmsvc_grace_period; -extern unsigned long nlmsvc_timeout; + +#include +extern int _nlmsvc_grace_period; +extern unsigned long _nlmsvc_timeout; + /* * Lockd client functions diff -uprN linux-2.6.18/include/linux/major.h linux-2.6.18.ovz/include/linux/major.h --- linux-2.6.18/include/linux/major.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/major.h 2007-06-13 06:55:07.000000000 -0400 @@ -166,4 +166,7 @@ #define VIOTAPE_MAJOR 230 +#define UNNAMED_EXTRA_MAJOR 130 +#define UNNAMED_EXTRA_MAJOR_COUNT 120 + #endif diff -uprN linux-2.6.18/include/linux/mm.h linux-2.6.18.ovz/include/linux/mm.h --- linux-2.6.18/include/linux/mm.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/mm.h 2007-06-13 06:55:07.000000000 -0400 @@ -267,6 +267,12 @@ struct page { void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_USER_RESOURCE + union { + struct user_beancounter *page_ub; + struct page_beancounter *page_pb; + } bc; +#endif }; #define page_private(page) ((page)->private) @@ -650,16 +656,9 @@ struct page *shmem_nopage(struct vm_area int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new); struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr); -int shmem_lock(struct file *file, int lock, struct user_struct *user); #else #define shmem_nopage filemap_nopage -static inline int shmem_lock(struct file *file, int lock, - struct user_struct *user) -{ - return 0; -} - static inline int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) { @@ -720,7 +719,9 @@ void free_pgd_range(struct mmu_gather ** void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, + unsigned long addr, size_t size); int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long size, pgprot_t prot); void unmap_mapping_range(struct address_space *mapping, diff -uprN linux-2.6.18/include/linux/mman.h linux-2.6.18.ovz/include/linux/mman.h --- linux-2.6.18/include/linux/mman.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/mman.h 2007-06-13 06:55:07.000000000 -0400 @@ -61,6 +61,9 @@ static inline unsigned long calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | +#ifdef MAP_GROWSUP + _calc_vm_trans(flags, MAP_GROWSUP, VM_GROWSUP ) | +#endif _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); diff -uprN linux-2.6.18/include/linux/mmzone.h linux-2.6.18.ovz/include/linux/mmzone.h --- linux-2.6.18/include/linux/mmzone.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/mmzone.h 2007-06-13 06:55:07.000000000 -0400 @@ -155,6 +155,7 @@ struct zone { * zone reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_ratio; + unsigned long min_slab_pages; struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; @@ -199,13 +200,9 @@ struct zone { * under - it drives the swappiness decision: whether to unmap mapped * pages. * - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. - * - * Access to both these fields is quite racy even on uniprocessor. But + * Access to both this field is quite racy even on uniprocessor. But * it is expected to average out OK. */ - int temp_priority; int prev_priority; @@ -421,6 +418,8 @@ int percpu_pagelist_fraction_sysctl_hand void __user *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ @@ -629,6 +628,12 @@ void sparse_init(void); #define sparse_index_init(_sec, _nid) do {} while (0) #endif /* CONFIG_SPARSEMEM */ +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) +#else +#define early_pfn_in_nid(pfn, nid) (1) +#endif + #ifndef early_pfn_valid #define early_pfn_valid(pfn) (1) #endif diff -uprN linux-2.6.18/include/linux/mount.h linux-2.6.18.ovz/include/linux/mount.h --- linux-2.6.18/include/linux/mount.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/mount.h 2007-06-13 06:55:07.000000000 -0400 @@ -54,6 +54,7 @@ struct vfsmount { struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */ struct namespace *mnt_namespace; /* containing namespace */ int mnt_pinned; + unsigned owner; }; static inline struct vfsmount *mntget(struct vfsmount *mnt) diff -uprN linux-2.6.18/include/linux/msg.h linux-2.6.18.ovz/include/linux/msg.h --- linux-2.6.18/include/linux/msg.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/msg.h 2007-06-13 06:55:07.000000000 -0400 @@ -92,6 +92,8 @@ struct msg_queue { struct list_head q_senders; }; +int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg); + #endif /* __KERNEL__ */ #endif /* _LINUX_MSG_H */ diff -uprN linux-2.6.18/include/linux/namei.h linux-2.6.18.ovz/include/linux/namei.h --- linux-2.6.18/include/linux/namei.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/namei.h 2007-06-13 06:55:07.000000000 -0400 @@ -48,12 +48,15 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA #define LOOKUP_PARENT 16 #define LOOKUP_NOALT 32 #define LOOKUP_REVAL 64 +#define LOOKUP_STRICT 128 /* no symlinks or other filesystems */ + /* * Intent data */ #define LOOKUP_OPEN (0x0100) #define LOOKUP_CREATE (0x0200) #define LOOKUP_ACCESS (0x0400) +#define LOOKUP_NOAREACHECK (0x0800) /* no area check on lookup */ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *)); diff -uprN linux-2.6.18/include/linux/namespace.h linux-2.6.18.ovz/include/linux/namespace.h --- linux-2.6.18/include/linux/namespace.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/namespace.h 2007-06-13 06:55:07.000000000 -0400 @@ -4,6 +4,7 @@ #include #include +#include struct namespace { atomic_t count; @@ -13,6 +14,8 @@ struct namespace { int event; }; +extern struct rw_semaphore namespace_sem; + extern int copy_namespace(int, struct task_struct *); extern void __put_namespace(struct namespace *namespace); extern struct namespace *dup_namespace(struct task_struct *, struct fs_struct *); @@ -26,11 +29,8 @@ static inline void put_namespace(struct static inline void exit_namespace(struct task_struct *p) { - struct namespace *namespace = p->namespace; + struct namespace *namespace = p->nsproxy->namespace; if (namespace) { - task_lock(p); - p->namespace = NULL; - task_unlock(p); put_namespace(namespace); } } diff -uprN linux-2.6.18/include/linux/netdevice.h linux-2.6.18.ovz/include/linux/netdevice.h --- linux-2.6.18/include/linux/netdevice.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netdevice.h 2007-06-13 06:55:07.000000000 -0400 @@ -37,6 +37,7 @@ #include #include #include +#include struct divert_blk; struct vlan_group; @@ -235,6 +236,11 @@ enum netdev_state_t __LINK_STATE_QDISC_RUNNING, }; +struct netdev_bc { + struct user_beancounter *exec_ub, *owner_ub; +}; + +#define netdev_bc(dev) (&(dev)->dev_bc) /* * This structure holds at boot time configured netdevice settings. They @@ -319,6 +325,10 @@ struct net_device #define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT) #define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT) #define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT) +/* device is venet device */ +#define NETIF_F_VENET (1 << (NETIF_F_GSO_SHIFT - 1)) +/* can be registered inside VE */ +#define NETIF_F_VIRTUAL (1 << (NETIF_F_GSO_SHIFT - 2)) /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6) @@ -521,10 +531,19 @@ struct net_device struct divert_blk *divert; #endif /* CONFIG_NET_DIVERT */ + struct ve_struct *owner_env; /* Owner VE of the interface */ + struct netdev_bc dev_bc; + /* class/net/name entry */ struct class_device class_dev; /* space for optional statistics and wireless sysfs groups */ struct attribute_group *sysfs_groups[3]; + +#ifdef CONFIG_VE + /* List entry in global devices list to keep track of their names + * assignment */ + struct list_head dev_global_list_entry; +#endif }; #define NETDEV_ALIGN 32 @@ -560,10 +579,25 @@ struct packet_type { #include #include +extern struct net_device templ_loopback_dev; extern struct net_device loopback_dev; /* The loopback */ +#if defined(CONFIG_VE) && defined(CONFIG_NET) +#define loopback_dev (*get_exec_env()->_loopback_dev) +#define ve0_loopback (*get_ve0()->_loopback_dev) +#define dev_base (get_exec_env()->_net_dev_base) +#define visible_dev_head(x) (&(x)->_net_dev_head) +#define visible_dev_index_head(x) (&(x)->_net_dev_index_head) +#else extern struct net_device *dev_base; /* All devices */ +#define ve0_loopback loopback_dev +#define visible_dev_head(x) NULL +#define visible_dev_index_head(x) NULL +#endif extern rwlock_t dev_base_lock; /* Device list lock */ +struct hlist_head *dev_name_hash(const char *name, struct ve_struct *env); +struct hlist_head *dev_index_hash(int ifindex, struct ve_struct *env); + extern int netdev_boot_setup_check(struct net_device *dev); extern unsigned long netdev_boot_base(const char *prefix, int unit); extern struct net_device *dev_getbyhwaddr(unsigned short type, char *hwaddr); @@ -997,6 +1031,18 @@ extern void dev_seq_stop(struct seq_file extern void linkwatch_run_queue(void); +#if defined(CONFIG_VE) && defined(CONFIG_NET) +static inline int ve_is_dev_movable(struct net_device *dev) +{ + return !(dev->features & NETIF_F_VIRTUAL); +} +#else +static inline int ve_is_dev_movable(struct net_device *dev) +{ + return 0; +} +#endif + static inline int net_gso_ok(int features, int gso_type) { int feature = gso_type << NETIF_F_GSO_SHIFT; diff -uprN linux-2.6.18/include/linux/netfilter/Kbuild linux-2.6.18.ovz/include/linux/netfilter/Kbuild --- linux-2.6.18/include/linux/netfilter/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,11 +1,38 @@ -header-y := nf_conntrack_sctp.h nf_conntrack_tuple_common.h \ - nfnetlink_conntrack.h nfnetlink_log.h nfnetlink_queue.h \ - xt_CLASSIFY.h xt_comment.h xt_connbytes.h xt_connmark.h \ - xt_CONNMARK.h xt_conntrack.h xt_dccp.h xt_esp.h \ - xt_helper.h xt_length.h xt_limit.h xt_mac.h xt_mark.h \ - xt_MARK.h xt_multiport.h xt_NFQUEUE.h xt_pkttype.h \ - xt_policy.h xt_realm.h xt_sctp.h xt_state.h xt_string.h \ - xt_tcpmss.h xt_tcpudp.h xt_SECMARK.h xt_CONNSECMARK.h +header-y += nf_conntrack_sctp.h +header-y += nf_conntrack_tuple_common.h +header-y += nfnetlink_conntrack.h +header-y += nfnetlink_log.h +header-y += nfnetlink_queue.h +header-y += xt_CLASSIFY.h +header-y += xt_comment.h +header-y += xt_connbytes.h +header-y += xt_connmark.h +header-y += xt_CONNMARK.h +header-y += xt_conntrack.h +header-y += xt_dccp.h +header-y += xt_esp.h +header-y += xt_helper.h +header-y += xt_length.h +header-y += xt_limit.h +header-y += xt_mac.h +header-y += xt_mark.h +header-y += xt_MARK.h +header-y += xt_multiport.h +header-y += xt_NFQUEUE.h +header-y += xt_pkttype.h +header-y += xt_policy.h +header-y += xt_realm.h +header-y += xt_sctp.h +header-y += xt_state.h +header-y += xt_string.h +header-y += xt_tcpmss.h +header-y += xt_tcpudp.h +header-y += xt_SECMARK.h +header-y += xt_CONNSECMARK.h -unifdef-y := nf_conntrack_common.h nf_conntrack_ftp.h \ - nf_conntrack_tcp.h nfnetlink.h x_tables.h xt_physdev.h +unifdef-y += nf_conntrack_common.h +unifdef-y += nf_conntrack_ftp.h +unifdef-y += nf_conntrack_tcp.h +unifdef-y += nfnetlink.h +unifdef-y += x_tables.h +unifdef-y += xt_physdev.h diff -uprN linux-2.6.18/include/linux/netfilter/nf_conntrack_ftp.h linux-2.6.18.ovz/include/linux/netfilter/nf_conntrack_ftp.h --- linux-2.6.18/include/linux/netfilter/nf_conntrack_ftp.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter/nf_conntrack_ftp.h 2007-06-13 06:55:07.000000000 -0400 @@ -32,13 +32,22 @@ struct ip_conntrack_expect; /* For NAT to hook in when we find a packet which describes what other * connection we should expect. */ -extern unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, +typedef unsigned int (*ip_nat_helper_ftp_hook)(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, enum ip_ct_ftp_type type, unsigned int matchoff, unsigned int matchlen, struct ip_conntrack_expect *exp, u32 *seq); +extern ip_nat_helper_ftp_hook ip_nat_ftp_hook; +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_ip_nat_ftp_hook \ + ((ip_nat_helper_ftp_hook) \ + (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook)) +#else +#define ve_ip_nat_ftp_hook ip_nat_ftp_hook +#endif #endif /* __KERNEL__ */ #endif /* _NF_CONNTRACK_FTP_H */ diff -uprN linux-2.6.18/include/linux/netfilter/x_tables.h linux-2.6.18.ovz/include/linux/netfilter/x_tables.h --- linux-2.6.18/include/linux/netfilter/x_tables.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter/x_tables.h 2007-06-13 06:55:07.000000000 -0400 @@ -275,6 +275,7 @@ struct xt_table_info { /* Size per table */ unsigned int size; + unsigned int alloc_size; /* Number of entries: FIXME. --RR */ unsigned int number; /* Initial number of entries. Needed for module usage count */ @@ -304,6 +305,10 @@ extern int xt_register_table(struct xt_t struct xt_table_info *bootstrap, struct xt_table_info *newinfo); extern void *xt_unregister_table(struct xt_table *table); +extern struct xt_table *virt_xt_register_table(struct xt_table *table, + struct xt_table_info *bootstrap, + struct xt_table_info *newinfo); +extern void *virt_xt_unregister_table(struct xt_table *table); extern struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff -uprN linux-2.6.18/include/linux/netfilter/xt_limit.h linux-2.6.18.ovz/include/linux/netfilter/xt_limit.h --- linux-2.6.18/include/linux/netfilter/xt_limit.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter/xt_limit.h 2007-06-13 06:55:07.000000000 -0400 @@ -18,4 +18,20 @@ struct xt_rateinfo { /* Ugly, ugly fucker. */ struct xt_rateinfo *master; }; + +#ifdef CONFIG_COMPAT +struct compat_xt_rateinfo { + u_int32_t avg; /* Average secs between packets * scale */ + u_int32_t burst; /* Period multiplier for upper limit. */ + + /* Used internally by the kernel */ + compat_ulong_t prev; + u_int32_t credit; + u_int32_t credit_cap, cost; + + /* Ugly, ugly fucker. */ + compat_uptr_t master; +}; +#endif + #endif /*_XT_RATE_H*/ diff -uprN linux-2.6.18/include/linux/netfilter.h linux-2.6.18.ovz/include/linux/netfilter.h --- linux-2.6.18/include/linux/netfilter.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter.h 2007-06-13 06:55:07.000000000 -0400 @@ -117,7 +117,13 @@ void nf_unregister_hooks(struct nf_hook_ int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_hooks \ + ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks)) +#else extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; +#define ve_nf_hooks nf_hooks +#endif /* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will * disappear once iptables is replaced with pkttables. Please DO NOT use them @@ -195,7 +201,7 @@ static inline int nf_hook_thresh(int pf, if (!cond) return 1; #ifndef CONFIG_NETFILTER_DEBUG - if (list_empty(&nf_hooks[pf][hook])) + if (list_empty(&ve_nf_hooks[pf][hook])) return 1; #endif return nf_hook_slow(pf, hook, pskb, indev, outdev, okfn, thresh); diff -uprN linux-2.6.18/include/linux/netfilter_arp/Kbuild linux-2.6.18.ovz/include/linux/netfilter_arp/Kbuild --- linux-2.6.18/include/linux/netfilter_arp/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_arp/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,2 +1,3 @@ -header-y := arpt_mangle.h -unifdef-y := arp_tables.h +header-y += arpt_mangle.h + +unifdef-y += arp_tables.h diff -uprN linux-2.6.18/include/linux/netfilter_bridge/Kbuild linux-2.6.18.ovz/include/linux/netfilter_bridge/Kbuild --- linux-2.6.18/include/linux/netfilter_bridge/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_bridge/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,4 +1,17 @@ -header-y += ebt_among.h ebt_arp.h ebt_arpreply.h ebt_ip.h ebt_limit.h \ - ebt_log.h ebt_mark_m.h ebt_mark_t.h ebt_nat.h ebt_pkttype.h \ - ebt_redirect.h ebt_stp.h ebt_ulog.h ebt_vlan.h -unifdef-y := ebtables.h ebt_802_3.h +header-y += ebt_among.h +header-y += ebt_arp.h +header-y += ebt_arpreply.h +header-y += ebt_ip.h +header-y += ebt_limit.h +header-y += ebt_log.h +header-y += ebt_mark_m.h +header-y += ebt_mark_t.h +header-y += ebt_nat.h +header-y += ebt_pkttype.h +header-y += ebt_redirect.h +header-y += ebt_stp.h +header-y += ebt_ulog.h +header-y += ebt_vlan.h + +unifdef-y += ebtables.h +unifdef-y += ebt_802_3.h diff -uprN linux-2.6.18/include/linux/netfilter_ipv4/Kbuild linux-2.6.18.ovz/include/linux/netfilter_ipv4/Kbuild --- linux-2.6.18/include/linux/netfilter_ipv4/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,21 +1,63 @@ +header-y += ip_conntrack_helper.h +header-y += ip_conntrack_helper_h323_asn1.h +header-y += ip_conntrack_helper_h323_types.h +header-y += ip_conntrack_protocol.h +header-y += ip_conntrack_sctp.h +header-y += ip_conntrack_tcp.h +header-y += ip_conntrack_tftp.h +header-y += ip_nat_pptp.h +header-y += ipt_addrtype.h +header-y += ipt_ah.h +header-y += ipt_CLASSIFY.h +header-y += ipt_CLUSTERIP.h +header-y += ipt_comment.h +header-y += ipt_connbytes.h +header-y += ipt_connmark.h +header-y += ipt_CONNMARK.h +header-y += ipt_conntrack.h +header-y += ipt_dccp.h +header-y += ipt_dscp.h +header-y += ipt_DSCP.h +header-y += ipt_ecn.h +header-y += ipt_ECN.h +header-y += ipt_esp.h +header-y += ipt_hashlimit.h +header-y += ipt_helper.h +header-y += ipt_iprange.h +header-y += ipt_length.h +header-y += ipt_limit.h +header-y += ipt_LOG.h +header-y += ipt_mac.h +header-y += ipt_mark.h +header-y += ipt_MARK.h +header-y += ipt_multiport.h +header-y += ipt_NFQUEUE.h +header-y += ipt_owner.h +header-y += ipt_physdev.h +header-y += ipt_pkttype.h +header-y += ipt_policy.h +header-y += ipt_realm.h +header-y += ipt_recent.h +header-y += ipt_REJECT.h +header-y += ipt_SAME.h +header-y += ipt_sctp.h +header-y += ipt_state.h +header-y += ipt_string.h +header-y += ipt_tcpmss.h +header-y += ipt_TCPMSS.h +header-y += ipt_tos.h +header-y += ipt_TOS.h +header-y += ipt_ttl.h +header-y += ipt_TTL.h +header-y += ipt_ULOG.h -header-y := ip_conntrack_helper.h ip_conntrack_helper_h323_asn1.h \ - ip_conntrack_helper_h323_types.h ip_conntrack_protocol.h \ - ip_conntrack_sctp.h ip_conntrack_tcp.h ip_conntrack_tftp.h \ - ip_nat_pptp.h ipt_addrtype.h ipt_ah.h \ - ipt_CLASSIFY.h ipt_CLUSTERIP.h ipt_comment.h \ - ipt_connbytes.h ipt_connmark.h ipt_CONNMARK.h \ - ipt_conntrack.h ipt_dccp.h ipt_dscp.h ipt_DSCP.h ipt_ecn.h \ - ipt_ECN.h ipt_esp.h ipt_hashlimit.h ipt_helper.h \ - ipt_iprange.h ipt_length.h ipt_limit.h ipt_LOG.h ipt_mac.h \ - ipt_mark.h ipt_MARK.h ipt_multiport.h ipt_NFQUEUE.h \ - ipt_owner.h ipt_physdev.h ipt_pkttype.h ipt_policy.h \ - ipt_realm.h ipt_recent.h ipt_REJECT.h ipt_SAME.h \ - ipt_sctp.h ipt_state.h ipt_string.h ipt_tcpmss.h \ - ipt_TCPMSS.h ipt_tos.h ipt_TOS.h ipt_ttl.h ipt_TTL.h \ - ipt_ULOG.h - -unifdef-y := ip_conntrack.h ip_conntrack_h323.h ip_conntrack_irc.h \ - ip_conntrack_pptp.h ip_conntrack_proto_gre.h \ - ip_conntrack_tuple.h ip_nat.h ip_nat_rule.h ip_queue.h \ - ip_tables.h +unifdef-y += ip_conntrack.h +unifdef-y += ip_conntrack_h323.h +unifdef-y += ip_conntrack_irc.h +unifdef-y += ip_conntrack_pptp.h +unifdef-y += ip_conntrack_proto_gre.h +unifdef-y += ip_conntrack_tuple.h +unifdef-y += ip_nat.h +unifdef-y += ip_nat_rule.h +unifdef-y += ip_queue.h +unifdef-y += ip_tables.h diff -uprN linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack.h linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack.h --- linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack.h 2007-06-13 06:55:07.000000000 -0400 @@ -72,6 +72,10 @@ do { \ struct ip_conntrack_helper; +#ifdef CONFIG_VE_IPTABLES +#include +#endif + struct ip_conntrack { /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, @@ -127,6 +131,9 @@ struct ip_conntrack /* Traversed often, so hopefully in different cacheline to top */ /* These are my tuples; original and reply */ struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *ct_owner_env; +#endif }; struct ip_conntrack_expect @@ -238,7 +245,15 @@ extern void ip_conntrack_tcp_update(stru enum ip_conntrack_dir dir); /* Call me when a conntrack is destroyed. */ +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_ip_conntrack_destroyed \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_destroyed) +#else extern void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack); +#define ve_ip_conntrack_destroyed ip_conntrack_destroyed +#endif + /* Fake conntrack entry for untracked connections */ extern struct ip_conntrack ip_conntrack_untracked; @@ -267,7 +282,7 @@ extern void ip_conntrack_proto_put(struc extern void ip_ct_remove_expectations(struct ip_conntrack *ct); extern struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *, - struct ip_conntrack_tuple *); + struct ip_conntrack_tuple *, struct user_beancounter *); extern void ip_conntrack_free(struct ip_conntrack *ct); @@ -276,6 +291,8 @@ extern void ip_conntrack_hash_insert(str extern struct ip_conntrack_expect * __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple); +extern void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp); + extern struct ip_conntrack_expect * ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple); @@ -298,6 +315,7 @@ static inline int is_dying(struct ip_con extern unsigned int ip_conntrack_htable_size; extern int ip_conntrack_checksum; +extern int ip_conntrack_disable_ve0; #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) @@ -349,6 +367,9 @@ ip_conntrack_event_cache(enum ip_conntra struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct; struct ip_conntrack_ecache *ecache; + if (!ve_is_super(get_exec_env())) + return; + local_bh_disable(); ecache = &__get_cpu_var(ip_conntrack_ecache); if (ct != ecache->ct) @@ -360,7 +381,7 @@ ip_conntrack_event_cache(enum ip_conntra static inline void ip_conntrack_event(enum ip_conntrack_events event, struct ip_conntrack *ct) { - if (is_confirmed(ct) && !is_dying(ct)) + if (is_confirmed(ct) && !is_dying(ct) && ve_is_super(get_exec_env())) atomic_notifier_call_chain(&ip_conntrack_chain, event, ct); } @@ -368,7 +389,9 @@ static inline void ip_conntrack_expect_event(enum ip_conntrack_expect_events event, struct ip_conntrack_expect *exp) { - atomic_notifier_call_chain(&ip_conntrack_expect_chain, event, exp); + if (ve_is_super(get_exec_env())) + atomic_notifier_call_chain(&ip_conntrack_expect_chain, event, + exp); } #else /* CONFIG_IP_NF_CONNTRACK_EVENTS */ static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, diff -uprN linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack_core.h linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack_core.h --- linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack_core.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack_core.h 2007-06-13 06:55:07.000000000 -0400 @@ -3,7 +3,6 @@ #include #define MAX_IP_CT_PROTO 256 -extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use @@ -54,8 +53,26 @@ static inline int ip_conntrack_confirm(s extern void ip_ct_unlink_expect(struct ip_conntrack_expect *exp); +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_ip_ct_protos \ + (get_exec_env()->_ip_conntrack->_ip_ct_protos) +#define ve_ip_conntrack_hash \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_hash) +#define ve_ip_conntrack_expect_list \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_expect_list) +#define ve_ip_conntrack_vmalloc \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_vmalloc) +#else +extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; extern struct list_head *ip_conntrack_hash; extern struct list_head ip_conntrack_expect_list; +#define ve_ip_ct_protos ip_ct_protos +#define ve_ip_conntrack_hash ip_conntrack_hash +#define ve_ip_conntrack_expect_list ip_conntrack_expect_list +#define ve_ip_conntrack_vmalloc ip_conntrack_vmalloc +#endif /* CONFIG_VE_IPTABLES */ + extern rwlock_t ip_conntrack_lock; #endif /* _IP_CONNTRACK_CORE_H */ diff -uprN linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack_helper.h linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack_helper.h --- linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2007-06-13 06:55:07.000000000 -0400 @@ -31,6 +31,9 @@ struct ip_conntrack_helper extern int ip_conntrack_helper_register(struct ip_conntrack_helper *); extern void ip_conntrack_helper_unregister(struct ip_conntrack_helper *); +extern int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *); +extern void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *); + /* Allocate space for an expectation: this is mandatory before calling ip_conntrack_expect_related. You will have to call put afterwards. */ extern struct ip_conntrack_expect * @@ -41,4 +44,5 @@ extern void ip_conntrack_expect_put(stru extern int ip_conntrack_expect_related(struct ip_conntrack_expect *exp); extern void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp); +extern struct list_head helpers; #endif /*_IP_CONNTRACK_HELPER_H*/ diff -uprN linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack_irc.h linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack_irc.h --- linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2007-06-13 06:55:07.000000000 -0400 @@ -14,16 +14,26 @@ #ifndef _IP_CONNTRACK_IRC_H #define _IP_CONNTRACK_IRC_H +#include + /* This structure exists only once per master */ struct ip_ct_irc_master { }; #ifdef __KERNEL__ -extern unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp); +typedef unsigned int (*ip_nat_helper_irc_hook)(struct sk_buff **, + enum ip_conntrack_info, unsigned int, unsigned int, + struct ip_conntrack_expect *); + +extern ip_nat_helper_irc_hook ip_nat_irc_hook; +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_ip_nat_irc_hook \ + ((ip_nat_helper_irc_hook) \ + (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook)) +#else +#define ve_ip_nat_irc_hook ip_nat_irc_hook +#endif #define IRC_PORT 6667 diff -uprN linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack_protocol.h linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack_protocol.h --- linux-2.6.18/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2007-06-13 06:55:07.000000000 -0400 @@ -67,6 +67,7 @@ struct ip_conntrack_protocol /* Protocol registration. */ extern int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto); extern void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto); + /* Existing built-in protocols */ extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp; extern struct ip_conntrack_protocol ip_conntrack_protocol_udp; @@ -74,6 +75,41 @@ extern struct ip_conntrack_protocol ip_c extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; extern int ip_conntrack_protocol_tcp_init(void); +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +#include +#define ve_ip_ct_tcp_timeouts \ + (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeouts) +#define ve_ip_ct_udp_timeout \ + (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout) +#define ve_ip_ct_udp_timeout_stream \ + (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout_stream) +#define ve_ip_ct_icmp_timeout \ + (get_exec_env()->_ip_conntrack->_ip_ct_icmp_timeout) +#define ve_ip_ct_generic_timeout \ + (get_exec_env()->_ip_conntrack->_ip_ct_generic_timeout) +#define ve_ip_ct_log_invalid \ + (get_exec_env()->_ip_conntrack->_ip_ct_log_invalid) +#define ve_ip_ct_tcp_timeout_max_retrans \ + (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeout_max_retrans) +#define ve_ip_ct_tcp_loose \ + (get_exec_env()->_ip_conntrack->_ip_ct_tcp_loose) +#define ve_ip_ct_tcp_be_liberal \ + (get_exec_env()->_ip_conntrack->_ip_ct_tcp_be_liberal) +#define ve_ip_ct_tcp_max_retrans \ + (get_exec_env()->_ip_conntrack->_ip_ct_tcp_max_retrans) +#else +#define ve_ip_ct_tcp_timeouts *tcp_timeouts +#define ve_ip_ct_udp_timeout ip_ct_udp_timeout +#define ve_ip_ct_udp_timeout_stream ip_ct_udp_timeout_stream +#define ve_ip_ct_icmp_timeout ip_ct_icmp_timeout +#define ve_ip_ct_generic_timeout ip_ct_generic_timeout +#define ve_ip_ct_log_invalid ip_ct_log_invalid +#define ve_ip_ct_tcp_timeout_max_retrans ip_ct_tcp_timeout_max_retrans +#define ve_ip_ct_tcp_loose ip_ct_tcp_loose +#define ve_ip_ct_tcp_be_liberal ip_ct_tcp_be_liberal +#define ve_ip_ct_tcp_max_retrans ip_ct_tcp_max_retrans +#endif + /* Log invalid packets */ extern unsigned int ip_ct_log_invalid; @@ -85,10 +121,10 @@ extern int ip_ct_port_nfattr_to_tuple(st #ifdef CONFIG_SYSCTL #ifdef DEBUG_INVALID_PACKETS #define LOG_INVALID(proto) \ - (ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) + (ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW) #else #define LOG_INVALID(proto) \ - ((ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) \ + ((ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW) \ && net_ratelimit()) #endif #else diff -uprN linux-2.6.18/include/linux/netfilter_ipv4/ip_nat.h linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_nat.h --- linux-2.6.18/include/linux/netfilter_ipv4/ip_nat.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_nat.h 2007-06-13 06:55:07.000000000 -0400 @@ -72,6 +72,8 @@ extern unsigned int ip_nat_setup_info(st extern int ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack); +extern void ip_nat_hash_conntrack(struct ip_conntrack *conntrack); + /* Calculate relative checksum. */ extern u_int16_t ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, diff -uprN linux-2.6.18/include/linux/netfilter_ipv4/ip_nat_rule.h linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_nat_rule.h --- linux-2.6.18/include/linux/netfilter_ipv4/ip_nat_rule.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_nat_rule.h 2007-06-13 06:55:07.000000000 -0400 @@ -6,7 +6,7 @@ #ifdef __KERNEL__ -extern int ip_nat_rule_init(void) __init; +extern int ip_nat_rule_init(void); extern void ip_nat_rule_cleanup(void); extern int ip_nat_rule_find(struct sk_buff **pskb, unsigned int hooknum, diff -uprN linux-2.6.18/include/linux/netfilter_ipv4/ip_tables.h linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_tables.h --- linux-2.6.18/include/linux/netfilter_ipv4/ip_tables.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4/ip_tables.h 2007-06-13 06:55:07.000000000 -0400 @@ -282,7 +282,7 @@ extern void ipt_init(void) __init; //#define ipt_register_table(tbl, repl) xt_register_table(AF_INET, tbl, repl) //#define ipt_unregister_table(tbl) xt_unregister_table(AF_INET, tbl) -extern int ipt_register_table(struct ipt_table *table, +extern struct ipt_table *ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl); extern void ipt_unregister_table(struct ipt_table *table); diff -uprN linux-2.6.18/include/linux/netfilter_ipv4.h linux-2.6.18.ovz/include/linux/netfilter_ipv4.h --- linux-2.6.18/include/linux/netfilter_ipv4.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv4.h 2007-06-13 06:55:07.000000000 -0400 @@ -77,7 +77,7 @@ enum nf_ip_hook_priorities { #define SO_ORIGINAL_DST 80 #ifdef __KERNEL__ -extern int ip_route_me_harder(struct sk_buff **pskb); +extern int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type); extern int ip_xfrm_me_harder(struct sk_buff **pskb); extern unsigned int nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); diff -uprN linux-2.6.18/include/linux/netfilter_ipv6/Kbuild linux-2.6.18.ovz/include/linux/netfilter_ipv6/Kbuild --- linux-2.6.18/include/linux/netfilter_ipv6/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv6/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,6 +1,21 @@ -header-y += ip6t_HL.h ip6t_LOG.h ip6t_MARK.h ip6t_REJECT.h ip6t_ah.h \ - ip6t_esp.h ip6t_frag.h ip6t_hl.h ip6t_ipv6header.h \ - ip6t_length.h ip6t_limit.h ip6t_mac.h ip6t_mark.h \ - ip6t_multiport.h ip6t_opts.h ip6t_owner.h ip6t_policy.h \ - ip6t_physdev.h ip6t_rt.h -unifdef-y := ip6_tables.h +header-y += ip6t_HL.h +header-y += ip6t_LOG.h +header-y += ip6t_MARK.h +header-y += ip6t_REJECT.h +header-y += ip6t_ah.h +header-y += ip6t_esp.h +header-y += ip6t_frag.h +header-y += ip6t_hl.h +header-y += ip6t_ipv6header.h +header-y += ip6t_length.h +header-y += ip6t_limit.h +header-y += ip6t_mac.h +header-y += ip6t_mark.h +header-y += ip6t_multiport.h +header-y += ip6t_opts.h +header-y += ip6t_owner.h +header-y += ip6t_policy.h +header-y += ip6t_physdev.h +header-y += ip6t_rt.h + +unifdef-y += ip6_tables.h diff -uprN linux-2.6.18/include/linux/netfilter_ipv6/ip6_tables.h linux-2.6.18.ovz/include/linux/netfilter_ipv6/ip6_tables.h --- linux-2.6.18/include/linux/netfilter_ipv6/ip6_tables.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/netfilter_ipv6/ip6_tables.h 2007-06-13 06:55:07.000000000 -0400 @@ -293,7 +293,7 @@ extern void ip6t_init(void) __init; xt_register_match(match); }) #define ip6t_unregister_match(match) xt_unregister_match(match) -extern int ip6t_register_table(struct ip6t_table *table, +extern struct ip6t_table *ip6t_register_table(struct ip6t_table *table, const struct ip6t_replace *repl); extern void ip6t_unregister_table(struct ip6t_table *table); extern unsigned int ip6t_do_table(struct sk_buff **pskb, diff -uprN linux-2.6.18/include/linux/nfcalls.h linux-2.6.18.ovz/include/linux/nfcalls.h --- linux-2.6.18/include/linux/nfcalls.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/nfcalls.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,186 @@ +/* + * include/linux/nfcalls.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_NFCALLS_H +#define _LINUX_NFCALLS_H + +#include + +#ifdef CONFIG_MODULES +extern struct module no_module; + +#define DECL_KSYM_MODULE(name) \ + extern struct module *vz_mod_##name + +#define INIT_KSYM_MODULE(name) \ + struct module *vz_mod_##name = &no_module; \ + EXPORT_SYMBOL(vz_mod_##name) + +static inline void __vzksym_modresolve(struct module **modp, struct module *mod) +{ + /* + * we want to be sure, that pointer updates are visible first: + * 1. wmb() is here only for piece of sure + * (note, no rmb() in KSYMSAFECALL) + * 2. synchronize_sched() guarantees that updates are visible + * on all cpus and allows us to remove rmb() in KSYMSAFECALL + */ + wmb(); synchronize_sched(); + *modp = mod; + /* just to be sure, our changes are visible as soon as possible */ + wmb(); synchronize_sched(); +} + +static inline void __vzksym_modunresolve(struct module **modp) +{ + /* + * try_module_get() in KSYMSAFECALL should fail at this moment since + * THIS_MODULE in in unloading state (we should be called from fini), + * no need to syncronize pointers/ve_module updates. + */ + *modp = &no_module; + /* + * synchronize_sched() guarantees here that we see + * updated module pointer before the module really gets away + */ + synchronize_sched(); +} + +static inline int __vzksym_module_get(struct module *mod) +{ + /* + * we want to avoid rmb(), so use synchronize_sched() in KSYMUNRESOLVE + * and smp_read_barrier_depends() here... + */ + smp_read_barrier_depends(); /* for module loading */ + if (!try_module_get(mod)) + return -EBUSY; + + return 0; +} + +static inline void __vzksym_module_put(struct module *mod) +{ + module_put(mod); +} +#else +#define DECL_KSYM_MODULE(name) +#define INIT_KSYM_MODULE(name) +#define __vzksym_modresolve(modp, mod) +#define __vzksym_modunresolve(modp) +#define __vzksym_module_get(mod) (0) +#define __vzksym_module_put(mod) +#endif + +#define __KSYMERRCALL(err, type, mod, name, args) \ +({ \ + type ret = (type)err; \ + if (!__vzksym_module_get(vz_mod_##mod)) { \ + if (vz_##name) \ + ret = ((*vz_##name)args); \ + __vzksym_module_put(vz_mod_##mod); \ + } \ + ret; \ +}) +#define __KSYMSAFECALL_VOID(mod, name, args) \ +do { \ + if (!__vzksym_module_get(vz_mod_##mod)) { \ + if (vz_##name) \ + ((*vz_##name)args); \ + __vzksym_module_put(vz_mod_##mod); \ + } \ +} while (0) + +#define DECL_KSYM_CALL(type, name, args) \ + extern type (*vz_##name) args +#define INIT_KSYM_CALL(type, name, args) \ + type (*vz_##name) args; \ + EXPORT_SYMBOL(vz_##name) + +#define KSYMERRCALL(err, mod, name, args) \ + __KSYMERRCALL(err, int, mod, name, args) +#define KSYMSAFECALL(type, mod, name, args) \ + __KSYMERRCALL(0, type, mod, name, args) +#define KSYMSAFECALL_VOID(mod, name, args) \ + __KSYMSAFECALL_VOID(mod, name, args) +#define KSYMREF(name) vz_##name + +/* should be called _after_ KSYMRESOLVE's */ +#define KSYMMODRESOLVE(name) \ + __vzksym_modresolve(&vz_mod_##name, THIS_MODULE) +#define KSYMMODUNRESOLVE(name) \ + __vzksym_modunresolve(&vz_mod_##name) + +#define KSYMRESOLVE(name) \ + vz_##name = &name +#define KSYMUNRESOLVE(name) \ + vz_##name = NULL + +#if defined(CONFIG_VE) +DECL_KSYM_MODULE(ip_tables); +DECL_KSYM_MODULE(ip6_tables); +DECL_KSYM_MODULE(iptable_filter); +DECL_KSYM_MODULE(ip6table_filter); +DECL_KSYM_MODULE(iptable_mangle); +DECL_KSYM_MODULE(ip6table_mangle); +DECL_KSYM_MODULE(ip_conntrack); +DECL_KSYM_MODULE(ip_conntrack_ftp); +DECL_KSYM_MODULE(ip_conntrack_irc); +DECL_KSYM_MODULE(xt_conntrack); +DECL_KSYM_MODULE(ip_nat); +DECL_KSYM_MODULE(iptable_nat); +DECL_KSYM_MODULE(ip_nat_ftp); +DECL_KSYM_MODULE(ip_nat_irc); + +struct sk_buff; + +DECL_KSYM_CALL(int, init_netfilter, (void)); +DECL_KSYM_CALL(int, init_iptables, (void)); +DECL_KSYM_CALL(int, init_ip6tables, (void)); +DECL_KSYM_CALL(int, init_iptable_filter, (void)); +DECL_KSYM_CALL(int, init_ip6table_filter, (void)); +DECL_KSYM_CALL(int, init_iptable_mangle, (void)); +DECL_KSYM_CALL(int, init_ip6table_mangle, (void)); +DECL_KSYM_CALL(int, init_iptable_conntrack, (void)); +DECL_KSYM_CALL(int, init_ip_ct_ftp, (void)); +DECL_KSYM_CALL(int, init_ip_ct_irc, (void)); +DECL_KSYM_CALL(int, ip_nat_init, (void)); +DECL_KSYM_CALL(int, init_iptable_nat, (void)); +DECL_KSYM_CALL(int, init_iptable_nat_ftp, (void)); +DECL_KSYM_CALL(int, init_iptable_nat_irc, (void)); +DECL_KSYM_CALL(void, fini_iptable_nat_irc, (void)); +DECL_KSYM_CALL(void, fini_iptable_nat_ftp, (void)); +DECL_KSYM_CALL(void, fini_iptable_nat, (void)); +DECL_KSYM_CALL(void, ip_nat_cleanup, (void)); +DECL_KSYM_CALL(void, fini_ip_ct_irc, (void)); +DECL_KSYM_CALL(void, fini_ip_ct_ftp, (void)); +DECL_KSYM_CALL(void, fini_iptable_conntrack, (void)); +DECL_KSYM_CALL(void, fini_ip6table_filter, (void)); +DECL_KSYM_CALL(void, fini_iptable_filter, (void)); +DECL_KSYM_CALL(void, fini_ip6table_mangle, (void)); +DECL_KSYM_CALL(void, fini_iptable_mangle, (void)); +DECL_KSYM_CALL(void, fini_ip6tables, (void)); +DECL_KSYM_CALL(void, fini_iptables, (void)); +DECL_KSYM_CALL(void, fini_netfilter, (void)); + +#include +#endif /* CONFIG_VE */ + +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) +DECL_KSYM_MODULE(vzethdev); +DECL_KSYM_CALL(int, veth_open, (struct net_device *dev)); +#endif + +#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE) +DECL_KSYM_MODULE(vzmon); +DECL_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); +#endif + +#endif /* _LINUX_NFCALLS_H */ diff -uprN linux-2.6.18/include/linux/nfsd/Kbuild linux-2.6.18.ovz/include/linux/nfsd/Kbuild --- linux-2.6.18/include/linux/nfsd/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/nfsd/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,2 +1,7 @@ -unifdef-y := const.h export.h stats.h syscall.h nfsfh.h debug.h auth.h - +unifdef-y += const.h +unifdef-y += export.h +unifdef-y += stats.h +unifdef-y += syscall.h +unifdef-y += nfsfh.h +unifdef-y += debug.h +unifdef-y += auth.h diff -uprN linux-2.6.18/include/linux/notifier.h linux-2.6.18.ovz/include/linux/notifier.h --- linux-2.6.18/include/linux/notifier.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/notifier.h 2007-06-13 06:55:07.000000000 -0400 @@ -108,8 +108,9 @@ extern int raw_notifier_call_chain(struc #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ +#define NOTIFY_FAIL 0x0002 /* Reject */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ -#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) +#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL) /* Bad/Veto action */ /* * Clean way to return from the notifier and stop further calls. diff -uprN linux-2.6.18/include/linux/nsproxy.h linux-2.6.18.ovz/include/linux/nsproxy.h --- linux-2.6.18/include/linux/nsproxy.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/nsproxy.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,52 @@ +#ifndef _LINUX_NSPROXY_H +#define _LINUX_NSPROXY_H + +#include +#include + +struct namespace; +struct uts_namespace; +struct ipc_namespace; + +/* + * A structure to contain pointers to all per-process + * namespaces - fs (mount), uts, network, sysvipc, etc. + * + * 'count' is the number of tasks holding a reference. + * The count for each namespace, then, will be the number + * of nsproxies pointing to it, not the number of tasks. + * + * The nsproxy is shared by tasks which share all namespaces. + * As soon as a single namespace is cloned or unshared, the + * nsproxy is copied. + */ +struct nsproxy { + atomic_t count; + spinlock_t nslock; + struct uts_namespace *uts_ns; + struct ipc_namespace *ipc_ns; + struct namespace *namespace; +}; +extern struct nsproxy init_nsproxy; + +struct nsproxy *dup_namespaces(struct nsproxy *orig); +int copy_namespaces(int flags, struct task_struct *tsk); +void get_task_namespaces(struct task_struct *tsk); +void free_nsproxy(struct nsproxy *ns); + +static inline struct nsproxy *get_nsproxy(struct nsproxy *n) +{ + atomic_inc(&n->count); + return n; +} + +static inline void put_nsproxy(struct nsproxy *ns) +{ + if (atomic_dec_and_test(&ns->count)) { + free_nsproxy(ns); + } +} + +extern void exit_task_namespaces(struct task_struct *); +struct namespace * get_task_mnt_ns(struct task_struct *); +#endif diff -uprN linux-2.6.18/include/linux/page-flags.h linux-2.6.18.ovz/include/linux/page-flags.h --- linux-2.6.18/include/linux/page-flags.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/page-flags.h 2007-06-13 06:55:07.000000000 -0400 @@ -86,6 +86,8 @@ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_buddy 19 /* Page is free, on buddy lists */ +#define PG_checkpointed 21 /* Page transferred */ + #if (BITS_PER_LONG > 32) /* @@ -247,6 +249,8 @@ #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define ClearPageCheckpointed(page) clear_bit(PG_checkpointed, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); diff -uprN linux-2.6.18/include/linux/percpu.h linux-2.6.18.ovz/include/linux/percpu.h --- linux-2.6.18/include/linux/percpu.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/percpu.h 2007-06-13 06:55:07.000000000 -0400 @@ -32,16 +32,25 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -extern void *__alloc_percpu(size_t size); +#define static_percpu_ptr(sptr, sptrs) ({ \ + int i; \ + for (i = 0; i < NR_CPUS; i++) \ + (sptr)->ptrs[i] = &(sptrs)[i]; \ + ((void *)(~(unsigned long)(sptr))); \ + }) + +extern void *__alloc_percpu_mask(size_t size, gfp_t gfp); extern void free_percpu(const void *); #else /* CONFIG_SMP */ #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) -static inline void *__alloc_percpu(size_t size) +#define static_percpu_ptr(sptr, sptrs) (&sptrs[0]) + +static inline void *__alloc_percpu_mask(size_t size, gfp_t gfp) { - void *ret = kmalloc(size, GFP_KERNEL); + void *ret = kmalloc(size, gfp); if (ret) memset(ret, 0, size); return ret; @@ -54,6 +63,11 @@ static inline void free_percpu(const voi #endif /* CONFIG_SMP */ /* Simple wrapper for the common case: zeros memory. */ -#define alloc_percpu(type) ((type *)(__alloc_percpu(sizeof(type)))) +#define __alloc_percpu(size) \ + __alloc_percpu_mask((size), GFP_KERNEL) +#define alloc_percpu(type) \ + ((type *)(__alloc_percpu_mask(sizeof(type), GFP_KERNEL))) +#define alloc_percpu_atomic(type) \ + ((type *)(__alloc_percpu_mask(sizeof(type), GFP_ATOMIC))) #endif /* __LINUX_PERCPU_H */ diff -uprN linux-2.6.18/include/linux/pid.h linux-2.6.18.ovz/include/linux/pid.h --- linux-2.6.18/include/linux/pid.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/pid.h 2007-06-13 06:55:07.000000000 -0400 @@ -3,6 +3,18 @@ #include +#define VPID_BIT 10 +#define VPID_DIV (1<pids[(type)].node.next) +extern int alloc_pidmap(void); +extern fastcall void free_pidmap(int pid); + +#ifndef CONFIG_VE + +#define vpid_to_pid(pid) (pid) +#define __vpid_to_pid(pid) (pid) +#define pid_to_vpid(pid) (pid) +#define _pid_to_vpid(pid) (pid) + +#define comb_vpid_to_pid(pid) (pid) +#define comb_pid_to_vpid(pid) (pid) + +#else + +struct ve_struct; +extern void free_vpid(struct pid *pid); +extern pid_t alloc_vpid(struct pid *pid, pid_t vpid); +extern pid_t vpid_to_pid(pid_t pid); +extern pid_t __vpid_to_pid(pid_t pid); +extern pid_t pid_to_vpid(pid_t pid); +extern pid_t _pid_to_vpid(pid_t pid); -#define pid_next_task(task, type) \ - hlist_entry(pid_next(task, type), struct task_struct, \ - pids[(type)].node) +static inline int comb_vpid_to_pid(int vpid) +{ + int pid = vpid; + if (vpid > 0) { + pid = vpid_to_pid(vpid); + if (unlikely(pid < 0)) + return 0; + } else if (vpid < 0) { + pid = vpid_to_pid(-vpid); + if (unlikely(pid < 0)) + return 0; + pid = -pid; + } + return pid; +} + +static inline int comb_pid_to_vpid(int pid) +{ + int vpid = pid; + + if (pid > 0) { + vpid = pid_to_vpid(pid); + if (unlikely(vpid < 0)) + return 0; + } else if (pid < 0) { + vpid = pid_to_vpid(-pid); + if (unlikely(vpid < 0)) + return 0; + vpid = -vpid; + } + return vpid; +} + +extern int glob_virt_pids; +#endif + +#define pid_next_all(task, type) \ + ((task)->pids[(type)].node.next) + +#define pid_next_task_all(task, type) \ + hlist_entry(pid_next_all(task, type), \ + struct task_struct, pids[(type)].node) /* We could use hlist_for_each_entry_rcu here but it takes more arguments * than the do_each_task_pid/while_each_task_pid. So we roll our own * to preserve the existing interface. */ -#define do_each_task_pid(who, type, task) \ - if ((task = find_task_by_pid_type(type, who))) { \ - prefetch(pid_next(task, type)); \ +#define do_each_task_pid_all(who, type, task) \ + if ((task = find_task_by_pid_type_all(type, who))) { \ + prefetch(pid_next_all(task, type)); \ do { -#define while_each_task_pid(who, type, task) \ - } while (pid_next(task, type) && ({ \ - task = pid_next_task(task, type); \ +#define while_each_task_pid_all(who, type, task) \ + } while (pid_next_all(task, type) && ({ \ + task = pid_next_task_all(task, type); \ rcu_dereference(task); \ - prefetch(pid_next(task, type)); \ + prefetch(pid_next_all(task, type)); \ 1; }) ); \ } +#ifndef CONFIG_VE +#define __do_each_task_pid_ve(who, type, task, owner) \ + do_each_task_pid_all(who, type, task) +#define __while_each_task_pid_ve(who, type, task, owner) \ + while_each_task_pid_all(who, type, task) +#else /* CONFIG_VE */ +#define __do_each_task_pid_ve(who, type, task, owner) \ + do_each_task_pid_all(who, type, task) \ + if (ve_accessible(VE_TASK_INFO(task)->owner_env, owner)) +#define __while_each_task_pid_ve(who, type, task, owner) \ + while_each_task_pid_all(who, type, task) +#endif /* CONFIG_VE */ + +#define do_each_task_pid_ve(who, type, task) \ + __do_each_task_pid_ve(who, type, task, get_exec_env()); +#define while_each_task_pid_ve(who, type, task) \ + __while_each_task_pid_ve(who, type, task, get_exec_env()); + #endif /* _LINUX_PID_H */ diff -uprN linux-2.6.18/include/linux/proc_fs.h linux-2.6.18.ovz/include/linux/proc_fs.h --- linux-2.6.18/include/linux/proc_fs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/proc_fs.h 2007-06-13 06:55:07.000000000 -0400 @@ -4,6 +4,7 @@ #include #include #include +#include #include /* @@ -86,8 +87,16 @@ struct vmcore { extern struct proc_dir_entry proc_root; extern struct proc_dir_entry *proc_root_fs; +extern struct file_system_type proc_fs_type; + +#ifdef CONFIG_VE +#include +#define proc_net (get_exec_env()->_proc_net) +#define proc_net_stat (get_exec_env()->_proc_net_stat) +#else extern struct proc_dir_entry *proc_net; extern struct proc_dir_entry *proc_net_stat; +#endif extern struct proc_dir_entry *proc_bus; extern struct proc_dir_entry *proc_root_driver; extern struct proc_dir_entry *proc_root_kcore; @@ -108,7 +117,11 @@ char *task_mem(struct mm_struct *, char extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent); +extern struct proc_dir_entry *create_proc_glob_entry(const char *name, + mode_t mode, + struct proc_dir_entry *parent); extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); +extern void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent); extern struct vfsmount *proc_mnt; extern int proc_fill_super(struct super_block *,void *,int); @@ -195,6 +208,15 @@ static inline struct proc_dir_entry *pro return res; } +static inline struct proc_dir_entry *proc_glob_fops_create(const char *name, + mode_t mode, struct file_operations *fops) +{ + struct proc_dir_entry *res = create_proc_glob_entry(name, mode, NULL); + if (res) + res->proc_fops = fops; + return res; +} + static inline void proc_net_remove(const char *name) { remove_proc_entry(name,proc_net); @@ -207,6 +229,7 @@ static inline void proc_net_remove(const #define proc_bus NULL #define proc_net_fops_create(name, mode, fops) ({ (void)(mode), NULL; }) +#define proc_glob_fops_create(name, mode, fops) ({ (void)(mode), NULL; }) #define proc_net_create(name, mode, info) ({ (void)(mode), NULL; }) static inline void proc_net_remove(const char *name) {} @@ -214,6 +237,8 @@ static inline void proc_flush_task(struc static inline struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent) { return NULL; } +static inline struct proc_dir_entry *create_proc_glob_entry(const char *name, + mode_t mode, struct proc_dir_entry *parent) { return NULL; } #define remove_proc_entry(name, parent) do {} while (0) @@ -237,6 +262,48 @@ extern struct proc_dir_entry proc_root; #endif /* CONFIG_PROC_FS */ +static inline struct proc_dir_entry *create_proc_entry_mod(const char *name, + mode_t mode, + struct proc_dir_entry *parent, + struct module *owner) +{ + struct proc_dir_entry *ent; + + /* + * lock_kernel() here protects against proc_lookup() + * which can find this freshly created entry w/o owner being set. + * this can lead to module being put more times then getted. + */ + lock_kernel(); + ent = create_proc_entry(name, mode, parent); + if (ent) + ent->owner = owner; + unlock_kernel(); + + return ent; +} + +static inline struct proc_dir_entry *create_proc_glob_entry_mod(const char *name, + mode_t mode, + struct proc_dir_entry *parent, + struct module *owner) +{ + struct proc_dir_entry *ent; + + /* + * lock_kernel() here protects against proc_lookup() + * which can find this freshly created entry w/o owner being set. + * this can lead to module being put more times then getted. + */ + lock_kernel(); + ent = create_proc_glob_entry(name, mode, parent); + if (ent) + ent->owner = owner; + unlock_kernel(); + + return ent; +} + #if !defined(CONFIG_PROC_KCORE) static inline void kclist_add(struct kcore_list *new, void *addr, size_t size) { @@ -266,10 +333,24 @@ static inline struct proc_dir_entry *PDE return PROC_I(inode)->pde; } +static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) +{ + if (de) + atomic_inc(&de->count); + return de; +} + +void de_put(struct proc_dir_entry *de); + struct proc_maps_private { struct pid *pid; struct task_struct *task; struct vm_area_struct *tail_vma; }; +#define LPDE(inode) (PROC_I((inode))->pde) +#ifdef CONFIG_VE +#define GPDE(inode) (*(struct proc_dir_entry **)(&(inode)->i_pipe)) +#endif + #endif /* _LINUX_PROC_FS_H */ diff -uprN linux-2.6.18/include/linux/quota.h linux-2.6.18.ovz/include/linux/quota.h --- linux-2.6.18/include/linux/quota.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/quota.h 2007-06-13 06:55:07.000000000 -0400 @@ -44,8 +44,6 @@ typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ typedef __u64 qsize_t; /* Type in which we store sizes */ -extern spinlock_t dq_data_lock; - /* Size of blocks in which are counted size limits */ #define QUOTABLOCK_BITS 10 #define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) @@ -134,6 +132,10 @@ struct if_dqinfo { #include #include +#include + +extern spinlock_t dq_data_lock; + #include #include #include @@ -242,6 +244,8 @@ struct quota_format_ops { int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ }; +struct inode; +struct iattr; /* Operations working with dquots */ struct dquot_operations { int (*initialize) (struct inode *, int); @@ -256,9 +260,11 @@ struct dquot_operations { int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ + int (*rename) (struct inode *, struct inode *, struct inode *); }; /* Operations handling requests from userspace */ +struct v2_disk_dqblk; struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, char *); int (*quota_off)(struct super_block *, int); @@ -271,6 +277,10 @@ struct quotactl_ops { int (*set_xstate)(struct super_block *, unsigned int, int); int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); +#ifdef CONFIG_QUOTA_COMPAT + int (*get_quoti)(struct super_block *, int, unsigned int, + struct v2_disk_dqblk __user *); +#endif }; struct quota_format_type { @@ -291,6 +301,10 @@ struct quota_info { struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + struct vz_quota_master *vzdq_master; + int vzdq_count; +#endif }; /* Inline would be better but we need to dereference super_block which is not defined yet */ diff -uprN linux-2.6.18/include/linux/quotaops.h linux-2.6.18.ovz/include/linux/quotaops.h --- linux-2.6.18/include/linux/quotaops.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/quotaops.h 2007-06-13 06:55:07.000000000 -0400 @@ -170,6 +170,19 @@ static __inline__ int DQUOT_TRANSFER(str return 0; } +static __inline__ int DQUOT_RENAME(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + struct dquot_operations *q_op; + + q_op = inode->i_sb->dq_op; + if (q_op && q_op->rename) { + if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA) + return 1; + } + return 0; +} + /* The following two functions cannot be called inside a transaction */ #define DQUOT_SYNC(sb) sync_dquots(sb, -1) @@ -196,6 +209,7 @@ static __inline__ int DQUOT_OFF(struct s #define DQUOT_SYNC(sb) do { } while(0) #define DQUOT_OFF(sb) do { } while(0) #define DQUOT_TRANSFER(inode, iattr) (0) +#define DQUOT_RENAME(inode, old_dir, new_dir) (0) static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { inode_add_bytes(inode, nr); diff -uprN linux-2.6.18/include/linux/raid/Kbuild linux-2.6.18.ovz/include/linux/raid/Kbuild --- linux-2.6.18/include/linux/raid/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/raid/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1 +1,2 @@ -header-y += md_p.h md_u.h +header-y += md_p.h +header-y += md_u.h diff -uprN linux-2.6.18/include/linux/rmap.h linux-2.6.18.ovz/include/linux/rmap.h --- linux-2.6.18/include/linux/rmap.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/rmap.h 2007-06-13 06:55:07.000000000 -0400 @@ -73,6 +73,7 @@ void page_add_anon_rmap(struct page *, s void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_file_rmap(struct page *); void page_remove_rmap(struct page *); +struct anon_vma *page_lock_anon_vma(struct page *page); /** * page_dup_rmap - duplicate pte mapping to a page diff -uprN linux-2.6.18/include/linux/sched.h linux-2.6.18.ovz/include/linux/sched.h --- linux-2.6.18/include/linux/sched.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/sched.h 2007-06-13 06:55:07.000000000 -0400 @@ -24,6 +24,11 @@ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ +#define CLONE_NEWUTS 0x04000000 /* New utsname group? */ +#define CLONE_NEWIPC 0x08000000 /* New ipcs */ + +/* mask of clones which are disabled in OpenVZ VEs */ +#define CLONE_NAMESPACES_MASK (CLONE_NEWUTS | CLONE_NEWIPC) /* * Scheduling policies @@ -80,10 +85,14 @@ struct sched_param { #include #include #include +#include #include +#include + struct exec_domain; +struct ve_struct; struct futex_pi_state; /* @@ -116,17 +125,34 @@ extern unsigned long avenrun[]; /* Load load += n*(FIXED_1-exp); \ load >>= FSHIFT; +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + extern unsigned long total_forks; extern int nr_threads; extern int last_pid; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); + +extern unsigned long nr_sleeping(void); +extern unsigned long nr_stopped(void); +extern unsigned long nr_zombie; +extern atomic_t nr_dead; extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); -extern unsigned long weighted_cpuload(const int cpu); +#ifdef CONFIG_VE +struct ve_struct; +extern unsigned long nr_running_ve(struct ve_struct *); +extern unsigned long nr_iowait_ve(void); +extern unsigned long nr_uninterruptible_ve(struct ve_struct *); +#else +#define nr_running_ve(ve) 0 +#define nr_iowait_ve() 0 +#define nr_uninterruptible_ve(ve) 0 +#endif /* * Task state bitmask. NOTE! These bits are also @@ -194,6 +220,8 @@ extern cpumask_t nohz_cpu_mask; extern void show_state(void); extern void show_regs(struct pt_regs *); +extern void smp_show_regs(struct pt_regs *, void *); +extern void show_vsched(void); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current @@ -238,7 +266,7 @@ extern signed long schedule_timeout_inte extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -struct namespace; +struct nsproxy; /* Maximum number of active map areas.. This is a random (large) number */ #define DEFAULT_MAX_MAP_COUNT 65536 @@ -337,6 +365,8 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ unsigned dumpable:2; + unsigned vps_dumpable:2; + unsigned oom_killed:1; cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ @@ -353,6 +383,9 @@ struct mm_struct { /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *mm_ub; +#endif }; struct sighand_struct { @@ -369,6 +402,8 @@ struct pacct_struct { unsigned long ac_minflt, ac_majflt; }; +#include + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -420,6 +455,10 @@ struct signal_struct { pid_t session; /* boolean value for session group leader */ int leader; +#ifdef CONFIG_VE + pid_t vpgrp; + pid_t vsession; +#endif struct tty_struct *tty; /* NULL if no tty */ @@ -754,6 +793,9 @@ static inline void prefetch_stack(struct struct audit_context; /* See audit.c */ struct mempolicy; struct pipe_inode_info; +struct uts_namespace; +struct vcpu_scheduler; +struct vcpu_struct; enum sleep_type { SLEEP_NORMAL, @@ -778,6 +820,14 @@ struct task_struct { int oncpu; #endif #endif +#ifdef CONFIG_SCHED_VCPU + struct vcpu_scheduler *vsched; + struct vcpu_struct *vcpu; + + /* id's are saved to avoid locking (e.g. on vsched->id access) */ + int vsched_id; + int vcpu_id; +#endif int load_weight; /* for niceness load balancing purposes */ int prio, static_prio, normal_prio; struct list_head run_list; @@ -819,6 +869,10 @@ struct task_struct { unsigned did_exec:1; pid_t pid; pid_t tgid; +#ifdef CONFIG_VE + pid_t vpid; + pid_t vtgid; +#endif /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with @@ -880,8 +934,8 @@ struct task_struct { struct fs_struct *fs; /* open file information */ struct files_struct *files; -/* namespace */ - struct namespace *namespace; +/* namespaces */ + struct nsproxy *nsproxy; /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; @@ -955,6 +1009,11 @@ struct task_struct { unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ + +/* state tracking for suspend */ + __u8 pn_state; + __u8 stopped_state:1; + /* * current io wait handle: wait queue entry to use for io waits * If this thread is processing aio, this points at the waitqueue @@ -964,6 +1023,7 @@ struct task_struct { wait_queue_t *io_wait; /* i/o counters(bytes read/written, #syscalls */ u64 rchar, wchar, syscr, syscw; + struct task_io_accounting ioac; #if defined(CONFIG_BSD_PROCESS_ACCT) u64 acct_rss_mem1; /* accumulated rss usage */ u64 acct_vm_mem1; /* accumulated virtual memory usage */ @@ -996,6 +1056,16 @@ struct task_struct { #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif +#ifdef CONFIG_USER_RESOURCE + struct task_beancounter task_bc; +#endif +#ifdef CONFIG_VE + struct ve_task_info ve_task_info; +#endif +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + unsigned long magic; + struct inode *ino; +#endif }; static inline pid_t process_group(struct task_struct *tsk) @@ -1027,6 +1097,43 @@ static inline void put_task_struct(struc __put_task_struct(t); } +#ifndef CONFIG_VE +#define set_pn_state(tsk, state) do { } while(0) +#define clear_pn_state(tsk) do { } while(0) +#define set_stop_state(tsk) do { } while(0) +#define clear_stop_state(tsk) do { } while(0) +#else +#define PN_STOP_TF 1 /* was not in 2.6.8 */ +#define PN_STOP_TF_RT 2 /* was not in 2.6.8 */ +#define PN_STOP_ENTRY 3 +#define PN_STOP_FORK 4 +#define PN_STOP_VFORK 5 +#define PN_STOP_SIGNAL 6 +#define PN_STOP_EXIT 7 +#define PN_STOP_EXEC 8 +#define PN_STOP_LEAVE 9 + +static inline void set_pn_state(struct task_struct *tsk, int state) +{ + tsk->pn_state = state; +} + +static inline void clear_pn_state(struct task_struct *tsk) +{ + tsk->pn_state = 0; +} + +static inline void set_stop_state(struct task_struct *tsk) +{ + tsk->stopped_state = 1; +} + +static inline void clear_stop_state(struct task_struct *tsk) +{ + tsk->stopped_state = 0; +} +#endif + /* * Per process flags */ @@ -1042,7 +1149,7 @@ static inline void put_task_struct(struc #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ -#define PF_FREEZE 0x00004000 /* this task is being frozen for suspend now */ +#define PF_EXIT_RESTART 0x00004000 /* do_exit() restarted, see do_exit() */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ @@ -1097,6 +1204,21 @@ extern unsigned long long sched_clock(vo extern unsigned long long current_sched_time(const struct task_struct *current_task); +static inline unsigned long cycles_to_clocks(cycles_t cycles) +{ + extern unsigned long cycles_per_clock; + do_div(cycles, cycles_per_clock); + return cycles; +} + +static inline u64 cycles_to_jiffies(cycles_t cycles) +{ + extern unsigned long cycles_per_jiffy; + do_div(cycles, cycles_per_jiffy); + return cycles; +} + + /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); @@ -1162,12 +1284,222 @@ extern struct task_struct init_task; extern struct mm_struct init_mm; -#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) -extern struct task_struct *find_task_by_pid_type(int type, int pid); +#define find_task_by_pid_all(nr) \ + find_task_by_pid_type_all(PIDTYPE_PID, nr) +extern struct task_struct *find_task_by_pid_type_all(int type, int pid); extern void set_special_pids(pid_t session, pid_t pgrp); extern void __set_special_pids(pid_t session, pid_t pgrp); +#ifndef CONFIG_VE +#define find_task_by_pid_ve find_task_by_pid_all + +#define ve_is_super(env) 1 +#define ve_accessible(target, owner) 1 +#define ve_accessible_strict(target, owner) 1 +#define ve_accessible_veid(target, owner) 1 +#define ve_accessible_strict_veid(target, owner) 1 + +#define VEID(envid) 0 + +static inline pid_t virt_pid(struct task_struct *tsk) +{ + return tsk->pid; +} + +static inline pid_t virt_tgid(struct task_struct *tsk) +{ + return tsk->tgid; +} + +static inline pid_t virt_pgid(struct task_struct *tsk) +{ + return tsk->signal->pgrp; +} + +static inline pid_t virt_sid(struct task_struct *tsk) +{ + return tsk->signal->session; +} + +#define get_task_pid_ve(tsk, ve) get_task_pid(tsk) + +static inline pid_t get_task_pid(struct task_struct *tsk) +{ + return tsk->pid; +} + +static inline pid_t get_task_tgid(struct task_struct *tsk) +{ + return tsk->tgid; +} + +static inline pid_t get_task_pgid(struct task_struct *tsk) +{ + return tsk->signal->pgrp; +} + +static inline pid_t get_task_sid(struct task_struct *tsk) +{ + return tsk->signal->session; +} + +static inline void set_virt_pid(struct task_struct *tsk, pid_t pid) +{ +} + +static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid) +{ +} + +static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid) +{ +} + +static inline void set_virt_sid(struct task_struct *tsk, pid_t pid) +{ +} + +static inline pid_t get_task_ppid(struct task_struct *p) +{ + return pid_alive(p) ? p->group_leader->real_parent->tgid : 0; +} + +#else /* CONFIG_VE */ + +#include +#include + +#define find_task_by_pid_ve(nr) \ + find_task_by_pid_type_ve(PIDTYPE_PID, nr) + +extern struct task_struct *find_task_by_pid_type_ve(int type, int pid); + +#define VEID(envid) ((envid)->veid) + +#define ve_is_super(env) ((env) == get_ve0()) +#define ve_accessible_strict(target, owner) ((target) == (owner)) +static inline int ve_accessible(struct ve_struct *target, + struct ve_struct *owner) { + return ve_is_super(owner) || ve_accessible_strict(target, owner); +} + +#define ve_accessible_strict_veid(target, owner) ((target) == (owner)) +static inline int ve_accessible_veid(envid_t target, envid_t owner) +{ + return get_ve0()->veid == owner || + ve_accessible_strict_veid(target, owner); +} + +static inline pid_t virt_pid(struct task_struct *tsk) +{ + return tsk->vpid; +} + +static inline pid_t virt_tgid(struct task_struct *tsk) +{ + return tsk->vtgid; +} + +static inline pid_t virt_pgid(struct task_struct *tsk) +{ + return tsk->signal->vpgrp; +} + +static inline pid_t virt_sid(struct task_struct *tsk) +{ + return tsk->signal->vsession; +} + +static inline pid_t get_task_pid_ve(struct task_struct *tsk, struct ve_struct *env) +{ + return ve_is_super(env) ? tsk->pid : virt_pid(tsk); +} + +static inline pid_t get_task_pid(struct task_struct *tsk) +{ + return get_task_pid_ve(tsk, get_exec_env()); +} + +static inline pid_t get_task_tgid(struct task_struct *tsk) +{ + return ve_is_super(get_exec_env()) ? tsk->tgid : virt_tgid(tsk); +} + +static inline pid_t get_task_pgid(struct task_struct *tsk) +{ + return ve_is_super(get_exec_env()) ? tsk->signal->pgrp : virt_pgid(tsk); +} + +static inline pid_t get_task_sid(struct task_struct *tsk) +{ + return ve_is_super(get_exec_env()) ? tsk->signal->session : virt_sid(tsk); +} + +static inline void set_virt_pid(struct task_struct *tsk, pid_t pid) +{ + tsk->vpid = pid; +} + +static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid) +{ + tsk->vtgid = pid; +} + +static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid) +{ + tsk->signal->vpgrp = pid; +} + +static inline void set_virt_sid(struct task_struct *tsk, pid_t pid) +{ + tsk->signal->vsession = pid; +} + +static inline pid_t get_task_ppid(struct task_struct *p) +{ + struct task_struct *parent; + struct ve_struct *env; + + if (!pid_alive(p)) + return 0; + env = get_exec_env(); + if (get_task_pid_ve(p, env) == 1) + return 0; + parent = p->group_leader->real_parent; + return ve_accessible(VE_TASK_INFO(parent)->owner_env, env) ? + get_task_tgid(parent) : 1; +} + +void ve_sched_get_cpu_stat(struct ve_struct *envid, cycles_t *idle, + cycles_t *strv, unsigned int cpu); +void ve_sched_attach(struct ve_struct *envid); + +#endif /* CONFIG_VE */ + + +#ifdef CONFIG_VE +extern cycles_t __ve_sched_get_idle_time(struct ve_struct *ve, int cpu); +extern cycles_t ve_sched_get_iowait_time(int cpu); +#else +#define __ve_sched_get_idle_time(ve, cpu) 0 +#define ve_sched_get_iowait_time(cpu) 0 +#endif + +#define ve_sched_get_idle_time(cpu) \ + __ve_sched_get_idle_time(get_exec_env(), cpu) + +#ifdef CONFIG_SCHED_VCPU +struct vcpu_scheduler; +extern void fastcall vsched_cpu_online_map(struct vcpu_scheduler *sched, + cpumask_t *mask); +#else +#define vsched_cpu_online_map(vsched, mask) do { \ + *mask = cpu_online_map; \ + } while (0) +#endif + /* per-UID process charging. */ +extern int set_user(uid_t new_ruid, int dumpclear); extern struct user_struct * alloc_uid(uid_t); static inline struct user_struct *get_uid(struct user_struct *u) { @@ -1185,7 +1517,7 @@ extern int FASTCALL(wake_up_state(struct extern int FASTCALL(wake_up_process(struct task_struct * tsk)); extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, unsigned long clone_flags)); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined (CONFIG_SCHED_VCPU) extern void kick_process(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } @@ -1300,12 +1632,19 @@ extern struct task_struct *child_reaper; extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern long do_fork_pid(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, + long pid0); struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); extern void get_task_comm(char *to, struct task_struct *tsk); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined (CONFIG_SCHED_VCPU) extern void wait_task_inactive(struct task_struct * p); #else #define wait_task_inactive(p) do { } while (0) @@ -1314,28 +1653,99 @@ extern void wait_task_inactive(struct ta #define remove_parent(p) list_del_init(&(p)->sibling) #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children) -#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) +#define next_task_all(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) -#define for_each_process(p) \ - for (p = &init_task ; (p = next_task(p)) != &init_task ; ) +#define for_each_process_all(p) \ + for (p = &init_task ; (p = next_task_all(p)) != &init_task ; ) /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ -#define do_each_thread(g, t) \ - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do +#define do_each_thread_all(g, t) \ + for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do + +#define while_each_thread_all(g, t) \ + while ((t = next_thread(t)) != g) + +#ifndef CONFIG_VE + +#define for_each_process_ve(p) for_each_process_all(p) +#define do_each_thread_ve(g, t) do_each_thread_all(g, t) +#define while_each_thread_ve(g, t) while_each_thread_all(g, t) +#define first_task_ve() next_task_ve(&init_task) +#define __first_task_ve(owner) next_task_ve(&init_task) +#define __next_task_ve(owner, p) next_task_ve(p) +#define next_task_ve(p) \ + (next_task_all(p) != &init_task ? next_task_all(p) : NULL) + +#else /* CONFIG_VE */ + +static inline struct task_struct *ve_lh2task(struct ve_struct *ve, + struct list_head *lh) +{ + return lh == &ve->vetask_lh ? NULL : + list_entry(lh, struct task_struct, ve_task_info.vetask_list); +} + +static inline struct task_struct *__first_task_ve(struct ve_struct *ve) +{ + struct task_struct *tsk; + + if (unlikely(ve_is_super(ve))) { + tsk = next_task_all(&init_task); + if (tsk == &init_task) + tsk = NULL; + } else { + tsk = ve_lh2task(ve, rcu_dereference(ve->vetask_lh.next)); + } + return tsk; +} -#define while_each_thread(g, t) \ +static inline struct task_struct *__next_task_ve(struct ve_struct *ve, + struct task_struct *tsk) +{ + if (unlikely(ve_is_super(ve))) { + tsk = next_task_all(tsk); + if (tsk == &init_task) + tsk = NULL; + } else { + BUG_ON(tsk->ve_task_info.owner_env != ve); + tsk = ve_lh2task(ve, rcu_dereference(tsk-> + ve_task_info.vetask_list.next)); + } + return tsk; +} + +#define first_task_ve() __first_task_ve(get_exec_env()) +#define next_task_ve(p) __next_task_ve(get_exec_env(), p) +/* no one uses prev_task_ve(), copy next_task_ve() if needed */ + +#define for_each_process_ve(p) \ + for (p = first_task_ve(); p != NULL ; p = next_task_ve(p)) + +#define do_each_thread_ve(g, t) \ + for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do + +#define while_each_thread_ve(g, t) \ while ((t = next_thread(t)) != g) +#endif /* CONFIG_VE */ + /* de_thread depends on thread_group_leader not being a pid based check */ #define thread_group_leader(p) (p == p->group_leader) static inline struct task_struct *next_thread(const struct task_struct *p) { - return list_entry(rcu_dereference(p->thread_group.next), + struct task_struct *tsk; + + tsk = list_entry(rcu_dereference(p->thread_group.next), struct task_struct, thread_group); +#ifdef CONFIG_VE + /* all threads should belong to ONE ve! */ + BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env); +#endif + return tsk; } static inline int thread_group_empty(struct task_struct *p) @@ -1486,28 +1896,63 @@ extern void signal_wake_up(struct task_s */ #ifdef CONFIG_SMP -static inline unsigned int task_cpu(const struct task_struct *p) +static inline unsigned int task_pcpu(const struct task_struct *p) { return task_thread_info(p)->cpu; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +static inline void set_task_pcpu(struct task_struct *p, unsigned int cpu) { task_thread_info(p)->cpu = cpu; } #else +static inline unsigned int task_pcpu(const struct task_struct *p) +{ + return 0; +} + +static inline void set_task_pcpu(struct task_struct *p, unsigned int cpu) +{ +} + +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SCHED_VCPU + +static inline unsigned int task_vsched_id(const struct task_struct *p) +{ + return p->vsched_id; +} + static inline unsigned int task_cpu(const struct task_struct *p) { + return p->vcpu_id; +} + +extern void set_task_cpu(struct task_struct *p, unsigned int vcpu); +extern int vcpu_online(int cpu); + +#else + +static inline unsigned int task_vsched_id(const struct task_struct *p) +{ return 0; } +static inline unsigned int task_cpu(const struct task_struct *p) +{ + return task_pcpu(p); +} + static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { + set_task_pcpu(p, cpu); } -#endif /* CONFIG_SMP */ +#define vcpu_online(cpu) cpu_online(cpu) +#endif /* CONFIG_SCHED_VCPU */ #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT extern void arch_pick_mmap_layout(struct mm_struct *mm); @@ -1540,20 +1985,12 @@ static inline int frozen(struct task_str } /* - * Check if there is a request to freeze a process - */ -static inline int freezing(struct task_struct *p) -{ - return p->flags & PF_FREEZE; -} - -/* * Request that a process be frozen * FIXME: SMP problem. We may not modify other process' flags! */ static inline void freeze(struct task_struct *p) { - p->flags |= PF_FREEZE; + set_tsk_thread_flag(p, TIF_FREEZE); } /* @@ -1561,7 +1998,7 @@ static inline void freeze(struct task_st */ static inline void do_not_freeze(struct task_struct *p) { - p->flags &= ~PF_FREEZE; + clear_tsk_thread_flag(p, TIF_FREEZE); } /* @@ -1582,35 +2019,44 @@ static inline int thaw_process(struct ta */ static inline void frozen_process(struct task_struct *p) { - p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN; + clear_tsk_thread_flag(p, TIF_FREEZE); + p->flags |= PF_FROZEN; } -extern void refrigerator(void); extern int freeze_processes(void); extern void thaw_processes(void); -static inline int try_to_freeze(void) -{ - if (freezing(current)) { - refrigerator(); - return 1; - } else - return 0; -} #else static inline int frozen(struct task_struct *p) { return 0; } -static inline int freezing(struct task_struct *p) { return 0; } static inline void freeze(struct task_struct *p) { BUG(); } static inline int thaw_process(struct task_struct *p) { return 1; } static inline void frozen_process(struct task_struct *p) { BUG(); } -static inline void refrigerator(void) {} static inline int freeze_processes(void) { BUG(); return 0; } static inline void thaw_processes(void) {} -static inline int try_to_freeze(void) { return 0; } - #endif /* CONFIG_PM */ + +extern void refrigerator(void); + +/* + * Check if there is a request to freeze a process + */ +static inline int freezing(struct task_struct *p) +{ + return test_tsk_thread_flag(p, TIF_FREEZE); +} + +static inline int try_to_freeze(void) +{ + if (freezing(current)) { + refrigerator(); + return 1; + } else + return 0; +} + + #endif /* __KERNEL__ */ #endif diff -uprN linux-2.6.18/include/linux/scx200.h linux-2.6.18.ovz/include/linux/scx200.h --- linux-2.6.18/include/linux/scx200.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/scx200.h 2007-06-13 06:55:07.000000000 -0400 @@ -32,7 +32,7 @@ extern unsigned scx200_cb_base; /* High Resolution Timer */ #define SCx200_TIMER_OFFSET 0x08 -#define SCx200_TIMER_SIZE 0x05 +#define SCx200_TIMER_SIZE 0x06 /* Clock Generators */ #define SCx200_CLOCKGEN_OFFSET 0x10 diff -uprN linux-2.6.18/include/linux/sem.h linux-2.6.18.ovz/include/linux/sem.h --- linux-2.6.18/include/linux/sem.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/sem.h 2007-06-13 06:55:07.000000000 -0400 @@ -155,6 +155,9 @@ static inline void exit_sem(struct task_ } #endif +int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg); +int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg); + #endif /* __KERNEL__ */ #endif /* _LINUX_SEM_H */ diff -uprN linux-2.6.18/include/linux/serial_core.h linux-2.6.18.ovz/include/linux/serial_core.h --- linux-2.6.18/include/linux/serial_core.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/serial_core.h 2007-06-13 06:55:07.000000000 -0400 @@ -319,6 +319,7 @@ struct uart_info { #define UIF_CTS_FLOW ((__force uif_t) (1 << 26)) #define UIF_NORMAL_ACTIVE ((__force uif_t) (1 << 29)) #define UIF_INITIALIZED ((__force uif_t) (1 << 31)) +#define UIF_SUSPENDED ((__force uif_t) (1 << 30)) int blocked_open; diff -uprN linux-2.6.18/include/linux/shm.h linux-2.6.18.ovz/include/linux/shm.h --- linux-2.6.18/include/linux/shm.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/shm.h 2007-06-13 06:55:07.000000000 -0400 @@ -104,6 +104,10 @@ static inline long do_shmat(int shmid, c } #endif +int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg); +struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg); +void shm_clean_ns(struct ipc_namespace *ns); + #endif /* __KERNEL__ */ #endif /* _LINUX_SHM_H_ */ diff -uprN linux-2.6.18/include/linux/shmem_fs.h linux-2.6.18.ovz/include/linux/shmem_fs.h --- linux-2.6.18/include/linux/shmem_fs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/shmem_fs.h 2007-06-13 06:55:07.000000000 -0400 @@ -19,6 +19,9 @@ struct shmem_inode_info { swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */ struct list_head swaplist; /* chain of maybes on swap */ struct inode vfs_inode; +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *shmi_ub; +#endif }; struct shmem_sb_info { @@ -36,4 +39,6 @@ static inline struct shmem_inode_info *S return container_of(inode, struct shmem_inode_info, vfs_inode); } +extern struct file_system_type tmpfs_fs_type; + #endif diff -uprN linux-2.6.18/include/linux/signal.h linux-2.6.18.ovz/include/linux/signal.h --- linux-2.6.18/include/linux/signal.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/signal.h 2007-06-13 06:55:07.000000000 -0400 @@ -7,6 +7,7 @@ #ifdef __KERNEL__ #include #include +#include /* * Real Time signals may be queued. @@ -17,6 +18,9 @@ struct sigqueue { int flags; siginfo_t info; struct user_struct *user; +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *sig_ub; +#endif }; /* flags values. */ @@ -241,6 +245,8 @@ extern int sigprocmask(int, sigset_t *, struct pt_regs; extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); +extern kmem_cache_t *sigqueue_cachep; + #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNAL_H */ diff -uprN linux-2.6.18/include/linux/skbuff.h linux-2.6.18.ovz/include/linux/skbuff.h --- linux-2.6.18/include/linux/skbuff.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/skbuff.h 2007-06-13 06:55:07.000000000 -0400 @@ -228,6 +228,8 @@ enum { * @secmark: security marking */ +#include + struct sk_buff { /* These two members must be first. */ struct sk_buff *next; @@ -282,7 +284,9 @@ struct sk_buff { nfctinfo:3; __u8 pkt_type:3, fclone:2, - ipvs_property:1; + ipvs_property:1, + accounted:1, + redirected:1; __be16 protocol; void (*destructor)(struct sk_buff *skb); @@ -317,6 +321,8 @@ struct sk_buff { *data, *tail, *end; + struct skb_beancounter skb_bc; + struct ve_struct *owner_env; }; #ifdef __KERNEL__ @@ -324,6 +330,7 @@ struct sk_buff { * Handling routines are only of interest to the kernel */ #include +#include #include @@ -619,6 +626,13 @@ static inline void skb_queue_head_init(s list->qlen = 0; } +static inline void skb_queue_head_init_class(struct sk_buff_head *list, + struct lock_class_key *class) +{ + skb_queue_head_init(list); + lockdep_set_class(&list->lock, class); +} + /* * Insert an sk_buff at the start of a list. * @@ -1064,6 +1078,8 @@ static inline void pskb_trim_unique(stru */ static inline void skb_orphan(struct sk_buff *skb) { + ub_skb_uncharge(skb); + if (skb->destructor) skb->destructor(skb); skb->destructor = NULL; diff -uprN linux-2.6.18/include/linux/slab.h linux-2.6.18.ovz/include/linux/slab.h --- linux-2.6.18/include/linux/slab.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/slab.h 2007-06-13 06:55:07.000000000 -0400 @@ -47,6 +47,26 @@ typedef struct kmem_cache kmem_cache_t; #define SLAB_DESTROY_BY_RCU 0x00080000UL /* defer freeing pages to RCU */ #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ +/* + * allocation rules: __GFP_UBC 0 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * cache (SLAB_UBC) charge charge + * (usual caches: mm, vma, task_struct, ...) + * + * cache (SLAB_UBC | SLAB_NO_CHARGE) charge --- + * (ub_kmalloc) (kmalloc) + * + * cache (no UB flags) BUG() --- + * (nonub caches, mempools) + * + * pages charge --- + * (ub_vmalloc, (vmalloc, + * poll, fdsets, ...) non-ub allocs) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#define SLAB_UBC 0x20000000UL /* alloc space for ubs ... */ +#define SLAB_NO_CHARGE 0x40000000UL /* ... but don't charge */ + /* flags passed to a constructor func */ #define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */ #define SLAB_CTOR_ATOMIC 0x002UL /* tell constructor it can't sleep */ @@ -68,6 +88,7 @@ extern void kmem_cache_free(kmem_cache_t extern unsigned int kmem_cache_size(kmem_cache_t *); extern const char *kmem_cache_name(kmem_cache_t *); extern kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags); +extern void show_slab_info(void); /* Size description struct for general caches. */ struct cache_sizes { @@ -76,6 +97,7 @@ struct cache_sizes { kmem_cache_t *cs_dmacachep; }; extern struct cache_sizes malloc_sizes[]; +extern int malloc_cache_num; extern void *__kmalloc(size_t, gfp_t); #ifndef CONFIG_DEBUG_SLAB @@ -133,7 +155,7 @@ extern void *__kmalloc_track_caller(size */ static inline void *kmalloc(size_t size, gfp_t flags) { - if (__builtin_constant_p(size)) { + if (__builtin_constant_p(size) && __builtin_constant_p(flags)) { int i = 0; #define CACHE(x) \ if (size <= x) \ @@ -147,6 +169,8 @@ static inline void *kmalloc(size_t size, __you_cannot_kmalloc_that_much(); } found: + if (flags & __GFP_UBC) + i += malloc_cache_num; return kmem_cache_alloc((flags & GFP_DMA) ? malloc_sizes[i].cs_dmacachep : malloc_sizes[i].cs_cachep, flags); @@ -154,6 +178,7 @@ found: return __kmalloc(size, flags); } +#define ub_kmalloc(size, flags) kmalloc(size, ((flags) | __GFP_UBC)) extern void *__kzalloc(size_t, gfp_t); /** @@ -177,12 +202,15 @@ static inline void *kzalloc(size_t size, __you_cannot_kzalloc_that_much(); } found: + if (flags & __GFP_UBC) + i += malloc_cache_num; return kmem_cache_zalloc((flags & GFP_DMA) ? malloc_sizes[i].cs_dmacachep : malloc_sizes[i].cs_cachep, flags); } return __kzalloc(size, flags); } +#define ub_kzalloc(size, flags) kzalloc(size, (flags) | __GFP_UBC) /** * kcalloc - allocate memory for an array. The memory is set to zero. diff -uprN linux-2.6.18/include/linux/smp.h linux-2.6.18.ovz/include/linux/smp.h --- linux-2.6.18/include/linux/smp.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/smp.h 2007-06-13 06:55:07.000000000 -0400 @@ -9,6 +9,9 @@ extern void cpu_idle(void); +struct pt_regs; +typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info); + #ifdef CONFIG_SMP #include @@ -48,6 +51,8 @@ extern int __cpu_up(unsigned int cpunum) */ extern void smp_cpus_done(unsigned int max_cpus); +extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait); + /* * Call a function on all other processors */ @@ -97,6 +102,12 @@ static inline void smp_send_reschedule(i #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) +static inline int smp_nmi_call_function(smp_nmi_function func, + void *info, int wait) +{ + return 0; +} + #endif /* !SMP */ /* diff -uprN linux-2.6.18/include/linux/socket.h linux-2.6.18.ovz/include/linux/socket.h --- linux-2.6.18/include/linux/socket.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/socket.h 2007-06-13 06:55:07.000000000 -0400 @@ -286,6 +286,16 @@ struct ucred { #define IPX_TYPE 1 #ifdef __KERNEL__ + +#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - + 16 for IP, 16 for IPX, + 24 for IPv6, + about 80 for AX.25 + must be at least one bigger than + the AF_UNIX size (see net/unix/af_unix.c + :unix_mkname()). + */ + extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len); extern int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, int len); @@ -299,6 +309,7 @@ extern int memcpy_toiovec(struct iovec * extern int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen); extern int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int vz_security_proto_check(int family, int type, int protocol); #endif #endif /* not kernel and not glibc */ diff -uprN linux-2.6.18/include/linux/stddef.h linux-2.6.18.ovz/include/linux/stddef.h --- linux-2.6.18/include/linux/stddef.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/stddef.h 2007-06-13 06:55:07.000000000 -0400 @@ -10,11 +10,13 @@ #define NULL ((void *)0) #endif +#ifdef __KERNEL__ #undef offsetof #ifdef __compiler_offsetof #define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER) #else #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #endif +#endif /* __KERNEL__ */ #endif diff -uprN linux-2.6.18/include/linux/sunrpc/Kbuild linux-2.6.18.ovz/include/linux/sunrpc/Kbuild --- linux-2.6.18/include/linux/sunrpc/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/sunrpc/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1 +1 @@ -unifdef-y := debug.h +unifdef-y += debug.h diff -uprN linux-2.6.18/include/linux/sunrpc/clnt.h linux-2.6.18.ovz/include/linux/sunrpc/clnt.h --- linux-2.6.18/include/linux/sunrpc/clnt.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/sunrpc/clnt.h 2007-06-13 06:55:07.000000000 -0400 @@ -52,7 +52,8 @@ struct rpc_clnt { cl_intr : 1,/* interruptible */ cl_autobind : 1,/* use getport() */ cl_oneshot : 1,/* dispose after use */ - cl_dead : 1;/* abandoned */ + cl_dead : 1,/* abandoned */ + cl_broken : 1;/* no responce for too long */ struct rpc_rtt * cl_rtt; /* RTO estimator data */ struct rpc_portmap * cl_pmap; /* port mapping */ @@ -66,6 +67,8 @@ struct rpc_clnt { struct rpc_rtt cl_rtt_default; struct rpc_portmap cl_pmap_default; char cl_inline_name[32]; + + unsigned long cl_pr_time; }; #define cl_timeout cl_xprt->timeout #define cl_prog cl_pmap->pm_prog diff -uprN linux-2.6.18/include/linux/sunrpc/debug.h linux-2.6.18.ovz/include/linux/sunrpc/debug.h --- linux-2.6.18/include/linux/sunrpc/debug.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/sunrpc/debug.h 2007-06-13 06:55:07.000000000 -0400 @@ -95,6 +95,7 @@ enum { CTL_SLOTTABLE_TCP, CTL_MIN_RESVPORT, CTL_MAX_RESVPORT, + CTL_ABORT_TIMEOUT, }; #endif /* _LINUX_SUNRPC_DEBUG_H_ */ diff -uprN linux-2.6.18/include/linux/sunrpc/xprt.h linux-2.6.18.ovz/include/linux/sunrpc/xprt.h --- linux-2.6.18/include/linux/sunrpc/xprt.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/sunrpc/xprt.h 2007-06-13 06:55:07.000000000 -0400 @@ -41,6 +41,14 @@ extern unsigned int xprt_max_resvport; #define RPC_DEF_MAX_RESVPORT (1023U) /* + * Grand abort timeout (stop the client if occures) + */ +extern int xprt_abort_timeout; + +#define RPC_MIN_ABORT_TIMEOUT 300 +#define RPC_MAX_ABORT_TIMEOUT INT_MAX + +/* * This describes a timeout strategy */ struct rpc_timeout { @@ -122,6 +130,7 @@ struct rpc_xprt { struct rpc_xprt_ops * ops; /* transport methods */ struct socket * sock; /* BSD socket layer */ struct sock * inet; /* INET layer */ + struct ve_struct * owner_env; /* VE owner of mount */ struct rpc_timeout timeout; /* timeout parms */ struct sockaddr_in addr; /* server address */ diff -uprN linux-2.6.18/include/linux/swap.h linux-2.6.18.ovz/include/linux/swap.h --- linux-2.6.18/include/linux/swap.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/swap.h 2007-06-13 06:55:07.000000000 -0400 @@ -13,6 +13,7 @@ #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 +#define SWAP_FLAG_READONLY 0x40000000 /* set if swap is read-only */ static inline int current_is_kswapd(void) { @@ -88,6 +89,7 @@ struct address_space; struct sysinfo; struct writeback_control; struct zone; +struct user_beancounter; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of @@ -117,6 +119,7 @@ enum { SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ + SWP_READONLY = (1 << 2), }; #define SWAP_CLUSTER_MAX 32 @@ -127,6 +130,7 @@ enum { /* * The in-memory structure used to track swap areas. */ +struct user_beancounter; struct swap_info_struct { unsigned int flags; int prio; /* swap priority */ @@ -144,6 +148,9 @@ struct swap_info_struct { unsigned int max; unsigned int inuse_pages; int next; /* next entry on swap list */ +#ifdef CONFIG_USER_SWAP_ACCOUNTING + struct user_beancounter **swap_ubs; +#endif }; struct swap_list_t { @@ -151,11 +158,16 @@ struct swap_list_t { int next; /* swapfile to be used next */ }; +extern struct swap_list_t swap_list; +extern struct swap_info_struct swap_info[MAX_SWAPFILES]; + /* Swap 50% full? Release swapcache more aggressively.. */ #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) /* linux/mm/oom_kill.c */ extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); +extern int oom_kill_process(struct task_struct *p, const char *message); +extern struct task_struct *oom_select_bad_process(struct user_beancounter *ub); /* linux/mm/memory.c */ extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); @@ -190,6 +202,7 @@ extern long vm_total_pages; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; extern int sysctl_min_unmapped_ratio; +extern int sysctl_min_slab_ratio; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 @@ -220,6 +233,9 @@ extern struct address_space swapper_spac extern void show_swap_cache_info(void); extern int add_to_swap(struct page *, gfp_t); extern void __delete_from_swap_cache(struct page *); +extern int add_to_swap_cache(struct page *page, swp_entry_t entry); +extern int __add_to_swap_cache(struct page *page, + swp_entry_t entry, gfp_t gfp_mask); extern void delete_from_swap_cache(struct page *); extern int move_to_swap_cache(struct page *, swp_entry_t); extern int move_from_swap_cache(struct page *, unsigned long, @@ -233,7 +249,7 @@ extern struct page * read_swap_cache_asy extern long total_swap_pages; extern unsigned int nr_swapfiles; extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page(struct user_beancounter *); extern swp_entry_t get_swap_page_of_type(int); extern int swap_duplicate(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); @@ -245,6 +261,7 @@ extern sector_t map_swap_page(struct swa extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int can_share_swap_page(struct page *); extern int remove_exclusive_swap_page(struct page *); +extern int try_to_remove_exclusive_swap_page(struct page *); struct backing_dev_info; extern spinlock_t swap_lock; @@ -346,7 +363,7 @@ static inline int remove_exclusive_swap_ return 0; } -static inline swp_entry_t get_swap_page(void) +static inline swp_entry_t get_swap_page(struct user_beancounter *ub) { swp_entry_t entry; entry.val = 0; diff -uprN linux-2.6.18/include/linux/sysctl.h linux-2.6.18.ovz/include/linux/sysctl.h --- linux-2.6.18/include/linux/sysctl.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/sysctl.h 2007-06-13 06:55:07.000000000 -0400 @@ -150,6 +150,17 @@ enum KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ KERN_COMPAT_LOG=73, /* int: print compat layer messages */ KERN_MAX_LOCK_DEPTH=74, + KERN_SILENCE_LEVEL=200, /* int: Console silence loglevel */ + KERN_ALLOC_FAIL_WARN=201, /* int: whether we'll print "alloc failure" */ + KERN_VIRT_PIDS=202, /* int: VE pids virtualization */ + KERN_VIRT_OSRELEASE=205,/* virtualization of utsname.release */ + KERN_FAIRSCHED_MAX_LATENCY=211, /* int: Max start_tag delta */ + KERN_VCPU_SCHED_TIMESLICE=212, + KERN_VCPU_TIMESLICE=213, + KERN_SCALE_VCPU_FREQUENCY=214, /* Scale cpu frequency inside VE */ + KERN_VCPU_HOT_TIMESLICE=215, + KERN_VE_ALLOW_KTHREADS=207, + KERN_VE_MEMINFO=208, /* int: use privvmpages(0) or oomguarpages(1) */ }; @@ -191,6 +202,7 @@ enum VM_MIN_UNMAPPED=32, /* Set min percent of unmapped pages */ VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ + VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ }; @@ -406,15 +418,20 @@ enum NET_TCP_CONG_CONTROL=110, NET_TCP_ABC=111, NET_IPV4_IPFRAG_MAX_DIST=112, + NET_TCP_MAX_TW_BUCKETS_UB=151, + NET_TCP_MAX_TW_KMEM_FRACTION=152, NET_TCP_MTU_PROBING=113, NET_TCP_BASE_MSS=114, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, NET_TCP_DMA_COPYBREAK=116, NET_TCP_SLOW_START_AFTER_IDLE=117, + NET_TCP_PORT_FORWARD_RANGE=150, + NET_TCP_USE_SG=245, }; enum { NET_IPV4_ROUTE_FLUSH=1, + NET_IPV4_ROUTE_SRC_CHECK=188, NET_IPV4_ROUTE_MIN_DELAY=2, NET_IPV4_ROUTE_MAX_DELAY=3, NET_IPV4_ROUTE_GC_THRESH=4, @@ -794,6 +811,12 @@ enum FS_AIO_NR=18, /* current system-wide number of aio requests */ FS_AIO_MAX_NR=19, /* system-wide maximum number of aio requests */ FS_INOTIFY=20, /* inotify submenu */ + FS_AT_VSYSCALL=21, /* int: to announce vsyscall data */ +}; + +/* /proc/sys/debug */ +enum { + DBG_DECODE_CALLTRACES = 1, /* int: decode call traces on oops */ }; /* /proc/sys/fs/quota/ */ @@ -904,6 +927,8 @@ enum #ifdef __KERNEL__ #include +extern int ve_allow_kthreads; + extern void sysctl_init(void); typedef struct ctl_table ctl_table; @@ -948,6 +973,7 @@ extern ctl_handler sysctl_string; extern ctl_handler sysctl_intvec; extern ctl_handler sysctl_jiffies; extern ctl_handler sysctl_ms_jiffies; +extern ctl_handler sysctl_strategy_bset; /* @@ -988,6 +1014,8 @@ extern ctl_handler sysctl_ms_jiffies; */ /* A sysctl table is an array of struct ctl_table: */ +struct ve_struct; + struct ctl_table { int ctl_name; /* Binary ID */ @@ -1001,6 +1029,8 @@ struct ctl_table struct proc_dir_entry *de; /* /proc control block */ void *extra1; void *extra2; + struct ve_struct *owner_env; + int virt_handler; }; /* struct ctl_table_header is used to maintain dynamic lists of @@ -1017,6 +1047,9 @@ struct ctl_table_header * register_sysct int insert_at_head); void unregister_sysctl_table(struct ctl_table_header * table); +ctl_table *clone_sysctl_template(ctl_table *tmpl); +void free_sysctl_clone(ctl_table *clone); + #else /* __KERNEL__ */ #endif /* __KERNEL__ */ diff -uprN linux-2.6.18/include/linux/sysfs.h linux-2.6.18.ovz/include/linux/sysfs.h --- linux-2.6.18/include/linux/sysfs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/sysfs.h 2007-06-13 06:55:07.000000000 -0400 @@ -120,6 +120,8 @@ int sysfs_create_group(struct kobject *, void sysfs_remove_group(struct kobject *, const struct attribute_group *); void sysfs_notify(struct kobject * k, char *dir, char *attr); +extern struct file_system_type sysfs_fs_type; + #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir(struct kobject * k) diff -uprN linux-2.6.18/include/linux/task_io_accounting.h linux-2.6.18.ovz/include/linux/task_io_accounting.h --- linux-2.6.18/include/linux/task_io_accounting.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/task_io_accounting.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,42 @@ +/* + * task_io_accounting: a structure which is used for recording a single task's + * IO statistics. + * + * Don't include this header file directly - it is designed to be dragged in via + * sched.h. + * + * Blame akpm@osdl.org for all this. + */ + +#ifndef __TASK_IO_ACCOUNTING_H_ +#define __TASK_IO_ACCOUNTING_H_ + +#ifdef CONFIG_TASK_IO_ACCOUNTING +struct task_io_accounting { + /* + * The number of bytes which this task has caused to be read from + * storage. + */ + u64 read_bytes; + + /* + * The number of bytes which this task has caused, or shall cause to be + * written to disk. + */ + u64 write_bytes; + + /* + * A task can cause "negative" IO too. If this task truncates some + * dirty pagecache, some IO which another task has been accounted for + * (in its write_bytes) will not be happening. We _could_ just + * subtract that from the truncating task's write_bytes, but there is + * information loss in doing that. + */ + u64 cancelled_write_bytes; +}; +#else +struct task_io_accounting { +}; +#endif + +#endif diff -uprN linux-2.6.18/include/linux/task_io_accounting_ops.h linux-2.6.18.ovz/include/linux/task_io_accounting_ops.h --- linux-2.6.18/include/linux/task_io_accounting_ops.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/task_io_accounting_ops.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,58 @@ +/* + * Task I/O accounting operations + */ +#ifndef __TASK_IO_ACCOUNTING_OPS_INCLUDED +#define __TASK_IO_ACCOUNTING_OPS_INCLUDED + +#include + +#ifdef CONFIG_TASK_IO_ACCOUNTING +static inline void task_io_account_read(size_t bytes) +{ + ub_io_account_read(bytes); + current->ioac.read_bytes += bytes; +} + +static inline void task_io_account_write(struct page *page, size_t bytes, + int sync) +{ + if (sync) + ub_io_account_write(bytes); + else + ub_io_account_dirty(page, bytes); + + current->ioac.write_bytes += bytes; +} + +static inline void task_io_account_cancelled_write(size_t bytes) +{ + ub_io_account_write_cancelled(bytes); + current->ioac.cancelled_write_bytes += bytes; +} + +static inline void task_io_accounting_init(struct task_struct *tsk) +{ + memset(&tsk->ioac, 0, sizeof(tsk->ioac)); +} + +#else + +static inline void task_io_account_read(size_t bytes) +{ +} + +static inline void task_io_account_write(struct page *page, size_t bytes, + int sync) +{ +} + +static inline void task_io_account_cancelled_write(size_t bytes) +{ +} + +static inline void task_io_accounting_init(struct task_struct *tsk) +{ +} + +#endif /* CONFIG_TASK_IO_ACCOUNTING */ +#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */ diff -uprN linux-2.6.18/include/linux/tc_act/Kbuild linux-2.6.18.ovz/include/linux/tc_act/Kbuild --- linux-2.6.18/include/linux/tc_act/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/tc_act/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1 +1,4 @@ -header-y += tc_gact.h tc_ipt.h tc_mirred.h tc_pedit.h +header-y += tc_gact.h +header-y += tc_ipt.h +header-y += tc_mirred.h +header-y += tc_pedit.h diff -uprN linux-2.6.18/include/linux/tc_ematch/Kbuild linux-2.6.18.ovz/include/linux/tc_ematch/Kbuild --- linux-2.6.18/include/linux/tc_ematch/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/tc_ematch/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1 +1,4 @@ -headers-y := tc_em_cmp.h tc_em_meta.h tc_em_nbyte.h tc_em_text.h +header-y += tc_em_cmp.h +header-y += tc_em_meta.h +header-y += tc_em_nbyte.h +header-y += tc_em_text.h diff -uprN linux-2.6.18/include/linux/time.h linux-2.6.18.ovz/include/linux/time.h --- linux-2.6.18/include/linux/time.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/time.h 2007-06-13 06:55:07.000000000 -0400 @@ -108,7 +108,7 @@ extern void do_gettimeofday(struct timev extern int do_settimeofday(struct timespec *tv); extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz); #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts) -extern long do_utimes(int dfd, char __user *filename, struct timeval *times); +extern long do_utimes(int dfd, char __user *filename, struct timeval *times, int flags); struct itimerval; extern int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue); diff -uprN linux-2.6.18/include/linux/tty.h linux-2.6.18.ovz/include/linux/tty.h --- linux-2.6.18/include/linux/tty.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/tty.h 2007-06-13 06:55:07.000000000 -0400 @@ -227,6 +227,7 @@ struct tty_struct { spinlock_t read_lock; /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; + struct ve_struct *owner_env; }; /* tty magic number */ @@ -254,6 +255,7 @@ struct tty_struct { #define TTY_PTY_LOCK 16 /* pty private */ #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ #define TTY_HUPPED 18 /* Post driver->hangup() */ +#define TTY_CHARGED 19 /* Charged as ub resource */ #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) diff -uprN linux-2.6.18/include/linux/tty_driver.h linux-2.6.18.ovz/include/linux/tty_driver.h --- linux-2.6.18/include/linux/tty_driver.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/tty_driver.h 2007-06-13 06:55:07.000000000 -0400 @@ -213,14 +213,29 @@ struct tty_driver { unsigned int set, unsigned int clear); struct list_head tty_drivers; + struct ve_struct *owner_env; }; +#ifdef CONFIG_UNIX98_PTYS +extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ +extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */ +#endif + +#ifdef CONFIG_LEGACY_PTYS +extern struct tty_driver *pty_driver; +extern struct tty_driver *pty_slave_driver; +#endif + extern struct list_head tty_drivers; +extern rwlock_t tty_driver_guard; struct tty_driver *alloc_tty_driver(int lines); void put_tty_driver(struct tty_driver *driver); void tty_set_operations(struct tty_driver *driver, struct tty_operations *op); +struct class *init_ve_tty_class(void); +void fini_ve_tty_class(struct class *ve_tty_class); + /* tty driver magic number */ #define TTY_DRIVER_MAGIC 0x5402 diff -uprN linux-2.6.18/include/linux/ufs_fs.h linux-2.6.18.ovz/include/linux/ufs_fs.h --- linux-2.6.18/include/linux/ufs_fs.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/ufs_fs.h 2007-06-13 06:55:07.000000000 -0400 @@ -900,7 +900,7 @@ struct ufs_super_block_third { __fs64 fs_csaddr; /* blk addr of cyl grp summary area */ __fs64 fs_pendingblocks;/* blocks in process of being freed */ __fs32 fs_pendinginodes;/*inodes in process of being freed */ - } fs_u2; + } __attribute__ ((packed)) fs_u2; } fs_un1; union { struct { diff -uprN linux-2.6.18/include/linux/utsname.h linux-2.6.18.ovz/include/linux/utsname.h --- linux-2.6.18/include/linux/utsname.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/utsname.h 2007-06-13 06:55:07.000000000 -0400 @@ -1,6 +1,11 @@ #ifndef _LINUX_UTSNAME_H #define _LINUX_UTSNAME_H +#include +#include +#include +#include + #define __OLD_UTS_LEN 8 struct oldold_utsname { @@ -30,7 +35,52 @@ struct new_utsname { char domainname[65]; }; -extern struct new_utsname system_utsname; +struct uts_namespace { + struct kref kref; + struct new_utsname name; +}; +extern struct uts_namespace init_uts_ns; +extern struct new_utsname virt_utsname; + +static inline void get_uts_ns(struct uts_namespace *ns) +{ + kref_get(&ns->kref); +} + +#ifdef CONFIG_UTS_NS +extern int unshare_utsname(unsigned long unshare_flags, + struct uts_namespace **new_uts); +extern int copy_utsname(int flags, struct task_struct *tsk); +extern void free_uts_ns(struct kref *kref); + +static inline void put_uts_ns(struct uts_namespace *ns) +{ + kref_put(&ns->kref, free_uts_ns); +} +#else +static inline int unshare_utsname(unsigned long unshare_flags, + struct uts_namespace **new_uts) +{ + return -EINVAL; +} +static inline int copy_utsname(int flags, struct task_struct *tsk) +{ + return 0; +} +static inline void put_uts_ns(struct uts_namespace *ns) +{ +} +#endif + +static inline struct new_utsname *utsname(void) +{ + return ¤t->nsproxy->uts_ns->name; +} + +static inline struct new_utsname *init_utsname(void) +{ + return &init_uts_ns.name; +} extern struct rw_semaphore uts_sem; #endif diff -uprN linux-2.6.18/include/linux/ve.h linux-2.6.18.ovz/include/linux/ve.h --- linux-2.6.18/include/linux/ve.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/ve.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,354 @@ +/* + * include/linux/ve.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VE_H +#define _LINUX_VE_H + +#include + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +#include +#include +#include +#include +#include +#include +#include + +#ifdef VZMON_DEBUG +# define VZTRACE(fmt,args...) \ + printk(KERN_DEBUG fmt, ##args) +#else +# define VZTRACE(fmt,args...) +#endif /* VZMON_DEBUG */ + +struct tty_driver; +struct devpts_config; +struct task_struct; +struct new_utsname; +struct file_system_type; +struct icmp_mib; +struct ip_mib; +struct tcp_mib; +struct udp_mib; +struct linux_mib; +struct fib_info; +struct fib_rule; +struct veip_struct; +struct ve_monitor; +struct nsproxy; + +#if defined(CONFIG_VE) && defined(CONFIG_INET) +struct fib_table; +struct devcnfv4_struct; +struct ve_nfs_context; +#ifdef CONFIG_VE_IPTABLES +struct xt_af; +struct xt_table; +struct xt_target; +struct ip_conntrack; +typedef unsigned int (*ip_nat_helper_func)(void); +struct ve_ip_conntrack { + struct list_head *_ip_conntrack_hash; + struct list_head _ip_conntrack_expect_list; + struct list_head _ip_conntrack_unconfirmed; + struct ip_conntrack_protocol ** _ip_ct_protos; + struct list_head _ip_conntrack_helpers; + int _ip_conntrack_max; + int _ip_conntrack_vmalloc; + atomic_t _ip_conntrack_count; + void (*_ip_conntrack_destroyed)(struct ip_conntrack *conntrack); +#ifdef CONFIG_SYSCTL + unsigned long _ip_ct_tcp_timeouts[10]; + unsigned long _ip_ct_udp_timeout; + unsigned long _ip_ct_udp_timeout_stream; + unsigned long _ip_ct_icmp_timeout; + unsigned long _ip_ct_generic_timeout; + unsigned int _ip_ct_log_invalid; + unsigned long _ip_ct_tcp_timeout_max_retrans; + int _ip_ct_tcp_loose; + int _ip_ct_tcp_be_liberal; + int _ip_ct_tcp_max_retrans; + struct ctl_table_header *_ip_ct_sysctl_header; + ctl_table *_ip_ct_net_table; + ctl_table *_ip_ct_ipv4_table; + ctl_table *_ip_ct_netfilter_table; + ctl_table *_ip_ct_sysctl_table; +#endif /*CONFIG_SYSCTL*/ + + struct ip_nat_protocol **_ip_nat_protos; + ip_nat_helper_func _ip_nat_ftp_hook; + ip_nat_helper_func _ip_nat_irc_hook; + struct list_head *_ip_nat_bysource; + struct xt_table *_ip_nat_table; + + /* resource accounting */ + struct user_beancounter *ub; +}; +#endif +#endif + +#define UIDHASH_BITS_VE 6 +#define UIDHASH_SZ_VE (1 << UIDHASH_BITS_VE) + +struct ve_cpu_stats { + cycles_t idle_time; + cycles_t iowait_time; + cycles_t strt_idle_time; + cycles_t used_time; + seqcount_t stat_lock; + int nr_running; + int nr_unint; + cputime64_t user; + cputime64_t nice; + cputime64_t system; +} ____cacheline_aligned; + +struct ve_struct { + struct list_head ve_list; + + envid_t veid; + struct task_struct *init_entry; + struct list_head vetask_lh; + /* capability bounding set */ + kernel_cap_t ve_cap_bset; + atomic_t pcounter; + /* ref counter to ve from ipc */ + atomic_t counter; + unsigned int class_id; + struct rw_semaphore op_sem; + int is_running; + int is_locked; + atomic_t suspend; + int virt_pids; + /* see vzcalluser.h for VE_FEATURE_XXX definitions */ + __u64 features; + +/* VE's root */ + struct vfsmount *fs_rootmnt; + struct dentry *fs_root; + +/* sysctl */ + struct list_head sysctl_lh; + struct ctl_table_header *quota_header; + struct ctl_table *quota_table; + struct file_system_type *proc_fstype; + struct vfsmount *proc_mnt; + struct proc_dir_entry *proc_root; + struct proc_dir_entry *proc_sys_root; + struct proc_dir_entry *_proc_net; + struct proc_dir_entry *_proc_net_stat; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct proc_dir_entry *_proc_net_devsnmp6; +#endif + +/* BSD pty's */ +#ifdef CONFIG_LEGACY_PTYS + struct tty_driver *pty_driver; + struct tty_driver *pty_slave_driver; +#endif +#ifdef CONFIG_UNIX98_PTYS + struct tty_driver *ptm_driver; + struct tty_driver *pts_driver; + struct idr *allocated_ptys; + struct file_system_type *devpts_fstype; + struct vfsmount *devpts_mnt; + struct dentry *devpts_root; + struct devpts_config *devpts_config; +#endif + + struct ve_nfs_context *nfs_context; + + struct file_system_type *shmem_fstype; + struct vfsmount *shmem_mnt; +#ifdef CONFIG_SYSFS + struct file_system_type *sysfs_fstype; + struct vfsmount *sysfs_mnt; + struct super_block *sysfs_sb; + struct sysfs_dirent *sysfs_root; +#endif + struct subsystem *class_subsys; + struct subsystem *class_obj_subsys; + struct class *tty_class; + +/* User uids hash */ + struct list_head uidhash_table[UIDHASH_SZ_VE]; + +#ifdef CONFIG_NET + struct class *net_class; + struct hlist_head _net_dev_head; + struct hlist_head _net_dev_index_head; + struct net_device *_net_dev_base, **_net_dev_tail; + int ifindex; + struct net_device *_loopback_dev; + struct net_device_stats *_loopback_stats; +#ifdef CONFIG_INET + struct ipv4_devconf *_ipv4_devconf; + struct ipv4_devconf *_ipv4_devconf_dflt; + struct ctl_table_header *forward_header; + struct ctl_table *forward_table; + unsigned long rt_flush_required; + struct neigh_table *ve_arp_tbl; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct ipv6_devconf *_ipv6_devconf; + struct ipv6_devconf *_ipv6_devconf_dflt; + struct neigh_table *ve_nd_tbl; +#endif +#endif +#endif +#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE) + struct veip_struct *veip; + struct net_device *_venet_dev; +#endif + +/* per VE CPU stats*/ + struct timespec start_timespec; + u64 start_jiffies; /* Deprecated */ + cycles_t start_cycles; + unsigned long avenrun[3]; /* loadavg data */ + + cycles_t cpu_used_ve; + struct kstat_lat_pcpu_struct sched_lat_ve; + +#ifdef CONFIG_INET + struct hlist_head *_fib_info_hash; + struct hlist_head *_fib_info_laddrhash; + int _fib_hash_size; + int _fib_info_cnt; + + struct fib_rule *_local_rule; + struct hlist_head _fib_rules; +#ifdef CONFIG_IP_MULTIPLE_TABLES + /* XXX: why a magic constant? */ + struct fib_table *_fib_tables[256]; /* RT_TABLE_MAX - for now */ +#else + struct fib_table *_main_table; + struct fib_table *_local_table; +#endif + struct icmp_mib *_icmp_statistics[2]; + struct ipstats_mib *_ip_statistics[2]; + struct tcp_mib *_tcp_statistics[2]; + struct udp_mib *_udp_statistics[2]; + struct linux_mib *_net_statistics[2]; + struct venet_stat *stat; +#ifdef CONFIG_VE_IPTABLES +/* core/netfilter.c virtualization */ + void *_nf_hooks; + struct xt_table *_ve_ipt_filter_pf; /* packet_filter struct */ + struct xt_table *_ve_ip6t_filter_pf; + struct xt_table *_ipt_mangle_table; + struct xt_table *_ip6t_mangle_table; + struct list_head _xt_tables[NPROTO]; + + __u64 _iptables_modules; + struct ve_ip_conntrack *_ip_conntrack; +#endif /* CONFIG_VE_IPTABLES */ + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct fib6_table *_fib6_table; + struct ipstats_mib *_ipv6_statistics[2]; + struct icmpv6_mib *_icmpv6_statistics[2]; + struct udp_mib *_udp_stats_in6[2]; +#endif +#endif + wait_queue_head_t *_log_wait; + unsigned long *_log_start; + unsigned long *_log_end; + unsigned long *_logged_chars; + char *log_buf; +#define VE_DEFAULT_LOG_BUF_LEN 4096 + + struct ve_cpu_stats *cpu_stats; + unsigned long down_at; + struct list_head cleanup_list; +#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) + struct list_head _fuse_conn_list; + struct super_block *_fuse_control_sb; + + struct file_system_type *fuse_fs_type; + struct file_system_type *fuse_ctl_fs_type; +#endif +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + struct proc_dir_entry *_proc_vlan_dir; + struct proc_dir_entry *_proc_vlan_conf; +#endif + unsigned long jiffies_fixup; + unsigned char disable_net; + unsigned char sparse_vpid; + struct ve_monitor *monitor; + struct proc_dir_entry *monitor_proc; + unsigned long meminfo_val; + + struct nsproxy *ve_ns; +}; + +#define VE_CPU_STATS(ve, cpu) (per_cpu_ptr((ve)->cpu_stats, cpu)) + +extern int nr_ve; + +#ifdef CONFIG_VE + +void do_update_load_avg_ve(void); +void do_env_free(struct ve_struct *ptr); + +static inline struct ve_struct *get_ve(struct ve_struct *ptr) +{ + if (ptr != NULL) + atomic_inc(&ptr->counter); + return ptr; +} + +static inline void put_ve(struct ve_struct *ptr) +{ + if (ptr && atomic_dec_and_test(&ptr->counter)) { + if (atomic_read(&ptr->pcounter) > 0) + BUG(); + if (ptr->is_running) + BUG(); + do_env_free(ptr); + } +} + +static inline void pget_ve(struct ve_struct *ptr) +{ + atomic_inc(&ptr->pcounter); +} + +void ve_cleanup_schedule(struct ve_struct *); +static inline void pput_ve(struct ve_struct *ptr) +{ + if (unlikely(atomic_dec_and_test(&ptr->pcounter))) + ve_cleanup_schedule(ptr); +} + +extern spinlock_t ve_cleanup_lock; +extern struct list_head ve_cleanup_list; +extern struct task_struct *ve_cleanup_thread; + +extern unsigned long long ve_relative_clock(struct timespec * ts); + +#ifdef CONFIG_FAIRSCHED +#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask) +#else +#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0) +#endif +#else /* CONFIG_VE */ +#define ve_utsname system_utsname +#define get_ve(ve) (NULL) +#define put_ve(ve) do { } while (0) +#define pget_ve(ve) do { } while (0) +#define pput_ve(ve) do { } while (0) +#endif /* CONFIG_VE */ + +#endif /* _LINUX_VE_H */ diff -uprN linux-2.6.18/include/linux/ve_nfs.h linux-2.6.18.ovz/include/linux/ve_nfs.h --- linux-2.6.18/include/linux/ve_nfs.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/ve_nfs.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,35 @@ +/* + * linux/include/ve_nfs.h + * + * VE context for NFS + * + * Copyright (C) 2007 SWsoft + */ + +#ifndef __VE_NFS_H__ +#define __VE_NFS_H__ + +#ifdef CONFIG_VE +struct ve_nfs_context { + struct file_system_type *fstype; + unsigned int _nlmsvc_users; + pid_t _nlmsvc_pid; + int _nlmsvc_grace_period; + unsigned long _nlmsvc_timeout; +}; + +#define NFS_CTX_FIELD(arg) \ + (*(get_exec_env()->nfs_context == NULL ? &_##arg : \ + &get_exec_env()->nfs_context->_##arg)) +#define nlmsvc_grace_period NFS_CTX_FIELD(nlmsvc_grace_period) +#define nlmsvc_timeout NFS_CTX_FIELD(nlmsvc_timeout) +#define nlmsvc_users NFS_CTX_FIELD(nlmsvc_users) +#define nlmsvc_pid NFS_CTX_FIELD(nlmsvc_pid) +#else +#define nlmsvc_grace_period _nlmsvc_timeout +#define nlmsvc_timeout _nlmsvc_grace_period +#define nlmsvc_pid _nlmsvc_pid +#define nlmsvc_timeout _nlmsvc_timeout +#endif + +#endif diff -uprN linux-2.6.18/include/linux/ve_proto.h linux-2.6.18.ovz/include/linux/ve_proto.h --- linux-2.6.18/include/linux/ve_proto.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/ve_proto.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,90 @@ +/* + * include/linux/ve_proto.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_H__ +#define __VE_H__ + +#ifdef CONFIG_VE + +struct ve_struct; + +#ifdef CONFIG_INET +void ip_fragment_cleanup(struct ve_struct *envid); +void tcp_v4_kill_ve_sockets(struct ve_struct *envid); +struct fib_table * fib_hash_init(int id); +#ifdef CONFIG_VE_NETDEV +int venet_init(void); +#endif +#else +static inline void ip_fragment_cleanup(struct ve_struct *ve) { ; } +#endif + +extern struct list_head ve_list_head; +#define for_each_ve(ve) list_for_each_entry((ve), &ve_list_head, ve_list) +extern rwlock_t ve_list_lock; +extern struct ve_struct *get_ve_by_id(envid_t); +extern struct ve_struct *__find_ve_by_id(envid_t); + +struct env_create_param3; +extern int real_env_create(envid_t veid, unsigned flags, u32 class_id, + struct env_create_param3 *data, int datalen); +extern void ve_move_task(struct task_struct *, struct ve_struct *); + +int set_device_perms_ve(envid_t veid, unsigned type, dev_t dev, unsigned mask); +int get_device_perms_ve(int dev_type, dev_t dev, int access_mode); +void clean_device_perms_ve(envid_t veid); +extern struct file_operations proc_devperms_ops; + +enum { + VE_SS_CHAIN, + + VE_MAX_CHAINS +}; + +typedef int ve_hook_init_fn(void *data); +typedef void ve_hook_fini_fn(void *data); + +struct ve_hook +{ + ve_hook_init_fn *init; + ve_hook_fini_fn *fini; + struct module *owner; + + /* Functions are called in ascending priority */ + int priority; + + /* Private part */ + struct list_head list; +}; + +enum { + HOOK_PRIO_DEFAULT = 0, + + HOOK_PRIO_FS = HOOK_PRIO_DEFAULT, + + HOOK_PRIO_NET_PRE, + HOOK_PRIO_NET, + HOOK_PRIO_NET_POST, + + HOOK_PRIO_AFTERALL = INT_MAX +}; + +extern int ve_hook_iterate_init(int chain, void *data); +extern void ve_hook_iterate_fini(int chain, void *data); + +extern void ve_hook_register(int chain, struct ve_hook *vh); +extern void ve_hook_unregister(struct ve_hook *vh); +#else /* CONFIG_VE */ +#define ve_hook_register(ch, vh) do { } while (0) +#define ve_hook_unregister(ve) do { } while (0) + +#define get_device_perms_ve(t, d, a) (0) +#endif /* CONFIG_VE */ +#endif diff -uprN linux-2.6.18/include/linux/ve_task.h linux-2.6.18.ovz/include/linux/ve_task.h --- linux-2.6.18/include/linux/ve_task.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/ve_task.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,68 @@ +/* + * include/linux/ve_task.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_TASK_H__ +#define __VE_TASK_H__ + +#include +#include + +struct ve_task_info { +/* virtualization */ + struct ve_struct *owner_env; + struct ve_struct *exec_env; + struct ve_struct *saved_env; + struct list_head vetask_list; + struct dentry *glob_proc_dentry; +/* statistics: scheduling latency */ + cycles_t sleep_time; + cycles_t sched_time; + cycles_t sleep_stamp; + cycles_t wakeup_stamp; + seqcount_t wakeup_lock; +}; + +#define VE_TASK_INFO(task) (&(task)->ve_task_info) +#define VE_TASK_LIST_2_TASK(lh) \ + list_entry(lh, struct task_struct, ve_task_info.vetask_list) + +#ifdef CONFIG_VE +extern struct ve_struct ve0; +#define get_ve0() (&ve0) + +#define ve_save_context(t) do { \ + t->ve_task_info.saved_env = \ + t->ve_task_info.exec_env; \ + t->ve_task_info.exec_env = get_ve0(); \ + } while (0) +#define ve_restore_context(t) do { \ + t->ve_task_info.exec_env = \ + t->ve_task_info.saved_env; \ + } while (0) + +#define get_exec_env() (current->ve_task_info.exec_env) +#define set_exec_env(ve) ({ \ + struct ve_task_info *vi; \ + struct ve_struct *old; \ + \ + vi = ¤t->ve_task_info; \ + old = vi->exec_env; \ + vi->exec_env = ve; \ + old; \ + }) +#else +#define get_ve0() (NULL) +#define get_exec_env() (NULL) +#define set_exec_env(new_env) (NULL) +#define ve_save_context(t) do { } while (0) +#define ve_restore_context(t) do { } while (0) +#endif + +#endif /* __VE_TASK_H__ */ diff -uprN linux-2.6.18/include/linux/veip.h linux-2.6.18.ovz/include/linux/veip.h --- linux-2.6.18/include/linux/veip.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/veip.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,15 @@ +#ifndef __VE_IP_H_ +#define __VE_IP_H_ + +struct ve_addr_struct { + int family; + __u32 key[4]; +}; + +struct sockaddr; + +extern void veaddr_print(char *, int, struct ve_addr_struct *); +extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, + struct ve_addr_struct *veaddr); + +#endif diff -uprN linux-2.6.18/include/linux/venet.h linux-2.6.18.ovz/include/linux/venet.h --- linux-2.6.18/include/linux/venet.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/venet.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,73 @@ +/* + * include/linux/venet.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VENET_H +#define _VENET_H + +#include +#include +#include +#include + +#define VEIP_HASH_SZ 512 + +struct ve_struct; +struct venet_stat; + +struct ip_entry_struct +{ + struct ve_addr_struct addr; + struct ve_struct *active_env; + struct venet_stat *stat; + struct veip_struct *veip; + struct list_head ip_hash; + struct list_head ve_list; +}; + +struct veip_struct +{ + struct list_head src_lh; + struct list_head dst_lh; + struct list_head ip_lh; + struct list_head list; + envid_t veid; +}; + +/* veip_hash_lock should be taken for write by caller */ +void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip); +/* veip_hash_lock should be taken for write by caller */ +void ip_entry_unhash(struct ip_entry_struct *entry); +/* veip_hash_lock should be taken for read by caller */ +struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *); + +/* veip_hash_lock should be taken for read by caller */ +struct veip_struct *veip_find(envid_t veid); +/* veip_hash_lock should be taken for write by caller */ +struct veip_struct *veip_findcreate(envid_t veid); +/* veip_hash_lock should be taken for write by caller */ +void veip_put(struct veip_struct *veip); + +extern struct list_head veip_lh; + +int veip_start(struct ve_struct *ve); +void veip_stop(struct ve_struct *ve); +__exit void veip_cleanup(void); +int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr); +int veip_entry_del(envid_t veid, struct ve_addr_struct *addr); +int venet_change_skb_owner(struct sk_buff *skb); + +extern struct list_head ip_entry_hash_table[]; +extern rwlock_t veip_hash_lock; + +#ifdef CONFIG_PROC_FS +int veip_seq_show(struct seq_file *m, void *v); +#endif + +#endif diff -uprN linux-2.6.18/include/linux/veprintk.h linux-2.6.18.ovz/include/linux/veprintk.h --- linux-2.6.18/include/linux/veprintk.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/veprintk.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,38 @@ +/* + * include/linux/veprintk.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_PRINTK_H__ +#define __VE_PRINTK_H__ + +#ifdef CONFIG_VE + +#define ve_log_wait (*(get_exec_env()->_log_wait)) +#define ve_log_start (*(get_exec_env()->_log_start)) +#define ve_log_end (*(get_exec_env()->_log_end)) +#define ve_logged_chars (*(get_exec_env()->_logged_chars)) +#define ve_log_buf (get_exec_env()->log_buf) +#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \ + log_buf_len : VE_DEFAULT_LOG_BUF_LEN) +#define VE_LOG_BUF_MASK (ve_log_buf_len - 1) +#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK]) + +#else + +#define ve_log_wait log_wait +#define ve_log_start log_start +#define ve_log_end log_end +#define ve_logged_chars logged_chars +#define ve_log_buf log_buf +#define ve_log_buf_len log_buf_len +#define VE_LOG_BUF_MASK LOG_BUF_MASK +#define VE_LOG_BUF(idx) LOG_BUF(idx) + +#endif /* CONFIG_VE */ +#endif /* __VE_PRINTK_H__ */ diff -uprN linux-2.6.18/include/linux/virtinfo.h linux-2.6.18.ovz/include/linux/virtinfo.h --- linux-2.6.18/include/linux/virtinfo.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/virtinfo.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,99 @@ +/* + * include/linux/virtinfo.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_VIRTINFO_H +#define __LINUX_VIRTINFO_H + +#include +#include +#include +#include + +struct vnotifier_block +{ + int (*notifier_call)(struct vnotifier_block *self, + unsigned long, void *, int); + struct vnotifier_block *next; + int priority; +}; + +extern struct semaphore virtinfo_sem; +void __virtinfo_notifier_register(int type, struct vnotifier_block *nb); +void virtinfo_notifier_register(int type, struct vnotifier_block *nb); +void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb); +int virtinfo_notifier_call(int type, unsigned long n, void *data); + +struct page_info { + unsigned long nr_file_dirty; + unsigned long nr_writeback; + unsigned long nr_anon_pages; + unsigned long nr_file_mapped; + unsigned long nr_slab; + unsigned long nr_pagetable; + unsigned long nr_unstable_nfs; + unsigned long nr_bounce; +}; + +struct meminfo { + struct sysinfo si; + struct page_info pi; + unsigned long active, inactive; + unsigned long cache, swapcache; + unsigned long committed_space; + unsigned long allowed; + unsigned long vmalloc_total, vmalloc_used, vmalloc_largest; +}; + +#define VIRTINFO_MEMINFO 0 +#define VIRTINFO_ENOUGHMEM 1 +#define VIRTINFO_DOFORK 2 +#define VIRTINFO_DOEXIT 3 +#define VIRTINFO_DOEXECVE 4 +#define VIRTINFO_DOFORKRET 5 +#define VIRTINFO_DOFORKPOST 6 +#define VIRTINFO_EXIT 7 +#define VIRTINFO_EXITMMAP 8 +#define VIRTINFO_EXECMMAP 9 +#define VIRTINFO_OUTOFMEM 10 +#define VIRTINFO_PAGEIN 11 +#define VIRTINFO_SYSINFO 12 +#define VIRTINFO_NEWUBC 13 +#define VIRTINFO_VMSTAT 14 + +enum virt_info_types { + VITYPE_GENERAL, + VITYPE_FAUDIT, + VITYPE_QUOTA, + VITYPE_SCP, + + VIRT_TYPES +}; + +#ifdef CONFIG_VZ_GENCALLS + +static inline int virtinfo_gencall(unsigned long n, void *data) +{ + int r; + + r = virtinfo_notifier_call(VITYPE_GENERAL, n, data); + if (r & NOTIFY_FAIL) + return -ENOBUFS; + if (r & NOTIFY_OK) + return -ERESTARTNOINTR; + return 0; +} + +#else + +#define virtinfo_gencall(n, data) 0 + +#endif + +#endif /* __LINUX_VIRTINFO_H */ diff -uprN linux-2.6.18/include/linux/virtinfoscp.h linux-2.6.18.ovz/include/linux/virtinfoscp.h --- linux-2.6.18/include/linux/virtinfoscp.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/virtinfoscp.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,21 @@ +#ifndef __VIRTINFO_SCP_H__ +#define __VIRTINFO_SCP_H__ + +/* + * Dump and restore operations are non-symmetric. + * With respect to finish/fail hooks, 2 dump hooks are called from + * different proc operations, but restore hooks are called from a single one. + */ +#define VIRTINFO_SCP_COLLECT 0x10 +#define VIRTINFO_SCP_DUMP 0x11 +#define VIRTINFO_SCP_DMPFIN 0x12 +#define VIRTINFO_SCP_RSTCHECK 0x13 +#define VIRTINFO_SCP_RESTORE 0x14 +#define VIRTINFO_SCP_RSTFAIL 0x15 + +#define VIRTINFO_SCP_RSTTSK 0x20 +#define VIRTINFO_SCP_RSTMM 0x21 + +#define VIRTNOTIFY_CHANGE 0x100 + +#endif /* __VIRTINFO_SCP_H__ */ diff -uprN linux-2.6.18/include/linux/vmalloc.h linux-2.6.18.ovz/include/linux/vmalloc.h --- linux-2.6.18/include/linux/vmalloc.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/vmalloc.h 2007-06-13 06:55:07.000000000 -0400 @@ -22,6 +22,10 @@ struct vm_area_struct; #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ #endif +/* align size to 2^n page boundary */ +#define POWER2_PAGE_ALIGN(size) \ + ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size)))) + struct vm_struct { void *addr; unsigned long size; @@ -36,12 +40,16 @@ struct vm_struct { * Highlevel APIs for driver use */ extern void *vmalloc(unsigned long size); +extern void *ub_vmalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); +extern void *ub_vmalloc_node(unsigned long size, int node); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); +extern void *vmalloc_best(unsigned long size); +extern void *ub_vmalloc_best(unsigned long size); extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, @@ -61,6 +69,9 @@ extern int remap_vmalloc_range(struct vm extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); +extern struct vm_struct * get_vm_area_best(unsigned long size, + unsigned long flags); +extern void vprintstat(void); extern struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node); extern struct vm_struct *remove_vm_area(void *addr); diff -uprN linux-2.6.18/include/linux/vmstat.h linux-2.6.18.ovz/include/linux/vmstat.h --- linux-2.6.18/include/linux/vmstat.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/linux/vmstat.h 2007-06-13 06:55:07.000000000 -0400 @@ -61,6 +61,7 @@ static inline void count_vm_events(enum put_cpu(); } +extern unsigned long vm_events(enum vm_event_item i); extern void all_vm_events(unsigned long *); extern void vm_events_fold_cpu(int cpu); diff -uprN linux-2.6.18/include/linux/vsched.h linux-2.6.18.ovz/include/linux/vsched.h --- linux-2.6.18/include/linux/vsched.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vsched.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,27 @@ +/* + * include/linux/vsched.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VSCHED_H__ +#define __VSCHED_H__ + +#include +#include +#include +#include + +extern int vsched_create(int id, struct fairsched_node *node); +extern int vsched_destroy(struct vcpu_scheduler *vsched); + +extern int vsched_mvpr(struct task_struct *p, struct vcpu_scheduler *vsched); +extern int vsched_set_vcpus(struct vcpu_scheduler *vsched, unsigned int vcpus); + +unsigned long ve_scale_khz(unsigned long khz); + +#endif diff -uprN linux-2.6.18/include/linux/vzcalluser.h linux-2.6.18.ovz/include/linux/vzcalluser.h --- linux-2.6.18/include/linux/vzcalluser.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzcalluser.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,240 @@ +/* + * include/linux/vzcalluser.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VZCALLUSER_H +#define _LINUX_VZCALLUSER_H + +#include +#include + +#define KERN_VZ_PRIV_RANGE 51 + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +#ifndef __KERNEL__ +#define __user +#endif + +/* + * VE management ioctls + */ + +struct vzctl_old_env_create { + envid_t veid; + unsigned flags; +#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */ +#define VE_EXCLUSIVE 2 /* Fail if exists */ +#define VE_ENTER 4 /* Enter existing VE */ +#define VE_TEST 8 /* Test if VE exists */ +#define VE_LOCK 16 /* Do not allow entering created VE */ +#define VE_SKIPLOCK 32 /* Allow entering embrion VE */ + __u32 addr; +}; + +struct vzctl_mark_env_to_down { + envid_t veid; +}; + +struct vzctl_setdevperms { + envid_t veid; + unsigned type; +#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */ +#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */ +#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */ + unsigned dev; + unsigned mask; +}; + +struct vzctl_ve_netdev { + envid_t veid; + int op; +#define VE_NETDEV_ADD 1 +#define VE_NETDEV_DEL 2 + char __user *dev_name; +}; + +struct vzctl_ve_meminfo { + envid_t veid; + unsigned long val; +}; + +/* these masks represent modules */ +#define VE_IP_IPTABLES_MOD (1U<<0) +#define VE_IP_FILTER_MOD (1U<<1) +#define VE_IP_MANGLE_MOD (1U<<2) +#define VE_IP_CONNTRACK_MOD (1U<<14) +#define VE_IP_CONNTRACK_FTP_MOD (1U<<15) +#define VE_IP_CONNTRACK_IRC_MOD (1U<<16) +#define VE_IP_NAT_MOD (1U<<20) +#define VE_IP_NAT_FTP_MOD (1U<<21) +#define VE_IP_NAT_IRC_MOD (1U<<22) +#define VE_IP_IPTABLES6_MOD (1U<<26) +#define VE_IP_FILTER6_MOD (1U<<27) +#define VE_IP_MANGLE6_MOD (1U<<28) +#define VE_IP_IPTABLE_NAT_MOD (1U<<29) + +/* these masks represent modules with their dependences */ +#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD) +#define VE_IP_FILTER (VE_IP_FILTER_MOD \ + | VE_IP_IPTABLES) +#define VE_IP_MANGLE (VE_IP_MANGLE_MOD \ + | VE_IP_IPTABLES) +#define VE_IP_IPTABLES6 (VE_IP_IPTABLES6_MOD) +#define VE_IP_FILTER6 (VE_IP_FILTER6_MOD | VE_IP_IPTABLES6) +#define VE_IP_MANGLE6 (VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6) +#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD \ + | VE_IP_IPTABLES) +#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD \ + | VE_IP_CONNTRACK) +#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD \ + | VE_IP_CONNTRACK) +#define VE_IP_NAT (VE_IP_NAT_MOD \ + | VE_IP_CONNTRACK) +#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD \ + | VE_IP_NAT | VE_IP_CONNTRACK_FTP) +#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD \ + | VE_IP_NAT | VE_IP_CONNTRACK_IRC) +#define VE_IP_IPTABLE_NAT (VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK) + +/* safe iptables mask to be used by default */ +#define VE_IP_DEFAULT \ + (VE_IP_IPTABLES | \ + VE_IP_FILTER | VE_IP_MANGLE) + +#define VE_IPT_CMP(x,y) (((x) & (y)) == (y)) + +struct vzctl_env_create_cid { + envid_t veid; + unsigned flags; + __u32 class_id; +}; + +struct vzctl_env_create { + envid_t veid; + unsigned flags; + __u32 class_id; +}; + +struct env_create_param { + __u64 iptables_mask; +}; + +#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(struct env_create_param) + +struct env_create_param2 { + __u64 iptables_mask; + __u64 feature_mask; + __u32 total_vcpus; /* 0 - don't care, same as in host */ +}; + +struct env_create_param3 { + __u64 iptables_mask; + __u64 feature_mask; + __u32 total_vcpus; + __u32 pad; + __u64 known_features; +}; + +#define VE_FEATURE_SYSFS (1ULL << 0) +#define VE_FEATURE_NFS (1ULL << 1) +#define VE_FEATURE_DEF_PERMS (1ULL << 2) + +#define VE_FEATURES_OLD (VE_FEATURE_SYSFS) +#define VE_FEATURES_DEF (VE_FEATURE_SYSFS | \ + VE_FEATURE_DEF_PERMS) + +typedef struct env_create_param3 env_create_param_t; +#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(env_create_param_t) + +struct vzctl_env_create_data { + envid_t veid; + unsigned flags; + __u32 class_id; + env_create_param_t __user *data; + int datalen; +}; + +struct vz_load_avg { + int val_int; + int val_frac; +}; + +struct vz_cpu_stat { + unsigned long user_jif; + unsigned long nice_jif; + unsigned long system_jif; + unsigned long uptime_jif; + __u64 idle_clk; + __u64 strv_clk; + __u64 uptime_clk; + struct vz_load_avg avenrun[3]; /* loadavg data */ +}; + +struct vzctl_cpustatctl { + envid_t veid; + struct vz_cpu_stat __user *cpustat; +}; + +#define VZCTLTYPE '.' +#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \ + struct vzctl_old_env_create) +#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \ + struct vzctl_mark_env_to_down) +#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \ + struct vzctl_setdevperms) +#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \ + struct vzctl_env_create_cid) +#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \ + struct vzctl_env_create) +#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \ + struct vzctl_cpustatctl) +#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ + struct vzctl_env_create_data) +#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \ + struct vzctl_ve_netdev) +#define VZCTL_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ + struct vzctl_ve_meminfo) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +#include + +struct compat_vzctl_ve_netdev { + envid_t veid; + int op; + compat_uptr_t dev_name; +}; + +struct compat_vzctl_ve_meminfo { + envid_t veid; + compat_ulong_t val; +}; + +struct compat_vzctl_env_create_data { + envid_t veid; + unsigned flags; + __u32 class_id; + compat_uptr_t data; + int datalen; +}; + +#define VZCTL_COMPAT_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ + struct compat_vzctl_env_create_data) +#define VZCTL_COMPAT_VE_NETDEV _IOW(VZCTLTYPE, 11, \ + struct compat_vzctl_ve_netdev) +#define VZCTL_COMPAT_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ + struct compat_vzctl_ve_meminfo) +#endif +#endif + +#endif diff -uprN linux-2.6.18/include/linux/vzctl.h linux-2.6.18.ovz/include/linux/vzctl.h --- linux-2.6.18/include/linux/vzctl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzctl.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,30 @@ +/* + * include/linux/vzctl.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VZCTL_H +#define _LINUX_VZCTL_H + +#include + +struct module; +struct inode; +struct file; +struct vzioctlinfo { + unsigned type; + int (*ioctl)(struct file *, unsigned int, unsigned long); + int (*compat_ioctl)(struct file *, unsigned int, unsigned long); + struct module *owner; + struct list_head list; +}; + +extern void vzioctl_register(struct vzioctlinfo *inf); +extern void vzioctl_unregister(struct vzioctlinfo *inf); + +#endif diff -uprN linux-2.6.18/include/linux/vzctl_quota.h linux-2.6.18.ovz/include/linux/vzctl_quota.h --- linux-2.6.18/include/linux/vzctl_quota.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzctl_quota.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,74 @@ +/* + * include/linux/vzctl_quota.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_VZCTL_QUOTA_H__ +#define __LINUX_VZCTL_QUOTA_H__ + +#include + +#ifndef __KERNEL__ +#define __user +#endif + +/* + * Quota management ioctl + */ + +struct vz_quota_stat; +struct vzctl_quotactl { + int cmd; + unsigned int quota_id; + struct vz_quota_stat __user *qstat; + char __user *ve_root; +}; + +struct vzctl_quotaugidctl { + int cmd; /* subcommand */ + unsigned int quota_id; /* quota id where it applies to */ + unsigned int ugid_index;/* for reading statistic. index of first + uid/gid record to read */ + unsigned int ugid_size; /* size of ugid_buf array */ + void *addr; /* user-level buffer */ +}; + +#define VZDQCTLTYPE '+' +#define VZCTL_QUOTA_DEPR_CTL _IOWR(VZDQCTLTYPE, 1, \ + struct vzctl_quotactl) +#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \ + struct vzctl_quotactl) +#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ + struct vzctl_quotaugidctl) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +struct compat_vzctl_quotactl { + int cmd; + unsigned int quota_id; + compat_uptr_t qstat; + compat_uptr_t ve_root; +}; + +struct compat_vzctl_quotaugidctl { + int cmd; /* subcommand */ + unsigned int quota_id; /* quota id where it applies to */ + unsigned int ugid_index;/* for reading statistic. index of first + uid/gid record to read */ + unsigned int ugid_size; /* size of ugid_buf array */ + compat_uptr_t addr; /* user-level buffer */ +}; + +#define VZCTL_COMPAT_QUOTA_CTL _IOWR(VZDQCTLTYPE, 2, \ + struct compat_vzctl_quotactl) +#define VZCTL_COMPAT_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ + struct compat_vzctl_quotaugidctl) +#endif +#endif + +#endif /* __LINUX_VZCTL_QUOTA_H__ */ diff -uprN linux-2.6.18/include/linux/vzctl_venet.h linux-2.6.18.ovz/include/linux/vzctl_venet.h --- linux-2.6.18/include/linux/vzctl_venet.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzctl_venet.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,51 @@ +/* + * include/linux/vzctl_venet.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZCTL_VENET_H +#define _VZCTL_VENET_H + +#include +#include +#include + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +struct vzctl_ve_ip_map { + envid_t veid; + int op; +#define VE_IP_ADD 1 +#define VE_IP_DEL 2 + struct sockaddr *addr; + int addrlen; +}; + +#define VENETCTLTYPE '(' + +#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ + struct vzctl_ve_ip_map) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +struct compat_vzctl_ve_ip_map { + envid_t veid; + int op; + compat_uptr_t addr; + int addrlen; +}; + +#define VENETCTL_COMPAT_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ + struct compat_vzctl_ve_ip_map) +#endif +#endif + +#endif diff -uprN linux-2.6.18/include/linux/vzctl_veth.h linux-2.6.18.ovz/include/linux/vzctl_veth.h --- linux-2.6.18/include/linux/vzctl_veth.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzctl_veth.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,42 @@ +/* + * include/linux/vzctl_veth.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZCTL_VETH_H +#define _VZCTL_VETH_H + +#include +#include + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +struct vzctl_ve_hwaddr { + envid_t veid; + int op; +#define VE_ETH_ADD 1 +#define VE_ETH_DEL 2 +#define VE_ETH_ALLOW_MAC_CHANGE 3 +#define VE_ETH_DENY_MAC_CHANGE 4 + unsigned char dev_addr[6]; + int addrlen; + char dev_name[16]; + unsigned char dev_addr_ve[6]; + int addrlen_ve; + char dev_name_ve[16]; +}; + +#define VETHCTLTYPE '[' + +#define VETHCTL_VE_HWADDR _IOW(VETHCTLTYPE, 3, \ + struct vzctl_ve_hwaddr) + +#endif diff -uprN linux-2.6.18/include/linux/vzdq_tree.h linux-2.6.18.ovz/include/linux/vzdq_tree.h --- linux-2.6.18/include/linux/vzdq_tree.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzdq_tree.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,99 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo disk quota tree definition + */ + +#ifndef _VZDQ_TREE_H +#define _VZDQ_TREE_H + +#include +#include + +typedef unsigned int quotaid_t; +#define QUOTAID_BITS 32 +#define QUOTAID_BBITS 4 +#define QUOTAID_EBITS 8 + +#if QUOTAID_EBITS % QUOTAID_BBITS +#error Quota bit assumption failure +#endif + +#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS) +#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1) +#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \ + / QUOTAID_BBITS) +#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \ + / QUOTAID_EBITS) +#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS) + +/* + * Depth of keeping unused node (not inclusive). + * 0 means release all nodes including root, + * QUOTATREE_DEPTH means never release nodes. + * Current value: release all nodes strictly after QUOTATREE_EDEPTH + * (measured in external shift units). + */ +#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \ + - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \ + + 1) + +/* + * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes. + * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS), + * and each node contains 2^QUOTAID_BBITS pointers. + * Level 0 is a (single) tree root node. + * + * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data. + * Nodes of lower levels contain pointers to nodes. + * + * Double pointer in array of i-level node, pointing to a (i+1)-level node + * (such as inside quotatree_find_state) are marked by level (i+1), not i. + * Level 0 double pointer is a pointer to root inside tree struct. + * + * The tree is permanent, i.e. all index blocks allocated are keeped alive to + * preserve the blocks numbers in the quota file tree to keep its changes + * locally. + */ +struct quotatree_node { + struct list_head list; + quotaid_t num; + void *blocks[QUOTATREE_BSIZE]; +}; + +struct quotatree_level { + struct list_head usedlh, freelh; + quotaid_t freenum; +}; + +struct quotatree_tree { + struct quotatree_level levels[QUOTATREE_DEPTH]; + struct quotatree_node *root; + unsigned int leaf_num; +}; + +struct quotatree_find_state { + void **block; + int level; +}; + +/* number of leafs (objects) and leaf level of the tree */ +#define QTREE_LEAFNUM(tree) ((tree)->leaf_num) +#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1]) + +struct quotatree_tree *quotatree_alloc(void); +void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st); +int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st, void *data); +void quotatree_remove(struct quotatree_tree *tree, quotaid_t id); +void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)); +void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id); +void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index); + +#endif /* _VZDQ_TREE_H */ + diff -uprN linux-2.6.18/include/linux/vzevent.h linux-2.6.18.ovz/include/linux/vzevent.h --- linux-2.6.18/include/linux/vzevent.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzevent.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,13 @@ +#ifndef __LINUX_VZ_EVENT_H__ +#define __LINUX_VZ_EVENT_H__ + +#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE) +extern int vzevent_send(int msg, const char *attrs_fmt, ...); +#else +static inline int vzevent_send(int msg, const char *attrs_fmt, ...) +{ + return 0; +} +#endif + +#endif /* __LINUX_VZ_EVENT_H__ */ diff -uprN linux-2.6.18/include/linux/vzquota.h linux-2.6.18.ovz/include/linux/vzquota.h --- linux-2.6.18/include/linux/vzquota.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzquota.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,378 @@ +/* + * + * Copyright (C) 2001-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo disk quota implementation + */ + +#ifndef _VZDQUOTA_H +#define _VZDQUOTA_H + +#include +#include + +/* vzquotactl syscall commands */ +#define VZ_DQ_CREATE 5 /* create quota master block */ +#define VZ_DQ_DESTROY 6 /* destroy qmblk */ +#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */ +#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */ +#define VZ_DQ_SETLIMIT 9 /* set new limits */ +#define VZ_DQ_GETSTAT 10 /* get usage statistic */ +/* set of syscalls to maintain UGID quotas */ +#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */ +#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */ +#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */ +#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */ +#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */ +#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */ +#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */ +#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */ + +/* common structure for vz and ugid quota */ +struct dq_stat { + /* blocks limits */ + __u64 bhardlimit; /* absolute limit in bytes */ + __u64 bsoftlimit; /* preferred limit in bytes */ + time_t btime; /* time limit for excessive disk use */ + __u64 bcurrent; /* current bytes count */ + /* inodes limits */ + __u32 ihardlimit; /* absolute limit on allocated inodes */ + __u32 isoftlimit; /* preferred inode limit */ + time_t itime; /* time limit for excessive inode use */ + __u32 icurrent; /* current # allocated inodes */ +}; + +/* One second resolution for grace times */ +#define CURRENT_TIME_SECONDS (get_seconds()) + +/* Values for dq_info->flags */ +#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ +#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ + +struct dq_info { + time_t bexpire; /* expire timeout for excessive disk use */ + time_t iexpire; /* expire timeout for excessive inode use */ + unsigned flags; /* see previos defines */ +}; + +struct vz_quota_stat { + struct dq_stat dq_stat; + struct dq_info dq_info; +}; + +/* UID/GID interface record - for user-kernel level exchange */ +struct vz_quota_iface { + unsigned int qi_id; /* UID/GID this applies to */ + unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ + struct dq_stat qi_stat; /* limits, options, usage stats */ +}; + +#ifdef CONFIG_COMPAT +#include +struct compat_dq_stat { + /* blocks limits */ + __u64 bhardlimit; /* absolute limit in bytes */ + __u64 bsoftlimit; /* preferred limit in bytes */ + compat_time_t btime; /* time limit for excessive disk use */ + __u64 bcurrent; /* current bytes count */ + /* inodes limits */ + __u32 ihardlimit; /* absolute limit on allocated inodes */ + __u32 isoftlimit; /* preferred inode limit */ + compat_time_t itime; /* time limit for excessive inode use */ + __u32 icurrent; /* current # allocated inodes */ +}; + +struct compat_dq_info { + compat_time_t bexpire; /* expire timeout for excessive disk use */ + compat_time_t iexpire; /* expire timeout for excessive inode use */ + unsigned flags; /* see previos defines */ +}; + +struct compat_vz_quota_stat { + struct compat_dq_stat dq_stat; + struct compat_dq_info dq_info; +}; + +struct compat_vz_quota_iface { + unsigned int qi_id; /* UID/GID this applies to */ + unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ + struct compat_dq_stat qi_stat; /* limits, options, usage stats */ +}; + +static inline void compat_dqstat2dqstat(struct compat_dq_stat *odqs, + struct dq_stat *dqs) +{ + dqs->bhardlimit = odqs->bhardlimit; + dqs->bsoftlimit = odqs->bsoftlimit; + dqs->bcurrent = odqs->bcurrent; + dqs->btime = odqs->btime; + + dqs->ihardlimit = odqs->ihardlimit; + dqs->isoftlimit = odqs->isoftlimit; + dqs->icurrent = odqs->icurrent; + dqs->itime = odqs->itime; +} + +static inline void compat_dqinfo2dqinfo(struct compat_dq_info *odqi, + struct dq_info *dqi) +{ + dqi->bexpire = odqi->bexpire; + dqi->iexpire = odqi->iexpire; + dqi->flags = odqi->flags; +} + +static inline void dqstat2compat_dqstat(struct dq_stat *dqs, + struct compat_dq_stat *odqs) +{ + odqs->bhardlimit = dqs->bhardlimit; + odqs->bsoftlimit = dqs->bsoftlimit; + odqs->bcurrent = dqs->bcurrent; + odqs->btime = (compat_time_t)dqs->btime; + + odqs->ihardlimit = dqs->ihardlimit; + odqs->isoftlimit = dqs->isoftlimit; + odqs->icurrent = dqs->icurrent; + odqs->itime = (compat_time_t)dqs->itime; +} + +static inline void dqinfo2compat_dqinfo(struct dq_info *dqi, + struct compat_dq_info *odqi) +{ + odqi->bexpire = (compat_time_t)dqi->bexpire; + odqi->iexpire = (compat_time_t)dqi->iexpire; + odqi->flags = dqi->flags; +} +#endif + +/* values for flags and dq_flags */ +/* this flag is set if the userspace has been unable to provide usage + * information about all ugids + * if the flag is set, we don't allocate new UG quota blocks (their + * current usage is unknown) or free existing UG quota blocks (not to + * lose information that this block is ok) */ +#define VZDQUG_FIXED_SET 0x01 +/* permit to use ugid quota */ +#define VZDQUG_ON 0x02 +#define VZDQ_USRQUOTA 0x10 +#define VZDQ_GRPQUOTA 0x20 +#define VZDQ_NOACT 0x1000 /* not actual */ +#define VZDQ_NOQUOT 0x2000 /* not under quota tree */ + +struct vz_quota_ugid_stat { + unsigned int limit; /* max amount of ugid records */ + unsigned int count; /* amount of ugid records */ + unsigned int flags; +}; + +struct vz_quota_ugid_setlimit { + unsigned int type; /* quota type (USR/GRP) */ + unsigned int id; /* ugid */ + struct if_dqblk dqb; /* limits info */ +}; + +struct vz_quota_ugid_setinfo { + unsigned int type; /* quota type (USR/GRP) */ + struct if_dqinfo dqi; /* grace info */ +}; + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include + +/* Values for dq_info flags */ +#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ +#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ + +/* values for dq_state */ +#define VZDQ_STARTING 0 /* created, not turned on yet */ +#define VZDQ_WORKING 1 /* quota created, turned on */ +#define VZDQ_STOPING 2 /* created, turned on and off */ + +/* master quota record - one per veid */ +struct vz_quota_master { + struct list_head dq_hash; /* next quota in hash list */ + atomic_t dq_count; /* inode reference count */ + unsigned int dq_flags; /* see VZDQUG_FIXED_SET */ + unsigned int dq_state; /* see values above */ + unsigned int dq_id; /* VEID this applies to */ + struct dq_stat dq_stat; /* limits, grace, usage stats */ + struct dq_info dq_info; /* grace times and flags */ + spinlock_t dq_data_lock; /* for dq_stat */ + + struct semaphore dq_sem; /* semaphore to protect + ugid tree */ + + struct list_head dq_ilink_list; /* list of vz_quota_ilink */ + struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */ + struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */ + unsigned int dq_ugid_count; /* amount of ugid records */ + unsigned int dq_ugid_max; /* max amount of ugid records */ + struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */ + + struct dentry *dq_root_dentry;/* dentry of fs tree */ + struct vfsmount *dq_root_mnt; /* vfsmnt of this dentry */ + struct super_block *dq_sb; /* superblock of our quota root */ +}; + +/* UID/GID quota record - one per pair (quota_master, uid or gid) */ +struct vz_quota_ugid { + unsigned int qugid_id; /* UID/GID this applies to */ + struct dq_stat qugid_stat; /* limits, options, usage stats */ + int qugid_type; /* USRQUOTA|GRPQUOTA */ + atomic_t qugid_count; /* reference count */ +}; + +#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11) + +struct vz_quota_datast { + struct vz_quota_ilink qlnk; +}; + +#define VIRTINFO_QUOTA_GETSTAT 0 +#define VIRTINFO_QUOTA_ON 1 +#define VIRTINFO_QUOTA_OFF 2 +#define VIRTINFO_QUOTA_DISABLE 3 + +struct virt_info_quota { + struct super_block *super; + struct dq_stat *qstat; +}; + +/* + * Interface to VZ quota core + */ +#define INODE_QLNK(inode) (&(inode)->i_qlnk) +#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk) + +#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef) + +#define VZ_QUOTAO_SETE 1 +#define VZ_QUOTAO_INIT 2 +#define VZ_QUOTAO_DESTR 3 +#define VZ_QUOTAO_SWAP 4 +#define VZ_QUOTAO_INICAL 5 +#define VZ_QUOTAO_DRCAL 6 +#define VZ_QUOTAO_QSET 7 +#define VZ_QUOTAO_TRANS 8 +#define VZ_QUOTAO_ACT 9 +#define VZ_QUOTAO_DTREE 10 +#define VZ_QUOTAO_DET 11 +#define VZ_QUOTAO_ON 12 +#define VZ_QUOTAO_RE_LOCK 13 + +#define DQUOT_CMD_ALLOC 0 +#define DQUOT_CMD_PREALLOC 1 +#define DQUOT_CMD_CHECK 12 +#define DQUOT_CMD_FORCE 13 + +extern struct semaphore vz_quota_sem; +void inode_qmblk_lock(struct super_block *sb); +void inode_qmblk_unlock(struct super_block *sb); +void qmblk_data_read_lock(struct vz_quota_master *qmblk); +void qmblk_data_read_unlock(struct vz_quota_master *qmblk); +void qmblk_data_write_lock(struct vz_quota_master *qmblk); +void qmblk_data_write_unlock(struct vz_quota_master *qmblk); + +/* for quota operations */ +void vzquota_inode_init_call(struct inode *inode); +void vzquota_inode_drop_call(struct inode *inode); +int vzquota_inode_transfer_call(struct inode *, struct iattr *); +struct vz_quota_master *vzquota_inode_data(struct inode *inode, + struct vz_quota_datast *); +void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *); +int vzquota_rename_check(struct inode *inode, + struct inode *old_dir, struct inode *new_dir); +struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode); +/* for second-level quota */ +struct vz_quota_master *vzquota_find_qmblk(struct super_block *); +/* for management operations */ +struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, + struct vz_quota_stat *qstat); +void vzquota_free_master(struct vz_quota_master *); +struct vz_quota_master *vzquota_find_master(unsigned int quota_id); +int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, + struct vz_quota_master *qmblk); +int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk); +int vzquota_get_super(struct super_block *sb); +void vzquota_put_super(struct super_block *sb); + +static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk) +{ + if (!atomic_read(&qmblk->dq_count)) + BUG(); + atomic_inc(&qmblk->dq_count); + return qmblk; +} + +static inline void __qmblk_put(struct vz_quota_master *qmblk) +{ + atomic_dec(&qmblk->dq_count); +} + +static inline void qmblk_put(struct vz_quota_master *qmblk) +{ + if (!atomic_dec_and_test(&qmblk->dq_count)) + return; + vzquota_free_master(qmblk); +} + +extern struct list_head vzquota_hash_table[]; +extern int vzquota_hash_size; + +/* + * Interface to VZ UGID quota + */ +extern struct quotactl_ops vz_quotactl_operations; +extern struct dquot_operations vz_quota_operations2; +extern struct quota_format_type vz_quota_empty_v2_format; + +#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \ + qmblk->dq_uid_tree : \ + qmblk->dq_gid_tree) + +#define VZDQUG_FIND_DONT_ALLOC 1 +#define VZDQUG_FIND_FAKE 2 +struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags); +struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags); +struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid); +void vzquota_put_ugid(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid); +void vzquota_kill_ugid(struct vz_quota_master *qmblk); +int vzquota_ugid_init(void); +void vzquota_ugid_release(void); +int vzquota_transfer_usage(struct inode *inode, int mask, + struct vz_quota_ilink *qlnk); +void vzquota_inode_off(struct inode *inode); + +long do_vzquotaugidctl(int cmd, unsigned int quota_id, + unsigned int ugid_index, unsigned int ugid_size, + void *addr, int compat); + +/* + * Other VZ quota parts + */ +extern struct dquot_operations vz_quota_operations; + +long do_vzquotactl(int cmd, unsigned int quota_id, + struct vz_quota_stat __user *qstat, const char __user *ve_root, + int compat); +int vzquota_proc_init(void); +void vzquota_proc_release(void); +struct vz_quota_master *vzquota_find_qmblk(struct super_block *); +extern struct semaphore vz_quota_sem; + +void vzaquota_init(void); +void vzaquota_fini(void); + +#endif /* __KERNEL__ */ + +#endif /* _VZDQUOTA_H */ diff -uprN linux-2.6.18/include/linux/vzquota_qlnk.h linux-2.6.18.ovz/include/linux/vzquota_qlnk.h --- linux-2.6.18/include/linux/vzquota_qlnk.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzquota_qlnk.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,25 @@ +/* + * include/linux/vzquota_qlnk.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZDQUOTA_QLNK_H +#define _VZDQUOTA_QLNK_H + +struct vz_quota_master; +struct vz_quota_ugid; + +/* inode link, used to track inodes using quota via dq_ilink_list */ +struct vz_quota_ilink { + struct vz_quota_master *qmblk; + struct vz_quota_ugid *qugid[MAXQUOTAS]; + struct list_head list; + unsigned char origin[2]; +}; + +#endif /* _VZDQUOTA_QLNK_H */ diff -uprN linux-2.6.18/include/linux/vzratelimit.h linux-2.6.18.ovz/include/linux/vzratelimit.h --- linux-2.6.18/include/linux/vzratelimit.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzratelimit.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,28 @@ +/* + * include/linux/vzratelimit.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VZ_RATELIMIT_H__ +#define __VZ_RATELIMIT_H__ + +/* + * Generic ratelimiting stuff. + */ + +struct vz_rate_info { + int burst; + int interval; /* jiffy_t per event */ + int bucket; /* kind of leaky bucket */ + unsigned long last; /* last event */ +}; + +/* Return true if rate limit permits. */ +int vz_ratelimit(struct vz_rate_info *p); + +#endif /* __VZ_RATELIMIT_H__ */ diff -uprN linux-2.6.18/include/linux/vzstat.h linux-2.6.18.ovz/include/linux/vzstat.h --- linux-2.6.18/include/linux/vzstat.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/linux/vzstat.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,182 @@ +/* + * include/linux/vzstat.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VZSTAT_H__ +#define __VZSTAT_H__ + +struct swap_cache_info_struct { + unsigned long add_total; + unsigned long del_total; + unsigned long find_success; + unsigned long find_total; + unsigned long noent_race; + unsigned long exist_race; + unsigned long remove_race; +}; + +struct kstat_lat_snap_struct { + cycles_t maxlat, totlat; + unsigned long count; +}; +struct kstat_lat_pcpu_snap_struct { + cycles_t maxlat, totlat; + unsigned long count; + seqcount_t lock; +} ____cacheline_aligned_in_smp; + +struct kstat_lat_struct { + struct kstat_lat_snap_struct cur, last; + cycles_t avg[3]; +}; +struct kstat_lat_pcpu_struct { + struct kstat_lat_pcpu_snap_struct cur[NR_CPUS]; + cycles_t max_snap; + struct kstat_lat_snap_struct last; + cycles_t avg[3]; +}; + +struct kstat_perf_snap_struct { + cycles_t wall_tottime, cpu_tottime; + cycles_t wall_maxdur, cpu_maxdur; + unsigned long count; +}; +struct kstat_perf_struct { + struct kstat_perf_snap_struct cur, last; +}; + +struct kstat_zone_avg { + unsigned long free_pages_avg[3], + nr_active_avg[3], + nr_inactive_avg[3]; +}; + +#define KSTAT_ALLOCSTAT_NR 5 + +struct kernel_stat_glob { + unsigned long nr_unint_avg[3]; + + unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR]; + struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR]; + struct kstat_lat_pcpu_struct sched_lat; + struct kstat_lat_struct swap_in; + + struct kstat_perf_struct ttfp, cache_reap, + refill_inact, shrink_icache, shrink_dcache; + + struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */ +} ____cacheline_aligned; + +extern struct kernel_stat_glob kstat_glob ____cacheline_aligned; +extern spinlock_t kstat_glb_lock; + +#ifdef CONFIG_VE +#define KSTAT_PERF_ENTER(name) \ + unsigned long flags; \ + cycles_t start, sleep_time; \ + \ + start = get_cycles(); \ + sleep_time = VE_TASK_INFO(current)->sleep_time; \ + +#define KSTAT_PERF_LEAVE(name) \ + spin_lock_irqsave(&kstat_glb_lock, flags); \ + kstat_glob.name.cur.count++; \ + start = get_cycles() - start; \ + if (kstat_glob.name.cur.wall_maxdur < start) \ + kstat_glob.name.cur.wall_maxdur = start;\ + kstat_glob.name.cur.wall_tottime += start; \ + start -= VE_TASK_INFO(current)->sleep_time - \ + sleep_time; \ + if (kstat_glob.name.cur.cpu_maxdur < start) \ + kstat_glob.name.cur.cpu_maxdur = start; \ + kstat_glob.name.cur.cpu_tottime += start; \ + spin_unlock_irqrestore(&kstat_glb_lock, flags); \ + +#else +#define KSTAT_PERF_ENTER(name) +#define KSTAT_PERF_LEAVE(name) +#endif + +/* + * Add another statistics reading. + * Serialization is the caller's due. + */ +static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p, + cycles_t dur) +{ + p->cur.count++; + if (p->cur.maxlat < dur) + p->cur.maxlat = dur; + p->cur.totlat += dur; +} + +static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, + cycles_t dur) +{ + struct kstat_lat_pcpu_snap_struct *cur; + + cur = &p->cur[cpu]; + write_seqcount_begin(&cur->lock); + cur->count++; + if (cur->maxlat < dur) + cur->maxlat = dur; + cur->totlat += dur; + write_seqcount_end(&cur->lock); +} + +/* + * Move current statistics to last, clear last. + * Serialization is the caller's due. + */ +static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p) +{ + cycles_t m; + memcpy(&p->last, &p->cur, sizeof(p->last)); + p->cur.maxlat = 0; + m = p->last.maxlat; + CALC_LOAD(p->avg[0], EXP_1, m) + CALC_LOAD(p->avg[1], EXP_5, m) + CALC_LOAD(p->avg[2], EXP_15, m) +} + +static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p) +{ + unsigned i, cpu; + struct kstat_lat_pcpu_snap_struct snap, *cur; + cycles_t m; + + memset(&p->last, 0, sizeof(p->last)); + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cur = &p->cur[cpu]; + do { + i = read_seqcount_begin(&cur->lock); + memcpy(&snap, cur, sizeof(snap)); + } while (read_seqcount_retry(&cur->lock, i)); + /* + * read above and this update of maxlat is not atomic, + * but this is OK, since it happens rarely and losing + * a couple of peaks is not essential. xemul + */ + cur->maxlat = 0; + + p->last.count += snap.count; + p->last.totlat += snap.totlat; + if (p->last.maxlat < snap.maxlat) + p->last.maxlat = snap.maxlat; + } + + m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap); + CALC_LOAD(p->avg[0], EXP_1, m); + CALC_LOAD(p->avg[1], EXP_5, m); + CALC_LOAD(p->avg[2], EXP_15, m); + /* reset max_snap to calculate it correctly next time */ + p->max_snap = 0; +} + +#endif /* __VZSTAT_H__ */ diff -uprN linux-2.6.18/include/media/cx2341x.h linux-2.6.18.ovz/include/media/cx2341x.h --- linux-2.6.18/include/media/cx2341x.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/media/cx2341x.h 2007-06-13 06:55:07.000000000 -0400 @@ -49,7 +49,7 @@ struct cx2341x_mpeg_params { enum v4l2_mpeg_audio_mode_extension audio_mode_extension; enum v4l2_mpeg_audio_emphasis audio_emphasis; enum v4l2_mpeg_audio_crc audio_crc; - u8 audio_properties; + u16 audio_properties; /* video */ enum v4l2_mpeg_video_encoding video_encoding; diff -uprN linux-2.6.18/include/mtd/Kbuild linux-2.6.18.ovz/include/mtd/Kbuild --- linux-2.6.18/include/mtd/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/mtd/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,2 +1,6 @@ -unifdef-y := mtd-abi.h -header-y := inftl-user.h jffs2-user.h mtd-user.h nftl-user.h +header-y += inftl-user.h +header-y += jffs2-user.h +header-y += mtd-user.h +header-y += nftl-user.h + +unifdef-y += mtd-abi.h diff -uprN linux-2.6.18/include/net/addrconf.h linux-2.6.18.ovz/include/net/addrconf.h --- linux-2.6.18/include/net/addrconf.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/addrconf.h 2007-06-13 06:55:07.000000000 -0400 @@ -243,5 +243,19 @@ extern int if6_proc_init(void); extern void if6_proc_exit(void); #endif +int addrconf_ifdown(struct net_device *dev, int how); +int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, + __u32 prefered_lft, __u32 valid_lft); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +int addrconf_sysctl_init(struct ve_struct *ve); +void addrconf_sysctl_fini(struct ve_struct *ve); +void addrconf_sysctl_free(struct ve_struct *ve); +#else +#define addrconf_sysctl_init(ve) (0) +#define addrconf_sysctl_fini(ve) do { } while (0) +#define addrconf_sysctl_free(ve) do { } while (0) +#endif + #endif #endif diff -uprN linux-2.6.18/include/net/af_unix.h linux-2.6.18.ovz/include/net/af_unix.h --- linux-2.6.18/include/net/af_unix.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/af_unix.h 2007-06-13 06:55:07.000000000 -0400 @@ -9,6 +9,7 @@ extern void unix_inflight(struct file *fp); extern void unix_notinflight(struct file *fp); extern void unix_gc(void); +extern void unix_destruct_fds(struct sk_buff *skb); #define UNIX_HASH_SIZE 256 @@ -19,23 +20,37 @@ extern atomic_t unix_tot_inflight; static inline struct sock *first_unix_socket(int *i) { + struct sock *s; + struct ve_struct *ve; + + ve = get_exec_env(); for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { - if (!hlist_empty(&unix_socket_table[*i])) - return __sk_head(&unix_socket_table[*i]); + for (s = sk_head(&unix_socket_table[*i]); + s != NULL && !ve_accessible(s->owner_env, ve); + s = sk_next(s)); + if (s != NULL) + return s; } return NULL; } static inline struct sock *next_unix_socket(int *i, struct sock *s) { - struct sock *next = sk_next(s); - /* More in this chain? */ - if (next) - return next; + struct ve_struct *ve; + + ve = get_exec_env(); + for (s = sk_next(s); s != NULL; s = sk_next(s)) { + if (!ve_accessible(s->owner_env, ve)) + continue; + return s; + } /* Look for next non-empty chain. */ for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { - if (!hlist_empty(&unix_socket_table[*i])) - return __sk_head(&unix_socket_table[*i]); + for (s = sk_head(&unix_socket_table[*i]); + s != NULL && !ve_accessible(s->owner_env, ve); + s = sk_next(s)); + if (s != NULL) + return s; } return NULL; } diff -uprN linux-2.6.18/include/net/arp.h linux-2.6.18.ovz/include/net/arp.h --- linux-2.6.18/include/net/arp.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/arp.h 2007-06-13 06:55:07.000000000 -0400 @@ -7,7 +7,16 @@ #define HAVE_ARP_CREATE -extern struct neigh_table arp_tbl; +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define arp_tbl (*(get_exec_env()->ve_arp_tbl)) +extern int ve_arp_init(struct ve_struct *ve); +extern void ve_arp_fini(struct ve_struct *ve); +#else +extern struct neigh_table global_arp_tbl; +#define arp_tbl global_arp_tbl +static inline int ve_arp_init(struct ve_struct *ve) { return 0; } +static inline void ve_arp_fini(struct ve_struct *ve) { ; } +#endif extern void arp_init(void); extern int arp_find(unsigned char *haddr, struct sk_buff *skb); diff -uprN linux-2.6.18/include/net/flow.h linux-2.6.18.ovz/include/net/flow.h --- linux-2.6.18/include/net/flow.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/flow.h 2007-06-13 06:55:07.000000000 -0400 @@ -10,6 +10,7 @@ #include #include +struct ve_struct; struct flowi { int oif; int iif; @@ -78,6 +79,9 @@ struct flowi { #define fl_icmp_type uli_u.icmpt.type #define fl_icmp_code uli_u.icmpt.code #define fl_ipsec_spi uli_u.spi +#ifdef CONFIG_VE + struct ve_struct *owner_env; +#endif } __attribute__((__aligned__(BITS_PER_LONG/8))); #define FLOW_DIR_IN 0 diff -uprN linux-2.6.18/include/net/icmp.h linux-2.6.18.ovz/include/net/icmp.h --- linux-2.6.18/include/net/icmp.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/icmp.h 2007-06-13 06:55:07.000000000 -0400 @@ -30,9 +30,14 @@ struct icmp_err { extern struct icmp_err icmp_err_convert[]; DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics); -#define ICMP_INC_STATS(field) SNMP_INC_STATS(icmp_statistics, field) -#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) -#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define ve_icmp_statistics (get_exec_env()->_icmp_statistics) +#else +#define ve_icmp_statistics icmp_statistics +#endif +#define ICMP_INC_STATS(field) SNMP_INC_STATS(ve_icmp_statistics, field) +#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmp_statistics, field) +#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmp_statistics, field) struct dst_entry; struct net_proto_family; diff -uprN linux-2.6.18/include/net/if_inet6.h linux-2.6.18.ovz/include/net/if_inet6.h --- linux-2.6.18/include/net/if_inet6.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/if_inet6.h 2007-06-13 06:55:07.000000000 -0400 @@ -191,7 +191,14 @@ struct inet6_dev unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ }; -extern struct ipv6_devconf ipv6_devconf; +extern struct ipv6_devconf global_ipv6_devconf; +extern struct ipv6_devconf global_ipv6_devconf_dflt; + +#ifdef CONFIG_VE +#define ve_ipv6_devconf (*(get_exec_env()->_ipv6_devconf)) +#else +#define ve_ipv6_devconf global_ipv6_devconf +#endif static inline void ipv6_eth_mc_map(struct in6_addr *addr, char *buf) { diff -uprN linux-2.6.18/include/net/inet6_hashtables.h linux-2.6.18.ovz/include/net/inet6_hashtables.h --- linux-2.6.18/include/net/inet6_hashtables.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/inet6_hashtables.h 2007-06-13 06:55:07.000000000 -0400 @@ -26,11 +26,13 @@ struct inet_hashinfo; /* I have no idea if this is a good hash for v6 or not. -DaveM */ static inline unsigned int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport, - const struct in6_addr *faddr, const u16 fport) + const struct in6_addr *faddr, const u16 fport, + const envid_t veid) { unsigned int hashent = (lport ^ fport); hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); + hashent ^= (veid ^ (veid >> 16)); hashent ^= hashent >> 16; hashent ^= hashent >> 8; return hashent; @@ -44,7 +46,7 @@ static inline int inet6_sk_ehashfn(const const struct in6_addr *faddr = &np->daddr; const __u16 lport = inet->num; const __u16 fport = inet->dport; - return inet6_ehashfn(laddr, lport, faddr, fport); + return inet6_ehashfn(laddr, lport, faddr, fport, VEID(sk->owner_env)); } extern void __inet6_hash(struct inet_hashinfo *hashinfo, struct sock *sk); diff -uprN linux-2.6.18/include/net/inet_hashtables.h linux-2.6.18.ovz/include/net/inet_hashtables.h --- linux-2.6.18/include/net/inet_hashtables.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/inet_hashtables.h 2007-06-13 06:55:07.000000000 -0400 @@ -74,6 +74,7 @@ struct inet_ehash_bucket { * ports are created in O(1) time? I thought so. ;-) -DaveM */ struct inet_bind_bucket { + struct ve_struct *owner_env; unsigned short port; signed short fastreuse; struct hlist_node node; @@ -138,37 +139,43 @@ static inline struct inet_ehash_bucket * extern struct inet_bind_bucket * inet_bind_bucket_create(kmem_cache_t *cachep, struct inet_bind_hashbucket *head, - const unsigned short snum); + const unsigned short snum, + struct ve_struct *env); extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb); -static inline int inet_bhashfn(const __u16 lport, const int bhash_size) +static inline int inet_bhashfn(const __u16 lport, const int bhash_size, + unsigned veid) { - return lport & (bhash_size - 1); + return ((lport + (veid ^ (veid >> 16))) & (bhash_size - 1)); } extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum); /* These can have wildcards, don't try too hard. */ -static inline int inet_lhashfn(const unsigned short num) +static inline int inet_lhashfn(const unsigned short num, unsigned veid) { - return num & (INET_LHTABLE_SIZE - 1); + return ((num + (veid ^ (veid >> 16))) & (INET_LHTABLE_SIZE - 1)); } static inline int inet_sk_listen_hashfn(const struct sock *sk) { - return inet_lhashfn(inet_sk(sk)->num); + return inet_lhashfn(inet_sk(sk)->num, VEID(sk->owner_env)); } /* Caller must disable local BH processing. */ static inline void __inet_inherit_port(struct inet_hashinfo *table, struct sock *sk, struct sock *child) { - const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); - struct inet_bind_hashbucket *head = &table->bhash[bhash]; + int bhash; + struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; + bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size, + VEID(child->owner_env)); + head = &table->bhash[bhash]; + spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; sk_add_bind_node(child, &tb->owners); @@ -274,7 +281,8 @@ static inline int inet_iif(const struct extern struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, const unsigned short hnum, - const int dif); + const int dif, + struct ve_struct *env); /* Optimize the common listener case. */ static inline struct sock * @@ -284,18 +292,21 @@ static inline struct sock * { struct sock *sk = NULL; const struct hlist_head *head; + struct ve_struct *env; + env = get_exec_env(); read_lock(&hashinfo->lhash_lock); - head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; + head = &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))]; if (!hlist_empty(head)) { const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); if (inet->num == hnum && !sk->sk_node.next && + ve_accessible_strict(sk->owner_env, env) && (!inet->rcv_saddr || inet->rcv_saddr == daddr) && (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && !sk->sk_bound_dev_if) goto sherry_cache; - sk = __inet_lookup_listener(head, daddr, hnum, dif); + sk = __inet_lookup_listener(head, daddr, hnum, dif, env); } if (sk) { sherry_cache: @@ -322,25 +333,25 @@ sherry_cache: #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr)); #endif /* __BIG_ENDIAN */ -#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ +#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ (((__sk)->sk_hash == (__hash)) && \ ((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ +#define INET_TW_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ (((__sk)->sk_hash == (__hash)) && \ ((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #else /* 32-bit arch */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) -#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ +#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ (((__sk)->sk_hash == (__hash)) && \ (inet_sk(__sk)->daddr == (__saddr)) && \ (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define INET_TW_MATCH(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ +#define INET_TW_MATCH_ALLVE(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ (((__sk)->sk_hash == (__hash)) && \ (inet_twsk(__sk)->tw_daddr == (__saddr)) && \ (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ @@ -348,6 +359,18 @@ sherry_cache: (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #endif /* 64-bit arch */ +#define INET_MATCH(__sk, __hash, __cookie, __saddr, \ + __daddr, __ports, __dif, __ve) \ + (INET_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ + (__daddr), (__ports), (__dif)) \ + && ve_accessible_strict((__sk)->owner_env, (__ve))) + +#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, \ + __daddr, __ports, __dif, __ve) \ + (INET_TW_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ + (__daddr), (__ports), (__dif)) \ + && ve_accessible_strict(inet_twsk(__sk)->tw_owner_env, VEID(__ve))) + /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM @@ -367,19 +390,25 @@ static inline struct sock * /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); - struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); - + unsigned int hash; + struct inet_ehash_bucket *head; + struct ve_struct *env; + + env = get_exec_env(); + hash = inet_ehashfn(daddr, hnum, saddr, sport, VEID(env)); + head = inet_ehash_bucket(hashinfo, hash); prefetch(head->chain.first); read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { - if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk, hash, acookie, saddr, daddr, + ports, dif, env)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { - if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) + if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, + ports, dif, env)) goto hit; } sk = NULL; diff -uprN linux-2.6.18/include/net/inet_sock.h linux-2.6.18.ovz/include/net/inet_sock.h --- linux-2.6.18/include/net/inet_sock.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/inet_sock.h 2007-06-13 06:55:07.000000000 -0400 @@ -170,9 +170,10 @@ static inline void inet_sk_copy_descenda extern int inet_sk_rebuild_header(struct sock *sk); static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, - const __u32 faddr, const __u16 fport) + const __u32 faddr, const __u16 fport, + const envid_t veid) { - unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); + int h = (laddr ^ lport) ^ (faddr ^ fport) ^ (veid ^ (veid >> 16)); h ^= h >> 16; h ^= h >> 8; return h; @@ -185,8 +186,9 @@ static inline int inet_sk_ehashfn(const const __u16 lport = inet->num; const __u32 faddr = inet->daddr; const __u16 fport = inet->dport; + envid_t veid = VEID(sk->owner_env); - return inet_ehashfn(laddr, lport, faddr, fport); + return inet_ehashfn(laddr, lport, faddr, fport, veid); } #endif /* _INET_SOCK_H */ diff -uprN linux-2.6.18/include/net/inet_timewait_sock.h linux-2.6.18.ovz/include/net/inet_timewait_sock.h --- linux-2.6.18/include/net/inet_timewait_sock.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/inet_timewait_sock.h 2007-06-13 06:55:07.000000000 -0400 @@ -81,6 +81,7 @@ struct inet_timewait_death_row { struct inet_hashinfo *hashinfo; int sysctl_tw_recycle; int sysctl_max_tw_buckets; + int ub_managed; }; extern void inet_twdr_hangman(unsigned long data); @@ -133,6 +134,7 @@ struct inet_timewait_sock { unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; struct hlist_node tw_death_node; + envid_t tw_owner_env; }; static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, diff -uprN linux-2.6.18/include/net/ip.h linux-2.6.18.ovz/include/net/ip.h --- linux-2.6.18/include/net/ip.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/ip.h 2007-06-13 06:55:07.000000000 -0400 @@ -151,15 +151,25 @@ struct ipv4_config extern struct ipv4_config ipv4_config; DECLARE_SNMP_STAT(struct ipstats_mib, ip_statistics); -#define IP_INC_STATS(field) SNMP_INC_STATS(ip_statistics, field) -#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ip_statistics, field) -#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ip_statistics, field) +#ifdef CONFIG_VE +#define ve_ip_statistics (get_exec_env()->_ip_statistics) +#else +#define ve_ip_statistics ip_statistics +#endif +#define IP_INC_STATS(field) SNMP_INC_STATS(ve_ip_statistics, field) +#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ip_statistics, field) +#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ip_statistics, field) DECLARE_SNMP_STAT(struct linux_mib, net_statistics); -#define NET_INC_STATS(field) SNMP_INC_STATS(net_statistics, field) -#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(net_statistics, field) -#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(net_statistics, field) -#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(net_statistics, field, adnd) -#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(net_statistics, field, adnd) +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define ve_net_statistics (get_exec_env()->_net_statistics) +#else +#define ve_net_statistics net_statistics +#endif +#define NET_INC_STATS(field) SNMP_INC_STATS(ve_net_statistics, field) +#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_net_statistics, field) +#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_net_statistics, field) +#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(ve_net_statistics, field, adnd) +#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(ve_net_statistics, field, adnd) extern int sysctl_local_port_range[2]; extern int sysctl_ip_default_ttl; @@ -383,4 +393,11 @@ extern int ip_misc_proc_init(void); extern struct ctl_table ipv4_table[]; +#ifdef CONFIG_SYSCTL +extern int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int ipv4_sysctl_forward_strategy(ctl_table *table, int __user *name, + int nlen, void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context); +#endif #endif /* _IP_H */ diff -uprN linux-2.6.18/include/net/ip6_fib.h linux-2.6.18.ovz/include/net/ip6_fib.h --- linux-2.6.18/include/net/ip6_fib.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/ip6_fib.h 2007-06-13 06:55:07.000000000 -0400 @@ -78,6 +78,15 @@ struct rt6_info u8 rt6i_protocol; }; +struct fib6_table +{ + struct list_head list; + struct fib6_node root; + struct ve_struct *owner_env; +}; + +extern struct list_head fib6_table_list; + struct fib6_walker_t { struct fib6_walker_t *prev, *next; @@ -143,7 +152,7 @@ struct rt6_statistics { typedef void (*f_pnode)(struct fib6_node *fn, void *); -extern struct fib6_node ip6_routing_table; +extern struct fib6_node ve0_ip6_routing_table; /* * exported functions diff -uprN linux-2.6.18/include/net/ip6_route.h linux-2.6.18.ovz/include/net/ip6_route.h --- linux-2.6.18/include/net/ip6_route.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/ip6_route.h 2007-06-13 06:55:07.000000000 -0400 @@ -165,5 +165,13 @@ static inline int ipv6_unicast_destinati return rt->rt6i_flags & RTF_LOCAL; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +int init_ve_route6(struct ve_struct *ve); +void fini_ve_route6(struct ve_struct *ve); +#else +#define init_ve_route6(ve) (0) +#define fini_ve_route6(ve) do { } while (0) +#endif + #endif #endif diff -uprN linux-2.6.18/include/net/ip_fib.h linux-2.6.18.ovz/include/net/ip_fib.h --- linux-2.6.18/include/net/ip_fib.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/ip_fib.h 2007-06-13 06:55:07.000000000 -0400 @@ -167,10 +167,22 @@ struct fib_table { unsigned char tb_data[0]; }; +struct fn_zone; +struct fn_hash +{ + struct fn_zone *fn_zones[33]; + struct fn_zone *fn_zone_list; +}; + #ifndef CONFIG_IP_MULTIPLE_TABLES +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define ip_fib_local_table get_exec_env()->_local_table +#define ip_fib_main_table get_exec_env()->_main_table +#else extern struct fib_table *ip_fib_local_table; extern struct fib_table *ip_fib_main_table; +#endif static inline struct fib_table *fib_get_table(int id) { @@ -202,7 +214,12 @@ static inline void fib_select_default(co #define ip_fib_local_table (fib_tables[RT_TABLE_LOCAL]) #define ip_fib_main_table (fib_tables[RT_TABLE_MAIN]) +#ifdef CONFIG_VE +#define fib_tables get_exec_env()->_fib_tables +#else extern struct fib_table * fib_tables[RT_TABLE_MAX+1]; +#endif + extern int fib_lookup(const struct flowi *flp, struct fib_result *res); extern struct fib_table *__fib_new_table(int id); extern void fib_rule_put(struct fib_rule *r); @@ -249,10 +266,19 @@ extern u32 __fib_res_prefsrc(struct fib /* Exported by fib_hash.c */ extern struct fib_table *fib_hash_init(int id); +#if defined(CONFIG_VE) && defined(CONFIG_INET) +struct ve_struct; +extern int init_ve_route(struct ve_struct *ve); +extern void fini_ve_route(struct ve_struct *ve); +#else +#define init_ve_route(ve) (0) +#define fini_ve_route(ve) do { } while (0) +#endif #ifdef CONFIG_IP_MULTIPLE_TABLES /* Exported by fib_rules.c */ - +extern int fib_rules_create(void); +extern void fib_rules_destroy(void); extern int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); extern int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); extern int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb); diff -uprN linux-2.6.18/include/net/ipv6.h linux-2.6.18.ovz/include/net/ipv6.h --- linux-2.6.18/include/net/ipv6.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/ipv6.h 2007-06-13 06:55:07.000000000 -0400 @@ -112,39 +112,48 @@ extern int sysctl_mld_max_msf; /* MIBs */ DECLARE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); -#define IP6_INC_STATS(field) SNMP_INC_STATS(ipv6_statistics, field) -#define IP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(ipv6_statistics, field) -#define IP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(ipv6_statistics, field) +#ifdef CONFIG_VE +#define ve_ipv6_statistics (get_exec_env()->_ipv6_statistics) +#define ve_icmpv6_statistics (get_exec_env()->_icmpv6_statistics) +#define ve_udp_stats_in6 (get_exec_env()->_udp_stats_in6) +#else +#define ve_ipv6_statistics ipv6_statistics +#define ve_icmpv6_statistics icmpv6_statistics +#define ve_udp_stats_in6 udp_stats_in6 +#endif +#define IP6_INC_STATS(field) SNMP_INC_STATS(ve_ipv6_statistics, field) +#define IP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ipv6_statistics, field) +#define IP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ipv6_statistics, field) DECLARE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); #define ICMP6_INC_STATS(idev, field) ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS(idev->stats.icmpv6, field); \ - SNMP_INC_STATS(icmpv6_statistics, field); \ + SNMP_INC_STATS(ve_icmpv6_statistics, field); \ }) #define ICMP6_INC_STATS_BH(idev, field) ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_BH((_idev)->stats.icmpv6, field); \ - SNMP_INC_STATS_BH(icmpv6_statistics, field); \ + SNMP_INC_STATS_BH(ve_icmpv6_statistics, field); \ }) #define ICMP6_INC_STATS_USER(idev, field) ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_USER(_idev->stats.icmpv6, field); \ - SNMP_INC_STATS_USER(icmpv6_statistics, field); \ + SNMP_INC_STATS_USER(ve_icmpv6_statistics, field); \ }) #define ICMP6_INC_STATS_OFFSET_BH(idev, field, offset) ({ \ struct inet6_dev *_idev = idev; \ __typeof__(offset) _offset = (offset); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_OFFSET_BH(_idev->stats.icmpv6, field, _offset); \ - SNMP_INC_STATS_OFFSET_BH(icmpv6_statistics, field, _offset); \ + SNMP_INC_STATS_OFFSET_BH(ve_icmpv6_statistics, field, _offset); \ }) DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6); -#define UDP6_INC_STATS(field) SNMP_INC_STATS(udp_stats_in6, field) -#define UDP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_stats_in6, field) -#define UDP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_stats_in6, field) +#define UDP6_INC_STATS(field) SNMP_INC_STATS(ve_udp_stats_in6, field) +#define UDP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_udp_stats_in6, field) +#define UDP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_udp_stats_in6, field) int snmp6_register_dev(struct inet6_dev *idev); int snmp6_unregister_dev(struct inet6_dev *idev); @@ -153,6 +162,14 @@ int snmp6_free_dev(struct inet6_dev *ide int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign); void snmp6_mib_free(void *ptr[2]); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +int ve_snmp_proc_init(void); +void ve_snmp_proc_fini(void); +#else +#define ve_snmp_proc_init(void) (0) +#define ve_snmp_proc_fini(void) do { } while (0) +#endif + struct ip6_ra_chain { struct ip6_ra_chain *next; diff -uprN linux-2.6.18/include/net/ndisc.h linux-2.6.18.ovz/include/net/ndisc.h --- linux-2.6.18/include/net/ndisc.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/ndisc.h 2007-06-13 06:55:07.000000000 -0400 @@ -51,7 +51,20 @@ struct net_device; struct net_proto_family; struct sk_buff; -extern struct neigh_table nd_tbl; +#ifdef CONFIG_VE +#define nd_tbl (*(get_exec_env()->ve_nd_tbl)) +#else +#define nd_tbl global_nd_tbl +extern struct neigh_table global_nd_tbl; +#endif + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +extern int ve_ndisc_init(struct ve_struct *ve); +extern void ve_ndisc_fini(struct ve_struct *ve); +#else +#define ve_ndisc_init(ve) (0) +#define ve_ndisc_fini(ve) do { } while (0) +#endif struct nd_msg { struct icmp6hdr icmph; @@ -129,6 +142,7 @@ extern int ndisc_ifinfo_sysctl_change extern void inet6_ifinfo_notify(int event, struct inet6_dev *idev); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static inline struct neighbour * ndisc_get_neigh(struct net_device *dev, struct in6_addr *addr) { @@ -137,6 +151,7 @@ static inline struct neighbour * ndisc_g return NULL; } +#endif #endif /* __KERNEL__ */ diff -uprN linux-2.6.18/include/net/neighbour.h linux-2.6.18.ovz/include/net/neighbour.h --- linux-2.6.18/include/net/neighbour.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/neighbour.h 2007-06-13 06:55:07.000000000 -0400 @@ -191,6 +191,8 @@ struct neigh_table atomic_t entries; rwlock_t lock; unsigned long last_rand; + struct ve_struct *owner_env; + struct user_beancounter *owner_ub; kmem_cache_t *kmem_cachep; struct neigh_statistics *stats; struct neighbour **hash_buckets; @@ -210,8 +212,8 @@ struct neigh_table #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 -extern void neigh_table_init(struct neigh_table *tbl); -extern void neigh_table_init_no_netlink(struct neigh_table *tbl); +extern int neigh_table_init(struct neigh_table *tbl); +extern int neigh_table_init_no_netlink(struct neigh_table *tbl); extern int neigh_table_clear(struct neigh_table *tbl); extern struct neighbour * neigh_lookup(struct neigh_table *tbl, const void *pkey, diff -uprN linux-2.6.18/include/net/netlink_sock.h linux-2.6.18.ovz/include/net/netlink_sock.h --- linux-2.6.18/include/net/netlink_sock.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/net/netlink_sock.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,22 @@ +#ifndef __NET_NETLINK_SOCK_H +#define __NET_NETLINK_SOCK_H + +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 pid; + u32 dst_pid; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; + unsigned long state; + wait_queue_head_t wait; + struct netlink_callback *cb; + spinlock_t cb_lock; + void (*data_ready)(struct sock *sk, int bytes); + struct module *module; +}; + +#endif /* __NET_NETLINK_SOCK_H */ diff -uprN linux-2.6.18/include/net/route.h linux-2.6.18.ovz/include/net/route.h --- linux-2.6.18/include/net/route.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/route.h 2007-06-13 06:55:07.000000000 -0400 @@ -137,6 +137,7 @@ static inline void ip_rt_put(struct rtab #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) extern __u8 ip_tos2prio[16]; +extern int ip_rt_src_check; static inline char rt_tos2priority(u8 tos) { @@ -200,4 +201,14 @@ static inline struct inet_peer *rt_get_p extern ctl_table ipv4_route_table[]; +#ifdef CONFIG_SYSCTL +extern int ipv4_flush_delay; +extern int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); +extern int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, + int __user *name, int nlen, void __user *oldval, + size_t __user *oldlenp, void __user *newval, + size_t newlen, void **context); +#endif #endif /* _ROUTE_H */ diff -uprN linux-2.6.18/include/net/scm.h linux-2.6.18.ovz/include/net/scm.h --- linux-2.6.18/include/net/scm.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/scm.h 2007-06-13 06:55:07.000000000 -0400 @@ -54,7 +54,7 @@ static __inline__ int scm_send(struct so struct task_struct *p = current; scm->creds.uid = p->uid; scm->creds.gid = p->gid; - scm->creds.pid = p->tgid; + scm->creds.pid = virt_tgid(p); scm->fp = NULL; scm->seq = 0; unix_get_peersec_dgram(sock, scm); diff -uprN linux-2.6.18/include/net/sock.h linux-2.6.18.ovz/include/net/sock.h --- linux-2.6.18/include/net/sock.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/sock.h 2007-06-13 06:55:07.000000000 -0400 @@ -55,6 +55,8 @@ #include #include +#include + /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of @@ -258,6 +260,8 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + struct sock_beancounter sk_bc; + struct ve_struct *owner_env; }; /* @@ -492,6 +496,8 @@ static inline void sk_add_backlog(struct }) extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); +extern int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, + unsigned long amount); extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); extern void sk_stream_wait_close(struct sock *sk, long timeo_p); extern int sk_stream_error(struct sock *sk, int flags, int err); @@ -721,8 +727,11 @@ static inline void sk_stream_writequeue_ static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) { - return (int)skb->truesize <= sk->sk_forward_alloc || - sk_stream_mem_schedule(sk, skb->truesize, 1); + if ((int)skb->truesize > sk->sk_forward_alloc && + !sk_stream_mem_schedule(sk, skb->truesize, 1)) + /* The situation is bad according to mainstream. Den */ + return 0; + return ub_tcprcvbuf_charge(sk, skb) == 0; } static inline int sk_stream_wmem_schedule(struct sock *sk, int size) @@ -783,6 +792,11 @@ extern struct sk_buff *sock_alloc_send unsigned long size, int noblock, int *errcode); +extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk, + unsigned long size, + unsigned long size2, + int noblock, + int *errcode); extern void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); extern void sock_kfree_s(struct sock *sk, void *mem, int size); @@ -1041,6 +1055,8 @@ static inline int sk_can_gso(const struc static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + extern int sysctl_tcp_use_sg; + __sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) @@ -1051,6 +1067,8 @@ static inline void sk_setup_caps(struct else sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; } + if (!sysctl_tcp_use_sg) + sk->sk_route_caps &= ~NETIF_F_SG; } static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) diff -uprN linux-2.6.18/include/net/tcp.h linux-2.6.18.ovz/include/net/tcp.h --- linux-2.6.18/include/net/tcp.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/tcp.h 2007-06-13 06:55:07.000000000 -0400 @@ -40,6 +40,13 @@ #include #include +#include + +#define TCP_PAGE(sk) (sk->sk_sndmsg_page) +#define TCP_OFF(sk) (sk->sk_sndmsg_off) + +#define TW_WSCALE_MASK 0x0f +#define TW_WSCALE_SPEC 0x10 extern struct inet_hashinfo tcp_hashinfo; @@ -214,7 +221,9 @@ extern int sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; +#ifndef sysctl_tcp_adv_win_scale extern int sysctl_tcp_adv_win_scale; +#endif extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; @@ -227,6 +236,10 @@ extern int sysctl_tcp_mtu_probing; extern int sysctl_tcp_base_mss; extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; +extern int sysctl_tcp_use_sg; +extern int sysctl_tcp_max_tw_kmem_fraction; +extern int sysctl_tcp_max_tw_buckets_ub; + extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; @@ -258,12 +271,17 @@ static inline int between(__u32 seq1, __ extern struct proto tcp_prot; DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics); -#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field) -#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field) -#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field) -#define TCP_DEC_STATS(field) SNMP_DEC_STATS(tcp_statistics, field) -#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val) -#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val) +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define ve_tcp_statistics (get_exec_env()->_tcp_statistics) +#else +#define ve_tcp_statistics tcp_statistics +#endif +#define TCP_INC_STATS(field) SNMP_INC_STATS(ve_tcp_statistics, field) +#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_tcp_statistics, field) +#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_tcp_statistics, field) +#define TCP_DEC_STATS(field) SNMP_DEC_STATS(ve_tcp_statistics, field) +#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ve_tcp_statistics, field, val) +#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(ve_tcp_statistics, field, val) extern void tcp_v4_err(struct sk_buff *skb, u32); @@ -510,7 +528,7 @@ extern u32 __tcp_select_window(struct so * to use only the low 32-bits of jiffies and hide the ugly * casts with the following macro. */ -#define tcp_time_stamp ((__u32)(jiffies)) +#define tcp_time_stamp ((__u32)(jiffies + get_exec_env()->jiffies_fixup)) /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission diff -uprN linux-2.6.18/include/net/udp.h linux-2.6.18.ovz/include/net/udp.h --- linux-2.6.18/include/net/udp.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/net/udp.h 2007-06-13 06:55:07.000000000 -0400 @@ -39,13 +39,19 @@ extern rwlock_t udp_hash_lock; extern int udp_port_rover; -static inline int udp_lport_inuse(u16 num) +static inline int udp_hashfn(u16 num, unsigned veid) +{ + return ((num + (veid ^ (veid >> 16))) & (UDP_HTABLE_SIZE - 1)); +} + +static inline int udp_lport_inuse(u16 num, struct ve_struct *env) { struct sock *sk; struct hlist_node *node; - sk_for_each(sk, node, &udp_hash[num & (UDP_HTABLE_SIZE - 1)]) - if (inet_sk(sk)->num == num) + sk_for_each(sk, node, &udp_hash[udp_hashfn(num, VEID(env))]) + if (inet_sk(sk)->num == num && + ve_accessible_strict(sk->owner_env, env)) return 1; return 0; } @@ -75,9 +81,14 @@ extern unsigned int udp_poll(struct file poll_table *wait); DECLARE_SNMP_STAT(struct udp_mib, udp_statistics); -#define UDP_INC_STATS(field) SNMP_INC_STATS(udp_statistics, field) -#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_statistics, field) -#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_statistics, field) +#ifdef CONFIG_VE +#define ve_udp_statistics (get_exec_env()->_udp_statistics) +#else +#define ve_udp_statistics udp_statistics +#endif +#define UDP_INC_STATS(field) SNMP_INC_STATS(ve_udp_statistics, field) +#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_udp_statistics, field) +#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_udp_statistics, field) /* /proc */ struct udp_seq_afinfo { diff -uprN linux-2.6.18/include/rdma/Kbuild linux-2.6.18.ovz/include/rdma/Kbuild --- linux-2.6.18/include/rdma/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/rdma/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1 +1 @@ -header-y := ib_user_mad.h +header-y += ib_user_mad.h diff -uprN linux-2.6.18/include/scsi/Kbuild linux-2.6.18.ovz/include/scsi/Kbuild --- linux-2.6.18/include/scsi/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/scsi/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,2 +1,4 @@ header-y += scsi.h -unifdef-y := scsi_ioctl.h sg.h + +unifdef-y += scsi_ioctl.h +unifdef-y += sg.h diff -uprN linux-2.6.18/include/scsi/libsas.h linux-2.6.18.ovz/include/scsi/libsas.h --- linux-2.6.18/include/scsi/libsas.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/scsi/libsas.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,628 @@ +/* + * SAS host prototypes and structures header file + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#ifndef _LIBSAS_H_ +#define _LIBSAS_H_ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct block_device; + +enum sas_class { + SAS, + EXPANDER +}; + +enum sas_phy_role { + PHY_ROLE_NONE = 0, + PHY_ROLE_TARGET = 0x40, + PHY_ROLE_INITIATOR = 0x80, +}; + +enum sas_phy_type { + PHY_TYPE_PHYSICAL, + PHY_TYPE_VIRTUAL +}; + +/* The events are mnemonically described in sas_dump.c + * so when updating/adding events here, please also + * update the other file too. + */ +enum ha_event { + HAE_RESET = 0U, + HA_NUM_EVENTS = 1, +}; + +enum port_event { + PORTE_BYTES_DMAED = 0U, + PORTE_BROADCAST_RCVD = 1, + PORTE_LINK_RESET_ERR = 2, + PORTE_TIMER_EVENT = 3, + PORTE_HARD_RESET = 4, + PORT_NUM_EVENTS = 5, +}; + +enum phy_event { + PHYE_LOSS_OF_SIGNAL = 0U, + PHYE_OOB_DONE = 1, + PHYE_OOB_ERROR = 2, + PHYE_SPINUP_HOLD = 3, /* hot plug SATA, no COMWAKE sent */ + PHY_NUM_EVENTS = 4, +}; + +enum discover_event { + DISCE_DISCOVER_DOMAIN = 0U, + DISCE_REVALIDATE_DOMAIN = 1, + DISCE_PORT_GONE = 2, + DISC_NUM_EVENTS = 3, +}; + +/* ---------- Expander Devices ---------- */ + +#define ETASK 0xFA + +#define to_dom_device(_obj) container_of(_obj, struct domain_device, dev_obj) +#define to_dev_attr(_attr) container_of(_attr, struct domain_dev_attribute,\ + attr) + +enum routing_attribute { + DIRECT_ROUTING, + SUBTRACTIVE_ROUTING, + TABLE_ROUTING, +}; + +enum ex_phy_state { + PHY_EMPTY, + PHY_VACANT, + PHY_NOT_PRESENT, + PHY_DEVICE_DISCOVERED +}; + +struct ex_phy { + int phy_id; + + enum ex_phy_state phy_state; + + enum sas_dev_type attached_dev_type; + enum sas_phy_linkrate linkrate; + + u8 attached_sata_host:1; + u8 attached_sata_dev:1; + u8 attached_sata_ps:1; + + enum sas_proto attached_tproto; + enum sas_proto attached_iproto; + + u8 attached_sas_addr[SAS_ADDR_SIZE]; + u8 attached_phy_id; + + u8 phy_change_count; + enum routing_attribute routing_attr; + u8 virtual:1; + + int last_da_index; + + struct sas_phy *phy; + struct sas_port *port; +}; + +struct expander_device { + struct list_head children; + + u16 ex_change_count; + u16 max_route_indexes; + u8 num_phys; + u8 configuring:1; + u8 conf_route_table:1; + u8 enclosure_logical_id[8]; + + struct ex_phy *ex_phy; + struct sas_port *parent_port; +}; + +/* ---------- SATA device ---------- */ +enum ata_command_set { + ATA_COMMAND_SET = 0, + ATAPI_COMMAND_SET = 1, +}; + +struct sata_device { + enum ata_command_set command_set; + struct smp_resp rps_resp; /* report_phy_sata_resp */ + __le16 *identify_device; + __le16 *identify_packet_device; + + u8 port_no; /* port number, if this is a PM (Port) */ + struct list_head children; /* PM Ports if this is a PM */ +}; + +/* ---------- Domain device ---------- */ +struct domain_device { + enum sas_dev_type dev_type; + + enum sas_phy_linkrate linkrate; + enum sas_phy_linkrate min_linkrate; + enum sas_phy_linkrate max_linkrate; + + int pathways; + + struct domain_device *parent; + struct list_head siblings; /* devices on the same level */ + struct asd_sas_port *port; /* shortcut to root of the tree */ + + struct list_head dev_list_node; + + enum sas_proto iproto; + enum sas_proto tproto; + + struct sas_rphy *rphy; + + u8 sas_addr[SAS_ADDR_SIZE]; + u8 hashed_sas_addr[HASHED_SAS_ADDR_SIZE]; + + u8 frame_rcvd[32]; + + union { + struct expander_device ex_dev; + struct sata_device sata_dev; /* STP & directly attached */ + }; + + void *lldd_dev; +}; + +struct sas_discovery { + spinlock_t disc_event_lock; + struct work_struct disc_work[DISC_NUM_EVENTS]; + unsigned long pending; + u8 fanout_sas_addr[8]; + u8 eeds_a[8]; + u8 eeds_b[8]; + int max_level; +}; + + +/* The port struct is Class:RW, driver:RO */ +struct asd_sas_port { +/* private: */ + struct completion port_gone_completion; + + struct sas_discovery disc; + struct domain_device *port_dev; + spinlock_t dev_list_lock; + struct list_head dev_list; + enum sas_phy_linkrate linkrate; + + struct sas_phy *phy; + struct work_struct work; + +/* public: */ + int id; + + enum sas_class class; + u8 sas_addr[SAS_ADDR_SIZE]; + u8 attached_sas_addr[SAS_ADDR_SIZE]; + enum sas_proto iproto; + enum sas_proto tproto; + + enum sas_oob_mode oob_mode; + + spinlock_t phy_list_lock; + struct list_head phy_list; + int num_phys; + u32 phy_mask; + + struct sas_ha_struct *ha; + + struct sas_port *port; + + void *lldd_port; /* not touched by the sas class code */ +}; + +/* The phy pretty much is controlled by the LLDD. + * The class only reads those fields. + */ +struct asd_sas_phy { +/* private: */ + /* protected by ha->event_lock */ + struct work_struct port_events[PORT_NUM_EVENTS]; + struct work_struct phy_events[PHY_NUM_EVENTS]; + + unsigned long port_events_pending; + unsigned long phy_events_pending; + + int error; + + struct sas_phy *phy; + +/* public: */ + /* The following are class:RO, driver:R/W */ + int enabled; /* must be set */ + + int id; /* must be set */ + enum sas_class class; + enum sas_proto iproto; + enum sas_proto tproto; + + enum sas_phy_type type; + enum sas_phy_role role; + enum sas_oob_mode oob_mode; + enum sas_phy_linkrate linkrate; + + u8 *sas_addr; /* must be set */ + u8 attached_sas_addr[SAS_ADDR_SIZE]; /* class:RO, driver: R/W */ + + spinlock_t frame_rcvd_lock; + u8 *frame_rcvd; /* must be set */ + int frame_rcvd_size; + + spinlock_t sas_prim_lock; + u32 sas_prim; + + struct list_head port_phy_el; /* driver:RO */ + struct asd_sas_port *port; /* Class:RW, driver: RO */ + + struct sas_ha_struct *ha; /* may be set; the class sets it anyway */ + + void *lldd_phy; /* not touched by the sas_class_code */ +}; + +struct scsi_core { + struct Scsi_Host *shost; + + spinlock_t task_queue_lock; + struct list_head task_queue; + int task_queue_size; + + struct semaphore queue_thread_sema; + int queue_thread_kill; +}; + +struct sas_ha_struct { +/* private: */ + spinlock_t event_lock; + struct work_struct ha_events[HA_NUM_EVENTS]; + unsigned long pending; + + struct scsi_core core; + +/* public: */ + char *sas_ha_name; + struct pci_dev *pcidev; /* should be set */ + struct module *lldd_module; /* should be set */ + + u8 *sas_addr; /* must be set */ + u8 hashed_sas_addr[HASHED_SAS_ADDR_SIZE]; + + spinlock_t phy_port_lock; + struct asd_sas_phy **sas_phy; /* array of valid pointers, must be set */ + struct asd_sas_port **sas_port; /* array of valid pointers, must be set */ + int num_phys; /* must be set, gt 0, static */ + + /* The class calls this to send a task for execution. */ + int lldd_max_execute_num; + int lldd_queue_size; + + /* LLDD calls these to notify the class of an event. */ + void (*notify_ha_event)(struct sas_ha_struct *, enum ha_event); + void (*notify_port_event)(struct asd_sas_phy *, enum port_event); + void (*notify_phy_event)(struct asd_sas_phy *, enum phy_event); + + void *lldd_ha; /* not touched by sas class code */ +}; + +#define SHOST_TO_SAS_HA(_shost) (*(struct sas_ha_struct **)(_shost)->hostdata) + +static inline struct domain_device * +starget_to_domain_dev(struct scsi_target *starget) { + return starget->hostdata; +} + +static inline struct domain_device * +sdev_to_domain_dev(struct scsi_device *sdev) { + return starget_to_domain_dev(sdev->sdev_target); +} + +static inline struct domain_device * +cmd_to_domain_dev(struct scsi_cmnd *cmd) +{ + return sdev_to_domain_dev(cmd->device); +} + +void sas_hash_addr(u8 *hashed, const u8 *sas_addr); + +/* Before calling a notify event, LLDD should use this function + * when the link is severed (possibly from its tasklet). + * The idea is that the Class only reads those, while the LLDD, + * can R/W these (thus avoiding a race). + */ +static inline void sas_phy_disconnected(struct asd_sas_phy *phy) +{ + phy->oob_mode = OOB_NOT_CONNECTED; + phy->linkrate = PHY_LINKRATE_NONE; +} + +/* ---------- Tasks ---------- */ +/* + service_response | SAS_TASK_COMPLETE | SAS_TASK_UNDELIVERED | + exec_status | | | + ---------------------+---------------------+-----------------------+ + SAM_... | X | | + DEV_NO_RESPONSE | X | X | + INTERRUPTED | X | | + QUEUE_FULL | | X | + DEVICE_UNKNOWN | | X | + SG_ERR | | X | + ---------------------+---------------------+-----------------------+ + */ + +enum service_response { + SAS_TASK_COMPLETE, + SAS_TASK_UNDELIVERED = -1, +}; + +enum exec_status { + SAM_GOOD = 0, + SAM_CHECK_COND = 2, + SAM_COND_MET = 4, + SAM_BUSY = 8, + SAM_INTERMEDIATE = 0x10, + SAM_IM_COND_MET = 0x12, + SAM_RESV_CONFLICT= 0x14, + SAM_TASK_SET_FULL= 0x28, + SAM_ACA_ACTIVE = 0x30, + SAM_TASK_ABORTED = 0x40, + + SAS_DEV_NO_RESPONSE = 0x80, + SAS_DATA_UNDERRUN, + SAS_DATA_OVERRUN, + SAS_INTERRUPTED, + SAS_QUEUE_FULL, + SAS_DEVICE_UNKNOWN, + SAS_SG_ERR, + SAS_OPEN_REJECT, + SAS_OPEN_TO, + SAS_PROTO_RESPONSE, + SAS_PHY_DOWN, + SAS_NAK_R_ERR, + SAS_PENDING, + SAS_ABORTED_TASK, +}; + +/* When a task finishes with a response, the LLDD examines the + * response: + * - For an ATA task task_status_struct::stat is set to + * SAS_PROTO_RESPONSE, and the task_status_struct::buf is set to the + * contents of struct ata_task_resp. + * - For SSP tasks, if no data is present or status/TMF response + * is valid, task_status_struct::stat is set. If data is present + * (SENSE data), the LLDD copies up to SAS_STATUS_BUF_SIZE, sets + * task_status_struct::buf_valid_size, and task_status_struct::stat is + * set to SAM_CHECK_COND. + * + * "buf" has format SCSI Sense for SSP task, or struct ata_task_resp + * for ATA task. + * + * "frame_len" is the total frame length, which could be more or less + * than actually copied. + * + * Tasks ending with response, always set the residual field. + */ +struct ata_task_resp { + u16 frame_len; + u8 ending_fis[24]; /* dev to host or data-in */ + u32 sstatus; + u32 serror; + u32 scontrol; + u32 sactive; +}; + +#define SAS_STATUS_BUF_SIZE 96 + +struct task_status_struct { + enum service_response resp; + enum exec_status stat; + int buf_valid_size; + + u8 buf[SAS_STATUS_BUF_SIZE]; + + u32 residual; + enum sas_open_rej_reason open_rej_reason; +}; + +/* ATA and ATAPI task queuable to a SAS LLDD. + */ +struct sas_ata_task { + struct host_to_dev_fis fis; + u8 atapi_packet[16]; /* 0 if not ATAPI task */ + + u8 retry_count; /* hardware retry, should be > 0 */ + + u8 dma_xfer:1; /* PIO:0 or DMA:1 */ + u8 use_ncq:1; + u8 set_affil_pol:1; + u8 stp_affil_pol:1; + + u8 device_control_reg_update:1; +}; + +struct sas_smp_task { + struct scatterlist smp_req; + struct scatterlist smp_resp; +}; + +enum task_attribute { + TASK_ATTR_SIMPLE = 0, + TASK_ATTR_HOQ = 1, + TASK_ATTR_ORDERED= 2, + TASK_ATTR_ACA = 4, +}; + +struct sas_ssp_task { + u8 retry_count; /* hardware retry, should be > 0 */ + + u8 LUN[8]; + u8 enable_first_burst:1; + enum task_attribute task_attr; + u8 task_prio; + u8 cdb[16]; +}; + +struct sas_task { + struct domain_device *dev; + struct list_head list; + + spinlock_t task_state_lock; + unsigned task_state_flags; + + enum sas_proto task_proto; + + /* Used by the discovery code. */ + struct timer_list timer; + struct completion completion; + + union { + struct sas_ata_task ata_task; + struct sas_smp_task smp_task; + struct sas_ssp_task ssp_task; + }; + + struct scatterlist *scatter; + int num_scatter; + u32 total_xfer_len; + u8 data_dir:2; /* Use PCI_DMA_... */ + + struct task_status_struct task_status; + void (*task_done)(struct sas_task *); + + void *lldd_task; /* for use by LLDDs */ + void *uldd_task; +}; + + + +#define SAS_TASK_STATE_PENDING 1 +#define SAS_TASK_STATE_DONE 2 +#define SAS_TASK_STATE_ABORTED 4 + +static inline struct sas_task *sas_alloc_task(unsigned long flags) +{ + extern kmem_cache_t *sas_task_cache; + struct sas_task *task = kmem_cache_alloc(sas_task_cache, flags); + + if (task) { + memset(task, 0, sizeof(*task)); + INIT_LIST_HEAD(&task->list); + spin_lock_init(&task->task_state_lock); + task->task_state_flags = SAS_TASK_STATE_PENDING; + init_timer(&task->timer); + init_completion(&task->completion); + } + + return task; +} + +static inline void sas_free_task(struct sas_task *task) +{ + if (task) { + extern kmem_cache_t *sas_task_cache; + BUG_ON(!list_empty(&task->list)); + kmem_cache_free(sas_task_cache, task); + } +} + +struct sas_domain_function_template { + /* The class calls these to notify the LLDD of an event. */ + void (*lldd_port_formed)(struct asd_sas_phy *); + void (*lldd_port_deformed)(struct asd_sas_phy *); + + /* The class calls these when a device is found or gone. */ + int (*lldd_dev_found)(struct domain_device *); + void (*lldd_dev_gone)(struct domain_device *); + + int (*lldd_execute_task)(struct sas_task *, int num, + unsigned long gfp_flags); + + /* Task Management Functions. Must be called from process context. */ + int (*lldd_abort_task)(struct sas_task *); + int (*lldd_abort_task_set)(struct domain_device *, u8 *lun); + int (*lldd_clear_aca)(struct domain_device *, u8 *lun); + int (*lldd_clear_task_set)(struct domain_device *, u8 *lun); + int (*lldd_I_T_nexus_reset)(struct domain_device *); + int (*lldd_lu_reset)(struct domain_device *, u8 *lun); + int (*lldd_query_task)(struct sas_task *); + + /* Port and Adapter management */ + int (*lldd_clear_nexus_port)(struct asd_sas_port *); + int (*lldd_clear_nexus_ha)(struct sas_ha_struct *); + + /* Phy management */ + int (*lldd_control_phy)(struct asd_sas_phy *, enum phy_func); +}; + +extern int sas_register_ha(struct sas_ha_struct *); +extern int sas_unregister_ha(struct sas_ha_struct *); + +extern int sas_queuecommand(struct scsi_cmnd *, + void (*scsi_done)(struct scsi_cmnd *)); +extern int sas_target_alloc(struct scsi_target *); +extern int sas_slave_alloc(struct scsi_device *); +extern int sas_slave_configure(struct scsi_device *); +extern void sas_slave_destroy(struct scsi_device *); +extern int sas_change_queue_depth(struct scsi_device *, int new_depth); +extern int sas_change_queue_type(struct scsi_device *, int qt); +extern int sas_bios_param(struct scsi_device *, + struct block_device *, + sector_t capacity, int *hsc); +extern struct scsi_transport_template * +sas_domain_attach_transport(struct sas_domain_function_template *); +extern void sas_domain_release_transport(struct scsi_transport_template *); + +int sas_discover_root_expander(struct domain_device *); + +void sas_init_ex_attr(void); + +int sas_ex_revalidate_domain(struct domain_device *); + +void sas_unregister_domain_devices(struct asd_sas_port *port); +void sas_init_disc(struct sas_discovery *disc, struct asd_sas_port *); +int sas_discover_event(struct asd_sas_port *, enum discover_event ev); + +int sas_discover_sata(struct domain_device *); +int sas_discover_end_dev(struct domain_device *); + +void sas_unregister_dev(struct domain_device *); + +void sas_init_dev(struct domain_device *); + +#endif /* _SASLIB_H_ */ diff -uprN linux-2.6.18/include/scsi/sas.h linux-2.6.18.ovz/include/scsi/sas.h --- linux-2.6.18/include/scsi/sas.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/scsi/sas.h 2007-06-13 06:55:08.000000000 -0400 @@ -0,0 +1,644 @@ +/* + * SAS structures and definitions header file + * + * Copyright (C) 2005 Adaptec, Inc. All rights reserved. + * Copyright (C) 2005 Luben Tuikov + * + * This file is licensed under GPLv2. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#ifndef _SAS_H_ +#define _SAS_H_ + +#include +#include + +#define SAS_ADDR_SIZE 8 +#define HASHED_SAS_ADDR_SIZE 3 +#define SAS_ADDR(_sa) ((unsigned long long) be64_to_cpu(*(__be64 *)(_sa))) + +#define SMP_REQUEST 0x40 +#define SMP_RESPONSE 0x41 + +#define SSP_DATA 0x01 +#define SSP_XFER_RDY 0x05 +#define SSP_COMMAND 0x06 +#define SSP_RESPONSE 0x07 +#define SSP_TASK 0x16 + +#define SMP_REPORT_GENERAL 0x00 +#define SMP_REPORT_MANUF_INFO 0x01 +#define SMP_READ_GPIO_REG 0x02 +#define SMP_DISCOVER 0x10 +#define SMP_REPORT_PHY_ERR_LOG 0x11 +#define SMP_REPORT_PHY_SATA 0x12 +#define SMP_REPORT_ROUTE_INFO 0x13 +#define SMP_WRITE_GPIO_REG 0x82 +#define SMP_CONF_ROUTE_INFO 0x90 +#define SMP_PHY_CONTROL 0x91 +#define SMP_PHY_TEST_FUNCTION 0x92 + +#define SMP_RESP_FUNC_ACC 0x00 +#define SMP_RESP_FUNC_UNK 0x01 +#define SMP_RESP_FUNC_FAILED 0x02 +#define SMP_RESP_INV_FRM_LEN 0x03 +#define SMP_RESP_NO_PHY 0x10 +#define SMP_RESP_NO_INDEX 0x11 +#define SMP_RESP_PHY_NO_SATA 0x12 +#define SMP_RESP_PHY_UNK_OP 0x13 +#define SMP_RESP_PHY_UNK_TESTF 0x14 +#define SMP_RESP_PHY_TEST_INPROG 0x15 +#define SMP_RESP_PHY_VACANT 0x16 + +/* SAM TMFs */ +#define TMF_ABORT_TASK 0x01 +#define TMF_ABORT_TASK_SET 0x02 +#define TMF_CLEAR_TASK_SET 0x04 +#define TMF_LU_RESET 0x08 +#define TMF_CLEAR_ACA 0x40 +#define TMF_QUERY_TASK 0x80 + +/* SAS TMF responses */ +#define TMF_RESP_FUNC_COMPLETE 0x00 +#define TMF_RESP_INVALID_FRAME 0x02 +#define TMF_RESP_FUNC_ESUPP 0x04 +#define TMF_RESP_FUNC_FAILED 0x05 +#define TMF_RESP_FUNC_SUCC 0x08 +#define TMF_RESP_NO_LUN 0x09 +#define TMF_RESP_OVERLAPPED_TAG 0x0A + +enum sas_oob_mode { + OOB_NOT_CONNECTED, + SATA_OOB_MODE, + SAS_OOB_MODE +}; + +/* See sas_discover.c if you plan on changing these. + */ +enum sas_dev_type { + NO_DEVICE = 0, /* protocol */ + SAS_END_DEV = 1, /* protocol */ + EDGE_DEV = 2, /* protocol */ + FANOUT_DEV = 3, /* protocol */ + SAS_HA = 4, + SATA_DEV = 5, + SATA_PM = 7, + SATA_PM_PORT= 8, +}; + +enum sas_phy_linkrate { + PHY_LINKRATE_NONE = 0, + PHY_LINKRATE_UNKNOWN = 0, + PHY_DISABLED, + PHY_RESET_PROBLEM, + PHY_SPINUP_HOLD, + PHY_PORT_SELECTOR, + PHY_LINKRATE_1_5 = 0x08, + PHY_LINKRATE_G1 = PHY_LINKRATE_1_5, + PHY_LINKRATE_3 = 0x09, + PHY_LINKRATE_G2 = PHY_LINKRATE_3, + PHY_LINKRATE_6 = 0x0A, +}; + +/* Partly from IDENTIFY address frame. */ +enum sas_proto { + SATA_PROTO = 1, + SAS_PROTO_SMP = 2, /* protocol */ + SAS_PROTO_STP = 4, /* protocol */ + SAS_PROTO_SSP = 8, /* protocol */ + SAS_PROTO_ALL = 0xE, +}; + +/* From the spec; local phys only */ +enum phy_func { + PHY_FUNC_NOP, + PHY_FUNC_LINK_RESET, /* Enables the phy */ + PHY_FUNC_HARD_RESET, + PHY_FUNC_DISABLE, + PHY_FUNC_CLEAR_ERROR_LOG = 5, + PHY_FUNC_CLEAR_AFFIL, + PHY_FUNC_TX_SATA_PS_SIGNAL, + PHY_FUNC_RELEASE_SPINUP_HOLD = 0x10, /* LOCAL PORT ONLY! */ +}; + +/* SAS LLDD would need to report only _very_few_ of those, like BROADCAST. + * Most of those are here for completeness. + */ +enum sas_prim { + SAS_PRIM_AIP_NORMAL = 1, + SAS_PRIM_AIP_R0 = 2, + SAS_PRIM_AIP_R1 = 3, + SAS_PRIM_AIP_R2 = 4, + SAS_PRIM_AIP_WC = 5, + SAS_PRIM_AIP_WD = 6, + SAS_PRIM_AIP_WP = 7, + SAS_PRIM_AIP_RWP = 8, + + SAS_PRIM_BC_CH = 9, + SAS_PRIM_BC_RCH0 = 10, + SAS_PRIM_BC_RCH1 = 11, + SAS_PRIM_BC_R0 = 12, + SAS_PRIM_BC_R1 = 13, + SAS_PRIM_BC_R2 = 14, + SAS_PRIM_BC_R3 = 15, + SAS_PRIM_BC_R4 = 16, + + SAS_PRIM_NOTIFY_ENSP= 17, + SAS_PRIM_NOTIFY_R0 = 18, + SAS_PRIM_NOTIFY_R1 = 19, + SAS_PRIM_NOTIFY_R2 = 20, + + SAS_PRIM_CLOSE_CLAF = 21, + SAS_PRIM_CLOSE_NORM = 22, + SAS_PRIM_CLOSE_R0 = 23, + SAS_PRIM_CLOSE_R1 = 24, + + SAS_PRIM_OPEN_RTRY = 25, + SAS_PRIM_OPEN_RJCT = 26, + SAS_PRIM_OPEN_ACPT = 27, + + SAS_PRIM_DONE = 28, + SAS_PRIM_BREAK = 29, + + SATA_PRIM_DMAT = 33, + SATA_PRIM_PMNAK = 34, + SATA_PRIM_PMACK = 35, + SATA_PRIM_PMREQ_S = 36, + SATA_PRIM_PMREQ_P = 37, + SATA_SATA_R_ERR = 38, +}; + +enum sas_open_rej_reason { + /* Abandon open */ + SAS_OREJ_UNKNOWN = 0, + SAS_OREJ_BAD_DEST = 1, + SAS_OREJ_CONN_RATE = 2, + SAS_OREJ_EPROTO = 3, + SAS_OREJ_RESV_AB0 = 4, + SAS_OREJ_RESV_AB1 = 5, + SAS_OREJ_RESV_AB2 = 6, + SAS_OREJ_RESV_AB3 = 7, + SAS_OREJ_WRONG_DEST= 8, + SAS_OREJ_STP_NORES = 9, + + /* Retry open */ + SAS_OREJ_NO_DEST = 10, + SAS_OREJ_PATH_BLOCKED = 11, + SAS_OREJ_RSVD_CONT0 = 12, + SAS_OREJ_RSVD_CONT1 = 13, + SAS_OREJ_RSVD_INIT0 = 14, + SAS_OREJ_RSVD_INIT1 = 15, + SAS_OREJ_RSVD_STOP0 = 16, + SAS_OREJ_RSVD_STOP1 = 17, + SAS_OREJ_RSVD_RETRY = 18, +}; + +struct dev_to_host_fis { + u8 fis_type; /* 0x34 */ + u8 flags; + u8 status; + u8 error; + + u8 lbal; + union { u8 lbam; u8 byte_count_low; }; + union { u8 lbah; u8 byte_count_high; }; + u8 device; + + u8 lbal_exp; + u8 lbam_exp; + u8 lbah_exp; + u8 _r_a; + + union { u8 sector_count; u8 interrupt_reason; }; + u8 sector_count_exp; + u8 _r_b; + u8 _r_c; + + u32 _r_d; +} __attribute__ ((packed)); + +struct host_to_dev_fis { + u8 fis_type; /* 0x27 */ + u8 flags; + u8 command; + u8 features; + + u8 lbal; + union { u8 lbam; u8 byte_count_low; }; + union { u8 lbah; u8 byte_count_high; }; + u8 device; + + u8 lbal_exp; + u8 lbam_exp; + u8 lbah_exp; + u8 features_exp; + + union { u8 sector_count; u8 interrupt_reason; }; + u8 sector_count_exp; + u8 _r_a; + u8 control; + + u32 _r_b; +} __attribute__ ((packed)); + +/* Prefer to have code clarity over header file clarity. + */ +#ifdef __LITTLE_ENDIAN_BITFIELD +struct sas_identify_frame { + /* Byte 0 */ + u8 frame_type:4; + u8 dev_type:3; + u8 _un0:1; + + /* Byte 1 */ + u8 _un1; + + /* Byte 2 */ + union { + struct { + u8 _un20:1; + u8 smp_iport:1; + u8 stp_iport:1; + u8 ssp_iport:1; + u8 _un247:4; + }; + u8 initiator_bits; + }; + + /* Byte 3 */ + union { + struct { + u8 _un30:1; + u8 smp_tport:1; + u8 stp_tport:1; + u8 ssp_tport:1; + u8 _un347:4; + }; + u8 target_bits; + }; + + /* Byte 4 - 11 */ + u8 _un4_11[8]; + + /* Byte 12 - 19 */ + u8 sas_addr[SAS_ADDR_SIZE]; + + /* Byte 20 */ + u8 phy_id; + + u8 _un21_27[7]; + + __be32 crc; +} __attribute__ ((packed)); + +struct ssp_frame_hdr { + u8 frame_type; + u8 hashed_dest_addr[HASHED_SAS_ADDR_SIZE]; + u8 _r_a; + u8 hashed_src_addr[HASHED_SAS_ADDR_SIZE]; + __be16 _r_b; + + u8 changing_data_ptr:1; + u8 retransmit:1; + u8 retry_data_frames:1; + u8 _r_c:5; + + u8 num_fill_bytes:2; + u8 _r_d:6; + + u32 _r_e; + __be16 tag; + __be16 tptt; + __be32 data_offs; +} __attribute__ ((packed)); + +struct ssp_response_iu { + u8 _r_a[10]; + + u8 datapres:2; + u8 _r_b:6; + + u8 status; + + u32 _r_c; + + __be32 sense_data_len; + __be32 response_data_len; + + u8 resp_data[0]; + u8 sense_data[0]; +} __attribute__ ((packed)); + +/* ---------- SMP ---------- */ + +struct report_general_resp { + __be16 change_count; + __be16 route_indexes; + u8 _r_a; + u8 num_phys; + + u8 conf_route_table:1; + u8 configuring:1; + u8 _r_b:6; + + u8 _r_c; + + u8 enclosure_logical_id[8]; + + u8 _r_d[12]; +} __attribute__ ((packed)); + +struct discover_resp { + u8 _r_a[5]; + + u8 phy_id; + __be16 _r_b; + + u8 _r_c:4; + u8 attached_dev_type:3; + u8 _r_d:1; + + u8 linkrate:4; + u8 _r_e:4; + + u8 attached_sata_host:1; + u8 iproto:3; + u8 _r_f:4; + + u8 attached_sata_dev:1; + u8 tproto:3; + u8 _r_g:3; + u8 attached_sata_ps:1; + + u8 sas_addr[8]; + u8 attached_sas_addr[8]; + u8 attached_phy_id; + + u8 _r_h[7]; + + u8 hmin_linkrate:4; + u8 pmin_linkrate:4; + u8 hmax_linkrate:4; + u8 pmax_linkrate:4; + + u8 change_count; + + u8 pptv:4; + u8 _r_i:3; + u8 virtual:1; + + u8 routing_attr:4; + u8 _r_j:4; + + u8 conn_type; + u8 conn_el_index; + u8 conn_phy_link; + + u8 _r_k[8]; +} __attribute__ ((packed)); + +struct report_phy_sata_resp { + u8 _r_a[5]; + + u8 phy_id; + u8 _r_b; + + u8 affil_valid:1; + u8 affil_supp:1; + u8 _r_c:6; + + u32 _r_d; + + u8 stp_sas_addr[8]; + + struct dev_to_host_fis fis; + + u32 _r_e; + + u8 affil_stp_ini_addr[8]; + + __be32 crc; +} __attribute__ ((packed)); + +struct smp_resp { + u8 frame_type; + u8 function; + u8 result; + u8 reserved; + union { + struct report_general_resp rg; + struct discover_resp disc; + struct report_phy_sata_resp rps; + }; +} __attribute__ ((packed)); + +#elif defined(__BIG_ENDIAN_BITFIELD) +struct sas_identify_frame { + /* Byte 0 */ + u8 _un0:1; + u8 dev_type:3; + u8 frame_type:4; + + /* Byte 1 */ + u8 _un1; + + /* Byte 2 */ + union { + struct { + u8 _un247:4; + u8 ssp_iport:1; + u8 stp_iport:1; + u8 smp_iport:1; + u8 _un20:1; + }; + u8 initiator_bits; + }; + + /* Byte 3 */ + union { + struct { + u8 _un347:4; + u8 ssp_tport:1; + u8 stp_tport:1; + u8 smp_tport:1; + u8 _un30:1; + }; + u8 target_bits; + }; + + /* Byte 4 - 11 */ + u8 _un4_11[8]; + + /* Byte 12 - 19 */ + u8 sas_addr[SAS_ADDR_SIZE]; + + /* Byte 20 */ + u8 phy_id; + + u8 _un21_27[7]; + + __be32 crc; +} __attribute__ ((packed)); + +struct ssp_frame_hdr { + u8 frame_type; + u8 hashed_dest_addr[HASHED_SAS_ADDR_SIZE]; + u8 _r_a; + u8 hashed_src_addr[HASHED_SAS_ADDR_SIZE]; + __be16 _r_b; + + u8 _r_c:5; + u8 retry_data_frames:1; + u8 retransmit:1; + u8 changing_data_ptr:1; + + u8 _r_d:6; + u8 num_fill_bytes:2; + + u32 _r_e; + __be16 tag; + __be16 tptt; + __be32 data_offs; +} __attribute__ ((packed)); + +struct ssp_response_iu { + u8 _r_a[10]; + + u8 _r_b:6; + u8 datapres:2; + + u8 status; + + u32 _r_c; + + __be32 sense_data_len; + __be32 response_data_len; + + u8 resp_data[0]; + u8 sense_data[0]; +} __attribute__ ((packed)); + +/* ---------- SMP ---------- */ + +struct report_general_resp { + __be16 change_count; + __be16 route_indexes; + u8 _r_a; + u8 num_phys; + + u8 _r_b:6; + u8 configuring:1; + u8 conf_route_table:1; + + u8 _r_c; + + u8 enclosure_logical_id[8]; + + u8 _r_d[12]; +} __attribute__ ((packed)); + +struct discover_resp { + u8 _r_a[5]; + + u8 phy_id; + __be16 _r_b; + + u8 _r_d:1; + u8 attached_dev_type:3; + u8 _r_c:4; + + u8 _r_e:4; + u8 linkrate:4; + + u8 _r_f:4; + u8 iproto:3; + u8 attached_sata_host:1; + + u8 attached_sata_ps:1; + u8 _r_g:3; + u8 tproto:3; + u8 attached_sata_dev:1; + + u8 sas_addr[8]; + u8 attached_sas_addr[8]; + u8 attached_phy_id; + + u8 _r_h[7]; + + u8 pmin_linkrate:4; + u8 hmin_linkrate:4; + u8 pmax_linkrate:4; + u8 hmax_linkrate:4; + + u8 change_count; + + u8 virtual:1; + u8 _r_i:3; + u8 pptv:4; + + u8 _r_j:4; + u8 routing_attr:4; + + u8 conn_type; + u8 conn_el_index; + u8 conn_phy_link; + + u8 _r_k[8]; +} __attribute__ ((packed)); + +struct report_phy_sata_resp { + u8 _r_a[5]; + + u8 phy_id; + u8 _r_b; + + u8 _r_c:6; + u8 affil_supp:1; + u8 affil_valid:1; + + u32 _r_d; + + u8 stp_sas_addr[8]; + + struct dev_to_host_fis fis; + + u32 _r_e; + + u8 affil_stp_ini_addr[8]; + + __be32 crc; +} __attribute__ ((packed)); + +struct smp_resp { + u8 frame_type; + u8 function; + u8 result; + u8 reserved; + union { + struct report_general_resp rg; + struct discover_resp disc; + struct report_phy_sata_resp rps; + }; +} __attribute__ ((packed)); + +#else +#error "Bitfield order not defined!" +#endif + +#endif /* _SAS_H_ */ diff -uprN linux-2.6.18/include/scsi/scsi.h linux-2.6.18.ovz/include/scsi/scsi.h --- linux-2.6.18/include/scsi/scsi.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/scsi/scsi.h 2007-06-13 06:55:08.000000000 -0400 @@ -433,4 +433,10 @@ struct scsi_lun { /* Used to obtain the PCI location of a device */ #define SCSI_IOCTL_GET_PCI 0x5387 +/* Pull a u32 out of a SCSI message (using BE SCSI conventions) */ +static inline u32 scsi_to_u32(u8 *ptr) +{ + return (ptr[0]<<24) + (ptr[1]<<16) + (ptr[2]<<8) + ptr[3]; +} + #endif /* _SCSI_SCSI_H */ diff -uprN linux-2.6.18/include/scsi/scsi_transport_sas.h linux-2.6.18.ovz/include/scsi/scsi_transport_sas.h --- linux-2.6.18/include/scsi/scsi_transport_sas.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/scsi/scsi_transport_sas.h 2007-06-13 06:55:08.000000000 -0400 @@ -57,9 +57,6 @@ struct sas_phy { enum sas_linkrate maximum_linkrate_hw; enum sas_linkrate maximum_linkrate; - /* internal state */ - unsigned int local_attached : 1; - /* link error statistics */ u32 invalid_dword_count; u32 running_disparity_error_count; @@ -196,4 +193,6 @@ scsi_is_sas_expander_device(struct devic rphy->identify.device_type == SAS_EDGE_EXPANDER_DEVICE; } +#define scsi_is_sas_phy_local(phy) scsi_is_host_device((phy)->dev.parent) + #endif /* SCSI_TRANSPORT_SAS_H */ diff -uprN linux-2.6.18/include/sound/Kbuild linux-2.6.18.ovz/include/sound/Kbuild --- linux-2.6.18/include/sound/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/sound/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1,2 +1,10 @@ -header-y := asound_fm.h hdsp.h hdspm.h sfnt_info.h sscape_ioctl.h -unifdef-y := asequencer.h asound.h emu10k1.h sb16_csp.h +header-y += asound_fm.h +header-y += hdsp.h +header-y += hdspm.h +header-y += sfnt_info.h +header-y += sscape_ioctl.h + +unifdef-y += asequencer.h +unifdef-y += asound.h +unifdef-y += emu10k1.h +unifdef-y += sb16_csp.h diff -uprN linux-2.6.18/include/ub/beancounter.h linux-2.6.18.ovz/include/ub/beancounter.h --- linux-2.6.18/include/ub/beancounter.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/beancounter.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,452 @@ +/* + * include/ub/beancounter.h + * + * Copyright (C) 1999-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Andrey Savochkin saw@sw-soft.com + * + */ + +#ifndef _LINUX_BEANCOUNTER_H +#define _LINUX_BEANCOUNTER_H + +/* + * Generic ratelimiting stuff. + */ + +struct ub_rate_info { + int burst; + int interval; /* jiffy_t per event */ + int bucket; /* kind of leaky bucket */ + unsigned long last; /* last event */ +}; + +/* Return true if rate limit permits. */ +int ub_ratelimit(struct ub_rate_info *); + + +/* + * This magic is used to distinuish user beancounter and pages beancounter + * in struct page. page_ub and page_bc are placed in union and MAGIC + * ensures us that we don't use pbc as ubc in ub_page_uncharge(). + */ +#define UB_MAGIC 0x62756275 + +/* + * Resource list. + */ + +#define UB_KMEMSIZE 0 /* Unswappable kernel memory size including + * struct task, page directories, etc. + */ +#define UB_LOCKEDPAGES 1 /* Mlock()ed pages. */ +#define UB_PRIVVMPAGES 2 /* Total number of pages, counting potentially + * private pages as private and used. + */ +#define UB_SHMPAGES 3 /* IPC SHM segment size. */ +#define UB_DUMMY 4 /* Dummy resource (compatibility) */ +#define UB_NUMPROC 5 /* Number of processes. */ +#define UB_PHYSPAGES 6 /* All resident pages, for swapout guarantee. */ +#define UB_VMGUARPAGES 7 /* Guarantee for memory allocation, + * checked against PRIVVMPAGES. + */ +#define UB_OOMGUARPAGES 8 /* Guarantees against OOM kill. + * Only limit is used, no accounting. + */ +#define UB_NUMTCPSOCK 9 /* Number of TCP sockets. */ +#define UB_NUMFLOCK 10 /* Number of file locks. */ +#define UB_NUMPTY 11 /* Number of PTYs. */ +#define UB_NUMSIGINFO 12 /* Number of siginfos. */ +#define UB_TCPSNDBUF 13 /* Total size of tcp send buffers. */ +#define UB_TCPRCVBUF 14 /* Total size of tcp receive buffers. */ +#define UB_OTHERSOCKBUF 15 /* Total size of other socket + * send buffers (all buffers for PF_UNIX). + */ +#define UB_DGRAMRCVBUF 16 /* Total size of other socket + * receive buffers. + */ +#define UB_NUMOTHERSOCK 17 /* Number of other sockets. */ +#define UB_DCACHESIZE 18 /* Size of busy dentry/inode cache. */ +#define UB_NUMFILE 19 /* Number of open files. */ + +#define UB_NUMXTENT 23 +#define UB_RESOURCES_COMPAT 24 + +/* Add new resources here */ + +#define UB_RESOURCES 24 + +#define UB_UNUSEDPRIVVM (UB_RESOURCES + 0) +#define UB_TMPFSPAGES (UB_RESOURCES + 1) +#define UB_SWAPPAGES (UB_RESOURCES + 2) +#define UB_HELDPAGES (UB_RESOURCES + 3) + +struct ubparm { + /* + * A barrier over which resource allocations are failed gracefully. + * If the amount of consumed memory is over the barrier further sbrk() + * or mmap() calls fail, the existing processes are not killed. + */ + unsigned long barrier; + /* hard resource limit */ + unsigned long limit; + /* consumed resources */ + unsigned long held; + /* maximum amount of consumed resources through the last period */ + unsigned long maxheld; + /* minimum amount of consumed resources through the last period */ + unsigned long minheld; + /* count of failed charges */ + unsigned long failcnt; +}; + +/* + * Kernel internal part. + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form. + */ +#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1) + + +/* + * Resource management structures + * Serialization issues: + * beancounter list management is protected via ub_hash_lock + * task pointers are set only for current task and only once + * refcount is managed atomically + * value and limit comparison and change are protected by per-ub spinlock + */ + +struct page_beancounter; +struct task_beancounter; +struct sock_beancounter; + +struct page_private { + unsigned long ubp_unused_privvmpages; + unsigned long ubp_tmpfs_respages; + unsigned long ubp_swap_pages; + unsigned long long ubp_held_pages; +}; + +struct sock_private { + unsigned long ubp_rmem_thres; + unsigned long ubp_wmem_pressure; + unsigned long ubp_maxadvmss; + unsigned long ubp_rmem_pressure; + int ubp_tw_count; +#define UB_RMEM_EXPAND 0 +#define UB_RMEM_KEEP 1 +#define UB_RMEM_SHRINK 2 + struct list_head ubp_other_socks; + struct list_head ubp_tcp_socks; + atomic_t ubp_orphan_count; +}; + +struct ub_percpu_struct { + unsigned long unmap; + unsigned long swapin; +#ifdef CONFIG_UBC_IO_ACCT + unsigned long long bytes_wrote; + unsigned long long bytes_read; + unsigned long long bytes_cancelled; +#endif +#ifdef CONFIG_UBC_DEBUG_KMEM + long pages_charged; + long vmalloc_charged; + long pbcs; +#endif + unsigned long sync; + unsigned long sync_done; + + unsigned long fsync; + unsigned long fsync_done; + + unsigned long fdsync; + unsigned long fdsync_done; + + unsigned long frsync; + unsigned long frsync_done; + + unsigned long write; + unsigned long read; + unsigned long long wchar; + unsigned long long rchar; +}; + +struct user_beancounter +{ + unsigned long ub_magic; + atomic_t ub_refcount; + struct list_head ub_list; + struct hlist_node ub_hash; + + union { + struct rcu_head rcu; + struct execute_work cleanup; + }; + + spinlock_t ub_lock; + uid_t ub_uid; + + struct ub_rate_info ub_limit_rl; + int ub_oom_noproc; + + struct page_private ppriv; +#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages +#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages +#define ub_swap_pages ppriv.ubp_swap_pages +#define ub_held_pages ppriv.ubp_held_pages + struct sock_private spriv; +#define ub_rmem_thres spriv.ubp_rmem_thres +#define ub_maxadvmss spriv.ubp_maxadvmss +#define ub_rmem_pressure spriv.ubp_rmem_pressure +#define ub_wmem_pressure spriv.ubp_wmem_pressure +#define ub_tcp_sk_list spriv.ubp_tcp_socks +#define ub_other_sk_list spriv.ubp_other_socks +#define ub_orphan_count spriv.ubp_orphan_count +#define ub_tw_count spriv.ubp_tw_count + struct ub_iopriv iopriv; + + struct user_beancounter *parent; + void *private_data; + unsigned long ub_aflags; + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc; +#endif + + /* resources statistic and settings */ + struct ubparm ub_parms[UB_RESOURCES]; + /* resources statistic for last interval */ + struct ubparm ub_store[UB_RESOURCES]; + + struct ub_percpu_struct *ub_percpu; +#ifdef CONFIG_UBC_IO_ACCT + /* these are protected with pb_lock */ + unsigned long long bytes_wrote; + unsigned long long bytes_dirtied; + unsigned long long bytes_dirty_missed; + unsigned long io_pb_held; +#endif +#ifdef CONFIG_UBC_DEBUG_KMEM + struct list_head ub_cclist; +#endif +}; + +enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE }; + +#define UB_AFLAG_NOTIF_PAGEIN 0 + +static inline +struct user_beancounter *top_beancounter(struct user_beancounter *ub) +{ + while (ub->parent != NULL) + ub = ub->parent; + return ub; +} + +static inline int ub_barrier_hit(struct user_beancounter *ub, int resource) +{ + return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier; +} + +static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource) +{ + return (ub->ub_parms[resource].held > + ((ub->ub_parms[resource].barrier) >> 1)); +} + +static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource) +{ + struct ubparm *p; + p = ub->ub_parms + resource; + return p->held <= (p->barrier >> 3); +} + +static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource) +{ + struct ubparm *p; + p = ub->ub_parms + resource; + return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024; +} + +#ifndef CONFIG_USER_RESOURCE + +#define ub_percpu_add(ub, f, v) do { } while (0) +#define ub_percpu_sub(ub, f, v) do { } while (0) +#define ub_percpu_inc(ub, f) do { } while (0) +#define ub_percpu_dec(ub, f) do { } while (0) + +#define mm_ub(mm) (NULL) + +extern inline struct user_beancounter *get_beancounter_byuid + (uid_t uid, int create) { return NULL; } +extern inline struct user_beancounter *get_beancounter + (struct user_beancounter *ub) { return NULL; } +extern inline void put_beancounter(struct user_beancounter *ub) { } + +static inline void ub_init_late(void) { }; +static inline void ub_init_early(void) { }; + +static inline int charge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val, + enum ub_severity strict) { return 0; } +static inline void uncharge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val) { } + +#else /* CONFIG_USER_RESOURCE */ + +#define ub_percpu_add(ub, field, v) do { \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v); \ + put_cpu(); \ + } while (0) +#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1) + +#define ub_percpu_sub(ub, field, v) do { \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v); \ + put_cpu(); \ + } while (0) +#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1) + +#define mm_ub(mm) ((mm)->mm_ub) +/* + * Charge/uncharge operations + */ + +extern int __charge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict); + +extern void __uncharge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val); + +extern void __put_beancounter(struct user_beancounter *ub); + +extern void uncharge_warn(struct user_beancounter *ub, int resource, + unsigned long val, unsigned long held); + +extern const char *ub_rnames[]; +/* + * Put a beancounter reference + */ + +static inline void put_beancounter(struct user_beancounter *ub) +{ + if (unlikely(ub == NULL)) + return; + + /* FIXME - optimize not to disable interrupts and make call */ + __put_beancounter(ub); +} + +/* fast put, refcount can't reach zero */ +static inline void __put_beancounter_batch(struct user_beancounter *ub, int n) +{ + atomic_sub(n, &ub->ub_refcount); +} + +static inline void put_beancounter_batch(struct user_beancounter *ub, int n) +{ + if (n > 1) + __put_beancounter_batch(ub, n - 1); + __put_beancounter(ub); +} + +/* + * Create a new beancounter reference + */ +extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create); + +static inline +struct user_beancounter *get_beancounter(struct user_beancounter *ub) +{ + if (unlikely(ub == NULL)) + return NULL; + + atomic_inc(&ub->ub_refcount); + return ub; +} + +static inline +struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub) +{ + return atomic_inc_not_zero(&ub->ub_refcount) ? ub : NULL; +} + +static inline void get_beancounter_batch(struct user_beancounter *ub, int n) +{ + atomic_add(n, &ub->ub_refcount); +} + +extern struct user_beancounter *get_subbeancounter_byid( + struct user_beancounter *, + int id, int create); + +extern void ub_init_late(void); +extern void ub_init_early(void); + +extern int print_ub_uid(struct user_beancounter *ub, char *buf, int size); + +/* + * Resource charging + * Change user's account and compare against limits + */ + +static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource) +{ + if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held) + ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held; + if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held) + ub->ub_parms[resource].minheld = ub->ub_parms[resource].held; +} + +int charge_beancounter(struct user_beancounter *ub, int resource, + unsigned long val, enum ub_severity strict); +void uncharge_beancounter(struct user_beancounter *ub, int resource, + unsigned long val); +void __charge_beancounter_notop(struct user_beancounter *ub, int resource, + unsigned long val); +void __uncharge_beancounter_notop(struct user_beancounter *ub, int resource, + unsigned long val); + +static inline void charge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + if (ub->parent != NULL) + __charge_beancounter_notop(ub, resource, val); +} + +static inline void uncharge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + if (ub->parent != NULL) + __uncharge_beancounter_notop(ub, resource, val); +} + +#endif /* CONFIG_USER_RESOURCE */ + +#ifndef CONFIG_USER_RSS_ACCOUNTING +static inline void ub_ini_pbc(void) { } +#else +extern void ub_init_pbc(void); +#endif +#endif /* __KERNEL__ */ +#endif /* _LINUX_BEANCOUNTER_H */ diff -uprN linux-2.6.18/include/ub/io_acct.h linux-2.6.18.ovz/include/ub/io_acct.h --- linux-2.6.18/include/ub/io_acct.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/io_acct.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,113 @@ +/* + * include/ub/io_acct.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Pavel Emelianov + * + */ + +#ifndef __UB_IO_ACCT_H_ +#define __UB_IO_ACCT_H_ + +#ifdef CONFIG_UBC_IO_ACCT +#include +#include + +#define page_iopb(page) ({ \ + struct page_beancounter *pb; \ + pb = page_pbc(page); \ + rmb(); \ + pb; \ + }) + +/* + * IO ub is required in task context only, so if exec_ub is set + * to NULL this means that uses doesn't need to charge some + * resources. nevertheless IO activity must be accounted, so we + * account it to current's task beancounter. + */ + +static inline struct user_beancounter *get_io_ub(void) +{ + struct user_beancounter *ub; + + ub = get_exec_ub(); + if (unlikely(ub == NULL)) + ub = get_task_ub(current); + + return top_beancounter(ub); +} + +extern struct page_beancounter **page_pblist(struct page *); + +extern void ub_io_save_context(struct page *, size_t); +extern void ub_io_release_context(struct page *pg, size_t size); + +#define PAGE_IO_MARK (0x1UL) + +static inline struct page_beancounter *iopb_to_pb(struct page_beancounter *pb) +{ + if (!((unsigned long)pb & PAGE_IO_MARK)) + return NULL; + + return (struct page_beancounter *)((unsigned long)pb & ~PAGE_IO_MARK); +} + +static inline void ub_io_account_read(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_read, bytes); +} + +static inline void ub_io_account_write(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_wrote, bytes); +} + +static inline void ub_io_account_dirty(struct page *page, size_t bytes) +{ + ub_io_save_context(page, bytes); +} + +static inline void ub_io_account_write_cancelled(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_cancelled, bytes); +} + +void ub_init_io(kmem_cache_t *); +#else /* UBC_IO_ACCT */ +#define page_iopb(page) (NULL) +#define page_pblist(page) (&page_pbc(page)) + +static inline void ub_io_release_context(struct page *pg, size_t bytes) +{ +} + +static inline void ub_io_account_dirty(struct page *p, size_t bytes) +{ +} + +static inline void ub_io_account_read(size_t bytes) +{ +} + +static inline void ub_io_account_write(size_t bytes) +{ +} + +static inline void ub_io_account_write_cancelled(size_t bytes) +{ +} + +static inline void ub_init_io(kmem_cache_t *pb_cachep) { }; +#endif + +#ifdef CONFIG_UBC_DEBUG_IO +extern void ub_io_release_debug(struct page *pg); +#else +#define ub_io_release_debug(pg) do { } while (0) +#endif +#endif diff -uprN linux-2.6.18/include/ub/io_prio.h linux-2.6.18.ovz/include/ub/io_prio.h --- linux-2.6.18/include/ub/io_prio.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/io_prio.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,76 @@ +/* + * include/ub/io_prio.h + * + * Copyright (C) 2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Vasily Tarasov + * + */ + +#ifndef _UB_IO_PRIO_H +#define _UB_IO_PRIO_H + +#include +#include +#include + +#define UB_IOPRIO_MIN 0 +#define UB_IOPRIO_MAX IOPRIO_BE_NR +#define UB_IOPRIO_BASE 4 + +struct ub_iopriv { + struct list_head cfq_bc_head; + rwlock_t cfq_bc_list_lock; + + unsigned int ioprio; +}; + +#ifdef CONFIG_UBC_IO_PRIO +extern void bc_init_ioprio(struct ub_iopriv *); +extern void bc_fini_ioprio(struct ub_iopriv *); +extern struct cfq_bc_data * bc_findcreate_cfq_bc(struct ub_iopriv *, + struct cfq_data *, gfp_t gfp_mask); +extern void bc_cfq_exit_queue(struct cfq_data *); +extern int bc_expired(struct cfq_data *); +extern void bc_schedule_active(struct cfq_data *); +extern void bc_inc_rqnum(struct cfq_queue *); +extern void bc_dec_rqnum(struct cfq_queue *); +extern unsigned long bc_set_ioprio(int, int); +extern struct cfq_bc_data * +__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd); +extern struct user_beancounter *bc_io_switch_context(struct page *); +extern void bc_io_restore_context(struct user_beancounter *); +#else +static inline void bc_init_ioprio(struct ub_iopriv *iopriv) { ; } +static inline void bc_fini_ioprio(struct ub_iopriv *iopriv) { ; } +static inline struct cfq_bc_data * +bc_findcreate_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd, gfp_t mask) +{ + return &cfqd->cfq_bc; +} +static inline void bc_cfq_exit_queue(struct cfq_data *cfqd) { ; } +static inline int bc_expired(struct cfq_data *cfqd) { return 0; } +static inline void bc_schedule_active(struct cfq_data *cfqd) +{ + cfqd->active_cfq_bc = &cfqd->cfq_bc; +} +static inline void bc_inc_rqnum(struct cfq_queue *cfqq) { ; } +static inline void bc_dec_rqnum(struct cfq_queue *cfqq) { ; } +static inline unsigned long bc_set_ioprio(int ubid, int ioprio) +{ + return -EINVAL; +} +static inline struct cfq_bc_data * +__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd) +{ + return &cfqd->cfq_bc; +} +static inline struct user_beancounter * +bc_io_switch_context(struct page *page) { return NULL; } +static inline void bc_io_restore_context(struct user_beancounter *ub) { ; } +#endif /* CONFIG_UBC_IO_PRIO */ +#endif /* _UB_IO_PRIO_H */ diff -uprN linux-2.6.18/include/ub/proc.h linux-2.6.18.ovz/include/ub/proc.h --- linux-2.6.18/include/ub/proc.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/proc.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,40 @@ +/* + * include/ub/proc.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_PROC_H_ +#define __UB_PROC_H_ + +#include + +struct bc_proc_entry { + char *name; + union { + int (*show)(struct seq_file *, void *); + struct file_operations *fops; + } u; + struct bc_proc_entry *next; + int cookie; +}; + +struct user_beancounter; + +void bc_register_proc_entry(struct bc_proc_entry *); +void bc_register_proc_root_entry(struct bc_proc_entry *); + +static inline struct user_beancounter *seq_beancounter(struct seq_file *f) +{ + return (struct user_beancounter *)(f->private); +} + +extern const char *bc_proc_lu_fmt; +extern const char *bc_proc_lu_lfmt; +extern const char *bc_proc_llu_fmt; +extern const char *bc_proc_lu_lu_fmt; +#endif diff -uprN linux-2.6.18/include/ub/ub_dcache.h linux-2.6.18.ovz/include/ub/ub_dcache.h --- linux-2.6.18/include/ub/ub_dcache.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_dcache.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,49 @@ +/* + * include/ub/ub_dcache.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_DCACHE_H_ +#define __UB_DCACHE_H_ + +#include + +/* + * UB_DCACHESIZE accounting + */ + +struct dentry_beancounter +{ + /* + * d_inuse = + * + + * + * + * d_inuse == -1 means that dentry is unused + * state change -1 => 0 causes charge + * state change 0 => -1 causes uncharge + */ + atomic_t d_inuse; + /* charged size, including name length if name is not inline */ + unsigned long d_ubsize; + struct user_beancounter *d_ub; +}; + +#ifdef CONFIG_USER_RESOURCE +#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse)) +#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse)) +#define INUSE_INIT 0 + +extern int ub_dentry_on; +extern void ub_dentry_checkup(void); +#else +#define ub_dget_testone(d) (0) +#define ub_dput_testzero(d) (0) +#define ub_dentry_checkup() do { } while (0) +#endif +#endif diff -uprN linux-2.6.18/include/ub/ub_dcache_op.h linux-2.6.18.ovz/include/ub/ub_dcache_op.h --- linux-2.6.18/include/ub/ub_dcache_op.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_dcache_op.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,100 @@ +/* + * include/ub/ub_dcache_op.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_DCACHE_OP_H_ +#define __UB_DCACHE_OP_H_ + +struct dentry; + +#ifdef CONFIG_USER_RESOURCE + +#include +#include +#include + +extern int ub_dentry_alloc_barrier; +extern spinlock_t dcache_lock; + +static inline int ub_dentry_alloc(struct dentry *d) +{ + extern int __ub_dentry_alloc(struct dentry *); + + if (!ub_dentry_on) + return 0; + return __ub_dentry_alloc(d); +} + +static inline void ub_dentry_alloc_start(void) +{ + extern void __ub_dentry_alloc_start(void); + + if (ub_dentry_alloc_barrier) + __ub_dentry_alloc_start(); +} + +static inline void ub_dentry_alloc_end(void) +{ + extern void __ub_dentry_alloc_end(void); + + if (current->task_bc.dentry_alloc) + __ub_dentry_alloc_end(); +} + +static inline int ub_dentry_charge(struct dentry *d) +{ + extern int __ub_dentry_charge(struct dentry *); + + if (!ub_dentry_on) + return 0; + return __ub_dentry_charge(d); +} + +static inline void ub_dentry_charge_nofail(struct dentry *d) +{ + extern void __ub_dentry_charge_nofail(struct dentry *); + + if (!ub_dentry_on) + return; + __ub_dentry_charge_nofail(d); +} + +static inline void ub_dentry_uncharge_locked(struct dentry *d) +{ + extern void __ub_dentry_uncharge(struct dentry *); + + if (!ub_dentry_on) + return; + __ub_dentry_uncharge(d); +} + +static inline void ub_dentry_uncharge(struct dentry *d) +{ + extern void __ub_dentry_uncharge(struct dentry *); + + if (!ub_dentry_on) + return; + spin_lock(&dcache_lock); + __ub_dentry_uncharge(d); + spin_unlock(&dcache_lock); +} + +#else /* CONFIG_USER_RESOURCE */ + +static inline int ub_dentry_alloc(struct dentry *d) { return 0; } +static inline void ub_dentry_alloc_start(void) { } +static inline void ub_dentry_alloc_end(void) { } +static inline int ub_dentry_charge(struct dentry *d) { return 0; } +static inline void ub_dentry_charge_nofail(struct dentry *d) { } +static inline void ub_dentry_uncharge_locked(struct dentry *d) { } +static inline void ub_dentry_uncharge(struct dentry *d) { } + +#endif /* CONFIG_USER_RESOURCE */ + +#endif /* __UB_DCACHE_OP_H_ */ diff -uprN linux-2.6.18/include/ub/ub_debug.h linux-2.6.18.ovz/include/ub/ub_debug.h --- linux-2.6.18/include/ub/ub_debug.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_debug.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,112 @@ +/* + * include/ub/ub_debug.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_DEBUG_H_ +#define __UB_DEBUG_H_ + +/* + * general debugging + */ + +#define UBD_ALLOC 0x1 +#define UBD_CHARGE 0x2 +#define UBD_LIMIT 0x4 +#define UBD_TRACE 0x8 + +/* + * ub_net debugging + */ + +#define UBD_NET_SOCKET 0x10 +#define UBD_NET_SLEEP 0x20 +#define UBD_NET_SEND 0x40 +#define UBD_NET_RECV 0x80 + +/* + * Main routines + */ + +#define UB_DEBUG (0) +#define DEBUG_RESOURCE (0ULL) + +#define ub_dbg_cond(__cond, __str, args...) \ + do { \ + if ((__cond) != 0) \ + printk(__str, ##args); \ + } while(0) + +#define ub_debug(__section, __str, args...) \ + ub_dbg_cond(UB_DEBUG & (__section), __str, ##args) + +#define ub_debug_resource(__resource, __str, args...) \ + ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \ + (DEBUG_RESOURCE & (1 << (__resource))), \ + __str, ##args) + +#if UB_DEBUG & UBD_TRACE +#define ub_debug_trace(__cond, __b, __r) \ + do { \ + static struct ub_rate_info ri = { __b, __r }; \ + if ((__cond) != 0 && ub_ratelimit(&ri)) \ + dump_stack(); \ + } while(0) +#else +#define ub_debug_trace(__cond, __burst, __rate) +#endif + +#include + +#ifdef CONFIG_UBC_DEBUG_KMEM +#include +#include + +struct user_beancounter; +struct ub_cache_counter { + struct list_head ulist; + struct ub_cache_counter *next; + struct user_beancounter *ub; + kmem_cache_t *cachep; + unsigned long counter; +}; + +extern spinlock_t cc_lock; +extern void init_cache_counters(void); +extern void ub_free_counters(struct user_beancounter *); +extern void ub_kmemcache_free(kmem_cache_t *cachep); + +struct vm_struct; +#define inc_vmalloc_charged(vm, flags) do { \ + if (flags & __GFP_UBC) \ + ub_percpu_add(get_exec_ub(), vmalloc_charged, \ + vm->nr_pages); \ + } while (0) +#define dec_vmalloc_charged(vm) do { \ + struct user_beancounter *ub; \ + ub = page_ub(vm->pages[0]); \ + if (ub != NULL) \ + ub_percpu_sub(ub, vmalloc_charged, \ + vm->nr_pages); \ + } while (0) + +#define inc_pbc_count(ub) ub_percpu_inc(ub, pbcs) +#define dec_pbc_count(ub) ub_percpu_dec(ub, pbcs) +#else +#define init_cache_counters() do { } while (0) +#define inc_vmalloc_charged(vm, f) do { } while (0) +#define dec_vmalloc_charged(vm) do { } while (0) + +#define inc_pbc_count(ub) do { } while (0) +#define dec_pbc_count(ub) do { } while (0) + +#define ub_free_counters(ub) do { } while (0) +#define ub_kmemcache_free(cachep) do { } while (0) +#endif + +#endif diff -uprN linux-2.6.18/include/ub/ub_decl.h linux-2.6.18.ovz/include/ub/ub_decl.h --- linux-2.6.18/include/ub/ub_decl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_decl.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,42 @@ +/* + * include/ub/ub_decl.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_DECL_H_ +#define __UB_DECL_H_ + +#ifdef __KERNEL__ +#include + +/* + * Naming convension: + * ub__ + */ + +#ifdef CONFIG_USER_RESOURCE + +#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl; +#define UB_DECLARE_VOID_FUNC(decl) extern void decl; + +#else /* CONFIG_USER_RESOURCE */ + +#define UB_DECLARE_FUNC(ret_type, decl) \ + static inline ret_type decl \ + { \ + return (ret_type)0; \ + } +#define UB_DECLARE_VOID_FUNC(decl) \ + static inline void decl \ + { \ + } + +#endif /* CONFIG_USER_RESOURCE */ +#endif + +#endif diff -uprN linux-2.6.18/include/ub/ub_hash.h linux-2.6.18.ovz/include/ub/ub_hash.h --- linux-2.6.18/include/ub/ub_hash.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_hash.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,36 @@ +/* + * include/ub/ub_hash.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_UBHASH_H +#define _LINUX_UBHASH_H + +#ifdef __KERNEL__ + +#define UB_HASH_SIZE 256 + +extern struct hlist_head ub_hash[]; +extern spinlock_t ub_hash_lock; +extern struct list_head ub_list_head; + +#ifdef CONFIG_USER_RESOURCE + +/* + * Iterate over beancounters + * @__ubp - beancounter ptr + * Can use break :) + */ +#define for_each_beancounter(__ubp) \ + list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list) \ + +#define bc_hash_entry(ptr) hlist_entry(ptr, struct user_beancounter, ub_hash) + +#endif /* CONFIG_USER_RESOURCE */ +#endif /* __KERNEL__ */ +#endif /* _LINUX_UBHASH_H */ diff -uprN linux-2.6.18/include/ub/ub_mem.h linux-2.6.18.ovz/include/ub/ub_mem.h --- linux-2.6.18/include/ub/ub_mem.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_mem.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,78 @@ +/* + * include/ub/ub_mem.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_SLAB_H_ +#define __UB_SLAB_H_ + +#include +#include +#include +#include + +/* + * UB_KMEMSIZE accounting + */ + +#ifdef CONFIG_UBC_DEBUG_ITEMS +#define CHARGE_ORDER(__o) (1 << (__o)) +#define CHARGE_SIZE(__s) 1 +#else +#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o)) +#define CHARGE_SIZE(__s) (__s) +#endif + +#define page_ub(__page) ((__page)->bc.page_ub) + +struct mm_struct; +struct page; +struct kmem_cache; + +UB_DECLARE_FUNC(struct user_beancounter *, slab_ub(void *obj)) +UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj)) +UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj)) + +UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, gfp_t mask)) +UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order)) +UB_DECLARE_FUNC(int, ub_slab_charge(struct kmem_cache *cachep, + void *objp, gfp_t flags)) +UB_DECLARE_VOID_FUNC(ub_slab_uncharge(struct kmem_cache *cachep, void *obj)) + +#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\ + (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\ + sizeof(void *)))) + +#ifdef CONFIG_USER_RESOURCE +extern struct user_beancounter *ub_select_worst(long *); + +/* mm/slab.c needed stuff */ +#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1) +#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0) +#define set_cache_objuse(cachep) do { \ + (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \ + (cachep)->num - 1) / (cachep)->num; \ + if (!OFF_SLAB(cachep)) \ + break; \ + (cachep)->objuse += ((cachep)->slabp_cache->objuse + \ + (cachep)->num - 1) / (cachep)->num; \ + } while (0) +#define init_slab_ubps(cachep, slabp) do { \ + if (!((cachep)->flags & SLAB_UBC)) \ + break; \ + memset(slab_ubcs(cachep, slabp), 0, \ + (cachep)->num * sizeof(void *)); \ + } while (0) +#define kmem_obj_memusage(o) (virt_to_cache(o)->objuse) +#else +#define UB_ALIGN(flags) 1 +#define UB_EXTRA(flags) 0 +#define set_cache_objuse(c) do { } while (0) +#define init_slab_ubps(c, s) do { } while (0) +#endif +#endif /* __UB_SLAB_H_ */ diff -uprN linux-2.6.18/include/ub/ub_misc.h linux-2.6.18.ovz/include/ub/ub_misc.h --- linux-2.6.18/include/ub/ub_misc.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_misc.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,55 @@ +/* + * include/ub/ub_misc.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_MISC_H_ +#define __UB_MISC_H_ + +#include + +struct tty_struct; +struct file; +struct file_lock; +struct sigqueue; + +UB_DECLARE_FUNC(int, ub_file_charge(struct file *f)) +UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f)) +UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard)) +UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl)) +UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q, + struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q)) +UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent, + struct task_struct *task)) +UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task)) +UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task)) +UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty)) +UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty)) + +#ifdef CONFIG_USER_RESOURCE +#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0) +#define unset_flock_charged(fl) do { \ + WARN_ON((fl)->fl_charged == 0); \ + (fl)->fl_charged = 0; \ + } while (0) +#define set_mm_ub(mm, tsk) do { \ + (mm)->mm_ub = get_beancounter(tsk ? \ + tsk->task_bc.task_ub : get_exec_ub()); \ + } while (0) +#define put_mm_ub(mm) do { \ + put_beancounter((mm)->mm_ub); \ + (mm)->mm_ub = NULL; \ + } while (0) +#else +#define set_flock_charged(fl) do { } while (0) +#define unset_flock_charged(fl) do { } while (0) +#define set_mm_ub(mm, tsk) do { } while (0) +#define put_mm_ub(mm) do { } while (0) +#endif +#endif diff -uprN linux-2.6.18/include/ub/ub_net.h linux-2.6.18.ovz/include/ub/ub_net.h --- linux-2.6.18/include/ub/ub_net.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_net.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,215 @@ +/* + * include/ub/ub_net.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_NET_H_ +#define __UB_NET_H_ + +/* + * UB_NUMXXXSOCK, UB_XXXBUF accounting + */ + +#include +#include +#include + +#define bid2sid(__bufid) \ + ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK) + +#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \ + ~(SMP_CACHE_BYTES-1))) +#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE) + +static inline int ub_skb_alloc_bc(struct sk_buff *skb, gfp_t gfp_mask) +{ +#ifdef CONFIG_USER_RESOURCE + memset(skb_bc(skb), 0, sizeof(struct skb_beancounter)); +#endif + return 0; +} + +static inline void ub_skb_free_bc(struct sk_buff *skb) +{ +} + +#define IS_TCP_SOCK(__family, __type) \ + (((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM) + +/* number of sockets */ +UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type)) +UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk)) +UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk)) +UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk)) + +/* management of queue for send space */ +UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_snd_queue_add(struct sock *sk, int resource, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk)) + +/* send space */ +UB_DECLARE_FUNC(int, ub_sock_make_wreserv(struct sock *sk, int bufid, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_sock_get_wreserv(struct sock *sk, int bufid, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargesend(struct sock *sk, + struct sk_buff *skb, enum ub_severity strict)) +UB_DECLARE_VOID_FUNC(ub_sock_tcp_unchargesend(struct sock *sk, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk)) +UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk)) + +UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)) + +/* receive space */ +UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargerecv(struct sock *sk, + struct sk_buff *skb, enum ub_severity strict)) + +/* skb destructor */ +UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb)) + +static inline int ub_sock_makewres_other(struct sock *sk, unsigned long size) +{ + return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size); +} + +static inline int ub_sock_makewres_tcp(struct sock *sk, unsigned long size) +{ + return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size); +} + +UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk, + unsigned long size)) + +static inline int ub_sock_getwres_tcp(struct sock *sk, unsigned long size) +{ + return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size); +} + +UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk, + unsigned long size, unsigned long ressize)) + +static inline void ub_sock_retwres_tcp(struct sock *sk, unsigned long size, + unsigned long ressize) +{ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize); +} + +static inline void ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz) +{ + ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz); +} + +static inline void ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz) +{ + ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz); +} + +static inline int ub_tcpsndbuf_charge(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargesend(sk, skb, UB_HARD); +} + +static inline int ub_tcpsndbuf_charge_forced(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargesend(sk, skb, UB_FORCE); +} + +static inline int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + return ub_sock_tcp_chargerecv(sk, skb, UB_SOFT); +} + +static inline int ub_tcprcvbuf_charge_forced(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargerecv(sk, skb, UB_FORCE); +} + +/* Charge size */ +static inline unsigned long skb_charge_datalen(unsigned long chargesize) +{ +#ifdef CONFIG_USER_RESOURCE + unsigned long slabsize; + + chargesize -= sizeof(struct sk_buff); + slabsize = 64; + do { + slabsize <<= 1; + } while (slabsize <= chargesize); + + slabsize >>= 1; + return (slabsize - sizeof(struct skb_shared_info)) & + ~(SMP_CACHE_BYTES-1); +#else + return 0; +#endif +} + +static inline unsigned long skb_charge_size_gen(unsigned long size) +{ +#ifdef CONFIG_USER_RESOURCE + unsigned int slabsize; + + size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info); + slabsize = 32; /* min size is 64 because of skb_shared_info */ + do { + slabsize <<= 1; + } while (slabsize < size); + + return slabsize + sizeof(struct sk_buff); +#else + return 0; +#endif + +} + +static inline unsigned long skb_charge_size_const(unsigned long size) +{ +#ifdef CONFIG_USER_RESOURCE + unsigned int ret; + if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64) + ret = 64 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128) + ret = 128 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256) + ret = 256 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512) + ret = 512 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024) + ret = 1024 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048) + ret = 2048 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096) + ret = 4096 + sizeof(struct sk_buff); + else + ret = skb_charge_size_gen(size); + return ret; +#else + return 0; +#endif +} + + +#define skb_charge_size(__size) \ + (__builtin_constant_p(__size) ? \ + skb_charge_size_const(__size) : \ + skb_charge_size_gen(__size)) + +UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb)) +UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, + struct sock *sk, unsigned long size, int res)) + +#endif diff -uprN linux-2.6.18/include/ub/ub_oom.h linux-2.6.18.ovz/include/ub/ub_oom.h --- linux-2.6.18/include/ub/ub_oom.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_oom.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,26 @@ +#include +#include + +UB_DECLARE_FUNC(int, ub_oom_lock(void)) +UB_DECLARE_FUNC(struct user_beancounter *, ub_oom_select_worst(void)) +UB_DECLARE_VOID_FUNC(ub_oom_mm_killed(struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_oom_unlock(void)) +UB_DECLARE_VOID_FUNC(ub_out_of_memory(struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_oom_task_dead(struct task_struct *tsk)) +UB_DECLARE_FUNC(int, ub_oom_task_skip(struct user_beancounter *ub, + struct task_struct *tsk)) + +#ifdef CONFIG_USER_RESOURCE +extern int oom_generation; +extern int oom_kill_counter; +#define ub_oom_start() do { \ + current->task_bc.oom_generation = oom_generation; \ + } while (0) +#define ub_oom_task_killed(p) do { \ + oom_kill_counter++; \ + wake_up_process(p); \ + } while (0) +#else +#define ub_oom_start() do { } while (0) +#define ub_oom_task_killed(p) do { } while (0) +#endif diff -uprN linux-2.6.18/include/ub/ub_orphan.h linux-2.6.18.ovz/include/ub/ub_orphan.h --- linux-2.6.18/include/ub/ub_orphan.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_orphan.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,106 @@ +/* + * include/ub/ub_orphan.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_ORPHAN_H_ +#define __UB_ORPHAN_H_ + +#include + +#include "ub/beancounter.h" +#include "ub/ub_net.h" + + +static inline atomic_t *__ub_get_orphan_count_ptr(struct sock *sk) +{ +#ifdef CONFIG_USER_RESOURCE + if (sock_has_ubc(sk)) + return &sock_bc(sk)->ub->ub_orphan_count; +#endif + return sk->sk_prot->orphan_count; +} + +static inline void ub_inc_orphan_count(struct sock *sk) +{ + atomic_inc(__ub_get_orphan_count_ptr(sk)); +} + +static inline void ub_dec_orphan_count(struct sock *sk) +{ + atomic_dec(__ub_get_orphan_count_ptr(sk)); +} + +static inline int ub_get_orphan_count(struct sock *sk) +{ + return atomic_read(__ub_get_orphan_count_ptr(sk)); +} + +extern int __ub_too_many_orphans(struct sock *sk, int count); +static inline int ub_too_many_orphans(struct sock *sk, int count) +{ +#ifdef CONFIG_USER_RESOURCE + if (__ub_too_many_orphans(sk, count)) + return 1; +#endif + return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans || + (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])); +} + +#include +#include + +struct inet_timewait_sock; + +static inline void ub_timewait_mod(struct inet_timewait_sock *tw, int incdec) +{ +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *ub; + + ub = slab_ub(tw); + if (ub != NULL) + ub->ub_tw_count += incdec; +#endif +} + +static inline int __ub_timewait_check(struct sock *sk) +{ +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *ub; + unsigned long mem_max, mem; + int tw_count; + + ub = sock_bc(sk)->ub; + if (ub == NULL) + return 1; + + tw_count = ub->ub_tw_count; + mem_max = sysctl_tcp_max_tw_kmem_fraction * + ((ub->ub_parms[UB_KMEMSIZE].limit >> 10) + 1); + mem = tw_count * sk->sk_prot_creator->twsk_prot->twsk_slab->objuse; + return tw_count < sysctl_tcp_max_tw_buckets_ub && mem < mem_max; +#else + return 1; +#endif +} + +#define ub_timewait_inc(tw, twdr) do { \ + if ((twdr)->ub_managed) \ + ub_timewait_mod(tw, 1); \ + } while (0) + +#define ub_timewait_dec(tw, twdr) do { \ + if ((twdr)->ub_managed) \ + ub_timewait_mod(tw, -1); \ + } while (0) + +#define ub_timewait_check(sk, twdr) ((!(twdr)->ub_managed) || \ + __ub_timewait_check(sk)) + +#endif diff -uprN linux-2.6.18/include/ub/ub_page.h linux-2.6.18.ovz/include/ub/ub_page.h --- linux-2.6.18/include/ub/ub_page.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_page.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,59 @@ +/* + * include/ub/ub_page.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_PAGE_H_ +#define __UB_PAGE_H_ + +#include + +/* + * Page_beancounters + */ + +struct page; +struct user_beancounter; + +#define PB_MAGIC 0x62700001UL + +struct page_beancounter { + unsigned long pb_magic; + struct page *page; + struct user_beancounter *ub; + union { + struct page_beancounter *next_hash; + struct page_beancounter *page_pb_list; + }; + union { + unsigned refcount; + unsigned io_debug; + }; + union { + struct list_head page_list; + struct list_head io_list; + }; +}; + +#define PB_REFCOUNT_BITS 24 +#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS) +#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS)) +#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS)) +#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1)) +#define PB_COUNT_INC(c) ((c)++) +#define PB_COUNT_DEC(c) ((c)--) +#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c)) + +#define page_pbc(__page) ((__page)->bc.page_pb) + +extern spinlock_t pb_lock; + +struct address_space; +extern int is_shmem_mapping(struct address_space *); + +#endif diff -uprN linux-2.6.18/include/ub/ub_sk.h linux-2.6.18.ovz/include/ub/ub_sk.h --- linux-2.6.18/include/ub/ub_sk.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_sk.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,48 @@ +/* + * include/ub/ub_sk.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_SK_H_ +#define __UB_SK_H_ + +#include +#include + +struct sock; +struct sk_buff; + +struct skb_beancounter { + struct user_beancounter *ub; + unsigned long charged:27, resource:5; +}; + +struct sock_beancounter { + struct user_beancounter *ub; + /* + * poll_reserv accounts space already charged for future sends. + * It is required to make poll agree with sendmsg. + * Additionally, it makes real charges (with taking bc spinlock) + * in the send path rarer, speeding networking up. + * For TCP (only): changes are protected by socket lock (not bc!) + * For all proto: may be read without serialization in poll. + */ + unsigned long poll_reserv; + unsigned long forw_space; + /* fields below are protected by bc spinlock */ + unsigned long ub_waitspc; /* space waiting for */ + unsigned long ub_wcharged; + struct list_head ub_sock_list; +}; + +#define sock_bc(__sk) (&(__sk)->sk_bc) +#define skb_bc(__skb) (&(__skb)->skb_bc) +#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc)) +#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL) + +#endif diff -uprN linux-2.6.18/include/ub/ub_stat.h linux-2.6.18.ovz/include/ub/ub_stat.h --- linux-2.6.18/include/ub/ub_stat.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_stat.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,70 @@ +/* + * include/ub/ub_stat.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_STAT_H_ +#define __UB_STAT_H_ + +/* sys_ubstat commands list */ +#define UBSTAT_READ_ONE 0x010000 +#define UBSTAT_READ_ALL 0x020000 +#define UBSTAT_READ_FULL 0x030000 +#define UBSTAT_UBLIST 0x040000 +#define UBSTAT_UBPARMNUM 0x050000 +#define UBSTAT_GETTIME 0x060000 + +#define UBSTAT_CMD(func) ((func) & 0xF0000) +#define UBSTAT_PARMID(func) ((func) & 0x0FFFF) + +#define TIME_MAX_SEC (LONG_MAX / HZ) +#define TIME_MAX_JIF (TIME_MAX_SEC * HZ) + +typedef unsigned long ubstattime_t; + +typedef struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstattime_t cur_time; +} ubgettime_t; + +typedef struct { + long maxinterval; + int signum; +} ubnotifrq_t; + +typedef struct { + unsigned long maxheld; + unsigned long failcnt; +} ubstatparm_t; + +typedef struct { + unsigned long barrier; + unsigned long limit; + unsigned long held; + unsigned long maxheld; + unsigned long minheld; + unsigned long failcnt; + unsigned long __unused1; + unsigned long __unused2; +} ubstatparmf_t; + +typedef struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparmf_t param[0]; +} ubstatfull_t; + +#ifdef __KERNEL__ +struct ub_stat_notify { + struct list_head list; + struct task_struct *task; + int signum; +}; +#endif +#endif diff -uprN linux-2.6.18/include/ub/ub_task.h linux-2.6.18.ovz/include/ub/ub_task.h --- linux-2.6.18/include/ub/ub_task.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_task.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,69 @@ +/* + * include/ub/ub_task.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_TASK_H_ +#define __UB_TASK_H_ + +struct user_beancounter; + + +#ifdef CONFIG_USER_RESOURCE +struct task_beancounter { + struct user_beancounter *exec_ub; + struct user_beancounter *saved_ub; + struct user_beancounter *task_ub; + struct user_beancounter *fork_sub; + unsigned long file_precharged, file_quant, file_count; + unsigned long kmem_precharged; + char dentry_alloc, pgfault_handle; + void *task_fnode, *task_freserv; + unsigned long oom_generation; + unsigned long task_data[4]; + unsigned long pgfault_allot; +}; + +#define get_task_ub(__task) ((__task)->task_bc.task_ub) + +extern struct user_beancounter ub0; +#define get_ub0() (&ub0) + +#define ub_save_context(t) do { \ + t->task_bc.saved_ub = t->task_bc.exec_ub; \ + t->task_bc.exec_ub = get_ub0(); \ + } while (0) +#define ub_restore_context(t) do { \ + t->task_bc.exec_ub = t->task_bc.saved_ub; \ + } while (0) + +#define get_exec_ub() (current->task_bc.exec_ub) +#define set_exec_ub(__newub) \ +({ \ + struct user_beancounter *old; \ + struct task_beancounter *tbc; \ + \ + tbc = ¤t->task_bc; \ + old = tbc->exec_ub; \ + tbc->exec_ub = __newub; \ + old; \ +}) + +void ub_init_task_bc(struct task_beancounter *); + +#else /* CONFIG_USER_RESOURCE */ + +#define get_ub0() (NULL) +#define get_exec_ub() (NULL) +#define get_task_ub(task) (NULL) +#define set_exec_ub(__ub) (NULL) +#define ub_save_context(t) do { } while (0) +#define ub_restore_context(t) do { } while (0) + +#endif /* CONFIG_USER_RESOURCE */ +#endif /* __UB_TASK_H_ */ diff -uprN linux-2.6.18/include/ub/ub_tcp.h linux-2.6.18.ovz/include/ub/ub_tcp.h --- linux-2.6.18/include/ub/ub_tcp.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_tcp.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,76 @@ +/* + * include/ub/ub_tcp.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_TCP_H_ +#define __UB_TCP_H_ + +/* + * UB_NUMXXXSOCK, UB_XXXBUF accounting + */ + +#include +#include + +static inline void ub_tcp_update_maxadvmss(struct sock *sk) +{ +#ifdef CONFIG_USER_RESOURCE + if (!sock_has_ubc(sk)) + return; + if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss) + return; + + sock_bc(sk)->ub->ub_maxadvmss = + skb_charge_size(MAX_HEADER + sizeof(struct iphdr) + + sizeof(struct tcphdr) + tcp_sk(sk)->advmss); +#endif +} + +static inline int ub_tcp_rmem_allows_expand(struct sock *sk) +{ + if (tcp_memory_pressure) + return 0; +#ifdef CONFIG_USER_RESOURCE + if (sock_has_ubc(sk)) { + struct user_beancounter *ub; + + ub = sock_bc(sk)->ub; + if (ub->ub_rmem_pressure == UB_RMEM_EXPAND) + return 1; + if (ub->ub_rmem_pressure == UB_RMEM_SHRINK) + return 0; + return sk->sk_rcvbuf <= ub->ub_rmem_thres; + } +#endif + return 1; +} + +static inline int ub_tcp_memory_pressure(struct sock *sk) +{ + if (tcp_memory_pressure) + return 1; +#ifdef CONFIG_USER_RESOURCE + if (sock_has_ubc(sk)) + return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND; +#endif + return 0; +} + +static inline int ub_tcp_shrink_rcvbuf(struct sock *sk) +{ + if (tcp_memory_pressure) + return 1; +#ifdef CONFIG_USER_RESOURCE + if (sock_has_ubc(sk)) + return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK; +#endif + return 0; +} + +#endif diff -uprN linux-2.6.18/include/ub/ub_vmpages.h linux-2.6.18.ovz/include/ub/ub_vmpages.h --- linux-2.6.18/include/ub/ub_vmpages.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/include/ub/ub_vmpages.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,153 @@ +/* + * include/ub/ub_vmpages.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_PAGES_H_ +#define __UB_PAGES_H_ + +#include +#include +#include +#include + +/* + * Check whether vma has private or copy-on-write mapping. + * Should match checks in ub_protected_charge(). + */ +#define VM_UB_PRIVATE(__flags, __file) \ + ( ((__flags) & VM_WRITE) ? \ + (__file) == NULL || !((__flags) & VM_SHARED) : \ + 0 \ + ) + +/* Mprotect charging result */ +#define PRIVVM_ERROR -1 +#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */ +#define PRIVVM_TO_PRIVATE 1 +#define PRIVVM_TO_SHARED 2 + +UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm, + unsigned long size, + unsigned long newflags, + struct vm_area_struct *vma)) + +UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long num)) +#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1) +UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long num)) +#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1) + +UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm, + long sz)) + +UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm, + unsigned long size, + unsigned vm_flags, + struct file *vm_file, + int strict)) +UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm, + unsigned long size, + unsigned vm_flags, + struct file *vm_file)) + +struct shmem_inode_info; +UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i, + unsigned long sz)) +UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i, + unsigned long sz)) +UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi)) +UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi, + unsigned long size)) +#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1) + +#ifdef CONFIG_USER_RESOURCE +#define shmi_ub_set(shi, ub) do { \ + (shi)->shmi_ub = get_beancounter(ub); \ + } while (0) +#define shmi_ub_put(shi) do { \ + put_beancounter((shi)->shmi_ub); \ + (shi)->shmi_ub = NULL; \ + } while (0) +#else +#define shmi_ub_set(shi, ub) do { } while (0) +#define shmi_ub_put(shi) do { } while (0) +#endif + +UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi, + unsigned long size)) + +UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end)) +#define pages_in_vma(vma) (pages_in_vma_range(vma, \ + vma->vm_start, vma->vm_end)) + +#define UB_PAGE_WEIGHT_SHIFT 24 +#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT) + +struct page_beancounter; +#define PBC_COPY_SAME ((struct page_beancounter *) 1) + +/* Mprotect charging result */ +#define PRIVVM_ERROR -1 +#define PRIVVM_NO_CHARGE 0 +#define PRIVVM_TO_PRIVATE 1 +#define PRIVVM_TO_SHARED 2 + +extern void fastcall __ub_update_physpages(struct user_beancounter *ub); +extern void fastcall __ub_update_oomguarpages(struct user_beancounter *ub); +extern void fastcall __ub_update_privvm(struct user_beancounter *ub); + +#ifdef CONFIG_USER_RSS_ACCOUNTING +#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) +#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) +#else +#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} +#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { } +#endif + +PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc)) +PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num)) +PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page, + struct mm_struct *mm, + struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page, + struct mm_struct *mm, + struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb)) +PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb)) +PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page, + struct mm_struct *mm)) + +PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page)) +#endif + +#ifdef CONFIG_USER_SWAP_ACCOUNTING +#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) +#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) +#else +#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} +#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { } +#endif + +struct swap_info_struct; +SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n)) +SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si)) +SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n, + struct user_beancounter *ub)) +SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n)) diff -uprN linux-2.6.18/include/video/Kbuild linux-2.6.18.ovz/include/video/Kbuild --- linux-2.6.18/include/video/Kbuild 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/include/video/Kbuild 2007-06-13 06:55:07.000000000 -0400 @@ -1 +1 @@ -unifdef-y := sisfb.h +unifdef-y += sisfb.h diff -uprN linux-2.6.18/init/Kconfig linux-2.6.18.ovz/init/Kconfig --- linux-2.6.18/init/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/init/Kconfig 2007-06-13 06:55:07.000000000 -0400 @@ -1,5 +1,6 @@ config DEFCONFIG_LIST string + depends on !UML option defconfig_list default "/lib/modules/$UNAME_RELEASE/.config" default "/etc/kernel-config" @@ -115,6 +116,15 @@ config SYSVIPC section 6.4 of the Linux Programmer's Guide, available from . +config IPC_NS + bool "IPC Namespaces" + depends on SYSVIPC + default n + help + Support ipc namespaces. This allows containers, i.e. virtual + environments, to use ipc namespaces to provide different ipc + objects for different servers. If unsure, say N. + config POSIX_MQUEUE bool "POSIX Message Queues" depends on NET && EXPERIMENTAL @@ -182,6 +192,14 @@ config TASK_DELAY_ACCT Say N if unsure. +config UTS_NS + bool "UTS Namespaces" + default n + help + Support uts namespaces. This allows containers, i.e. + vservers, to use uts namespaces to provide different + uts info for different servers. If unsure, say N. + config AUDIT bool "Auditing support" depends on NET @@ -244,6 +262,15 @@ config RELAY source "usr/Kconfig" +config TASK_IO_ACCOUNTING + bool "Enable per-task storage I/O accounting (EXPERIMENTAL)" + depends on TASK_XACCT || USER_RESOURCE + help + Collect information on the number of bytes of storage I/O which this + task has caused. + + Say N if unsure. + config CC_OPTIMIZE_FOR_SIZE bool "Optimize for size (Look out for broken compilers!)" default y diff -uprN linux-2.6.18/init/calibrate.c linux-2.6.18.ovz/init/calibrate.c --- linux-2.6.18/init/calibrate.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/init/calibrate.c 2007-06-13 06:55:07.000000000 -0400 @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -105,6 +106,60 @@ static unsigned long __devinit calibrate static unsigned long __devinit calibrate_delay_direct(void) {return 0;} #endif +unsigned long cycles_per_jiffy, cycles_per_clock; + +static __devinit void calibrate_cycles(void) +{ + unsigned long ticks; + cycles_t time; + + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + time = get_cycles(); + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + + time = get_cycles() - time; + cycles_per_jiffy = time; + if ((time >> 32) != 0) { + printk("CPU too fast! timings are incorrect\n"); + cycles_per_jiffy = -1; + } +} + +EXPORT_SYMBOL(cycles_per_jiffy); +EXPORT_SYMBOL(cycles_per_clock); + +static __devinit void calc_cycles_per_jiffy(void) +{ +#if 0 + extern unsigned long fast_gettimeoffset_quotient; + unsigned long low, high; + + if (fast_gettimeoffset_quotient != 0) { + __asm__("divl %2" + :"=a" (low), "=d" (high) + :"r" (fast_gettimeoffset_quotient), + "0" (0), "1" (1000000/HZ)); + + cycles_per_jiffy = low; + } +#endif + if (cycles_per_jiffy == 0) + calibrate_cycles(); + + if (cycles_per_jiffy == 0) { + printk(KERN_WARNING "Cycles are stuck! " + "Some statistics will not be available."); + /* to prevent division by zero in cycles_to_(clocks|jiffies) */ + cycles_per_jiffy = 1; + cycles_per_clock = 1; + } else + cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC); +} + /* * This is the number of bits of precision for the loops_per_jiffy. Each * bit takes on average 1.5/HZ seconds. This (like the original) is a little @@ -170,4 +225,5 @@ void __devinit calibrate_delay(void) loops_per_jiffy); } + calc_cycles_per_jiffy(); } diff -uprN linux-2.6.18/init/main.c linux-2.6.18.ovz/init/main.c --- linux-2.6.18/init/main.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/init/main.c 2007-06-13 06:55:07.000000000 -0400 @@ -52,6 +52,8 @@ #include #include +#include + #include #include #include @@ -83,6 +85,7 @@ extern void mca_init(void); extern void sbus_init(void); extern void sysctl_init(void); extern void signals_init(void); +extern void fairsched_init_late(void); extern void pidhash_init(void); extern void pidmap_init(void); extern void prio_tree_init(void); @@ -107,6 +110,26 @@ extern void tc_init(void); enum system_states system_state; EXPORT_SYMBOL(system_state); +#ifdef CONFIG_VE +extern void init_ve_system(void); +extern void init_ve0(void); +extern void prepare_ve0_process(struct task_struct *tsk); +extern void prepare_ve0_proc_root(void); +extern void prepare_ve0_sysctl(void); +#else +#define init_ve_system() do { } while (0) +#define init_ve0() do { } while (0) +#define prepare_ve0_process(tsk) do { } while (0) +#define prepare_ve0_proc_root() do { } while (0) +#define prepare_ve0_sysctl() do { } while (0) +#endif + +#if defined(CONFIG_VE) && defined(CONFIG_NET) +extern void prepare_ve0_loopback(void); +#else +#define prepare_ve0_loopback() do { } while (0) +#endif + /* * Boot command-line arguments */ @@ -460,6 +483,9 @@ asmlinkage void __init start_kernel(void smp_setup_processor_id(); + prepare_ve0_process(&init_task); + init_ve0(); + /* * Need to run as early as possible, to initialize the * lockdep hash: @@ -475,6 +501,7 @@ asmlinkage void __init start_kernel(void * enable them */ lock_kernel(); + ub_init_early(); boot_cpu_init(); page_address_init(); printk(KERN_NOTICE); @@ -563,6 +590,7 @@ asmlinkage void __init start_kernel(void #endif fork_init(num_physpages); proc_caches_init(); + ub_init_late(); buffer_init(); unnamed_dev_init(); key_init(); @@ -573,6 +601,8 @@ asmlinkage void __init start_kernel(void /* rootfs populating might need page-writeback */ page_writeback_init(); #ifdef CONFIG_PROC_FS + prepare_ve0_proc_root(); + prepare_ve0_sysctl(); proc_root_init(); #endif cpuset_init(); @@ -583,6 +613,10 @@ asmlinkage void __init start_kernel(void acpi_early_init(); /* before LAPIC and SMP init */ +#ifdef CONFIG_USER_RSS_ACCOUNTING + ub_init_pbc(); +#endif + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -652,6 +686,9 @@ static void __init do_initcalls(void) */ static void __init do_basic_setup(void) { + prepare_ve0_loopback(); + init_ve_system(); + /* drivers will send hotplug events */ init_workqueues(); usermodehelper_init(); @@ -667,7 +704,7 @@ static void __init do_basic_setup(void) static void do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_VCPU) extern int migration_init(void); migration_init(); @@ -704,6 +741,12 @@ static int init(void * unused) do_pre_smp_initcalls(); smp_init(); + + /* + * This should be done after all cpus are known to + * be online. smp_init gives us confidence in it. + */ + fairsched_init_late(); sched_init_smp(); cpuset_init_smp(); diff -uprN linux-2.6.18/init/version.c linux-2.6.18.ovz/init/version.c --- linux-2.6.18/init/version.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/init/version.c 2007-06-13 06:55:07.000000000 -0400 @@ -12,22 +12,33 @@ #include #include #include +#include #define version(a) Version_ ## a #define version_string(a) version(a) int version_string(LINUX_VERSION_CODE); -struct new_utsname system_utsname = { - .sysname = UTS_SYSNAME, - .nodename = UTS_NODENAME, - .release = UTS_RELEASE, - .version = UTS_VERSION, - .machine = UTS_MACHINE, - .domainname = UTS_DOMAINNAME, +struct uts_namespace init_uts_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .name = { + .sysname = UTS_SYSNAME, + .nodename = UTS_NODENAME, + .release = UTS_RELEASE, + .version = UTS_VERSION, + .machine = UTS_MACHINE, + .domainname = UTS_DOMAINNAME, + }, }; +EXPORT_SYMBOL_GPL(init_uts_ns); -EXPORT_SYMBOL(system_utsname); +struct new_utsname virt_utsname = { + /* we need only this field */ + .release = UTS_RELEASE, +}; +EXPORT_SYMBOL(virt_utsname); const char linux_banner[] = "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" diff -uprN linux-2.6.18/ipc/msg.c linux-2.6.18.ovz/ipc/msg.c --- linux-2.6.18/ipc/msg.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/ipc/msg.c 2007-06-13 06:55:07.000000000 -0400 @@ -16,6 +16,10 @@ * * support for audit of ipc object properties and permission changes * Dustin Kirkland + * + * namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov */ #include @@ -31,16 +35,12 @@ #include #include #include +#include #include #include #include "util.h" -/* sysctl: */ -int msg_ctlmax = MSGMAX; -int msg_ctlmnb = MSGMNB; -int msg_ctlmni = MSGMNI; - /* * one msg_receiver structure for each sleeping receiver: */ @@ -69,30 +69,76 @@ struct msg_sender { static atomic_t msg_bytes = ATOMIC_INIT(0); static atomic_t msg_hdrs = ATOMIC_INIT(0); -static struct ipc_ids msg_ids; +static struct ipc_ids init_msg_ids; + +#define msg_ids(ns) (*((ns)->ids[IPC_MSG_IDS])) -#define msg_lock(id) ((struct msg_queue *)ipc_lock(&msg_ids, id)) +#define msg_lock(ns, id) ((struct msg_queue*)ipc_lock(&msg_ids(ns), id)) #define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) -#define msg_rmid(id) ((struct msg_queue *)ipc_rmid(&msg_ids, id)) -#define msg_checkid(msq, msgid) ipc_checkid(&msg_ids, &msq->q_perm, msgid) -#define msg_buildid(id, seq) ipc_buildid(&msg_ids, id, seq) +#define msg_rmid(ns, id) ((struct msg_queue*)ipc_rmid(&msg_ids(ns), id)) +#define msg_checkid(ns, msq, msgid) \ + ipc_checkid(&msg_ids(ns), &msq->q_perm, msgid) +#define msg_buildid(ns, id, seq) \ + ipc_buildid(&msg_ids(ns), id, seq) -static void freeque(struct msg_queue *msq, int id); -static int newque(key_t key, int msgflg); +static void freeque (struct ipc_namespace *ns, struct msg_queue *msq, int id); +static int newque (struct ipc_namespace *ns, key_t key, int msgflg); #ifdef CONFIG_PROC_FS static int sysvipc_msg_proc_show(struct seq_file *s, void *it); #endif +static void __ipc_init __msg_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids) +{ + ns->ids[IPC_MSG_IDS] = ids; + ns->msg_ctlmax = MSGMAX; + ns->msg_ctlmnb = MSGMNB; + ns->msg_ctlmni = MSGMNI; + ipc_init_ids(ids, ns->msg_ctlmni); +} + +#ifdef CONFIG_IPC_NS +int msg_init_ns(struct ipc_namespace *ns) +{ + struct ipc_ids *ids; + + ids = kmalloc(sizeof(struct ipc_ids), GFP_KERNEL); + if (ids == NULL) + return -ENOMEM; + + __msg_init_ns(ns, ids); + return 0; +} + +void msg_exit_ns(struct ipc_namespace *ns) +{ + int i; + struct msg_queue *msq; + + mutex_lock(&msg_ids(ns).mutex); + for (i = 0; i <= msg_ids(ns).max_id; i++) { + msq = msg_lock(ns, i); + if (msq == NULL) + continue; + + freeque(ns, msq, i); + } + mutex_unlock(&msg_ids(ns).mutex); + + ipc_fini_ids(ns->ids[IPC_MSG_IDS]); + kfree(ns->ids[IPC_MSG_IDS]); + ns->ids[IPC_MSG_IDS] = NULL; +} +#endif + void __init msg_init(void) { - ipc_init_ids(&msg_ids, msg_ctlmni); + __msg_init_ns(&init_ipc_ns, &init_msg_ids); ipc_init_proc_interface("sysvipc/msg", " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", - &msg_ids, - sysvipc_msg_proc_show); + IPC_MSG_IDS, sysvipc_msg_proc_show); } -static int newque(key_t key, int msgflg) +static int newque (struct ipc_namespace *ns, key_t key, int msgflg) { struct msg_queue *msq; int id, retval; @@ -111,18 +157,18 @@ static int newque(key_t key, int msgflg) return retval; } - id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni); + id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, -1); if (id == -1) { security_msg_queue_free(msq); ipc_rcu_putref(msq); return -ENOSPC; } - msq->q_id = msg_buildid(id, msq->q_perm.seq); + msq->q_id = msg_buildid(ns, id, msq->q_perm.seq); msq->q_stime = msq->q_rtime = 0; msq->q_ctime = get_seconds(); msq->q_cbytes = msq->q_qnum = 0; - msq->q_qbytes = msg_ctlmnb; + msq->q_qbytes = ns->msg_ctlmnb; msq->q_lspid = msq->q_lrpid = 0; INIT_LIST_HEAD(&msq->q_messages); INIT_LIST_HEAD(&msq->q_receivers); @@ -186,13 +232,13 @@ static void expunge_all(struct msg_queue * msg_ids.mutex and the spinlock for this message queue is hold * before freeque() is called. msg_ids.mutex remains locked on exit. */ -static void freeque(struct msg_queue *msq, int id) +static void freeque(struct ipc_namespace *ns, struct msg_queue *msq, int id) { struct list_head *tmp; expunge_all(msq, -EIDRM); ss_wakeup(&msq->q_senders, 1); - msq = msg_rmid(id); + msq = msg_rmid(ns, id); msg_unlock(msq); tmp = msq->q_messages.next; @@ -212,24 +258,27 @@ asmlinkage long sys_msgget(key_t key, in { struct msg_queue *msq; int id, ret = -EPERM; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; - mutex_lock(&msg_ids.mutex); + mutex_lock(&msg_ids(ns).mutex); if (key == IPC_PRIVATE) - ret = newque(key, msgflg); - else if ((id = ipc_findkey(&msg_ids, key)) == -1) { /* key not used */ + ret = newque(ns, key, msgflg); + else if ((id = ipc_findkey(&msg_ids(ns), key)) == -1) { /* key not used */ if (!(msgflg & IPC_CREAT)) ret = -ENOENT; else - ret = newque(key, msgflg); + ret = newque(ns, key, msgflg); } else if (msgflg & IPC_CREAT && msgflg & IPC_EXCL) { ret = -EEXIST; } else { - msq = msg_lock(id); + msq = msg_lock(ns, id); BUG_ON(msq == NULL); if (ipcperms(&msq->q_perm, msgflg)) ret = -EACCES; else { - int qid = msg_buildid(id, msq->q_perm.seq); + int qid = msg_buildid(ns, id, msq->q_perm.seq); ret = security_msg_queue_associate(msq, msgflg); if (!ret) @@ -237,7 +286,7 @@ asmlinkage long sys_msgget(key_t key, in } msg_unlock(msq); } - mutex_unlock(&msg_ids.mutex); + mutex_unlock(&msg_ids(ns).mutex); return ret; } @@ -341,11 +390,13 @@ asmlinkage long sys_msgctl(int msqid, in struct msq_setbuf setbuf; struct msg_queue *msq; int err, version; + struct ipc_namespace *ns; if (msqid < 0 || cmd < 0) return -EINVAL; version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: @@ -366,14 +417,14 @@ asmlinkage long sys_msgctl(int msqid, in return err; memset(&msginfo, 0, sizeof(msginfo)); - msginfo.msgmni = msg_ctlmni; - msginfo.msgmax = msg_ctlmax; - msginfo.msgmnb = msg_ctlmnb; + msginfo.msgmni = ns->msg_ctlmni; + msginfo.msgmax = ns->msg_ctlmax; + msginfo.msgmnb = ns->msg_ctlmnb; msginfo.msgssz = MSGSSZ; msginfo.msgseg = MSGSEG; - mutex_lock(&msg_ids.mutex); + mutex_lock(&msg_ids(ns).mutex); if (cmd == MSG_INFO) { - msginfo.msgpool = msg_ids.in_use; + msginfo.msgpool = msg_ids(ns).in_use; msginfo.msgmap = atomic_read(&msg_hdrs); msginfo.msgtql = atomic_read(&msg_bytes); } else { @@ -381,8 +432,8 @@ asmlinkage long sys_msgctl(int msqid, in msginfo.msgpool = MSGPOOL; msginfo.msgtql = MSGTQL; } - max_id = msg_ids.max_id; - mutex_unlock(&msg_ids.mutex); + max_id = msg_ids(ns).max_id; + mutex_unlock(&msg_ids(ns).mutex); if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; @@ -395,20 +446,20 @@ asmlinkage long sys_msgctl(int msqid, in if (!buf) return -EFAULT; - if (cmd == MSG_STAT && msqid >= msg_ids.entries->size) + if (cmd == MSG_STAT && msqid >= msg_ids(ns).entries->size) return -EINVAL; memset(&tbuf, 0, sizeof(tbuf)); - msq = msg_lock(msqid); + msq = msg_lock(ns, msqid); if (msq == NULL) return -EINVAL; if (cmd == MSG_STAT) { - success_return = msg_buildid(msqid, msq->q_perm.seq); + success_return = msg_buildid(ns, msqid, msq->q_perm.seq); } else { err = -EIDRM; - if (msg_checkid(msq, msqid)) + if (msg_checkid(ns, msq, msqid)) goto out_unlock; success_return = 0; } @@ -446,14 +497,14 @@ asmlinkage long sys_msgctl(int msqid, in return -EINVAL; } - mutex_lock(&msg_ids.mutex); - msq = msg_lock(msqid); + mutex_lock(&msg_ids(ns).mutex); + msq = msg_lock(ns, msqid); err = -EINVAL; if (msq == NULL) goto out_up; err = -EIDRM; - if (msg_checkid(msq, msqid)) + if (msg_checkid(ns, msq, msqid)) goto out_unlock_up; ipcp = &msq->q_perm; @@ -469,7 +520,7 @@ asmlinkage long sys_msgctl(int msqid, in err = -EPERM; if (current->euid != ipcp->cuid && - current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) + current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) /* We _could_ check for CAP_CHOWN above, but we don't */ goto out_unlock_up; @@ -481,7 +532,7 @@ asmlinkage long sys_msgctl(int msqid, in case IPC_SET: { err = -EPERM; - if (setbuf.qbytes > msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) + if (setbuf.qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) goto out_unlock_up; msq->q_qbytes = setbuf.qbytes; @@ -503,12 +554,12 @@ asmlinkage long sys_msgctl(int msqid, in break; } case IPC_RMID: - freeque(msq, msqid); + freeque(ns, msq, msqid); break; } err = 0; out_up: - mutex_unlock(&msg_ids.mutex); + mutex_unlock(&msg_ids(ns).mutex); return err; out_unlock_up: msg_unlock(msq); @@ -562,7 +613,7 @@ static inline int pipelined_send(struct msr->r_msg = ERR_PTR(-E2BIG); } else { msr->r_msg = NULL; - msq->q_lrpid = msr->r_tsk->pid; + msq->q_lrpid = virt_pid(msr->r_tsk); msq->q_rtime = get_seconds(); wake_up_process(msr->r_tsk); smp_mb(); @@ -582,8 +633,11 @@ sys_msgsnd(int msqid, struct msgbuf __us struct msg_msg *msg; long mtype; int err; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; - if (msgsz > msg_ctlmax || (long) msgsz < 0 || msqid < 0) + if (msgsz > ns->msg_ctlmax || (long) msgsz < 0 || msqid < 0) return -EINVAL; if (get_user(mtype, &msgp->mtype)) return -EFAULT; @@ -597,13 +651,13 @@ sys_msgsnd(int msqid, struct msgbuf __us msg->m_type = mtype; msg->m_ts = msgsz; - msq = msg_lock(msqid); + msq = msg_lock(ns, msqid); err = -EINVAL; if (msq == NULL) goto out_free; err= -EIDRM; - if (msg_checkid(msq, msqid)) + if (msg_checkid(ns, msq, msqid)) goto out_unlock_free; for (;;) { @@ -646,7 +700,7 @@ sys_msgsnd(int msqid, struct msgbuf __us } } - msq->q_lspid = current->tgid; + msq->q_lspid = virt_tgid(current); msq->q_stime = get_seconds(); if (!pipelined_send(msq, msg)) { @@ -694,17 +748,19 @@ asmlinkage long sys_msgrcv(int msqid, st struct msg_queue *msq; struct msg_msg *msg; int mode; + struct ipc_namespace *ns; if (msqid < 0 || (long) msgsz < 0) return -EINVAL; mode = convert_mode(&msgtyp, msgflg); + ns = current->nsproxy->ipc_ns; - msq = msg_lock(msqid); + msq = msg_lock(ns, msqid); if (msq == NULL) return -EINVAL; msg = ERR_PTR(-EIDRM); - if (msg_checkid(msq, msqid)) + if (msg_checkid(ns, msq, msqid)) goto out_unlock; for (;;) { @@ -749,7 +805,7 @@ asmlinkage long sys_msgrcv(int msqid, st list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = get_seconds(); - msq->q_lrpid = current->tgid; + msq->q_lrpid = virt_tgid(current); msq->q_cbytes -= msg->m_ts; atomic_sub(msg->m_ts, &msg_bytes); atomic_dec(&msg_hdrs); @@ -865,3 +921,30 @@ static int sysvipc_msg_proc_show(struct msq->q_ctime); } #endif + +#ifdef CONFIG_VE +#include + +int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg) +{ + int i; + int err = 0; + struct msg_queue * msq; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + mutex_lock(&msg_ids(ns).mutex); + for(i = 0; i <= msg_ids(ns).max_id; i++) { + if ((msq = msg_lock(ns, i)) == NULL) + continue; + err = func(msg_buildid(ns, i, msq->q_perm.seq), msq, arg); + msg_unlock(msq); + if (err) + break; + } + mutex_unlock(&msg_ids(ns).mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_msg); +#endif diff -uprN linux-2.6.18/ipc/msgutil.c linux-2.6.18.ovz/ipc/msgutil.c --- linux-2.6.18/ipc/msgutil.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/ipc/msgutil.c 2007-06-13 06:55:07.000000000 -0400 @@ -17,6 +17,8 @@ #include "util.h" +#include + struct msg_msgseg { struct msg_msgseg* next; /* the next part of the message follows immediately */ @@ -36,7 +38,7 @@ struct msg_msg *load_msg(const void __us if (alen > DATALEN_MSG) alen = DATALEN_MSG; - msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + msg = (struct msg_msg *)ub_kmalloc(sizeof(*msg) + alen, GFP_KERNEL); if (msg == NULL) return ERR_PTR(-ENOMEM); @@ -56,7 +58,7 @@ struct msg_msg *load_msg(const void __us alen = len; if (alen > DATALEN_SEG) alen = DATALEN_SEG; - seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen, + seg = (struct msg_msgseg *)ub_kmalloc(sizeof(*seg) + alen, GFP_KERNEL); if (seg == NULL) { err = -ENOMEM; diff -uprN linux-2.6.18/ipc/sem.c linux-2.6.18.ovz/ipc/sem.c --- linux-2.6.18/ipc/sem.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/ipc/sem.c 2007-06-13 06:55:07.000000000 -0400 @@ -64,6 +64,10 @@ * * support for audit of ipc object properties and permission changes * Dustin Kirkland + * + * namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov */ #include @@ -78,22 +82,25 @@ #include #include #include +#include #include #include "util.h" +#define sem_ids(ns) (*((ns)->ids[IPC_SEM_IDS])) + +#define sem_lock(ns, id) ((struct sem_array*)ipc_lock(&sem_ids(ns), id)) +#define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) +#define sem_rmid(ns, id) ((struct sem_array*)ipc_rmid(&sem_ids(ns), id)) +#define sem_checkid(ns, sma, semid) \ + ipc_checkid(&sem_ids(ns),&sma->sem_perm,semid) +#define sem_buildid(ns, id, seq) \ + ipc_buildid(&sem_ids(ns), id, seq) -#define sem_lock(id) ((struct sem_array*)ipc_lock(&sem_ids,id)) -#define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) -#define sem_rmid(id) ((struct sem_array*)ipc_rmid(&sem_ids,id)) -#define sem_checkid(sma, semid) \ - ipc_checkid(&sem_ids,&sma->sem_perm,semid) -#define sem_buildid(id, seq) \ - ipc_buildid(&sem_ids, id, seq) -static struct ipc_ids sem_ids; +static struct ipc_ids init_sem_ids; -static int newary (key_t, int, int); -static void freeary (struct sem_array *sma, int id); +static int newary(struct ipc_namespace *, key_t, int, int, int); +static void freeary(struct ipc_namespace *ns, struct sem_array *sma, int id); #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #endif @@ -110,22 +117,62 @@ static int sysvipc_sem_proc_show(struct * */ -int sem_ctls[4] = {SEMMSL, SEMMNS, SEMOPM, SEMMNI}; -#define sc_semmsl (sem_ctls[0]) -#define sc_semmns (sem_ctls[1]) -#define sc_semopm (sem_ctls[2]) -#define sc_semmni (sem_ctls[3]) +#define sc_semmsl sem_ctls[0] +#define sc_semmns sem_ctls[1] +#define sc_semopm sem_ctls[2] +#define sc_semmni sem_ctls[3] + +static void __ipc_init __sem_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids) +{ + ns->ids[IPC_SEM_IDS] = ids; + ns->sc_semmsl = SEMMSL; + ns->sc_semmns = SEMMNS; + ns->sc_semopm = SEMOPM; + ns->sc_semmni = SEMMNI; + ns->used_sems = 0; + ipc_init_ids(ids, ns->sc_semmni); +} + +#ifdef CONFIG_IPC_NS +int sem_init_ns(struct ipc_namespace *ns) +{ + struct ipc_ids *ids; + + ids = kmalloc(sizeof(struct ipc_ids), GFP_KERNEL); + if (ids == NULL) + return -ENOMEM; + + __sem_init_ns(ns, ids); + return 0; +} + +void sem_exit_ns(struct ipc_namespace *ns) +{ + int i; + struct sem_array *sma; + + mutex_lock(&sem_ids(ns).mutex); + for (i = 0; i <= sem_ids(ns).max_id; i++) { + sma = sem_lock(ns, i); + if (sma == NULL) + continue; + + freeary(ns, sma, i); + } + mutex_unlock(&sem_ids(ns).mutex); -static int used_sems; + ipc_fini_ids(ns->ids[IPC_SEM_IDS]); + kfree(ns->ids[IPC_SEM_IDS]); + ns->ids[IPC_SEM_IDS] = NULL; +} +#endif void __init sem_init (void) { - used_sems = 0; - ipc_init_ids(&sem_ids,sc_semmni); + __sem_init_ns(&init_ipc_ns, &init_sem_ids); ipc_init_proc_interface("sysvipc/sem", " key semid perms nsems uid gid cuid cgid otime ctime\n", - &sem_ids, - sysvipc_sem_proc_show); + IPC_SEM_IDS, sysvipc_sem_proc_show); } /* @@ -162,7 +209,8 @@ void __init sem_init (void) */ #define IN_WAKEUP 1 -static int newary (key_t key, int nsems, int semflg) +static int newary (struct ipc_namespace *ns, key_t key, int semid, + int nsems, int semflg) { int id; int retval; @@ -171,7 +219,7 @@ static int newary (key_t key, int nsems, if (!nsems) return -EINVAL; - if (used_sems + nsems > sc_semmns) + if (ns->used_sems + nsems > ns->sc_semmns) return -ENOSPC; size = sizeof (*sma) + nsems * sizeof (struct sem); @@ -191,15 +239,15 @@ static int newary (key_t key, int nsems, return retval; } - id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni); + id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, semid); if(id == -1) { security_sem_free(sma); ipc_rcu_putref(sma); return -ENOSPC; } - used_sems += nsems; + ns->used_sems += nsems; - sma->sem_id = sem_buildid(id, sma->sem_perm.seq); + sma->sem_id = sem_buildid(ns, id, sma->sem_perm.seq); sma->sem_base = (struct sem *) &sma[1]; /* sma->sem_pending = NULL; */ sma->sem_pending_last = &sma->sem_pending; @@ -215,29 +263,32 @@ asmlinkage long sys_semget (key_t key, i { int id, err = -EINVAL; struct sem_array *sma; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; - if (nsems < 0 || nsems > sc_semmsl) + if (nsems < 0 || nsems > ns->sc_semmsl) return -EINVAL; - mutex_lock(&sem_ids.mutex); + mutex_lock(&sem_ids(ns).mutex); if (key == IPC_PRIVATE) { - err = newary(key, nsems, semflg); - } else if ((id = ipc_findkey(&sem_ids, key)) == -1) { /* key not used */ + err = newary(ns, key, -1, nsems, semflg); + } else if ((id = ipc_findkey(&sem_ids(ns), key)) == -1) { /* key not used */ if (!(semflg & IPC_CREAT)) err = -ENOENT; else - err = newary(key, nsems, semflg); + err = newary(ns, key, -1, nsems, semflg); } else if (semflg & IPC_CREAT && semflg & IPC_EXCL) { err = -EEXIST; } else { - sma = sem_lock(id); + sma = sem_lock(ns, id); BUG_ON(sma==NULL); if (nsems > sma->sem_nsems) err = -EINVAL; else if (ipcperms(&sma->sem_perm, semflg)) err = -EACCES; else { - int semid = sem_buildid(id, sma->sem_perm.seq); + int semid = sem_buildid(ns, id, sma->sem_perm.seq); err = security_sem_associate(sma, semflg); if (!err) err = semid; @@ -245,7 +296,7 @@ asmlinkage long sys_semget (key_t key, i sem_unlock(sma); } - mutex_unlock(&sem_ids.mutex); + mutex_unlock(&sem_ids(ns).mutex); return err; } @@ -444,7 +495,7 @@ static int count_semzcnt (struct sem_arr * the spinlock for this semaphore set hold. sem_ids.mutex remains locked * on exit. */ -static void freeary (struct sem_array *sma, int id) +static void freeary (struct ipc_namespace *ns, struct sem_array *sma, int id) { struct sem_undo *un; struct sem_queue *q; @@ -472,10 +523,10 @@ static void freeary (struct sem_array *s } /* Remove the semaphore set from the ID array*/ - sma = sem_rmid(id); + sma = sem_rmid(ns, id); sem_unlock(sma); - used_sems -= sma->sem_nsems; + ns->used_sems -= sma->sem_nsems; size = sizeof (*sma) + sma->sem_nsems * sizeof (struct sem); security_sem_free(sma); ipc_rcu_putref(sma); @@ -503,7 +554,8 @@ static unsigned long copy_semid_to_user( } } -static int semctl_nolock(int semid, int semnum, int cmd, int version, union semun arg) +static int semctl_nolock(struct ipc_namespace *ns, int semid, int semnum, + int cmd, int version, union semun arg) { int err = -EINVAL; struct sem_array *sma; @@ -520,24 +572,24 @@ static int semctl_nolock(int semid, int return err; memset(&seminfo,0,sizeof(seminfo)); - seminfo.semmni = sc_semmni; - seminfo.semmns = sc_semmns; - seminfo.semmsl = sc_semmsl; - seminfo.semopm = sc_semopm; + seminfo.semmni = ns->sc_semmni; + seminfo.semmns = ns->sc_semmns; + seminfo.semmsl = ns->sc_semmsl; + seminfo.semopm = ns->sc_semopm; seminfo.semvmx = SEMVMX; seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; - mutex_lock(&sem_ids.mutex); + mutex_lock(&sem_ids(ns).mutex); if (cmd == SEM_INFO) { - seminfo.semusz = sem_ids.in_use; - seminfo.semaem = used_sems; + seminfo.semusz = sem_ids(ns).in_use; + seminfo.semaem = ns->used_sems; } else { seminfo.semusz = SEMUSZ; seminfo.semaem = SEMAEM; } - max_id = sem_ids.max_id; - mutex_unlock(&sem_ids.mutex); + max_id = sem_ids(ns).max_id; + mutex_unlock(&sem_ids(ns).mutex); if (copy_to_user (arg.__buf, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_id < 0) ? 0: max_id; @@ -547,12 +599,12 @@ static int semctl_nolock(int semid, int struct semid64_ds tbuf; int id; - if(semid >= sem_ids.entries->size) + if(semid >= sem_ids(ns).entries->size) return -EINVAL; memset(&tbuf,0,sizeof(tbuf)); - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if(sma == NULL) return -EINVAL; @@ -564,7 +616,7 @@ static int semctl_nolock(int semid, int if (err) goto out_unlock; - id = sem_buildid(semid, sma->sem_perm.seq); + id = sem_buildid(ns, semid, sma->sem_perm.seq); kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm); tbuf.sem_otime = sma->sem_otime; @@ -584,7 +636,8 @@ out_unlock: return err; } -static int semctl_main(int semid, int semnum, int cmd, int version, union semun arg) +static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, + int cmd, int version, union semun arg) { struct sem_array *sma; struct sem* curr; @@ -593,14 +646,14 @@ static int semctl_main(int semid, int se ushort* sem_io = fast_sem_io; int nsems; - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if(sma==NULL) return -EINVAL; nsems = sma->sem_nsems; err=-EIDRM; - if (sem_checkid(sma,semid)) + if (sem_checkid(ns,sma,semid)) goto out_unlock; err = -EACCES; @@ -746,7 +799,7 @@ static int semctl_main(int semid, int se for (un = sma->undo; un; un = un->id_next) un->semadj[semnum] = 0; curr->semval = val; - curr->sempid = current->tgid; + curr->sempid = virt_tgid(current); sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ update_queue(sma); @@ -802,7 +855,8 @@ static inline unsigned long copy_semid_f } } -static int semctl_down(int semid, int semnum, int cmd, int version, union semun arg) +static int semctl_down(struct ipc_namespace *ns, int semid, int semnum, + int cmd, int version, union semun arg) { struct sem_array *sma; int err; @@ -813,11 +867,11 @@ static int semctl_down(int semid, int se if(copy_semid_from_user (&setbuf, arg.buf, version)) return -EFAULT; } - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if(sma==NULL) return -EINVAL; - if (sem_checkid(sma,semid)) { + if (sem_checkid(ns,sma,semid)) { err=-EIDRM; goto out_unlock; } @@ -833,7 +887,7 @@ static int semctl_down(int semid, int se goto out_unlock; } if (current->euid != ipcp->cuid && - current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) { + current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) { err=-EPERM; goto out_unlock; } @@ -844,7 +898,7 @@ static int semctl_down(int semid, int se switch(cmd){ case IPC_RMID: - freeary(sma, semid); + freeary(ns, sma, semid); err = 0; break; case IPC_SET: @@ -872,17 +926,19 @@ asmlinkage long sys_semctl (int semid, i { int err = -EINVAL; int version; + struct ipc_namespace *ns; if (semid < 0) return -EINVAL; version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; switch(cmd) { case IPC_INFO: case SEM_INFO: case SEM_STAT: - err = semctl_nolock(semid,semnum,cmd,version,arg); + err = semctl_nolock(ns,semid,semnum,cmd,version,arg); return err; case GETALL: case GETVAL: @@ -892,13 +948,13 @@ asmlinkage long sys_semctl (int semid, i case IPC_STAT: case SETVAL: case SETALL: - err = semctl_main(semid,semnum,cmd,version,arg); + err = semctl_main(ns,semid,semnum,cmd,version,arg); return err; case IPC_RMID: case IPC_SET: - mutex_lock(&sem_ids.mutex); - err = semctl_down(semid,semnum,cmd,version,arg); - mutex_unlock(&sem_ids.mutex); + mutex_lock(&sem_ids(ns).mutex); + err = semctl_down(ns,semid,semnum,cmd,version,arg); + mutex_unlock(&sem_ids(ns).mutex); return err; default: return -EINVAL; @@ -934,6 +990,7 @@ static inline void unlock_semundo(void) spin_unlock(&undo_list->lock); } +#include /* If the task doesn't already have a undo_list, then allocate one * here. We guarantee there is only one thread using this undo list, @@ -954,7 +1011,8 @@ static inline int get_undo_list(struct s undo_list = current->sysvsem.undo_list; if (!undo_list) { size = sizeof(struct sem_undo_list); - undo_list = (struct sem_undo_list *) kmalloc(size, GFP_KERNEL); + undo_list = (struct sem_undo_list *) ub_kmalloc(size, + GFP_KERNEL); if (undo_list == NULL) return -ENOMEM; memset(undo_list, 0, size); @@ -986,7 +1044,7 @@ static struct sem_undo *lookup_undo(stru return un; } -static struct sem_undo *find_undo(int semid) +static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid) { struct sem_array *sma; struct sem_undo_list *ulp; @@ -1005,12 +1063,12 @@ static struct sem_undo *find_undo(int se goto out; /* no undo structure around - allocate one. */ - sma = sem_lock(semid); + sma = sem_lock(ns, semid); un = ERR_PTR(-EINVAL); if(sma==NULL) goto out; un = ERR_PTR(-EIDRM); - if (sem_checkid(sma,semid)) { + if (sem_checkid(ns,sma,semid)) { sem_unlock(sma); goto out; } @@ -1018,7 +1076,8 @@ static struct sem_undo *find_undo(int se ipc_rcu_getref(sma); sem_unlock(sma); - new = (struct sem_undo *) kmalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); + new = (struct sem_undo *) ub_kmalloc(sizeof(struct sem_undo) + + sizeof(short)*nsems, GFP_KERNEL); if (!new) { ipc_lock_by_ptr(&sma->sem_perm); ipc_rcu_putref(sma); @@ -1070,13 +1129,16 @@ asmlinkage long sys_semtimedop(int semid int undos = 0, alter = 0, max; struct sem_queue queue; unsigned long jiffies_left = 0; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; if (nsops < 1 || semid < 0) return -EINVAL; - if (nsops > sc_semopm) + if (nsops > ns->sc_semopm) return -E2BIG; if(nsops > SEMOPM_FAST) { - sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); + sops = ub_kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); if(sops==NULL) return -ENOMEM; } @@ -1109,7 +1171,7 @@ asmlinkage long sys_semtimedop(int semid retry_undos: if (undos) { - un = find_undo(semid); + un = find_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); goto out_free; @@ -1117,12 +1179,12 @@ retry_undos: } else un = NULL; - sma = sem_lock(semid); + sma = sem_lock(ns, semid); error=-EINVAL; if(sma==NULL) goto out_free; error = -EIDRM; - if (sem_checkid(sma,semid)) + if (sem_checkid(ns,sma,semid)) goto out_unlock_free; /* * semid identifies are not unique - find_undo may have @@ -1145,7 +1207,7 @@ retry_undos: if (error) goto out_unlock_free; - error = try_atomic_semop (sma, sops, nsops, un, current->tgid); + error = try_atomic_semop (sma, sops, nsops, un, virt_tgid(current)); if (error <= 0) { if (alter && error == 0) update_queue (sma); @@ -1160,7 +1222,7 @@ retry_undos: queue.sops = sops; queue.nsops = nsops; queue.undo = un; - queue.pid = current->tgid; + queue.pid = virt_tgid(current); queue.id = semid; queue.alter = alter; if (alter) @@ -1190,7 +1252,7 @@ retry_undos: goto out_free; } - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if(sma==NULL) { BUG_ON(queue.prev != NULL); error = -EIDRM; @@ -1267,6 +1329,7 @@ void exit_sem(struct task_struct *tsk) { struct sem_undo_list *undo_list; struct sem_undo *u, **up; + struct ipc_namespace *ns; undo_list = tsk->sysvsem.undo_list; if (!undo_list) @@ -1275,6 +1338,7 @@ void exit_sem(struct task_struct *tsk) if (!atomic_dec_and_test(&undo_list->refcnt)) return; + ns = tsk->nsproxy->ipc_ns; /* There's no need to hold the semundo list lock, as current * is the last task exiting for this undo list. */ @@ -1288,14 +1352,14 @@ void exit_sem(struct task_struct *tsk) if(semid == -1) continue; - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if (sma == NULL) continue; if (u->semid == -1) goto next_entry; - BUG_ON(sem_checkid(sma,u->semid)); + BUG_ON(sem_checkid(ns,sma,u->semid)); /* remove u from the sma->undo list */ for (unp = &sma->undo; (un = *unp); unp = &un->id_next) { @@ -1329,7 +1393,7 @@ found: semaphore->semval = 0; if (semaphore->semval > SEMVMX) semaphore->semval = SEMVMX; - semaphore->sempid = current->tgid; + semaphore->sempid = virt_tgid(current); } } sma->sem_otime = get_seconds(); @@ -1360,3 +1424,54 @@ static int sysvipc_sem_proc_show(struct sma->sem_ctime); } #endif + +#ifdef CONFIG_VE +#include + +int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg) +{ + int err = 0; + struct sem_array *sma; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + mutex_lock(&sem_ids(ns).mutex); + sma = sem_lock(ns, semid); + if (!sma) { + err = newary(ns, key, semid, size, semflg); + if (err >= 0) + sma = sem_lock(ns, semid); + } + if (sma) + sem_unlock(sma); + mutex_unlock(&sem_ids(ns).mutex); + + return err > 0 ? 0 : err; +} +EXPORT_SYMBOL_GPL(sysvipc_setup_sem); + +int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg) +{ + int i; + int err = 0; + struct sem_array *sma; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + mutex_lock(&sem_ids(ns).mutex); + for (i = 0; i <= sem_ids(ns).max_id; i++) { + if ((sma = sem_lock(ns, i)) == NULL) + continue; + err = func(sem_buildid(ns, i, sma->sem_perm.seq), sma, arg); + sem_unlock(sma); + if (err) + break; + } + mutex_unlock(&sem_ids(ns).mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_sem); +EXPORT_SYMBOL_GPL(exit_sem); +#endif diff -uprN linux-2.6.18/ipc/shm.c linux-2.6.18.ovz/ipc/shm.c --- linux-2.6.18/ipc/shm.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/ipc/shm.c 2007-06-13 06:55:07.000000000 -0400 @@ -15,10 +15,15 @@ * * support for audit of ipc object properties and permission changes * Dustin Kirkland + * + * namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov */ #include #include +#include #include #include #include @@ -32,78 +37,196 @@ #include #include #include +#include +#include #include +#include +#include + #include "util.h" static struct file_operations shm_file_operations; static struct vm_operations_struct shm_vm_ops; -static struct ipc_ids shm_ids; +static struct ipc_ids init_shm_ids; -#define shm_lock(id) ((struct shmid_kernel*)ipc_lock(&shm_ids,id)) -#define shm_unlock(shp) ipc_unlock(&(shp)->shm_perm) -#define shm_get(id) ((struct shmid_kernel*)ipc_get(&shm_ids,id)) -#define shm_buildid(id, seq) \ - ipc_buildid(&shm_ids, id, seq) +#define shm_ids(ns) (*((ns)->ids[IPC_SHM_IDS])) -static int newseg (key_t key, int shmflg, size_t size); +#define shm_lock(ns, id) \ + ((struct shmid_kernel*)ipc_lock(&shm_ids(ns),id)) +#define shm_unlock(shp) \ + ipc_unlock(&(shp)->shm_perm) +#define shm_get(ns, id) \ + ((struct shmid_kernel*)ipc_get(&shm_ids(ns),id)) +#define shm_buildid(ns, id, seq) \ + ipc_buildid(&shm_ids(ns), id, seq) + +static int newseg (struct ipc_namespace *ns, key_t key, + int shmid, int shmflg, size_t size); static void shm_open (struct vm_area_struct *shmd); static void shm_close (struct vm_area_struct *shmd); +static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); #ifdef CONFIG_PROC_FS static int sysvipc_shm_proc_show(struct seq_file *s, void *it); #endif -size_t shm_ctlmax = SHMMAX; -size_t shm_ctlall = SHMALL; -int shm_ctlmni = SHMMNI; +static void __ipc_init __shm_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids) +{ + ns->ids[IPC_SHM_IDS] = ids; + ns->shm_ctlmax = SHMMAX; + ns->shm_ctlall = SHMALL; + ns->shm_ctlmni = SHMMNI; + ns->shm_tot = 0; + ipc_init_ids(ids, 1); +} + +static void do_shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *shp) +{ + if (shp->shm_nattch){ + shp->shm_perm.mode |= SHM_DEST; + /* Do not find it any more */ + shp->shm_perm.key = IPC_PRIVATE; + shm_unlock(shp); + } else + shm_destroy(ns, shp); +} + +#ifdef CONFIG_IPC_NS +int shm_init_ns(struct ipc_namespace *ns) +{ + struct ipc_ids *ids; + + ids = kmalloc(sizeof(struct ipc_ids), GFP_KERNEL); + if (ids == NULL) + return -ENOMEM; + + __shm_init_ns(ns, ids); + return 0; +} + +/* This function does not invalidate ipc namespace, it just releases + * all its content. Unless caller take some precautions, new objects + * can appear. + */ +void shm_clean_ns(struct ipc_namespace *ns) +{ + int i; + struct shmid_kernel *shp; + + mutex_lock(&shm_ids(ns).mutex); + for (i = 0; i <= shm_ids(ns).max_id; i++) { + shp = shm_lock(ns, i); + if (shp == NULL) + continue; + + do_shm_rmid(ns, shp); + } + mutex_unlock(&shm_ids(ns).mutex); +} +EXPORT_SYMBOL(shm_clean_ns); -static int shm_tot; /* total number of shared memory pages */ +void shm_exit_ns(struct ipc_namespace *ns) +{ + shm_clean_ns(ns); + + ipc_fini_ids(ns->ids[IPC_SHM_IDS]); + kfree(ns->ids[IPC_SHM_IDS]); + ns->ids[IPC_SHM_IDS] = NULL; +} +#endif void __init shm_init (void) { - ipc_init_ids(&shm_ids, 1); + __shm_init_ns(&init_ipc_ns, &init_shm_ids); ipc_init_proc_interface("sysvipc/shm", " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime\n", - &shm_ids, - sysvipc_shm_proc_show); + IPC_SHM_IDS, sysvipc_shm_proc_show); } -static inline int shm_checkid(struct shmid_kernel *s, int id) +static inline int shm_checkid(struct ipc_namespace *ns, + struct shmid_kernel *s, int id) { - if (ipc_checkid(&shm_ids,&s->shm_perm,id)) + if (ipc_checkid(&shm_ids(ns), &s->shm_perm, id)) return -EIDRM; return 0; } -static inline struct shmid_kernel *shm_rmid(int id) +static inline struct shmid_kernel *shm_rmid(struct ipc_namespace *ns, int id) { - return (struct shmid_kernel *)ipc_rmid(&shm_ids,id); + return (struct shmid_kernel *)ipc_rmid(&shm_ids(ns), id); } -static inline int shm_addid(struct shmid_kernel *shp) +static inline int shm_addid(struct ipc_namespace *ns, struct shmid_kernel *shp, + int reqid) { - return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni); + return ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, reqid); } -static inline void shm_inc (int id) { +static inline void shm_inc(struct ipc_namespace *ns, int id) +{ struct shmid_kernel *shp; - shp = shm_lock(id); + shp = shm_lock(ns, id); BUG_ON(!shp); shp->shm_atim = get_seconds(); - shp->shm_lprid = current->tgid; + shp->shm_lprid = virt_tgid(current); shp->shm_nattch++; shm_unlock(shp); } +#define shm_file_ns(file) (*((struct ipc_namespace **)&(file)->private_data)) + /* This is called by fork, once for every shm attach. */ -static void shm_open (struct vm_area_struct *shmd) +static void shm_open(struct vm_area_struct *shmd) { - shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino); + shm_inc(shm_file_ns(shmd->vm_file), + shmd->vm_file->f_dentry->d_inode->i_ino); +} + +static int shmem_lock(struct shmid_kernel *shp, int lock, + struct user_struct *user) +{ + struct file *file = shp->shm_file; + struct inode *inode = file->f_dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long size; + + size = shp->shm_segsz + PAGE_SIZE - 1; + +#ifdef CONFIG_SHMEM + spin_lock(&info->lock); + if (lock && !(info->flags & VM_LOCKED)) { + if (ub_lockedshm_charge(info, size) < 0) + goto out_ch; + + if (!user_shm_lock(inode->i_size, user)) + goto out_user; + info->flags |= VM_LOCKED; + } + if (!lock && (info->flags & VM_LOCKED) && user) { + ub_lockedshm_uncharge(info, size); + user_shm_unlock(inode->i_size, user); + info->flags &= ~VM_LOCKED; + } + spin_unlock(&info->lock); + return 0; + +out_user: + ub_lockedshm_uncharge(info, size); +out_ch: + spin_unlock(&info->lock); + return -ENOMEM; +#else + if (lock && ub_lockedshm_charge(info, size)) + return -ENOMEM; + if (!lock) + ub_lockedshm_uncharge(info, size); + return 0; +#endif } /* @@ -114,13 +237,13 @@ static void shm_open (struct vm_area_str * It has to be called with shp and shm_ids.mutex locked, * but returns with shp unlocked and freed. */ -static void shm_destroy (struct shmid_kernel *shp) +static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) { - shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; - shm_rmid (shp->id); + ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; + shm_rmid(ns, shp->id); shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) - shmem_lock(shp->shm_file, 0, shp->mlock_user); + shmem_lock(shp, 0, shp->mlock_user); else user_shm_unlock(shp->shm_file->f_dentry->d_inode->i_size, shp->mlock_user); @@ -140,20 +263,23 @@ static void shm_close (struct vm_area_st struct file * file = shmd->vm_file; int id = file->f_dentry->d_inode->i_ino; struct shmid_kernel *shp; + struct ipc_namespace *ns; + + ns = shm_file_ns(file); - mutex_lock(&shm_ids.mutex); + mutex_lock(&shm_ids(ns).mutex); /* remove from the list of attaches of the shm segment */ - shp = shm_lock(id); + shp = shm_lock(ns, id); BUG_ON(!shp); - shp->shm_lprid = current->tgid; + shp->shm_lprid = virt_tgid(current); shp->shm_dtim = get_seconds(); shp->shm_nattch--; if(shp->shm_nattch == 0 && shp->shm_perm.mode & SHM_DEST) - shm_destroy (shp); + shm_destroy(ns, shp); else shm_unlock(shp); - mutex_unlock(&shm_ids.mutex); + mutex_unlock(&shm_ids(ns).mutex); } static int shm_mmap(struct file * file, struct vm_area_struct * vma) @@ -165,14 +291,25 @@ static int shm_mmap(struct file * file, vma->vm_ops = &shm_vm_ops; if (!(vma->vm_flags & VM_WRITE)) vma->vm_flags &= ~VM_MAYWRITE; - shm_inc(file->f_dentry->d_inode->i_ino); + shm_inc(shm_file_ns(file), file->f_dentry->d_inode->i_ino); } return ret; } +static int shm_release(struct inode *ino, struct file *file) +{ + struct ipc_namespace *ns; + + ns = shm_file_ns(file); + put_ipc_ns(ns); + shm_file_ns(file) = NULL; + return 0; +} + static struct file_operations shm_file_operations = { - .mmap = shm_mmap, + .mmap = shm_mmap, + .release = shm_release, #ifndef CONFIG_MMU .get_unmapped_area = shmem_get_unmapped_area, #endif @@ -188,7 +325,8 @@ static struct vm_operations_struct shm_v #endif }; -static int newseg (key_t key, int shmflg, size_t size) +static int newseg (struct ipc_namespace *ns, key_t key, int shmid, + int shmflg, size_t size) { int error; struct shmid_kernel *shp; @@ -197,10 +335,10 @@ static int newseg (key_t key, int shmflg char name[13]; int id; - if (size < SHMMIN || size > shm_ctlmax) + if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; - if (shm_tot + numpages >= shm_ctlall) + if (ns->shm_tot + numpages >= ns->shm_ctlall) return -ENOSPC; shp = ipc_rcu_alloc(sizeof(*shp)); @@ -239,25 +377,27 @@ static int newseg (key_t key, int shmflg goto no_file; error = -ENOSPC; - id = shm_addid(shp); + id = shm_addid(ns, shp, shmid); if(id == -1) goto no_id; - shp->shm_cprid = current->tgid; + shp->shm_cprid = virt_tgid(current); shp->shm_lprid = 0; shp->shm_atim = shp->shm_dtim = 0; shp->shm_ctim = get_seconds(); shp->shm_segsz = size; shp->shm_nattch = 0; - shp->id = shm_buildid(id,shp->shm_perm.seq); + shp->id = shm_buildid(ns, id, shp->shm_perm.seq); shp->shm_file = file; file->f_dentry->d_inode->i_ino = shp->id; + shm_file_ns(file) = get_ipc_ns(ns); + /* Hugetlb ops would have already been assigned. */ if (!(shmflg & SHM_HUGETLB)) file->f_op = &shm_file_operations; - shm_tot += numpages; + ns->shm_tot += numpages; shm_unlock(shp); return shp->id; @@ -273,33 +413,36 @@ asmlinkage long sys_shmget (key_t key, s { struct shmid_kernel *shp; int err, id = 0; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; - mutex_lock(&shm_ids.mutex); + mutex_lock(&shm_ids(ns).mutex); if (key == IPC_PRIVATE) { - err = newseg(key, shmflg, size); - } else if ((id = ipc_findkey(&shm_ids, key)) == -1) { + err = newseg(ns, key, -1, shmflg, size); + } else if ((id = ipc_findkey(&shm_ids(ns), key)) == -1) { if (!(shmflg & IPC_CREAT)) err = -ENOENT; else - err = newseg(key, shmflg, size); + err = newseg(ns, key, -1, shmflg, size); } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) { err = -EEXIST; } else { - shp = shm_lock(id); + shp = shm_lock(ns, id); BUG_ON(shp==NULL); if (shp->shm_segsz < size) err = -EINVAL; else if (ipcperms(&shp->shm_perm, shmflg)) err = -EACCES; else { - int shmid = shm_buildid(id, shp->shm_perm.seq); + int shmid = shm_buildid(ns, id, shp->shm_perm.seq); err = security_shm_associate(shp, shmflg); if (!err) err = shmid; } shm_unlock(shp); } - mutex_unlock(&shm_ids.mutex); + mutex_unlock(&shm_ids(ns).mutex); return err; } @@ -395,18 +538,19 @@ static inline unsigned long copy_shminfo } } -static void shm_get_stat(unsigned long *rss, unsigned long *swp) +static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, + unsigned long *swp) { int i; *rss = 0; *swp = 0; - for (i = 0; i <= shm_ids.max_id; i++) { + for (i = 0; i <= shm_ids(ns).max_id; i++) { struct shmid_kernel *shp; struct inode *inode; - shp = shm_get(i); + shp = shm_get(ns, i); if(!shp) continue; @@ -430,6 +574,7 @@ asmlinkage long sys_shmctl (int shmid, i struct shm_setbuf setbuf; struct shmid_kernel *shp; int err, version; + struct ipc_namespace *ns; if (cmd < 0 || shmid < 0) { err = -EINVAL; @@ -437,6 +582,7 @@ asmlinkage long sys_shmctl (int shmid, i } version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; switch (cmd) { /* replace with proc interface ? */ case IPC_INFO: @@ -448,15 +594,15 @@ asmlinkage long sys_shmctl (int shmid, i return err; memset(&shminfo,0,sizeof(shminfo)); - shminfo.shmmni = shminfo.shmseg = shm_ctlmni; - shminfo.shmmax = shm_ctlmax; - shminfo.shmall = shm_ctlall; + shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; + shminfo.shmmax = ns->shm_ctlmax; + shminfo.shmall = ns->shm_ctlall; shminfo.shmmin = SHMMIN; if(copy_shminfo_to_user (buf, &shminfo, version)) return -EFAULT; /* reading a integer is always atomic */ - err= shm_ids.max_id; + err= shm_ids(ns).max_id; if(err<0) err = 0; goto out; @@ -470,14 +616,14 @@ asmlinkage long sys_shmctl (int shmid, i return err; memset(&shm_info,0,sizeof(shm_info)); - mutex_lock(&shm_ids.mutex); - shm_info.used_ids = shm_ids.in_use; - shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp); - shm_info.shm_tot = shm_tot; + mutex_lock(&shm_ids(ns).mutex); + shm_info.used_ids = shm_ids(ns).in_use; + shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); + shm_info.shm_tot = ns->shm_tot; shm_info.swap_attempts = 0; shm_info.swap_successes = 0; - err = shm_ids.max_id; - mutex_unlock(&shm_ids.mutex); + err = shm_ids(ns).max_id; + mutex_unlock(&shm_ids(ns).mutex); if(copy_to_user (buf, &shm_info, sizeof(shm_info))) { err = -EFAULT; goto out; @@ -492,17 +638,17 @@ asmlinkage long sys_shmctl (int shmid, i struct shmid64_ds tbuf; int result; memset(&tbuf, 0, sizeof(tbuf)); - shp = shm_lock(shmid); + shp = shm_lock(ns, shmid); if(shp==NULL) { err = -EINVAL; goto out; } else if(cmd==SHM_STAT) { err = -EINVAL; - if (shmid > shm_ids.max_id) + if (shmid > shm_ids(ns).max_id) goto out_unlock; - result = shm_buildid(shmid, shp->shm_perm.seq); + result = shm_buildid(ns, shmid, shp->shm_perm.seq); } else { - err = shm_checkid(shp,shmid); + err = shm_checkid(ns, shp,shmid); if(err) goto out_unlock; result = 0; @@ -534,12 +680,12 @@ asmlinkage long sys_shmctl (int shmid, i case SHM_LOCK: case SHM_UNLOCK: { - shp = shm_lock(shmid); + shp = shm_lock(ns, shmid); if(shp==NULL) { err = -EINVAL; goto out; } - err = shm_checkid(shp,shmid); + err = shm_checkid(ns, shp,shmid); if(err) goto out_unlock; @@ -564,14 +710,14 @@ asmlinkage long sys_shmctl (int shmid, i if(cmd==SHM_LOCK) { struct user_struct * user = current->user; if (!is_file_hugepages(shp->shm_file)) { - err = shmem_lock(shp->shm_file, 1, user); + err = shmem_lock(shp, 1, user); if (!err) { shp->shm_perm.mode |= SHM_LOCKED; shp->mlock_user = user; } } } else if (!is_file_hugepages(shp->shm_file)) { - shmem_lock(shp->shm_file, 0, shp->mlock_user); + shmem_lock(shp, 0, shp->mlock_user); shp->shm_perm.mode &= ~SHM_LOCKED; shp->mlock_user = NULL; } @@ -590,12 +736,12 @@ asmlinkage long sys_shmctl (int shmid, i * Instead we set a destroyed flag, and then blow * the name away when the usage hits zero. */ - mutex_lock(&shm_ids.mutex); - shp = shm_lock(shmid); + mutex_lock(&shm_ids(ns).mutex); + shp = shm_lock(ns, shmid); err = -EINVAL; if (shp == NULL) goto out_up; - err = shm_checkid(shp, shmid); + err = shm_checkid(ns, shp, shmid); if(err) goto out_unlock_up; @@ -605,7 +751,7 @@ asmlinkage long sys_shmctl (int shmid, i if (current->euid != shp->shm_perm.uid && current->euid != shp->shm_perm.cuid && - !capable(CAP_SYS_ADMIN)) { + !capable(CAP_VE_SYS_ADMIN)) { err=-EPERM; goto out_unlock_up; } @@ -614,14 +760,8 @@ asmlinkage long sys_shmctl (int shmid, i if (err) goto out_unlock_up; - if (shp->shm_nattch){ - shp->shm_perm.mode |= SHM_DEST; - /* Do not find it any more */ - shp->shm_perm.key = IPC_PRIVATE; - shm_unlock(shp); - } else - shm_destroy (shp); - mutex_unlock(&shm_ids.mutex); + do_shm_rmid(ns, shp); + mutex_unlock(&shm_ids(ns).mutex); goto out; } @@ -631,12 +771,12 @@ asmlinkage long sys_shmctl (int shmid, i err = -EFAULT; goto out; } - mutex_lock(&shm_ids.mutex); - shp = shm_lock(shmid); + mutex_lock(&shm_ids(ns).mutex); + shp = shm_lock(ns, shmid); err=-EINVAL; if(shp==NULL) goto out_up; - err = shm_checkid(shp,shmid); + err = shm_checkid(ns, shp,shmid); if(err) goto out_unlock_up; err = audit_ipc_obj(&(shp->shm_perm)); @@ -648,7 +788,7 @@ asmlinkage long sys_shmctl (int shmid, i err=-EPERM; if (current->euid != shp->shm_perm.uid && current->euid != shp->shm_perm.cuid && - !capable(CAP_SYS_ADMIN)) { + !capable(CAP_VE_SYS_ADMIN)) { goto out_unlock_up; } @@ -673,7 +813,7 @@ asmlinkage long sys_shmctl (int shmid, i out_unlock_up: shm_unlock(shp); out_up: - mutex_unlock(&shm_ids.mutex); + mutex_unlock(&shm_ids(ns).mutex); goto out; out_unlock: shm_unlock(shp); @@ -699,6 +839,7 @@ long do_shmat(int shmid, char __user *sh unsigned long prot; int acc_mode; void *user_addr; + struct ipc_namespace *ns; if (shmid < 0) { err = -EINVAL; @@ -737,12 +878,13 @@ long do_shmat(int shmid, char __user *sh * We cannot rely on the fs check since SYSV IPC does have an * additional creator id... */ - shp = shm_lock(shmid); + ns = current->nsproxy->ipc_ns; + shp = shm_lock(ns, shmid); if(shp == NULL) { err = -EINVAL; goto out; } - err = shm_checkid(shp,shmid); + err = shm_checkid(ns, shp,shmid); if (err) { shm_unlock(shp); goto out; @@ -783,16 +925,16 @@ long do_shmat(int shmid, char __user *sh invalid: up_write(¤t->mm->mmap_sem); - mutex_lock(&shm_ids.mutex); - shp = shm_lock(shmid); + mutex_lock(&shm_ids(ns).mutex); + shp = shm_lock(ns, shmid); BUG_ON(!shp); shp->shm_nattch--; if(shp->shm_nattch == 0 && shp->shm_perm.mode & SHM_DEST) - shm_destroy (shp); + shm_destroy(ns, shp); else shm_unlock(shp); - mutex_unlock(&shm_ids.mutex); + mutex_unlock(&shm_ids(ns).mutex); *raddr = (unsigned long) user_addr; err = 0; @@ -931,3 +1073,61 @@ static int sysvipc_shm_proc_show(struct shp->shm_ctim); } #endif + +#ifdef CONFIG_VE +#include + +struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg) +{ + struct shmid_kernel *shp; + struct file *file; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + mutex_lock(&shm_ids(ns).mutex); + shp = shm_lock(ns, shmid); + if (!shp) { + int err; + + err = newseg(ns, key, shmid, shmflg, size); + file = ERR_PTR(err); + if (err < 0) + goto out; + shp = shm_lock(ns, shmid); + } + file = ERR_PTR(-EINVAL); + if (shp) { + file = shp->shm_file; + get_file(file); + shm_unlock(shp); + } +out: + mutex_unlock(&shm_ids(ns).mutex); + return file; +} +EXPORT_SYMBOL_GPL(sysvipc_setup_shm); + +int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg) +{ + int i; + int err = 0; + struct shmid_kernel* shp; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + mutex_lock(&shm_ids(ns).mutex); + for(i = 0; i <= shm_ids(ns).max_id; i++) { + if ((shp = shm_lock(ns, i)) == NULL) + continue; + err = func(shp, arg); + shm_unlock(shp); + if (err) + break; + } + mutex_unlock(&shm_ids(ns).mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_shm); +#endif diff -uprN linux-2.6.18/ipc/util.c linux-2.6.18.ovz/ipc/util.c --- linux-2.6.18/ipc/util.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/ipc/util.c 2007-06-13 06:55:07.000000000 -0400 @@ -12,6 +12,9 @@ * Mingming Cao * Mar 2006 - support for audit of ipc object properties * Dustin Kirkland + * Jun 2006 - namespaces ssupport + * OpenVZ, SWsoft Inc. + * Pavel Emelianov */ #include @@ -29,18 +32,124 @@ #include #include #include +#include #include +#include + #include "util.h" struct ipc_proc_iface { const char *path; const char *header; - struct ipc_ids *ids; + int ids; int (*show)(struct seq_file *, void *); }; +struct ipc_namespace init_ipc_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, +}; + +#ifdef CONFIG_IPC_NS +static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns) +{ + int err; + struct ipc_namespace *ns; + + err = -ENOMEM; + ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + if (ns == NULL) + goto err_mem; + + err = sem_init_ns(ns); + if (err) + goto err_sem; + err = msg_init_ns(ns); + if (err) + goto err_msg; + err = shm_init_ns(ns); + if (err) + goto err_shm; + + kref_init(&ns->kref); + return ns; + +err_shm: + msg_exit_ns(ns); +err_msg: + sem_exit_ns(ns); +err_sem: + kfree(ns); +err_mem: + return ERR_PTR(err); +} + +int unshare_ipcs(unsigned long unshare_flags, struct ipc_namespace **new_ipc) +{ + struct ipc_namespace *new; + + if (unshare_flags & CLONE_NEWIPC) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + new = clone_ipc_ns(current->nsproxy->ipc_ns); + if (IS_ERR(new)) + return PTR_ERR(new); + + *new_ipc = new; + } + + return 0; +} + +int copy_ipcs(unsigned long flags, struct task_struct *tsk) +{ + struct ipc_namespace *old_ns = tsk->nsproxy->ipc_ns; + struct ipc_namespace *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_ipc_ns(old_ns); + + if (!(flags & CLONE_NEWIPC)) + return 0; + +#ifndef CONFIG_VE + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } +#endif + + new_ns = clone_ipc_ns(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->nsproxy->ipc_ns = new_ns; +out: + put_ipc_ns(old_ns); + return err; +} + +void free_ipc_ns(struct kref *kref) +{ + struct ipc_namespace *ns; + + ns = container_of(kref, struct ipc_namespace, kref); + sem_exit_ns(ns); + msg_exit_ns(ns); + shm_exit_ns(ns); + kfree(ns); +} +#endif + /** * ipc_init - initialise IPC subsystem * @@ -67,7 +176,7 @@ __initcall(ipc_init); * array itself. */ -void __init ipc_init_ids(struct ipc_ids* ids, int size) +void __ipc_init ipc_init_ids(struct ipc_ids* ids, int size) { int i; @@ -110,8 +219,7 @@ static struct file_operations sysvipc_pr * @show: show routine. */ void __init ipc_init_proc_interface(const char *path, const char *header, - struct ipc_ids *ids, - int (*show)(struct seq_file *, void *)) + int ids, int (*show)(struct seq_file *, void *)) { struct proc_dir_entry *pde; struct ipc_proc_iface *iface; @@ -197,7 +305,7 @@ static int grow_ary(struct ipc_ids* ids, */ rcu_assign_pointer(ids->entries, new); - ipc_rcu_putref(old); + __ipc_fini_ids(ids, old); return newsize; } @@ -215,10 +323,18 @@ static int grow_ary(struct ipc_ids* ids, * Called with ipc_ids.mutex held. */ -int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) +int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid) { int id; + if (reqid >= 0) { + id = reqid%SEQ_MULTIPLIER; + size = grow_ary(ids,id+1); + if (ids->entries->p[id] == NULL) + goto found; + return -1; + } + size = grow_ary(ids,size); /* @@ -238,9 +354,13 @@ found: new->cuid = new->uid = current->euid; new->gid = new->cgid = current->egid; - new->seq = ids->seq++; - if(ids->seq > ids->seq_max) - ids->seq = 0; + if (reqid >= 0) { + new->seq = reqid/SEQ_MULTIPLIER; + } else { + new->seq = ids->seq++; + if(ids->seq > ids->seq_max) + ids->seq = 0; + } spin_lock_init(&new->lock); new->deleted = 0; @@ -302,9 +422,9 @@ void* ipc_alloc(int size) { void* out; if(size > PAGE_SIZE) - out = vmalloc(size); + out = ub_vmalloc(size); else - out = kmalloc(size, GFP_KERNEL); + out = ub_kmalloc(size, GFP_KERNEL); return out; } @@ -387,14 +507,14 @@ void* ipc_rcu_alloc(int size) * workqueue if necessary (for vmalloc). */ if (rcu_use_vmalloc(size)) { - out = vmalloc(HDRLEN_VMALLOC + size); + out = ub_vmalloc(HDRLEN_VMALLOC + size); if (out) { out += HDRLEN_VMALLOC; container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; } } else { - out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); + out = ub_kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); if (out) { out += HDRLEN_KMALLOC; container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; @@ -635,6 +755,9 @@ static void *sysvipc_proc_next(struct se struct ipc_proc_iface *iface = s->private; struct kern_ipc_perm *ipc = it; loff_t p; + struct ipc_ids *ids; + + ids = current->nsproxy->ipc_ns->ids[iface->ids]; /* If we had an ipc id locked before, unlock it */ if (ipc && ipc != SEQ_START_TOKEN) @@ -644,8 +767,8 @@ static void *sysvipc_proc_next(struct se * p = *pos - 1 (because id 0 starts at position 1) * + 1 (because we increment the position by one) */ - for (p = *pos; p <= iface->ids->max_id; p++) { - if ((ipc = ipc_lock(iface->ids, p)) != NULL) { + for (p = *pos; p <= ids->max_id; p++) { + if ((ipc = ipc_lock(ids, p)) != NULL) { *pos = p + 1; return ipc; } @@ -664,12 +787,15 @@ static void *sysvipc_proc_start(struct s struct ipc_proc_iface *iface = s->private; struct kern_ipc_perm *ipc; loff_t p; + struct ipc_ids *ids; + + ids = current->nsproxy->ipc_ns->ids[iface->ids]; /* * Take the lock - this will be released by the corresponding * call to stop(). */ - mutex_lock(&iface->ids->mutex); + mutex_lock(&ids->mutex); /* pos < 0 is invalid */ if (*pos < 0) @@ -680,8 +806,8 @@ static void *sysvipc_proc_start(struct s return SEQ_START_TOKEN; /* Find the (pos-1)th ipc */ - for (p = *pos - 1; p <= iface->ids->max_id; p++) { - if ((ipc = ipc_lock(iface->ids, p)) != NULL) { + for (p = *pos - 1; p <= ids->max_id; p++) { + if ((ipc = ipc_lock(ids, p)) != NULL) { *pos = p + 1; return ipc; } @@ -693,13 +819,15 @@ static void sysvipc_proc_stop(struct seq { struct kern_ipc_perm *ipc = it; struct ipc_proc_iface *iface = s->private; + struct ipc_ids *ids; /* If we had a locked segment, release it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); + ids = current->nsproxy->ipc_ns->ids[iface->ids]; /* Release the lock we took in start() */ - mutex_unlock(&iface->ids->mutex); + mutex_unlock(&ids->mutex); } static int sysvipc_proc_show(struct seq_file *s, void *it) diff -uprN linux-2.6.18/ipc/util.h linux-2.6.18.ovz/ipc/util.h --- linux-2.6.18/ipc/util.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/ipc/util.h 2007-06-13 06:55:07.000000000 -0400 @@ -3,6 +3,8 @@ * Copyright (C) 1999 Christoph Rohland * * ipc helper functions (c) 1999 Manfred Spraul + * namespaces support. 2006 OpenVZ, SWsoft Inc. + * Pavel Emelianov */ #ifndef _IPC_UTIL_H @@ -15,6 +17,14 @@ void sem_init (void); void msg_init (void); void shm_init (void); +int sem_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); +int shm_init_ns(struct ipc_namespace *ns); + +void sem_exit_ns(struct ipc_namespace *ns); +void msg_exit_ns(struct ipc_namespace *ns); +void shm_exit_ns(struct ipc_namespace *ns); + struct ipc_id_ary { int size; struct kern_ipc_perm *p[0]; @@ -31,18 +41,26 @@ struct ipc_ids { }; struct seq_file; -void __init ipc_init_ids(struct ipc_ids* ids, int size); +#ifdef CONFIG_IPC_NS +#define __ipc_init +#else +#define __ipc_init __init +#endif +void __ipc_init ipc_init_ids(struct ipc_ids *ids, int size); #ifdef CONFIG_PROC_FS void __init ipc_init_proc_interface(const char *path, const char *header, - struct ipc_ids *ids, - int (*show)(struct seq_file *, void *)); + int ids, int (*show)(struct seq_file *, void *)); #else #define ipc_init_proc_interface(path, header, ids, show) do {} while (0) #endif +#define IPC_SEM_IDS 0 +#define IPC_MSG_IDS 1 +#define IPC_SHM_IDS 2 + /* must be called with ids->mutex acquired.*/ int ipc_findkey(struct ipc_ids* ids, key_t key); -int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size); +int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid); /* must be called with both locks acquired. */ struct kern_ipc_perm* ipc_rmid(struct ipc_ids* ids, int id); @@ -65,6 +83,18 @@ void* ipc_rcu_alloc(int size); void ipc_rcu_getref(void *ptr); void ipc_rcu_putref(void *ptr); +static inline void __ipc_fini_ids(struct ipc_ids *ids, + struct ipc_id_ary *entries) +{ + if (entries != &ids->nullentry) + ipc_rcu_putref(entries); +} + +static inline void ipc_fini_ids(struct ipc_ids *ids) +{ + __ipc_fini_ids(ids, ids->entries); +} + struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id); struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id); void ipc_lock_by_ptr(struct kern_ipc_perm *ipcp); diff -uprN linux-2.6.18/kernel/Kconfig.fairsched linux-2.6.18.ovz/kernel/Kconfig.fairsched --- linux-2.6.18/kernel/Kconfig.fairsched 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/Kconfig.fairsched 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,22 @@ +config SCHED_VCPU + bool "VCPU scheduler support" + default y + help + VCPU scheduler support adds additional layer of abstraction + which allows to virtualize cpu notion and split physical cpus + and virtual cpus. This support allows to use CPU fair scheduler, + dynamically add/remove cpus to/from VPS and so on. + +config FAIRSCHED + bool "Fair CPU scheduler (EXPERIMENTAL)" + depends on SCHED_VCPU + default SCHED_VCPU + help + Config option for Fair CPU scheduler (fairsched). + This option allows to group processes to scheduling nodes + which receive CPU proportional to their weight. + This is very important feature for process groups isolation and + QoS management. + + If unsure, say N. + diff -uprN linux-2.6.18/kernel/Kconfig.openvz linux-2.6.18.ovz/kernel/Kconfig.openvz --- linux-2.6.18/kernel/Kconfig.openvz 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/Kconfig.openvz 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,78 @@ +# Copyright (C) 2005 SWsoft +# All rights reserved. +# Licensing governed by "linux/COPYING.SWsoft" file. + +menu "OpenVZ" + +config VE + bool "Virtual Environment support" + default y + select IPC_NS + select UTS_NS + help + This option adds support of virtual Linux running on the original box + with fully supported virtual network driver, tty subsystem and + configurable access for hardware and other resources. + +config VE_CALLS + tristate "VE calls interface" + depends on VE + select VZ_DEV + default m + help + This option controls how to build vzmon code containing VE calls. + By default it's build in module vzmon.o + +config VZ_GENCALLS + bool + default y + +config VE_NETDEV + tristate "VE network device" + depends on VE_CALLS && NET + select VZ_DEV + default m + help + This option controls whether to build venet device. This is a + common interface for networking in VE. + +config VE_ETHDEV + tristate "Virtual ethernet device" + depends on VE_CALLS && NET + select VZ_DEV + default m + help + This option controls whether to build virtual ethernet device. + +config VZ_DEV + tristate "VE device" + default m + help + This option adds support of vzdev device, which is used by + user-space applications to control Virtual Environments. + +config VE_IPTABLES + bool "VE netfiltering" + depends on VE && VE_NETDEV && INET && NETFILTER + default y + help + This option controls whether to build VE netfiltering code. + +config VZ_WDOG + tristate "VE watchdog module" + depends on VE_CALLS + default m + help + This option controls building of vzwdog module, which dumps + a lot of useful system info on console periodically. + +config VZ_CHECKPOINT + tristate "Checkpointing & restoring Virtual Environments" + depends on VE_CALLS && INET + default m + help + This option adds two modules, "cpt" and "rst", which allow + to save a running Virtual Environment and restore it + on another host (live migration) or on the same host (checkpointing). + +endmenu diff -uprN linux-2.6.18/kernel/Makefile linux-2.6.18.ovz/kernel/Makefile --- linux-2.6.18/kernel/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/Makefile 2007-06-13 06:55:07.000000000 -0400 @@ -2,16 +2,21 @@ # Makefile for the linux kernel. # -obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ +obj-y = sched.o fairsched.o \ + fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o + hrtimer.o rwsem.o nsproxy.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +obj-$(CONFIG_USER_RESOURCE) += ub/ +obj-$(CONFIG_VE) += ve/ +obj-$(CONFIG_VZ_CHECKPOINT) += cpt/ + obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) @@ -48,6 +53,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o diff -uprN linux-2.6.18/kernel/audit.c linux-2.6.18.ovz/kernel/audit.c --- linux-2.6.18/kernel/audit.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/audit.c 2007-06-13 06:55:07.000000000 -0400 @@ -488,6 +488,9 @@ static int audit_receive_msg(struct sk_b char *ctx; u32 len; + if (!ve_is_super(skb->owner_env)) + return -ECONNREFUSED; + err = audit_netlink_ok(skb, msg_type); if (err) return err; diff -uprN linux-2.6.18/kernel/capability.c linux-2.6.18.ovz/kernel/capability.c --- linux-2.6.18/kernel/capability.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/capability.c 2007-06-13 06:55:07.000000000 -0400 @@ -15,16 +15,18 @@ #include unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ -kernel_cap_t cap_bset = CAP_INIT_EFF_SET; - EXPORT_SYMBOL(securebits); +#ifndef CONFIG_VE +kernel_cap_t cap_bset = CAP_INIT_EFF_SET; EXPORT_SYMBOL(cap_bset); +#endif /* * This lock protects task->cap_* for all tasks including current. * Locking rule: acquire this prior to tasklist_lock. */ -static DEFINE_SPINLOCK(task_capability_lock); +DEFINE_SPINLOCK(task_capability_lock); +EXPORT_SYMBOL(task_capability_lock); /* * For sys_getproccap() and sys_setproccap(), any of the three @@ -67,8 +69,8 @@ asmlinkage long sys_capget(cap_user_head spin_lock(&task_capability_lock); read_lock(&tasklist_lock); - if (pid && pid != current->pid) { - target = find_task_by_pid(pid); + if (pid && pid != virt_pid(current)) { + target = find_task_by_pid_ve(pid); if (!target) { ret = -ESRCH; goto out; @@ -100,9 +102,13 @@ static inline int cap_set_pg(int pgrp, k int ret = -EPERM; int found = 0; - do_each_task_pid(pgrp, PIDTYPE_PGID, g) { + pgrp = vpid_to_pid(pgrp); + if (pgrp < 0) + return ret; + + do_each_task_pid_ve(pgrp, PIDTYPE_PGID, g) { target = g; - while_each_thread(g, target) { + while_each_thread_ve(g, target) { if (!security_capset_check(target, effective, inheritable, permitted)) { @@ -113,7 +119,7 @@ static inline int cap_set_pg(int pgrp, k } found = 1; } - } while_each_task_pid(pgrp, PIDTYPE_PGID, g); + } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, g); if (!found) ret = 0; @@ -132,7 +138,7 @@ static inline int cap_set_all(kernel_cap int ret = -EPERM; int found = 0; - do_each_thread(g, target) { + do_each_thread_ve(g, target) { if (target == current || target->pid == 1) continue; found = 1; @@ -141,7 +147,7 @@ static inline int cap_set_all(kernel_cap continue; ret = 0; security_capset_set(target, effective, inheritable, permitted); - } while_each_thread(g, target); + } while_each_thread_ve(g, target); if (!found) ret = 0; @@ -188,7 +194,7 @@ asmlinkage long sys_capset(cap_user_head if (get_user(pid, &header->pid)) return -EFAULT; - if (pid && pid != current->pid && !capable(CAP_SETPCAP)) + if (pid && pid != virt_pid(current) && !capable(CAP_SETPCAP)) return -EPERM; if (copy_from_user(&effective, &data->effective, sizeof(effective)) || @@ -199,8 +205,8 @@ asmlinkage long sys_capset(cap_user_head spin_lock(&task_capability_lock); read_lock(&tasklist_lock); - if (pid > 0 && pid != current->pid) { - target = find_task_by_pid(pid); + if (pid > 0 && pid != virt_pid(current)) { + target = find_task_by_pid_ve(pid); if (!target) { ret = -ESRCH; goto out; diff -uprN linux-2.6.18/kernel/compat.c linux-2.6.18.ovz/kernel/compat.c --- linux-2.6.18/kernel/compat.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/compat.c 2007-06-13 06:55:07.000000000 -0400 @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include @@ -39,61 +41,75 @@ int put_compat_timespec(const struct tim __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static long compat_nanosleep_restart(struct restart_block *restart) +long compat_nanosleep_restart(struct restart_block *restart) { - unsigned long expire = restart->arg0, now = jiffies; struct compat_timespec __user *rmtp; + struct timespec tu; + void *rfn_save = restart->fn; + struct hrtimer_sleeper sleeper; + ktime_t rem; - /* Did it expire while we handled signals? */ - if (!time_after(expire, now)) - return 0; + restart->fn = do_no_restart_syscall; + + hrtimer_init(&sleeper.timer, (clockid_t) restart->arg3, HRTIMER_ABS); + + sleeper.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; + hrtimer_init_sleeper(&sleeper, current); - expire = schedule_timeout_interruptible(expire - now); - if (expire == 0) + set_current_state(TASK_INTERRUPTIBLE); + rem = schedule_hrtimer(&sleeper.timer, HRTIMER_ABS); + + if (rem.tv64 <= 0) return 0; - rmtp = (struct compat_timespec __user *)restart->arg1; - if (rmtp) { - struct compat_timespec ct; - struct timespec t; - - jiffies_to_timespec(expire, &t); - ct.tv_sec = t.tv_sec; - ct.tv_nsec = t.tv_nsec; - if (copy_to_user(rmtp, &ct, sizeof(ct))) - return -EFAULT; - } - /* The 'restart' block is already filled in */ + rmtp = (struct compat_timespec __user *) restart->arg2; + tu = ktime_to_timespec(rem); + if (rmtp && put_compat_timespec(&tu, rmtp)) + return -EFAULT; + + restart->fn = rfn_save; + + /* The other values in restart are already filled in */ return -ERESTART_RESTARTBLOCK; } +EXPORT_SYMBOL_GPL(compat_nanosleep_restart); asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, struct compat_timespec __user *rmtp) { struct timespec t; struct restart_block *restart; - unsigned long expire; + struct hrtimer_sleeper sleeper; + ktime_t rem; if (get_compat_timespec(&t, rqtp)) return -EFAULT; - if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) + if (!timespec_valid(&t)) return -EINVAL; - expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - expire = schedule_timeout_interruptible(expire); - if (expire == 0) + hrtimer_init(&sleeper.timer, CLOCK_MONOTONIC, HRTIMER_REL); + + sleeper.timer.expires = timespec_to_ktime(t); + hrtimer_init_sleeper(&sleeper, current); + + set_current_state(TASK_INTERRUPTIBLE); + rem = schedule_hrtimer(&sleeper.timer, HRTIMER_REL); + if (rem.tv64 <= 0) return 0; - if (rmtp) { - jiffies_to_timespec(expire, &t); - if (put_compat_timespec(&t, rmtp)) - return -EFAULT; - } + t = ktime_to_timespec(rem); + + if (rmtp && put_compat_timespec(&t, rmtp)) + return -EFAULT; + restart = ¤t_thread_info()->restart_block; restart->fn = compat_nanosleep_restart; - restart->arg0 = jiffies + expire; - restart->arg1 = (unsigned long) rmtp; + restart->arg0 = sleeper.timer.expires.tv64 & 0xFFFFFFFF; + restart->arg1 = sleeper.timer.expires.tv64 >> 32; + restart->arg2 = (unsigned long) rmtp; + restart->arg3 = (unsigned long) sleeper.timer.base->index; + return -ERESTART_RESTARTBLOCK; } diff -uprN linux-2.6.18/kernel/configs.c linux-2.6.18.ovz/kernel/configs.c --- linux-2.6.18/kernel/configs.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/configs.c 2007-06-13 06:55:07.000000000 -0400 @@ -61,18 +61,9 @@ static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) { - loff_t pos = *offset; - ssize_t count; - - if (pos >= kernel_config_data_size) - return 0; - - count = min(len, (size_t)(kernel_config_data_size - pos)); - if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count)) - return -EFAULT; - - *offset += count; - return count; + return simple_read_from_buffer(buf, len, offset, + kernel_config_data + MAGIC_SIZE, + kernel_config_data_size); } static struct file_operations ikconfig_file_ops = { @@ -88,8 +79,7 @@ static int __init ikconfig_init(void) struct proc_dir_entry *entry; /* create the current config file */ - entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, - &proc_root); + entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, NULL); if (!entry) return -ENOMEM; diff -uprN linux-2.6.18/kernel/cpt/Makefile linux-2.6.18.ovz/kernel/cpt/Makefile --- linux-2.6.18/kernel/cpt/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/Makefile 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,51 @@ +# +# +# kernel/cpt/Makefile +# +# Copyright (C) 2000-2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o + +vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \ + cpt_mm.o cpt_files.o cpt_kernel.o \ + cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \ + cpt_conntrack.o cpt_epoll.o + +vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \ + rst_mm.o rst_files.o \ + rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \ + rst_conntrack.o rst_epoll.o + +ifeq ($(CONFIG_USER_RESOURCE), y) +vzcpt-objs += cpt_ubc.o +endif + +ifeq ($(CONFIG_USER_RESOURCE), y) +vzrst-objs += rst_ubc.o +endif + +vzrst-objs += cpt_exports.o + +ifeq ($(CONFIG_VZ_CHECKPOINT), m) +vzrst-objs += cpt_obj.o cpt_kernel.o +endif + +ifeq ($(CONFIG_VZ_CHECKPOINT_ITER), y) +vzcpt-objs += cpt_iterative.o +vzrst-objs += rst_iterative.o +endif + +ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y) +vzcpt-objs += cpt_pagein.o +vzrst-objs += rst_pagein.o +endif + +ifeq ($(CONFIG_X86_64), y) +vzcpt-objs += cpt_x8664.o +ifeq ($(CONFIG_VZ_CHECKPOINT), m) +vzrst-objs += cpt_x8664.o +endif +endif diff -uprN linux-2.6.18/kernel/cpt/cpt_conntrack.c linux-2.6.18.ovz/kernel/cpt/cpt_conntrack.c --- linux-2.6.18/kernel/cpt/cpt_conntrack.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_conntrack.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,366 @@ +/* + * + * kernel/cpt/cpt_conntrack.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + + +/* How does it work? + * + * Network is disabled, so new conntrack entries will not appear. + * However, some of them can disappear because of timeouts. + * + * So, we take read_lock, collect all required information atomically, + * essentially, creating parallel "refcount" structures holding pointers. + * We delete conntrack timers as well, so the structures cannot disappear + * after releasing the lock. Now, after releasing lock we can dump everything + * safely. And on exit we restore timers to their original values. + * + * Note, this approach is not going to work in VE0. + */ + +struct ct_holder +{ + struct ct_holder *next; + struct ip_conntrack_tuple_hash *cth; + int index; +}; + +static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple) +{ + v->cpt_dst = tuple->dst.ip; + v->cpt_dstport = tuple->dst.u.all; + v->cpt_protonum = tuple->dst.protonum; + v->cpt_dir = tuple->dst.dir; + + v->cpt_src = tuple->src.ip; + v->cpt_srcport = tuple->src.u.all; +} + +static int dump_one_expect(struct cpt_ip_connexpect_image *v, + struct ip_conntrack_expect *exp, + int sibling, cpt_context_t *ctx) +{ + int err = 0; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + encode_tuple(&v->cpt_tuple, &exp->tuple); + encode_tuple(&v->cpt_mask, &exp->mask); + v->cpt_sibling_conntrack = sibling; + v->cpt_flags = exp->flags; + v->cpt_seq = exp->id; + v->cpt_dir = 0; + v->cpt_manip_proto = 0; +#ifdef CONFIG_IP_NF_NAT_NEEDED + v->cpt_manip_proto = exp->saved_proto.all; + v->cpt_dir = exp->dir; +#endif + v->cpt_timeout = 0; + if (exp->master->helper->timeout) + v->cpt_timeout = exp->timeout.expires - jiffies; + return err; +} + +/* NOTE. We use one page to dump list of expectations. This may be not enough + * in theory. In practice there is only one expectation per conntrack record. + * Moreover, taking into account that _ALL_ of expecations are saved in one + * global list, which is looked up each incoming/outpging packet, the system + * would be severely dead when even one conntrack would have so much of + * expectations. Shortly, I am not going to repair this. + */ + +static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list, + cpt_context_t *ctx) +{ + int err = 0; + unsigned long pg; + struct cpt_ip_connexpect_image *v; + struct ip_conntrack_expect *exp; + + if (ct->expecting == 0) + return err; + if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE) + return -ENOBUFS; + + pg = __get_free_page(GFP_KERNEL); + if (!pg) + return -ENOMEM; + v = (struct cpt_ip_connexpect_image *)pg; + + read_lock_bh(&ip_conntrack_lock); + list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) { + int sibling; + + if (exp->master != ct) + continue; + + if (ct->helper == NULL) { + eprintk_ctx("conntrack: no helper and non-trivial expectation\n"); + err = -EINVAL; + break; + } + + sibling = 0; +#if 0 + /* That's all? No need to calculate sibling? */ + if (exp->sibling) { + struct ct_holder *c; + for (c = list; c; c = c->next) { + if (tuplehash_to_ctrack(c->cth) == exp->sibling) { + sibling = c->index; + break; + } + } + /* NOTE: exp->sibling could be not "confirmed" and, hence, + * out of hash table. We should just ignore such a sibling, + * the connection is going to be retried, the packet + * apparently was lost somewhere. + */ + if (sibling == 0) + dprintk_ctx("sibling conntrack is not found\n"); + } +#endif + + /* If the expectation still does not have exp->sibling + * and timer is not running, it is about to die on another + * cpu. Skip it. */ + if (!sibling && + ct->helper->timeout && + !timer_pending(&exp->timeout)) { + dprintk_ctx("conntrack: expectation: no timer\n"); + continue; + } + + err = dump_one_expect(v, exp, sibling, ctx); + if (err) + break; + + v++; + } + read_unlock_bh(&ip_conntrack_lock); + + if (err == 0 && (unsigned long)v != pg) + ctx->write((void*)pg, (unsigned long)v - pg, ctx); + + free_page(pg); + return err; +} + +static int dump_one_ct(struct ct_holder *c, struct ct_holder *list, + cpt_context_t *ctx) +{ + struct ip_conntrack_tuple_hash *h = c->cth; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + struct cpt_ip_conntrack_image v; + int err = 0; + + if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) { + eprintk_ctx("conntrack module ct->proto version mismatch\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_CONNTRACK; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + read_lock_bh(&ip_conntrack_lock); + v.cpt_status = ct->status; + v.cpt_timeout = ct->timeout.expires - jiffies; + v.cpt_ct_helper = (ct->helper != NULL); + v.cpt_index = c->index; + v.cpt_id = ct->id; + v.cpt_mark = 0; +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + v.cpt_mark = ct->mark; +#endif + encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple); + encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple); + memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data)); + memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data)); + + v.cpt_masq_index = 0; + v.cpt_initialized = 0; + v.cpt_num_manips = 0; + v.cpt_nat_helper = 0; +#ifdef CONFIG_IP_NF_NAT_NEEDED +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + v.cpt_masq_index = ct->nat.masq_index; +#endif + /* "help" data is used by pptp, difficult to support */ + v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos; + v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before; + v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after; + v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos; + v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before; + v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after; +#endif + read_unlock_bh(&ip_conntrack_lock); + + ctx->write(&v, sizeof(v), ctx); + + err = dump_expect_list(ct, list, ctx); + + cpt_close_object(ctx); + return err; +} + +int cpt_dump_ip_conntrack(cpt_context_t * ctx) +{ + struct ct_holder *ct_list = NULL; + struct ct_holder *c, **cp; + int err = 0; + int index = 0; + int idx; + + if (get_exec_env()->_ip_conntrack == NULL) + return 0; + + for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) { + c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); + if (c == NULL) { + err = -ENOMEM; + goto done; + } + memset(c, 0, sizeof(struct ct_holder)); + c->next = ct_list; + ct_list = c; + } + + c = ct_list; + + read_lock_bh(&ip_conntrack_lock); + for (idx = 0; idx < ip_conntrack_htable_size; idx++) { + struct ip_conntrack_tuple_hash *h; + list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) { + /* Skip reply tuples, they are covered by original + * direction. */ + if (DIRECTION(h)) + continue; + + /* Oops, we have not enough of holders... + * It is impossible. */ + if (unlikely(c == NULL)) { + read_unlock_bh(&ip_conntrack_lock); + eprintk_ctx("unexpected conntrack appeared\n"); + err = -ENOMEM; + goto done; + } + + /* If timer is not running, it means that it + * has just been scheduled on another cpu. + * We should skip this conntrack, it is about to be + * destroyed. */ + if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) { + dprintk_ctx("conntrack: no timer\n"); + continue; + } + + /* Timer is deleted. refcnt is _not_ decreased. + * We are going to restore the timer on exit + * from this function. */ + c->cth = h; + c->index = ++index; + c = c->next; + } + } + read_unlock_bh(&ip_conntrack_lock); + + /* No conntracks? Good. */ + if (index == 0) + goto done; + + /* Comb the list a little. */ + cp = &ct_list; + while ((c = *cp) != NULL) { + /* Discard unused entries; they can appear, if some + * entries were timed out since we preallocated the list. + */ + if (c->cth == NULL) { + *cp = c->next; + kfree(c); + continue; + } + + /* Move conntracks attached to expectations to the beginning + * of the list. */ + if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) { + *cp = c->next; + c->next = ct_list; + ct_list = c; + dprintk_ctx("conntrack: %d moved in list\n", c->index); + continue; + } + cp = &c->next; + } + + cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK); + + for (c = ct_list; c; c = c->next) { + err = dump_one_ct(c, ct_list, ctx); + if (err) + goto done; + } + + cpt_close_section(ctx); + +done: + while ((c = ct_list) != NULL) { + ct_list = c->next; + if (c->cth) { + /* Restore timer. refcnt is preserved. */ + add_timer(&tuplehash_to_ctrack(c->cth)->timeout); + } + kfree(c); + } + return err; +} + +#endif diff -uprN linux-2.6.18/kernel/cpt/cpt_context.c linux-2.6.18.ovz/kernel/cpt/cpt_context.c --- linux-2.6.18/kernel/cpt/cpt_context.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_context.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,258 @@ +/* + * + * kernel/cpt/cpt_context.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + + +static void file_write(const void *addr, size_t count, struct cpt_context *ctx) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->write(file, addr, count, &file->f_pos); + set_fs(oldfs); + if (err != count && !ctx->write_error) + ctx->write_error = err < 0 ? err : -EIO; +} + +static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->write(file, addr, count, &pos); + set_fs(oldfs); + if (err != count && !ctx->write_error) + ctx->write_error = err < 0 ? err : -EIO; +} + +static void file_align(struct cpt_context *ctx) +{ + struct file *file = ctx->file; + + if (file) + file->f_pos = CPT_ALIGN(file->f_pos); +} + +void cpt_context_init(struct cpt_context *ctx) +{ + int i; + + memset(ctx, 0, sizeof(*ctx)); + + init_MUTEX(&ctx->main_sem); + ctx->refcount = 1; + + ctx->current_section = -1; + ctx->current_object = -1; + ctx->pagesize = PAGE_SIZE; + ctx->write = file_write; + ctx->pwrite = file_pwrite; + ctx->align = file_align; + for (i=0; i < CPT_SECT_MAX; i++) + ctx->sections[i] = CPT_NULL; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + init_completion(&ctx->pgin_notify); +#endif + cpt_object_init(ctx); +} + +int cpt_open_dumpfile(struct cpt_context *ctx) +{ + ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); + if (ctx->tmpbuf == NULL) + return -ENOMEM; + __cpt_release_buf(ctx); + return 0; +} + +int cpt_close_dumpfile(struct cpt_context *ctx) +{ + if (ctx->file) { + fput(ctx->file); + ctx->file = NULL; + } + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } + if (ctx->write_error) + eprintk_ctx("error while writing dump file: %d\n", ctx->write_error); + return ctx->write_error; +} + +int cpt_major_hdr_out(struct cpt_context *ctx) +{ + struct cpt_major_hdr hdr; + + if (ctx->file == NULL) + return 0; + + memset(&hdr, 0, sizeof(hdr)); + hdr.cpt_signature[0] = CPT_SIGNATURE0; + hdr.cpt_signature[1] = CPT_SIGNATURE1; + hdr.cpt_signature[2] = CPT_SIGNATURE2; + hdr.cpt_signature[3] = CPT_SIGNATURE3; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_image_version = CPT_VERSION_18; +#ifdef CONFIG_X86_64 + hdr.cpt_os_arch = CPT_OS_ARCH_EMT64; +#elif defined(CONFIG_X86_32) + hdr.cpt_os_arch = CPT_OS_ARCH_I386; +#elif defined(CONFIG_IA64) + hdr.cpt_os_arch = CPT_OS_ARCH_IA64; +#else +#error Arch is not supported +#endif + hdr.cpt_ve_features = (__u32)ctx->features; + hdr.cpt_ve_features2 = (__u32)(ctx->features>>32); + hdr.cpt_pagesize = (__u16)PAGE_SIZE; + hdr.cpt_hz = HZ; + hdr.cpt_start_jiffies64 = ctx->virt_jiffies64; + hdr.cpt_start_sec = ctx->start_time.tv_sec; + hdr.cpt_start_nsec = ctx->start_time.tv_nsec; + hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags; + hdr.cpt_kernel_config[0] = ctx->kernel_config_flags; + hdr.cpt_iptables_mask = ctx->iptables_mask; + + ctx->write(&hdr, sizeof(hdr), ctx); + return 0; +} + +int cpt_close_section(struct cpt_context *ctx) +{ + if (ctx->file && ctx->current_section >= 0) { + __u64 next = ctx->file->f_pos - ctx->current_section; + ctx->pwrite(&next, 8, ctx, ctx->current_section); + ctx->current_section = -1; + } + return 0; +} +EXPORT_SYMBOL(cpt_close_section); + +int cpt_open_section(struct cpt_context *ctx, __u32 type) +{ + struct cpt_section_hdr hdr; + + if (ctx->file == NULL) + return 0; + + cpt_close_section(ctx); + + ctx->current_section = ctx->file->f_pos; + ctx->sections[type] = ctx->current_section; + + hdr.cpt_next = 0; + hdr.cpt_section = type; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_align = 0; + ctx->write(&hdr, sizeof(hdr), ctx); + + return 0; +} +EXPORT_SYMBOL(cpt_open_section); + + +int cpt_close_object(struct cpt_context *ctx) +{ + if (ctx->file && ctx->current_object >= 0) { + __u64 next = ctx->file->f_pos - ctx->current_object; + ctx->pwrite(&next, 8, ctx, ctx->current_object); + ctx->current_object = -1; + } + return 0; +} +EXPORT_SYMBOL(cpt_close_object); + +int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx) +{ + if (ctx->file == NULL) + return 0; + + cpt_close_object(ctx); + + ctx->current_object = ctx->file->f_pos; + if (obj) + cpt_obj_setpos(obj, ctx->current_object, ctx); + + return 0; +} +EXPORT_SYMBOL(cpt_open_object); + +int cpt_push_object(loff_t *saved, struct cpt_context *ctx) +{ + if (ctx->file) { + *saved = ctx->current_object; + ctx->current_object = ctx->file->f_pos; + } + return 0; +} +EXPORT_SYMBOL(cpt_push_object); + +int cpt_pop_object(loff_t *saved, struct cpt_context *ctx) +{ + ctx->current_object = *saved; + return 0; +} +EXPORT_SYMBOL(cpt_pop_object); + +int cpt_dump_tail(struct cpt_context *ctx) +{ + struct cpt_major_tail hdr; + int i; + + if (ctx->file == NULL) + return 0; + + cpt_open_section(ctx, CPT_SECT_TRAILER); + memset(&hdr, 0, sizeof(hdr)); + hdr.cpt_next = sizeof(hdr); + hdr.cpt_object = CPT_OBJ_TRAILER; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = CPT_CONTENT_VOID; + hdr.cpt_lazypages = 0; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + hdr.cpt_lazypages = ctx->lazypages; +#endif + hdr.cpt_64bit = ctx->tasks64; + hdr.cpt_signature[0] = CPT_SIGNATURE0; + hdr.cpt_signature[1] = CPT_SIGNATURE1; + hdr.cpt_signature[2] = CPT_SIGNATURE2; + hdr.cpt_signature[3] = CPT_SIGNATURE3; + hdr.cpt_nsect = CPT_SECT_MAX_INDEX; + for (i = 0; i < CPT_SECT_MAX_INDEX; i++) + hdr.cpt_sections[i] = ctx->sections[i]; + + ctx->write(&hdr, sizeof(hdr), ctx); + cpt_close_section(ctx); + return 0; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_context.h linux-2.6.18.ovz/kernel/cpt/cpt_context.h --- linux-2.6.18/kernel/cpt/cpt_context.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_context.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,208 @@ +#include +#include +#include + +#define CPT_CTX_ERROR -1 +#define CPT_CTX_IDLE 0 +#define CPT_CTX_SUSPENDING 1 +#define CPT_CTX_SUSPENDED 2 +#define CPT_CTX_DUMPING 3 +#define CPT_CTX_UNDUMPING 4 +#define CPT_CTX_UNDUMPED 5 + +#define CPT_TID(tsk) (tsk)->pid, virt_pid(tsk), (tsk)->comm +#define CPT_FID "%d,%d(%s)" + + +typedef struct cpt_context +{ + struct list_head ctx_list; + int refcount; + int ctx_state; + int objcount; + int sticky; + struct semaphore main_sem; + + struct file *errorfile; + struct file *statusfile; + struct file *lockfile; + + int errno; + char *error_msg; + loff_t err_offset; + + struct file *file; + char *tmpbuf; + int pagesize; +#ifdef CONFIG_VZ_CHECKPOINT_ITER + int iter_done; + void *iter_dir; + struct user_beancounter *iter_ub; +#endif + loff_t current_section; + loff_t current_object; + + loff_t sections[CPT_SECT_MAX]; + + __u32 errormask; + __u32 write_error; + + struct list_head object_array[CPT_OBJ_MAX]; + + void (*write)(const void *addr, size_t count, struct cpt_context *ctx); + void (*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); + ssize_t (*read)(void *addr, size_t count, struct cpt_context *ctx); + ssize_t (*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); + void (*align)(struct cpt_context *ctx); + int ve_id; + int contextid; + struct timespec cpt_monotonic_time; /* Host monotonic time at the moment of cpt/rst + * corresponging to start_time */ + __u64 virt_jiffies64; /* Virtual jiffies64. It is == cpt_jiffies64 when + * VE did not migrate. */ + struct timespec start_time; + struct timespec delta_time; + __s64 delta_nsec; + int image_version; + __u16 image_arch; + __u64 iptables_mask; + __u64 features; + +#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9) +#define CPT_ANONVMA_HSIZE (1<ve_id, ##arg) + +#define wprintk(a...) cpt_printk(2, "CPT WRN: " a) +#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg) + +#define eprintk(a...) cpt_printk(1, "CPT ERR: " a) +#define eprintk_ctx(f, arg...) \ +do { \ + eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg); \ + if (ctx->error_msg && ctx->err_offset < PAGE_SIZE) \ + ctx->err_offset += snprintf((char*)(ctx->error_msg + \ + ctx->err_offset), \ + PAGE_SIZE - ctx->err_offset, f, ##arg); \ +} while(0) + +#define CPT_TMPBUF_FREE 0x789adf12 +#define CPT_TMPBUF_BUSY 0xabcd9876 + +static inline void *cpt_get_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE); + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY; + return buf; +} + +static inline void __cpt_release_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; +} + +static inline void cpt_release_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY); + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; +} + +static inline void cpt_flush_error(cpt_context_t *ctx) +{ + mm_segment_t oldfs; + + if (ctx->errorfile && ctx->error_msg && ctx->err_offset) { + if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) { + oldfs = get_fs(); + set_fs(KERNEL_DS); + ctx->errorfile->f_op->write(ctx->errorfile, + ctx->error_msg, ctx->err_offset, + &ctx->errorfile->f_pos); + set_fs(oldfs); + } + ctx->error_msg[0] = 0; + ctx->err_offset = 0; + } +} diff -uprN linux-2.6.18/kernel/cpt/cpt_dump.c linux-2.6.18.ovz/kernel/cpt/cpt_dump.c --- linux-2.6.18/kernel/cpt/cpt_dump.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_dump.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1125 @@ +/* + * + * kernel/cpt/cpt_dump.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_net.h" +#include "cpt_socket.h" +#include "cpt_ubc.h" +#include "cpt_kernel.h" + + +static int vps_child_level(struct task_struct *root, struct task_struct *c) +{ + int level = 0; + int veid = VE_TASK_INFO(c)->owner_env->veid; + + while (VE_TASK_INFO(c)->owner_env->veid == veid) { + if (c->pid != c->tgid) + c = c->group_leader; + if (c == root) + return level; + + c = c->real_parent; + level++; + } + return -1; +} + +static inline int freezable(struct task_struct * p) +{ + if (p->exit_state) + return 0; + + switch (p->state) { + case EXIT_ZOMBIE: + case EXIT_DEAD: + case TASK_STOPPED: +#if TASK_TRACED != TASK_STOPPED + case TASK_TRACED: +#endif + return 0; + default: + return 1; + } +} + +static void wake_ve(cpt_context_t *ctx) +{ + struct task_struct *p, *g; + + do_each_thread_ve(g, p) { + spin_lock_irq(&p->sighand->siglock); + if (p->flags & PF_FROZEN) { + p->flags &= ~PF_FROZEN; + wake_up_process(p); + } + spin_unlock_irq(&p->sighand->siglock); + } while_each_thread_ve(g, p); +} + +/* + * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE... + * + * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context + * of another process. Apparently, it is unacceptable on SMP. + * Let's take freeze_processes() in kernel/power/process.c as an example. + * Unserialized modifications tsk->flags easily + * (believe or not, but it happens with probability of almost 100% :-)) + * creates the situation when setting PF_FREEZE in freeze_processes(), + * which quickly spins raising PF_FREEZE of all the processes, + * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks. + * + * So, to make things clean, we require that those flags may be modified + * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE + * is just a kind of signal. + * + * It is not enough, because we are still not allowed to change tsk->flags + * in context of another process, we can corrupt another flags, when the process + * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags, + * which can be changed atomically. + * + * PF_FROZEN also changes in context of another process, but this happens + * only when the process is already in refrigerator() which does not modify + * tsk->flags. + */ + +enum +{ + OBSTACLE_NOGO = -1, + OBSTACLE_TIMEOUT = -2, + OBSTACLE_TRYAGAIN = -3, +}; + +#define SUSPEND_TIMEOUT (10UL*HZ) + +static int vps_stop_tasks(struct cpt_context *ctx) +{ + unsigned long start_time = jiffies; + unsigned long target, timeout; + struct task_struct *p, *g; + int todo; + int round = 0; + + do_gettimespec(&ctx->start_time); + do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); + ctx->virt_jiffies64 = get_jiffies_64() + get_exec_env()->jiffies_fixup; + + read_lock(&tasklist_lock); + + atomic_inc(&get_exec_env()->suspend); + timeout = HZ/5; + target = jiffies + timeout; + + for(;;) { + struct task_struct *root; + todo = 0; + + root = find_task_by_pid_ve(1); + if (!root) { + read_unlock(&tasklist_lock); + eprintk_ctx("cannot find ve init\n"); + atomic_dec(&get_exec_env()->suspend); + return -ESRCH; + } + + do_each_thread_ve(g, p) { + struct mm_struct *mm = get_task_mm(p); + if (mm) { + if (!mm->vps_dumpable && (p != current)) { + mmput(mm); + wprintk_ctx("Killing external process " + CPT_FID "\n", CPT_TID(p)); + send_sig(SIGKILL, p, 1); + continue; + } + mmput(mm); + } + if (vps_child_level(root, p) >= 0) { + if (!is_virtual_pid(virt_pid(p))) { + wprintk_ctx("Killing external process " + CPT_FID "\n", CPT_TID(p)); + send_sig(SIGKILL, p, 1); + continue; + } + if (!is_virtual_pid(virt_pgid(p))) { + eprintk_ctx("external process group %d/%d(%s) inside VE (e.g. vzctl enter or vzctl exec).\n", virt_pgid(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + } + if (!is_virtual_pid(virt_sid(p))) { + eprintk_ctx("external process session %d/%d(%s) inside VE (e.g. vzctl enter or vzctl exec).\n", virt_sid(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + } + if (p->vfork_done) { + /* Task between vfork()...exec() + * cannot be frozen, because parent + * wait in uninterruptible state. + * So, we do nothing, waiting for + * exec(), unless: + */ + if (p->state == TASK_STOPPED || + p->state == TASK_TRACED) { + eprintk_ctx("task " CPT_FID " is stopped while vfork(). Checkpointing is impossible.\n", CPT_TID(p)); + todo = OBSTACLE_NOGO; + /* It is fatal, _user_ stopped + * vfork()ing task, so that we + * cannot suspend now. + */ + } else { + todo = OBSTACLE_TRYAGAIN; + } + goto out; + } + if (p->signal->group_exit_task && + p->signal->notify_count) { + /* exec() waits for threads' death */ + wprintk_ctx("task " CPT_FID " waits for threads' death\n", CPT_TID(p)); + todo = OBSTACLE_TRYAGAIN; + goto out; + } + if (p->state == TASK_TRACED +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) + && !p->stopped_state +#endif + ) { + int ptrace_id = p->pn_state; + /* Debugger waits for signal. */ + switch (ptrace_id) { + case PN_STOP_TF: + case PN_STOP_TF_RT: + case PN_STOP_ENTRY: + case PN_STOP_FORK: + case PN_STOP_VFORK: + case PN_STOP_SIGNAL: + case PN_STOP_EXIT: + case PN_STOP_LEAVE: + break; + default: + eprintk_ctx("task " CPT_FID " is stopped by debugger while %d.\n", CPT_TID(p), ptrace_id); + todo = OBSTACLE_NOGO; + goto out; + } + } + if (p->flags & PF_NOFREEZE) { + eprintk_ctx("task " CPT_FID " is unfreezable. Checkpointing is impossible.\n", CPT_TID(p)); + todo = OBSTACLE_NOGO; + goto out; + } + + if (!freezable(p)) + continue; + + spin_lock_irq(&p->sighand->siglock); + if (!(p->flags & PF_FROZEN)) { + set_tsk_thread_flag(p, TIF_FREEZE); + signal_wake_up(p, 0); + } + spin_unlock_irq(&p->sighand->siglock); + + if (p->flags & PF_FROZEN) { + if (p->state != TASK_UNINTERRUPTIBLE) + printk("Holy Crap 1 %ld " CPT_FID "\n", p->state, CPT_TID(p)); + continue; + } + + if (round == 10) + wprintk_ctx(CPT_FID " is running\n", CPT_TID(p)); + + todo++; + } else { + if (p != current) { + wprintk_ctx("Killing foreign process " CPT_FID "\n", CPT_TID(p)); + send_sig(SIGKILL, p, 1); + continue; + } + } + } while_each_thread_ve(g, p); + + if (todo > 0) { + /* No visible obstacles, but VE did not freeze + * for timeout. Interrupt suspend, if it is major + * timeout or signal; if it is minor timeout + * we will wake VE and restart suspend. + */ + if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) + || signal_pending(current)) + todo = OBSTACLE_TIMEOUT; + else if (time_after(jiffies, target)) + todo = OBSTACLE_TRYAGAIN; + } + +out: + if (todo < 0) { + atomic_dec(&get_exec_env()->suspend); + + wake_ve(ctx); + +#if 0 + /* This is sign of failure of printk(), which is not + * ours. So, no prefixes. */ + printk(">\n"); +#endif + } + + read_unlock(&tasklist_lock); + + if (!todo) { + atomic_dec(&get_exec_env()->suspend); + return 0; + } + + switch (todo) { + case OBSTACLE_NOGO: + eprintk_ctx("suspend is impossible now.\n"); + return -EAGAIN; + + case OBSTACLE_TIMEOUT: + eprintk_ctx("interrupted or timed out.\n"); + return -EINTR; + + case OBSTACLE_TRYAGAIN: + if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) || + signal_pending(current)) { + wprintk_ctx("suspend timed out\n"); + return -EAGAIN; + } + + wprintk_ctx("minor suspend timeout (%lu) expired, " + "trying again\n", timeout); + + /* Try again. VE is awake, give it some time to run. */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ); + + /* After a short wait restart suspend + * with longer timeout */ + atomic_inc(&get_exec_env()->suspend); + timeout = min(timeout<<1, SUSPEND_TIMEOUT); + target = jiffies + timeout; + break; + + default: + if (round > 0) { + /* VE is partially frozen, give processes + * a chance to enter to refrigerator(). */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/20); + } else { + yield(); + } + } + + read_lock(&tasklist_lock); + round++; + } +} + +static int cpt_unlock_ve(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + down_write(&env->op_sem); + env->is_locked = 0; + up_write(&env->op_sem); + put_ve(env); + return 0; +} + +int cpt_resume(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); + + cpt_unlock_sockets(ctx); + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) { + wait_for_completion(&ctx->pgin_notify); + put_task_struct(ctx->pgin_task); + ctx->pgin_task = NULL; + } +#endif + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->flags & PF_FROZEN) { + tsk->flags &= ~PF_FROZEN; + wake_up_process(tsk); + } else if (freezable(tsk)) { + eprintk_ctx("strange, %s not frozen\n", tsk->comm ); + } + spin_unlock_irq(&tsk->sighand->siglock); + put_task_struct(tsk); + } + + cpt_resume_network(ctx); + + cpt_unlock_ve(ctx); + + cpt_finish_ubc(ctx); + cpt_object_destroy(ctx); + return 0; +} + +int cpt_kill(struct cpt_context *ctx) +{ + int err = 0; + struct ve_struct *env; + cpt_object_t *obj; + struct task_struct *root_task = NULL; + long delay; + + if (!ctx->ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + + /* from here cpt_kill succeeds */ + virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); + + if (current->ve_task_info.owner_env == env) { + wprintk_ctx("attempt to kill ve from inside, escaping...\n"); + ve_move_task(current, get_ve0()); + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) { + wait_for_completion(&ctx->pgin_notify); + put_task_struct(ctx->pgin_task); + ctx->pgin_task = NULL; + } +#endif + + cpt_kill_sockets(ctx); + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + if (tsk->exit_state) { + put_task_struct(tsk); + continue; + } + + if (virt_pid(tsk) == 1) { + root_task = tsk; + continue; + } + + tsk->robust_list = NULL; +#ifdef CONFIG_COMPAT + tsk->compat_robust_list = NULL; +#endif + tsk->clear_child_tid = NULL; + + if (tsk->ptrace) { + write_lock_irq(&tasklist_lock); + tsk->ptrace = 0; + if (!list_empty(&tsk->ptrace_list)) { + list_del_init(&tsk->ptrace_list); + remove_parent(tsk); + tsk->parent = tsk->real_parent; + add_parent(tsk); + } + write_unlock_irq(&tasklist_lock); + } + + send_sig(SIGKILL, tsk, 1); + + spin_lock_irq(&tsk->sighand->siglock); + sigfillset(&tsk->blocked); + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(tsk, TIF_SIGPENDING); + if (tsk->flags & PF_FROZEN) + tsk->flags &= ~PF_FROZEN; + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + put_task_struct(tsk); + } + + yield(); + + if (root_task != NULL) { + send_sig(SIGKILL, root_task, 1); + + spin_lock_irq(&root_task->sighand->siglock); + sigfillset(&root_task->blocked); + sigdelsetmask(&root_task->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(root_task, TIF_SIGPENDING); + clear_tsk_thread_flag(root_task, TIF_FREEZE); + if (root_task->flags & PF_FROZEN) + root_task->flags &= ~PF_FROZEN; + spin_unlock_irq(&root_task->sighand->siglock); + + wake_up_process(root_task); + put_task_struct(root_task); + } + + cpt_finish_ubc(ctx); + cpt_object_destroy(ctx); + + delay = 1; + while (atomic_read(&env->counter) != 1) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + delay = (delay < HZ) ? (delay << 1) : HZ; + schedule_timeout(delay); + } + put_ve(env); + + return err; +} + +#ifdef CONFIG_USER_RESOURCE +static void collect_task_ubc(struct task_struct *t, struct cpt_context *ctx) +{ + struct task_beancounter *tbc; + + tbc = &(t->task_bc); + cpt_add_ubc(tbc->exec_ub, ctx); + cpt_add_ubc(tbc->task_ub, ctx); + cpt_add_ubc(tbc->fork_sub, ctx); +} +#else +static void inline collect_task_ubc(struct task_struct *t, + struct cpt_context *ctx) +{ return; } +#endif + +static cpt_object_t * remember_task(struct task_struct * child, + cpt_object_t * head, cpt_context_t * ctx) +{ + cpt_object_t *cobj; + + if (freezable(child) && !(child->flags&PF_FROZEN)) { + eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child)); + put_task_struct(child); + return NULL; + } + + if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG(); + if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { + put_task_struct(child); + return NULL; + } + cobj->o_count = 1; + cpt_obj_setobj(cobj, child, ctx); + insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx); + collect_task_ubc(child, ctx); + return cobj; +} + +static int vps_collect_tasks(struct cpt_context *ctx) +{ + int err = -ESRCH; + cpt_object_t *obj; + struct task_struct *root; + read_lock(&tasklist_lock); + root = find_task_by_pid_ve(1); + if (root) + get_task_struct(root); + read_unlock(&tasklist_lock); + + if (!root) { + err = -ESRCH; + eprintk_ctx("vps_collect_tasks: cannot find root\n"); + goto out; + } + + if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { + put_task_struct(root); + return -ENOMEM; + } + obj->o_count = 1; + cpt_obj_setobj(obj, root, ctx); + intern_cpt_object(CPT_OBJ_TASK, obj, ctx); + collect_task_ubc(root, ctx); + + /* Collect process subtree recursively */ + for_each_object(obj, CPT_OBJ_TASK) { + cpt_object_t *head = obj; + struct task_struct *tsk = obj->o_obj; + struct task_struct *child; + + if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) { + eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk)); + err = -EINVAL; + goto out; + } + + if (tsk->state == TASK_RUNNING) + printk("Holy Crap 2 %ld " CPT_FID "\n", tsk->state, CPT_TID(tsk)); + + wait_task_inactive(tsk); + + if (tsk->pid == tsk->tgid) { + child = tsk; + for (;;) { + read_lock(&tasklist_lock); + child = next_thread(child); + if (child != tsk) + get_task_struct(child); + read_unlock(&tasklist_lock); + + if (child == tsk) + break; + + if (child->real_parent != tsk->real_parent) { + put_task_struct(child); + eprintk_ctx("illegal thread structure, kernel bug\n"); + return -EINVAL; + } + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + } + } + + /* About locking. VE is frozen. But lists of children + * may change at least for init, when entered task reparents + * to init and when reparented task exits. If we take care + * of this case, we still can unlock while scanning + * tasklists. + */ + read_lock(&tasklist_lock); + list_for_each_entry(child, &tsk->children, sibling) { + if (child->real_parent != tsk) + continue; + if (child->pid != child->tgid) + continue; + get_task_struct(child); + read_unlock(&tasklist_lock); + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + } + + list_for_each_entry(child, &tsk->ptrace_children, ptrace_list) { + if (child->real_parent != tsk) + continue; + if (child->pid != child->tgid) + continue; + get_task_struct(child); + read_unlock(&tasklist_lock); + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + } + read_unlock(&tasklist_lock); + } + + return 0; + +out: + while (!list_empty(&ctx->object_array[CPT_OBJ_TASK])) { + struct list_head *head = ctx->object_array[CPT_OBJ_TASK].next; + cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); + struct task_struct *tsk; + + list_del(head); + tsk = obj->o_obj; + put_task_struct(tsk); + free_cpt_object(obj, ctx); + } + return err; +} + +static int cpt_collect(struct cpt_context *ctx) +{ + int err; + + if ((err = cpt_collect_mm(ctx)) != 0) + return err; + + if ((err = cpt_collect_sysv(ctx)) != 0) + return err; + + if ((err = cpt_collect_files(ctx)) != 0) + return err; + + if ((err = cpt_collect_fs(ctx)) != 0) + return err; + + if ((err = cpt_collect_namespace(ctx)) != 0) + return err; + + if ((err = cpt_collect_signals(ctx)) != 0) + return err; + + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_COLLECT, ctx) & NOTIFY_FAIL) + return -ECHRNG; + + return 0; +} + +static int cpt_dump_veinfo(cpt_context_t *ctx) +{ + struct cpt_veinfo_image i; + struct ve_struct *ve; + struct timespec delta; + struct ipc_namespace *ns; + + cpt_open_section(ctx, CPT_SECT_VEINFO); + cpt_open_object(NULL, ctx); + + i.cpt_next = CPT_NULL; + i.cpt_object = CPT_OBJ_VEINFO; + i.cpt_hdrlen = sizeof(i); + i.cpt_content = CPT_CONTENT_VOID; + + ve = get_exec_env(); + ns = ve->ve_ns->ipc_ns; + + i.shm_ctl_all = ns->shm_ctlall; + i.shm_ctl_max = ns->shm_ctlmax; + i.shm_ctl_mni = ns->shm_ctlmni; + + i.msg_ctl_max = ns->msg_ctlmax; + i.msg_ctl_mni = ns->msg_ctlmni; + i.msg_ctl_mnb = ns->msg_ctlmnb; + + BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i.sem_ctl_arr)); + i.sem_ctl_arr[0] = ns->sem_ctls[0]; + i.sem_ctl_arr[1] = ns->sem_ctls[1]; + i.sem_ctl_arr[2] = ns->sem_ctls[2]; + i.sem_ctl_arr[3] = ns->sem_ctls[3]; + + do_posix_clock_monotonic_gettime(&delta); + _set_normalized_timespec(&delta, + delta.tv_sec - ve->start_timespec.tv_sec, + delta.tv_nsec - ve->start_timespec.tv_nsec); + i.start_timespec_delta = cpt_timespec_export(&delta); + i.start_jiffies_delta = get_jiffies_64() - ve->start_jiffies; + + ctx->write(&i, sizeof(i), ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + return 0; +} + +static int cpt_dump_utsname(cpt_context_t *ctx) +{ + int len; + struct cpt_object_hdr o; + struct ve_struct *ve; + struct uts_namespace *ns; + + cpt_open_section(ctx, CPT_SECT_UTSNAME); + + ve = get_exec_env(); + ns = ve->ve_ns->uts_ns; + + cpt_open_object(NULL, ctx); + len = strlen(ns->name.nodename); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(ns->name.nodename, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + + cpt_open_object(NULL, ctx); + len = strlen(ns->name.domainname); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(ns->name.domainname, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + + cpt_close_section(ctx); + return 0; +} + +int cpt_dump(struct cpt_context *ctx) +{ + struct ve_struct *oldenv, *env; + struct nsproxy *old_ns; + int err, err2 = 0; + + if (!ctx->ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + + down_read(&env->op_sem); + err = -ESRCH; + if (!env->is_running) + goto out_noenv; + if (!env->is_locked) + goto out_noenv; + + oldenv = set_exec_env(env); + old_ns = current->nsproxy; + current->nsproxy = env->ve_ns; + + /* Phase 2: real checkpointing */ + err = cpt_open_dumpfile(ctx); + if (err) + goto out; + + cpt_major_hdr_out(ctx); + + if (!err) + err = cpt_dump_veinfo(ctx); + if (!err) + err = cpt_dump_ubc(ctx); + if (!err) + err = cpt_dump_ifinfo(ctx); + if (!err) + err = cpt_dump_files(ctx); + if (!err) + err = cpt_dump_files_struct(ctx); + if (!err) + err = cpt_dump_fs_struct(ctx); + if (!err) + err = cpt_dump_namespace(ctx); + if (!err) + err = cpt_dump_sighand(ctx); + if (!err) + err = cpt_dump_vm(ctx); + if (!err) + err = cpt_dump_sysvsem(ctx); + if (!err) + err = cpt_dump_tasks(ctx); + if (!err) + err = cpt_dump_orphaned_sockets(ctx); +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + if (!err) + err = cpt_dump_ip_conntrack(ctx); +#endif + if (!err) { + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_DUMP, ctx) & NOTIFY_FAIL) + err = -ECHRNG; + } + if (!err) + err = cpt_dump_utsname(ctx); + + if (!err) + err = cpt_dump_tail(ctx); + + err2 = cpt_close_dumpfile(ctx); + +out: + current->nsproxy = old_ns; + set_exec_env(oldenv); +out_noenv: + up_read(&env->op_sem); + put_ve(env); + return err ? : err2; +} + +int cpt_vps_suspend(struct cpt_context *ctx) +{ + struct ve_struct *oldenv, *env; + struct nsproxy *old_ns; + int err = 0; + + ctx->kernel_config_flags = test_kernel_config(); + cpt_object_init(ctx); + + if (!ctx->ve_id) { + env = get_exec_env(); + if (env == get_ve0()) + return -EINVAL; + wprintk("undefined ve_id\n"); + ctx->ve_id = env->veid; + get_ve(env); + } else { + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + } + +#ifdef CONFIG_VE_IPTABLES + ctx->iptables_mask = env->_iptables_modules; +#endif + ctx->features = env->features; + + down_write(&env->op_sem); + err = -ESRCH; + if (!env->is_running) + goto out_noenv; + + err = -EBUSY; + if (env->is_locked) + goto out_noenv; + env->is_locked = 1; + downgrade_write(&env->op_sem); + + oldenv = set_exec_env(env); + old_ns = current->nsproxy; + current->nsproxy = env->ve_ns; + + /* Phase 0: find and stop all the tasks */ + if ((err = vps_stop_tasks(ctx)) != 0) + goto out; + + if ((err = cpt_suspend_network(ctx)) != 0) + goto out_wake; + + /* At the moment all the state is frozen. We do not need to lock + * the state, which can be changed only if the tasks are running. + */ + + /* Phase 1: collect task tree */ + if ((err = vps_collect_tasks(ctx)) != 0) + goto out_wake; + + /* Phase 1': collect all the resources */ + if ((err = cpt_collect(ctx)) != 0) + goto out; + +out: + current->nsproxy = old_ns; + set_exec_env(oldenv); + up_read(&env->op_sem); + put_ve(env); + return err; + +out_noenv: + up_write(&env->op_sem); + put_ve(env); + return err; + +out_wake: + read_lock(&tasklist_lock); + wake_ve(ctx); + read_unlock(&tasklist_lock); + goto out; +} + +static void check_unsupported_netdevices(struct cpt_context *ctx, __u32 *caps) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev; dev = dev->next) { + if (dev != get_exec_env()->_loopback_dev +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) + && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) +#endif +#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) + && dev != get_exec_env()->_venet_dev +#endif + ) { + eprintk_ctx("unsupported netdevice %s\n", dev->name); + *caps |= (1<thread_info->flags & _TIF_IA32)) + *caps |= (1<= 0) { + if (!is_virtual_pid(virt_pgid(p))) { + eprintk_ctx("external process group %d/%d(%s) inside VE (e.g. vzctl enter or vzctl exec).\n", virt_pgid(p), p->pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<nsproxy->namespace) { + eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", virt_pid(p), p->pid, p->comm); + *caps |= (1<policy != SCHED_NORMAL) { + eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", virt_pid(p), p->pid, p->comm); + *caps |= (1<parent) { + if (p->parent != p->real_parent && + VE_TASK_INFO(p->parent)->owner_env != env) { + eprintk_ctx("task %d/%d(%s) is ptraced from VE0\n", p->pid, virt_pid(p), p->comm); + *caps |= (1<list) { + struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); + + path = __d_path(mnt->mnt_root, mnt, + env->fs_root, env->fs_rootmnt, + path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + + if (check_one_vfsmount(mnt)) { + eprintk_ctx("Unsupported filesystem %s\n", mnt->mnt_sb->s_type->name); + *caps |= (1<ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (env == NULL) + return -ESRCH; + + *caps = flags & (1<nsproxy; + current->nsproxy = env->ve_ns; + + check_unsupported_netdevices(ctx, caps); + + read_lock(&tasklist_lock); + root = find_task_by_pid_ve(1); + if (!root) { + read_unlock(&tasklist_lock); + eprintk_ctx("cannot find ve init\n"); + err = -ESRCH; + goto out; + } + get_task_struct(root); + for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p)) + check_one_process(ctx, caps, flags, env, root, p); + read_unlock(&tasklist_lock); + + n = get_task_mnt_ns(root); + if (n) { + char *path_buf; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) { + put_namespace(n); + err = -ENOMEM; + goto out_root; + } + + check_unsupported_mounts(ctx, caps, env, n, path_buf); + + free_page((unsigned long) path_buf); + put_namespace(n); + } + + err = 0; + +out_root: + put_task_struct(root); +out: + current->nsproxy = old_ns; + set_exec_env(old_env); + put_ve(env); + + return err; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_dump.h linux-2.6.18.ovz/kernel/cpt/cpt_dump.h --- linux-2.6.18/kernel/cpt/cpt_dump.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_dump.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,16 @@ +int cpt_dump(struct cpt_context *cpt); +int rst_undump(struct cpt_context *cpt); +int cpt_suspend(struct cpt_context *cpt); +int cpt_resume(struct cpt_context *cpt); +int cpt_kill(struct cpt_context *cpt); +int rst_clean(struct cpt_context *cpt); +int rst_resume(struct cpt_context *cpt); +int rst_kill(struct cpt_context *cpt); + +int cpt_freeze_one(pid_t pid, int freeze); +int cpt_vps_suspend(struct cpt_context *ctx); +int vps_rst_undump(struct cpt_context *ctx); + +int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps); + +int cpt_check_unsupported(struct task_struct *tsk, struct cpt_context *ctx); diff -uprN linux-2.6.18/kernel/cpt/cpt_epoll.c linux-2.6.18.ovz/kernel/cpt/cpt_epoll.c --- linux-2.6.18/kernel/cpt/cpt_epoll.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_epoll.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,116 @@ +/* + * + * kernel/cpt/cpt_epoll.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +extern struct file_operations eventpoll_fops; + +int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx) +{ + int err = 0; + struct file *file = obj->o_obj; + struct eventpoll *ep; + struct rb_node *rbp; + struct cpt_epoll_image ei; + + if (file->f_op != &eventpoll_fops) { + eprintk_ctx("bad epoll file\n"); + return -EINVAL; + } + + ep = file->private_data; + + /* eventpoll.c does not protect open /proc/N/fd, silly. + * Opener will get an invalid file with uninitialized private_data + */ + if (unlikely(ep == NULL)) { + eprintk_ctx("bad epoll device\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + ei.cpt_next = CPT_NULL; + ei.cpt_object = CPT_OBJ_EPOLL; + ei.cpt_hdrlen = sizeof(ei); + ei.cpt_content = CPT_CONTENT_ARRAY; + ei.cpt_file = obj->o_pos; + + ctx->write(&ei, sizeof(ei), ctx); + + mutex_lock(&epmutex); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + loff_t saved_obj; + cpt_object_t *tobj; + struct cpt_epoll_file_image efi; + struct epitem *epi; + epi = rb_entry(rbp, struct epitem, rbn); + tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx); + if (tobj == NULL) { + eprintk_ctx("epoll device refers to an external file\n"); + err = -EBUSY; + break; + } + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + efi.cpt_next = CPT_NULL; + efi.cpt_object = CPT_OBJ_EPOLL_FILE; + efi.cpt_hdrlen = sizeof(efi); + efi.cpt_content = CPT_CONTENT_VOID; + efi.cpt_file = tobj->o_pos; + efi.cpt_fd = epi->ffd.fd; + efi.cpt_events = epi->event.events; + efi.cpt_data = epi->event.data; + efi.cpt_revents = epi->revents; + efi.cpt_ready = 0; + if (!list_empty(&epi->rdllink)) + efi.cpt_ready = 1; + + ctx->write(&efi, sizeof(efi), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + mutex_unlock(&epmutex); + + cpt_close_object(ctx); + + return err; +} + diff -uprN linux-2.6.18/kernel/cpt/cpt_exports.c linux-2.6.18.ovz/kernel/cpt/cpt_exports.c --- linux-2.6.18/kernel/cpt/cpt_exports.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_exports.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,12 @@ +#include + +#include "cpt_obj.h" + +EXPORT_SYMBOL(alloc_cpt_object); +EXPORT_SYMBOL(intern_cpt_object); +EXPORT_SYMBOL(insert_cpt_object); +EXPORT_SYMBOL(__cpt_object_add); +EXPORT_SYMBOL(cpt_object_add); +EXPORT_SYMBOL(cpt_object_get); +EXPORT_SYMBOL(lookup_cpt_object); +EXPORT_SYMBOL(lookup_cpt_obj_bypos); diff -uprN linux-2.6.18/kernel/cpt/cpt_files.c linux-2.6.18.ovz/kernel/cpt/cpt_files.c --- linux-2.6.18/kernel/cpt/cpt_files.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_files.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1599 @@ +/* + * + * kernel/cpt/cpt_files.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt) +{ + char *path; + unsigned long pg = __get_free_page(GFP_KERNEL); + + if (!pg) + return; + + path = d_path(d, mnt, (char *)pg, PAGE_SIZE); + + if (!IS_ERR(path)) + eprintk("<%s>", path); + free_page(pg); +} + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, + cpt_context_t *ctx) +{ + if (path[0] == '/' && !(!IS_ROOT(d) && d_unhashed(d))) { + struct nameidata nd; + if (path_lookup(path, 0, &nd)) { + eprintk_ctx("d_path cannot be looked up %s\n", path); + return -EINVAL; + } + if (nd.dentry != d || nd.mnt != mnt) { + eprintk_ctx("d_path is invisible %s\n", path); + path_release(&nd); + return -EINVAL; + } + path_release(&nd); + } + return 0; +} + +static int +cpt_replaced(struct dentry * de, struct vfsmount *mnt, cpt_context_t * ctx) +{ + int result = 0; + +#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) + char *path; + unsigned long pg; + struct dentry * renamed_dentry; + + if (de->d_sb->s_magic != FSMAGIC_VEFS) + return 0; + if (de->d_inode->i_nlink != 0 || + atomic_read(&de->d_inode->i_writecount) > 0) + return 0; + + renamed_dentry = vefs_replaced_dentry(de); + if (renamed_dentry == NULL) + return 0; + + pg = __get_free_page(GFP_KERNEL); + if (!pg) + return 0; + + path = d_path(de, mnt, (char *)pg, PAGE_SIZE); + if (!IS_ERR(path)) { + int len; + struct nameidata nd; + + len = pg + PAGE_SIZE - 1 - (unsigned long)path; + if (len >= sizeof("(deleted) ") - 1 && + !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { + len -= sizeof("(deleted) ") - 1; + path += sizeof("(deleted) ") - 1; + } + + if (path_lookup(path, 0, &nd) == 0) { + if (mnt == nd.mnt && + vefs_is_renamed_dentry(nd.dentry, renamed_dentry)) + result = 1; + path_release(&nd); + } + } + free_page(pg); +#endif + return result; +} + +static int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt, + int replaced, cpt_context_t *ctx) +{ + int len; + char *path; + char *pg = cpt_get_buf(ctx); + loff_t saved; + + path = d_path(d, mnt, pg, PAGE_SIZE); + len = PTR_ERR(path); + + if (IS_ERR(path)) { + struct cpt_object_hdr o; + char tmp[1]; + + /* VZ changes d_path() to return EINVAL, when path + * is not supposed to be visible inside VE. + * This changes behaviour of d_path() comparing + * to mainstream kernel, f.e. d_path() fails + * on any kind of shared memory. Maybe, there are + * another cases, but I am aware only about this one. + * So, we just ignore error on shmem mounts and proceed. + * Otherwise, checkpointing is prohibited because + * of reference to an invisible file. + */ + if (len != -EINVAL || + mnt != get_exec_env()->shmem_mnt) + eprintk_ctx("d_path err=%d\n", len); + else + len = 0; + + cpt_push_object(&saved, ctx); + cpt_open_object(NULL, ctx); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + tmp[0] = 0; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(tmp, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved, ctx); + + __cpt_release_buf(ctx); + return len; + } else { + struct cpt_object_hdr o; + + len = pg + PAGE_SIZE - 1 - path; + if (replaced && + len >= sizeof("(deleted) ") - 1 && + !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { + len -= sizeof("(deleted) ") - 1; + path += sizeof("(deleted) ") - 1; + } + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + path[len] = 0; + + if (cpt_verify_overmount(path, d, mnt, ctx)) { + __cpt_release_buf(ctx); + return -EINVAL; + } + + cpt_push_object(&saved, ctx); + cpt_open_object(NULL, ctx); + ctx->write(&o, sizeof(o), ctx); + ctx->write(path, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved, ctx); + __cpt_release_buf(ctx); + } + return 0; +} + +int cpt_dump_string(const char *s, struct cpt_context *ctx) +{ + int len; + struct cpt_object_hdr o; + + cpt_open_object(NULL, ctx); + len = strlen(s); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(s, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} + +static int +cpt_dump_filename(struct file *file, int replaced, cpt_context_t *ctx) +{ + return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, replaced, ctx); +} + +int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err; + struct cpt_inode_image *v = cpt_get_buf(ctx); + struct kstat sbuf; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_INODE; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) { + cpt_release_buf(ctx); + return err; + } + + v->cpt_dev = d->d_inode->i_sb->s_dev; + v->cpt_ino = d->d_inode->i_ino; + v->cpt_mode = sbuf.mode; + v->cpt_nlink = sbuf.nlink; + v->cpt_uid = sbuf.uid; + v->cpt_gid = sbuf.gid; + v->cpt_rdev = d->d_inode->i_rdev; + v->cpt_size = sbuf.size; + v->cpt_atime = cpt_timespec_export(&sbuf.atime); + v->cpt_mtime = cpt_timespec_export(&sbuf.mtime); + v->cpt_ctime = cpt_timespec_export(&sbuf.ctime); + v->cpt_blksize = sbuf.blksize; + v->cpt_blocks = sbuf.blocks; + v->cpt_sb = d->d_inode->i_sb->s_magic; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + +int cpt_collect_files(cpt_context_t * ctx) +{ + int err; + cpt_object_t *obj; + int index = 0; + + /* Collect process fd sets */ + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL) + return -ENOMEM; + } + + /* Collect files from fd sets */ + for_each_object(obj, CPT_OBJ_FILES) { + int fd; + struct files_struct *f = obj->o_obj; + + cpt_obj_setindex(obj, index++, ctx); + + if (obj->o_count != atomic_read(&f->count)) { + eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count)); + return -EBUSY; + } + + for (fd = 0; fd < f->fdt->max_fds; fd++) { + struct file *file = fcheck_files(f, fd); + if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL) + return -ENOMEM; + } + } + + /* Collect files queued by AF_UNIX sockets. */ + if ((err = cpt_collect_passedfds(ctx)) < 0) + return err; + + /* OK. At this point we should count all the references. */ + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + struct file *parent; + cpt_object_t *ino_obj; + + if (obj->o_count != atomic_read(&file->f_count)) { + eprintk_ctx("file struct is referenced outside %d %d\n", obj->o_count, atomic_read(&file->f_count)); + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); + return -EBUSY; + } + + switch (file->f_dentry->d_inode->i_sb->s_magic) { + case FSMAGIC_FUTEX: + case FSMAGIC_MQUEUE: + case FSMAGIC_BDEV: + eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic); + return -EBUSY; + } + + /* Collect inode. It is necessary mostly to resolve deleted + * hard links. */ + ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (ino_obj == NULL) + return -ENOMEM; + + parent = ino_obj->o_parent; + if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) + ino_obj->o_parent = file; + + if (S_ISCHR(file->f_dentry->d_inode->i_mode)) { + int maj = imajor(file->f_dentry->d_inode); + if (maj == PTY_MASTER_MAJOR || + (maj >= UNIX98_PTY_MASTER_MAJOR && + maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || + maj == PTY_SLAVE_MAJOR || + maj == UNIX98_PTY_SLAVE_MAJOR || + maj == TTYAUX_MAJOR) { + err = cpt_collect_tty(file, ctx); + if (err) + return err; + } + } + + if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { + err = cpt_collect_socket(file, ctx); + if (err) + return err; + } + } + + err = cpt_index_sockets(ctx); + + return err; +} + +/* /dev/ptmx is special, all the files share one inode, but real tty backend + * is attached via file->private_data. + */ + +static inline int is_cloning_inode(struct inode *ino) +{ + return S_ISCHR(ino->i_mode) && + ino->i_rdev == MKDEV(TTYAUX_MAJOR,2); +} + +static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx) +{ + pid_t pid; + struct cpt_flock_image *v = cpt_get_buf(ctx); + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_FLOCK; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_owner = owner; + + pid = fl->fl_pid; + if (pid && !is_virtual_pid(fl->fl_pid)) { + pid = _pid_to_vpid(fl->fl_pid); + if (pid == -1) { + if (!(fl->fl_flags&FL_FLOCK)) { + eprintk_ctx("posix lock from another VE?\n"); + cpt_release_buf(ctx); + return -EBUSY; + } + pid = 0; + } + } + + v->cpt_pid = pid; + v->cpt_start = fl->fl_start; + v->cpt_end = fl->fl_end; + v->cpt_flags = fl->fl_flags; + v->cpt_type = fl->fl_type; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + + +int cpt_dump_flock(struct file *file, struct cpt_context *ctx) +{ + int err = 0; + struct file_lock *fl; + + lock_kernel(); + for (fl = file->f_dentry->d_inode->i_flock; + fl; fl = fl->fl_next) { + if (file != fl->fl_file) + continue; + if (fl->fl_flags & FL_LEASE) { + eprintk_ctx("lease lock is not supported\n"); + err = -EINVAL; + break; + } + if (fl->fl_flags & FL_POSIX) { + cpt_object_t *obj; + obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx); + if (obj) { + dump_one_flock(fl, obj->o_index, ctx); + continue; + } else { + eprintk_ctx("unknown lock owner %p\n", fl->fl_owner); + err = -EINVAL; + } + } + if (fl->fl_flags & FL_FLOCK) { + dump_one_flock(fl, -1, ctx); + continue; + } + } + unlock_kernel(); + return err; +} + +static int __comb_pid_to_vpid(int pid) +{ + int vpid = pid; + + if (pid > 0) { + vpid = _pid_to_vpid(pid); + if (unlikely(vpid < 0)) { + dprintk("pid %d does not exist amymore.\n", pid); + return 0; + } + } else if (pid < 0) { + vpid = _pid_to_vpid(-pid); + if (unlikely(vpid < 0)) { + dprintk("pgid %d does not exist amymore.\n", -pid); + return 0; + } + vpid = -vpid; + } + return vpid; +} + +static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx) +{ + int err = 0; + cpt_object_t *iobj; + struct cpt_file_image *v = cpt_get_buf(ctx); + struct kstat sbuf; + int replaced = 0; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILE; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_flags = file->f_flags; + v->cpt_mode = file->f_mode; + v->cpt_pos = file->f_pos; + v->cpt_uid = file->f_uid; + v->cpt_gid = file->f_gid; + + vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf); + + v->cpt_i_mode = sbuf.mode; + v->cpt_lflags = 0; + if (IS_ROOT(file->f_dentry)) + v->cpt_lflags |= CPT_DENTRY_ROOT; + else if (d_unhashed(file->f_dentry)) { + if (cpt_replaced(file->f_dentry, file->f_vfsmnt, ctx)) { + v->cpt_lflags |= CPT_DENTRY_REPLACED; + replaced = 1; + } else { + v->cpt_lflags |= CPT_DENTRY_DELETED; + } + } + if (is_cloning_inode(file->f_dentry->d_inode)) + v->cpt_lflags |= CPT_DENTRY_CLONING; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) + v->cpt_lflags |= CPT_DENTRY_PROC; + v->cpt_inode = CPT_NULL; + if (!(v->cpt_lflags & CPT_DENTRY_REPLACED)) { + iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (iobj) + v->cpt_inode = iobj->o_pos; + } + v->cpt_priv = CPT_NULL; + v->cpt_fown_fd = -1; + if (S_ISCHR(v->cpt_i_mode)) { + iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx); + if (iobj) { + v->cpt_priv = iobj->o_pos; + if (file->f_flags&FASYNC) + v->cpt_fown_fd = cpt_tty_fasync(file, ctx); + } + } + if (S_ISSOCK(v->cpt_i_mode)) { + if (obj->o_index < 0) { + eprintk_ctx("BUG: no socket index\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_priv = obj->o_index; + if (file->f_flags&FASYNC) + v->cpt_fown_fd = cpt_socket_fasync(file, ctx); + } + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) { + v->cpt_priv = file->f_dentry->d_inode->i_ino; + v->cpt_lflags |= CPT_DENTRY_EPOLL; + } + + v->cpt_fown_pid = __comb_pid_to_vpid((int)file->f_owner.pid); + v->cpt_fown_uid = file->f_owner.uid; + v->cpt_fown_euid = file->f_owner.euid; + v->cpt_fown_signo = file->f_owner.signum; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (!S_ISSOCK(v->cpt_i_mode)) { + err = cpt_dump_filename(file, replaced, ctx); + if (err) + return err; + if ((file->f_mode & FMODE_WRITE) && + file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_VEFS) + vefs_track_notify(file->f_dentry, 1); + } + + if (file->f_dentry->d_inode->i_flock) + err = cpt_dump_flock(file, ctx); + + cpt_close_object(ctx); + + return err; +} + +/* About this weird function... Crappy code dealing with SYSV shared memory + * defines TMPFS inode and file with f_op doing only mmap. So... + * Maybe, this is wrong and leaks something. It is clear access to + * SYSV shmem via mmap is quite unusual and impossible from user space. + */ +static int dump_content_shm(struct file *file, struct cpt_context *ctx) +{ + struct cpt_obj_bits *v; + loff_t saved_pos; + unsigned long addr; + + addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size, + PROT_READ, MAP_SHARED, 0); + if (IS_ERR((void*)addr)) + return PTR_ERR((void*)addr); + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v = cpt_get_buf(ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = file->f_dentry->d_inode->i_size; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx); + ctx->align(ctx); + do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size); + + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + return 0; +} + +static int data_is_zero(char *addr, int len) +{ + int i; + unsigned long zerolong = 0; + + for (i=0; if_op == NULL) + return -EINVAL; + + if ((do_read = file->f_op->read) == NULL) { + if (file->f_op->mmap == NULL) + return -EINVAL; + if (file->f_dentry->d_inode->i_sb->s_magic != FSMAGIC_TMPFS) { + eprintk_ctx("unreadable, but not SYSV SHM file\n"); + return -EINVAL; + } + + do_read = file->f_dentry->d_inode->i_fop->read; + cpt_dump_content_sysvshm(file, ctx); + if (!do_read) { + wprintk_ctx("TMPFS is not configured?\n"); + return dump_content_shm(file, ctx); + } + } + + if (!(file->f_mode & FMODE_READ) || + (file->f_flags & O_DIRECT)) { + file = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), O_RDONLY); + if (IS_ERR(file)) { + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); + eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } + } else { + atomic_inc(&file->f_count); + } + + for (;;) { + mm_segment_t oldfs; + int err; + + (void)cpt_get_buf(ctx); + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos); + set_fs(oldfs); + if (err < 0) { + eprintk_ctx("dump_content_regular: do_read: %d", err); + fput(file); + __cpt_release_buf(ctx); + return err; + } + if (err == 0) { + __cpt_release_buf(ctx); + break; + } + if (data_is_zero(ctx->tmpbuf, err)) { + if (obj_opened != CPT_NULL) { + ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + obj_opened = CPT_NULL; + } + } else { + if (obj_opened == CPT_NULL) { + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + obj_opened = ctx->file->f_pos; + pgb.cpt_next = CPT_NULL; + pgb.cpt_object = CPT_OBJ_PAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_DATA; + pgb.cpt_start = pos - err; + pgb.cpt_end = pgb.cpt_start; + ctx->write(&pgb, sizeof(pgb), ctx); + } + ctx->write(ctx->tmpbuf, err, ctx); + pgb.cpt_end += err; + } + __cpt_release_buf(ctx); + } + + fput(file); + + if (obj_opened != CPT_NULL) { + ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + obj_opened = CPT_NULL; + } + return 0; +} + + +static int dump_content_chrdev(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + int maj; + + maj = imajor(ino); + if (maj == MEM_MAJOR) { + /* Well, OK. */ + return 0; + } + if (maj == PTY_MASTER_MAJOR || + (maj >= UNIX98_PTY_MASTER_MAJOR && + maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || + maj == PTY_SLAVE_MAJOR || + maj == UNIX98_PTY_SLAVE_MAJOR || + maj == TTYAUX_MAJOR) { + return cpt_dump_content_tty(file, ctx); + } + eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino)); + return -EINVAL; +} + +static int dump_content_blkdev(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + + /* We are not going to transfer them. */ + eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino)); + return -EINVAL; +} + +static int dump_content_fifo(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + cpt_object_t *obj; + loff_t saved_pos; + int readers; + int writers; + int anon = 0; + + mutex_lock(&ino->i_mutex); + readers = ino->i_pipe->readers; + writers = ino->i_pipe->writers; + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file1 = obj->o_obj; + if (file1->f_dentry->d_inode == ino) { + if (file1->f_mode & FMODE_READ) + readers--; + if (file1->f_mode & FMODE_WRITE) + writers--; + } + } + mutex_unlock(&ino->i_mutex); + if (readers || writers) { + struct dentry *dr = file->f_dentry->d_sb->s_root; + if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0) + anon = 1; + + if (anon) { + eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers); + return -EBUSY; + } + /* If fifo has external readers/writers, we are in troubles. + * If the buffer is not empty, we must move its content. + * But if the fifo is owned by a service, we cannot do + * this. See? + * + * For now we assume, that if fifo is opened by another + * process, we do not own it and, hence, migrate without + * data. + */ + return 0; + } + + /* OK, we must save fifo state. No semaphores required. */ + + if (ino->i_pipe->nrbufs) { + struct cpt_obj_bits *v = cpt_get_buf(ctx); + struct pipe_inode_info *info; + int count, buf, nrbufs; + + mutex_lock(&ino->i_mutex); + info = ino->i_pipe; + count = 0; + buf = info->curbuf; + nrbufs = info->nrbufs; + while (--nrbufs >= 0) { + if (!info->bufs[buf].ops->can_merge) { + mutex_unlock(&ino->i_mutex); + eprintk_ctx("unknown format of pipe buffer\n"); + return -EINVAL; + } + count += info->bufs[buf].len; + buf = (buf+1) & (PIPE_BUFFERS-1); + } + + if (!count) { + mutex_unlock(&ino->i_mutex); + return 0; + } + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = count; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + count = 0; + buf = info->curbuf; + nrbufs = info->nrbufs; + while (--nrbufs >= 0) { + struct pipe_buffer *b = info->bufs + buf; + /* need to ->pin first? */ + void * addr = b->ops->map(info, b, 0); + ctx->write(addr + b->offset, b->len, ctx); + b->ops->unmap(info, b, addr); + buf = (buf+1) & (PIPE_BUFFERS-1); + } + + mutex_unlock(&ino->i_mutex); + + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + } + + return 0; +} + +static int dump_content_socket(struct file *file, struct cpt_context *ctx) +{ + return 0; +} + +struct cpt_dirent { + unsigned long ino; + char *name; + int namelen; + int found; +}; + +static int cpt_filldir(void * __buf, const char * name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct cpt_dirent * dirent = __buf; + + if ((ino == dirent->ino) && (namelen < PAGE_SIZE - 1)) { + memcpy(dirent->name, name, namelen); + dirent->name[namelen] = '\0'; + dirent->namelen = namelen; + dirent->found = 1; + return 1; + } + return 0; +} + +static int find_linked_dentry(struct dentry *d, struct vfsmount *mnt, + struct inode *ino, struct cpt_context *ctx) +{ + int err = -EBUSY; + struct file *f = NULL; + struct cpt_dirent entry; + struct dentry *de, *found = NULL; + + dprintk_ctx("deleted reference to existing inode, try to find file\n"); + /* 1. Try to find not deleted dentry in ino->i_dentry list */ + spin_lock(&dcache_lock); + list_for_each_entry(de, &ino->i_dentry, d_alias) { + if (!IS_ROOT(de) && d_unhashed(de)) + continue; + found = de; + dget_locked(found); + break; + } + spin_unlock(&dcache_lock); + if (found) { + err = cpt_dump_dentry(found, mnt, 0, ctx); + dput(found); + if (!err) { + dprintk_ctx("dentry found in aliases\n"); + return 0; + } + } + + /* 2. Try to find file in current dir */ + de = dget_parent(d); + if (!de) + return -EINVAL; + + mntget(mnt); + f = dentry_open(de, mnt, O_RDONLY); + if (IS_ERR(f)) + return PTR_ERR(f); + + entry.ino = ino->i_ino; + entry.name = cpt_get_buf(ctx); + entry.found = 0; + err = vfs_readdir(f, cpt_filldir, &entry); + if (err || !entry.found) { + err = err ? err : -ENOENT; + goto err_readdir; + } + + found = lookup_one_len(entry.name, de, entry.namelen); + if (IS_ERR(found)) { + err = PTR_ERR(found); + goto err_readdir; + } + + err = -ENOENT; + if (found->d_inode != ino) + goto err_lookup; + + dprintk_ctx("dentry found in dir\n"); + __cpt_release_buf(ctx); + err = cpt_dump_dentry(found, mnt, 0, ctx); + +err_lookup: + dput(found); +err_readdir: + fput(f); + __cpt_release_buf(ctx); + return err; +} + +static int dump_one_inode(struct file *file, struct dentry *d, + struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err = 0; + struct inode *ino = d->d_inode; + cpt_object_t *iobj; + int dump_it = 0; + + iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx); + if (!iobj) + return -EINVAL; + + if (iobj->o_pos >= 0) + return 0; + + if ((!IS_ROOT(d) && d_unhashed(d)) && + !cpt_replaced(d, mnt, ctx)) + dump_it = 1; + if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) { + /* One more bug in epoll: invalid inode mode. + * What a load of crap... + */ + if (ino->i_sb->s_magic == FSMAGIC_EPOLL && + (ino->i_mode & S_IFMT) == 0) + return 0; + dump_it = 1; + } + + if (!dump_it) + return 0; + + cpt_open_object(iobj, ctx); + cpt_dump_inode(d, mnt, ctx); + + if (!IS_ROOT(d) && d_unhashed(d)) { + struct file *parent; + parent = iobj->o_parent; + if (!parent || + (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) { + /* Inode is not deleted, but it does not + * have references from inside checkpointed + * process group. */ + if (ino->i_nlink != 0) { + err = find_linked_dentry(d, mnt, ino, ctx); + if (err) { + eprintk_ctx("deleted reference to existing inode, checkpointing is impossible: %d\n", err); + return -EBUSY; + } + if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) + dump_it = 0; + } + } else { + /* Refer to _another_ file name. */ + err = cpt_dump_filename(parent, 0, ctx); + if (err) + return err; + if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) + dump_it = 0; + } + } + if (dump_it) { + if (S_ISREG(ino->i_mode)) { + if ((err = dump_content_regular(file, ctx)) != 0) { + eprintk_ctx("dump_content_regular "); + cpt_printk_dentry(d, mnt); + } + } else if (S_ISDIR(ino->i_mode)) { + /* We cannot do anything. The directory should be + * empty, so it is not a big deal. + */ + } else if (S_ISCHR(ino->i_mode)) { + err = dump_content_chrdev(file, ctx); + } else if (S_ISBLK(ino->i_mode)) { + err = dump_content_blkdev(file, ctx); + } else if (S_ISFIFO(ino->i_mode)) { + err = dump_content_fifo(file, ctx); + } else if (S_ISSOCK(ino->i_mode)) { + err = dump_content_socket(file, ctx); + } else { + eprintk_ctx("unknown inode mode %o\n", ino->i_mode & S_IFMT); + err = -EINVAL; + } + } + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_files(struct cpt_context *ctx) +{ + int epoll_nr; + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_TTY); + for_each_object(obj, CPT_OBJ_TTY) { + int err; + + if ((err = cpt_dump_tty(obj, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + cpt_open_section(ctx, CPT_SECT_INODE); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + int err; + + if ((err = dump_one_inode(file, file->f_dentry, + file->f_vfsmnt, ctx)) != 0) + return err; + } + for_each_object(obj, CPT_OBJ_FS) { + struct fs_struct *fs = obj->o_obj; + int err; + + if (fs->root && + (err = dump_one_inode(NULL, fs->root, fs->rootmnt, ctx)) != 0) + return err; + if (fs->pwd && + (err = dump_one_inode(NULL, fs->pwd, fs->pwdmnt, ctx)) != 0) + return err; + if (fs->altroot && + (err = dump_one_inode(NULL, fs->altroot, fs->altrootmnt, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + epoll_nr = 0; + cpt_open_section(ctx, CPT_SECT_FILES); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + int err; + + if ((err = dump_one_file(obj, file, ctx)) != 0) + return err; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) + epoll_nr++; + } + cpt_close_section(ctx); + + if (epoll_nr) { + cpt_open_section(ctx, CPT_SECT_EPOLL); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) { + int err; + if ((err = cpt_dump_epolldev(obj, ctx)) != 0) + return err; + } + } + cpt_close_section(ctx); + } + + cpt_open_section(ctx, CPT_SECT_SOCKET); + for_each_object(obj, CPT_OBJ_SOCKET) { + int err; + + if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + return 0; +} + +static int dump_filedesc(int fd, struct file *file, + struct files_struct *f, struct cpt_context *ctx) +{ + struct cpt_fd_image *v = cpt_get_buf(ctx); + cpt_object_t *obj; + + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILEDESC; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_fd = fd; + obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx); + if (!obj) BUG(); + v->cpt_file = obj->o_pos; + v->cpt_flags = 0; + if (FD_ISSET(fd, f->fdt->close_on_exec)) + v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + + return 0; +} + +static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct files_struct *f = obj->o_obj; + struct cpt_files_struct_image *v = cpt_get_buf(ctx); + int fd; + loff_t saved_obj; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILES; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_index = obj->o_index; + v->cpt_max_fds = f->fdt->max_fds; + v->cpt_next_fd = f->next_fd; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + for (fd = 0; fd < f->fdt->max_fds; fd++) { + struct file *file = fcheck_files(f, fd); + if (file) + dump_filedesc(fd, file, f, ctx); + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return 0; +} + +int cpt_dump_files_struct(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_FILES_STRUCT); + + for_each_object(obj, CPT_OBJ_FILES) { + int err; + + if ((err = dump_one_file_struct(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +int cpt_collect_fs(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->fs) { + if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->pwd && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd->d_inode, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->root && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->root->d_inode, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->altroot && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->altroot->d_inode, ctx) == NULL) + return -ENOMEM; + } + } + return 0; +} + +static int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) +{ + struct file file; + + memset(&file, 0, sizeof(file)); + + file.f_dentry = d; + file.f_vfsmnt = mnt; + file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK; + return dump_one_file(NULL, &file, ctx); +} + +static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct fs_struct *fs = obj->o_obj; + struct cpt_fs_struct_image *v = cpt_get_buf(ctx); + loff_t saved_obj; + int err; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_umask = fs->umask; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + err = cpt_dump_dir(fs->root, fs->rootmnt, ctx); + if (!err) + err = cpt_dump_dir(fs->pwd, fs->pwdmnt, ctx); + if (!err && fs->altroot) + err = cpt_dump_dir(fs->altroot, fs->altrootmnt, ctx); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_fs_struct(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_FS); + + for_each_object(obj, CPT_OBJ_FS) { + int err; + + if ((err = dump_one_fs(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err = 0; + struct namespace *n = obj->o_obj; + struct list_head *p; + char *path_buf, *path; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) + return -ENOMEM; + + down_read(&namespace_sem); + list_for_each(p, &n->list) { + struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); + + path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + + if (check_one_vfsmount(mnt)) { + eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name); + err = -EINVAL; + break; + } + } + up_read(&namespace_sem); + + free_page((unsigned long) path_buf); + + return err; +} + +int cpt_collect_namespace(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->nsproxy && tsk->nsproxy->namespace && + cpt_object_add(CPT_OBJ_NAMESPACE, + tsk->nsproxy->namespace, ctx) == NULL) + return -ENOMEM; + } + + for_each_object(obj, CPT_OBJ_NAMESPACE) { + int err; + if ((err = check_one_namespace(obj, ctx)) != 0) + return err; + } + + return 0; +} + +struct args_t +{ + int* pfd; + char* path; +}; + +static int dumptmpfs(void *arg) +{ + int i; + struct args_t *args = arg; + int *pfd = args->pfd; + int fd0, fd2; + char *path = args->path; + char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL }; + + i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump tmpfs\n"); + module_put(THIS_MODULE); + return 255 << 8; + } + + if (pfd[1] != 1) + sc_dup2(pfd[1], 1); + set_fs(KERNEL_DS); + fd0 = sc_open("/dev/null", O_RDONLY, 0); + fd2 = sc_open("/dev/null", O_WRONLY, 0); + if (fd0 < 0 || fd2 < 0) { + eprintk("can not open /dev/null for tar: %d %d\n", fd0, fd2); + module_put(THIS_MODULE); + return 255 << 8; + } + if (fd0 != 0) + sc_dup2(fd0, 0); + if (fd2 != 2) + sc_dup2(fd2, 2); + + for (i = 3; i < current->files->fdt->max_fds; i++) { + sc_close(i); + } + + module_put(THIS_MODULE); + + i = sc_execve("/bin/tar", argv, NULL); + eprintk("failed to exec /bin/tar: %d\n", i); + return 255 << 8; +} + +static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx) +{ + int err; + int pid; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + char buf[16]; + int n; + loff_t saved_obj; + struct args_t args; + int status; + mm_segment_t oldfs; + + err = sc_pipe(pfd); + if (err < 0) + return err; + args.pfd = pfd; + args.path = path; + err = pid = local_kernel_thread(dumptmpfs, (void*)&args, SIGCHLD, 0); + if (err < 0) + goto out; + f = fget(pfd[0]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NAME; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&v, sizeof(v), ctx); + + do { + oldfs = get_fs(); set_fs(KERNEL_DS); + n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); + set_fs(oldfs); + if (n > 0) + ctx->write(buf, n, ctx); + } while (n > 0); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("tar exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("tar terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + + buf[0] = 0; + ctx->write(buf, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + return n ? : err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + return err; +} + +static int loopy_root(struct vfsmount *mnt) +{ + struct list_head *p; + + list_for_each(p, &mnt->mnt_namespace->list) { + struct vfsmount * m = list_entry(p, struct vfsmount, mnt_list); + if (m == mnt) + return 0; + if (m->mnt_sb == mnt->mnt_sb) + return 1; + } + /* Cannot happen */ + return 0; +} + +static int cpt_dump_bind_mnt(struct vfsmount * mnt, cpt_context_t * ctx) +{ + struct list_head *p; + int err = -EINVAL; + + /* One special case: mount --bind /a /a */ + if (mnt->mnt_root == mnt->mnt_mountpoint) + return cpt_dump_dentry(mnt->mnt_root, mnt, 0, ctx); + + list_for_each_prev(p, &mnt->mnt_list) { + struct vfsmount * m; + + if (p == &mnt->mnt_namespace->list) + break; + + m = list_entry(p, struct vfsmount, mnt_list); + + if (m->mnt_sb != mnt->mnt_sb) + continue; + + err = cpt_dump_dentry(mnt->mnt_root, m, 0, ctx); + if (err == 0) + break; + } + return err; +} + +static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err = 0; + struct cpt_vfsmount_image v; + loff_t saved_obj; + char *path_buf, *path; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) + return -ENOMEM; + + path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) { + free_page((unsigned long) path_buf); + return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path); + } + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_VFSMOUNT; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + v.cpt_mntflags = mnt->mnt_flags; + if (slab_ub(mnt) != get_exec_ub()) { + v.cpt_mntflags |= CPT_MNT_EXT; + } else { + if (mnt->mnt_root != mnt->mnt_sb->s_root || loopy_root(mnt)) + v.cpt_mntflags |= CPT_MNT_BIND; + } + v.cpt_flags = mnt->mnt_sb->s_flags; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + cpt_dump_string(mnt->mnt_devname ? : "none", ctx); + cpt_dump_string(path, ctx); + cpt_dump_string(mnt->mnt_sb->s_type->name, ctx); + + if (v.cpt_mntflags & CPT_MNT_BIND) + err = cpt_dump_bind_mnt(mnt, ctx); + else if (!(v.cpt_mntflags & CPT_MNT_EXT) && + strcmp(mnt->mnt_sb->s_type->name, "tmpfs") == 0) { + mntget(mnt); + up_read(&namespace_sem); + err = cpt_dump_tmpfs(path, ctx); + down_read(&namespace_sem); + if (!err) { + if (list_empty(&mnt->mnt_list)) + err = -EBUSY; + } + mntput(mnt); + } + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + if (!err && mnt->mnt_sb->s_magic == FSMAGIC_VEFS) + vefs_track_force_stop(mnt->mnt_sb); + + free_page((unsigned long) path_buf); + + return err; +} + +static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct namespace *n = obj->o_obj; + struct cpt_object_hdr v; + struct list_head *p; + loff_t saved_obj; + int err = 0; + + cpt_open_object(obj, ctx); + + v.cpt_next = -1; + v.cpt_object = CPT_OBJ_NAMESPACE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + + down_read(&namespace_sem); + list_for_each(p, &n->list) { + err = dump_vfsmount(list_entry(p, struct vfsmount, mnt_list), ctx); + if (err) + break; + } + up_read(&namespace_sem); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_namespace(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_NAMESPACE); + + for_each_object(obj, CPT_OBJ_NAMESPACE) { + int err; + + if ((err = dump_one_namespace(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_files.h linux-2.6.18.ovz/kernel/cpt/cpt_files.h --- linux-2.6.18/kernel/cpt/cpt_files.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_files.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,57 @@ +int cpt_collect_files(cpt_context_t *); +int cpt_collect_fs(cpt_context_t *); +int cpt_collect_namespace(cpt_context_t *); +int cpt_collect_sysvsem_undo(cpt_context_t *); +int cpt_collect_tty(struct file *, cpt_context_t *); +int cpt_dump_files(struct cpt_context *ctx); +int cpt_dump_files_struct(struct cpt_context *ctx); +int cpt_dump_fs_struct(struct cpt_context *ctx); +int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx); +int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx); +int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx); +struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx); +struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx); +__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx); + +int rst_posix_locks(struct cpt_context *ctx); + +struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx); +int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_restore_fs(struct cpt_context *ctx); + +int cpt_collect_sysv(cpt_context_t *); +int cpt_dump_sysvsem(struct cpt_context *ctx); +int rst_sysv_ipc(struct cpt_context *ctx); +int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx); + +int cpt_dump_namespace(struct cpt_context *ctx); +int rst_root_namespace(struct cpt_context *ctx); + +int rst_stray_files(struct cpt_context *ctx); +int rst_tty_jobcontrol(struct cpt_context *ctx); + +void rst_flush_filejobs(struct cpt_context *); +int rst_do_filejobs(struct cpt_context *); + +int rst_eventpoll(struct cpt_context *); +struct file *cpt_open_epolldev(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx); +int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *); + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, + cpt_context_t *ctx); + +#define check_one_vfsmount(mnt) \ + (strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "ext2") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "unionfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0) diff -uprN linux-2.6.18/kernel/cpt/cpt_fsmagic.h linux-2.6.18.ovz/kernel/cpt/cpt_fsmagic.h --- linux-2.6.18/kernel/cpt/cpt_fsmagic.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_fsmagic.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,16 @@ +/* Collected from kernel sources. */ + +#define FSMAGIC_TMPFS 0x01021994 +#define FSMAGIC_PIPEFS 0x50495045 +#define FSMAGIC_SOCKFS 0x534F434B +#define FSMAGIC_PFMFS 0xa0b4d889 +#define FSMAGIC_BDEV 0x62646576 +#define FSMAGIC_EPOLL 0x03111965 +#define FSMAGIC_FUTEX 0x0BAD1DEA +#define FSMAGIC_MQUEUE 0x19800202 +#define FSMAGIC_PROC 0x9fa0 +#define FSMAGIC_DEVPTS 0x1CD1 +#define FSMAGIC_AUTOFS 0x0187 +#define FSMAGIC_EXT2 0xEF53 +#define FSMAGIC_REISER 0x52654973 +#define FSMAGIC_VEFS 0x565a4653 diff -uprN linux-2.6.18/kernel/cpt/cpt_kernel.c linux-2.6.18.ovz/kernel/cpt/cpt_kernel.c --- linux-2.6.18/kernel/cpt/cpt_kernel.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_kernel.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,166 @@ +/* + * + * kernel/cpt/cpt_kernel.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#define __KERNEL_SYSCALLS__ 1 + +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include + +#include "cpt_kernel.h" +#include "cpt_syscalls.h" + +int debug_level = 1; + +#ifdef CONFIG_X86_32 + +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.ebx = (unsigned long) fn; + regs.edx = (unsigned long) arg; + + regs.xds = __USER_DS; + regs.xes = __USER_DS; + regs.orig_eax = -1; + regs.eip = (unsigned long) kernel_thread_helper; + regs.xcs = __KERNEL_CS; + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + + /* Ok, create the new process.. */ + return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL, pid); +} +#endif + +#ifdef CONFIG_IA64 +pid_t +asm_kernel_thread (int (*fn)(void *), void *arg, unsigned long flags, pid_t pid) +{ + extern void start_kernel_thread (void); + unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; + struct { + struct switch_stack sw; + struct pt_regs pt; + } regs; + + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ + regs.pt.r9 = (unsigned long) fn; /* 1st argument */ + regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ + /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ + regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; + regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ + regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); + regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; + regs.sw.pr = (1 << 2 /*PRED_KERNEL_STACK*/); + return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL, pid); +} +#endif + +int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) +{ + pid_t ret; + + if (current->fs == NULL) { + /* do_fork_pid() hates processes without fs, oopses. */ + printk("CPT BUG: local_kernel_thread: current->fs==NULL\n"); + return -EINVAL; + } + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + ret = asm_kernel_thread(fn, arg, flags, pid); + if (ret < 0) + module_put(THIS_MODULE); + return ret; +} + +#ifdef __i386__ +int __execve(const char *file, char **argv, char **envp) +{ + long res; + __asm__ volatile ("int $0x80" + : "=a" (res) + : "0" (__NR_execve),"b" ((long)(file)),"c" ((long)(argv)), + "d" ((long)(envp)) : "memory"); + return (int)res; +} +#endif + +int sc_execve(char *cmd, char **argv, char **env) +{ + int ret; +#ifndef __i386__ + ret = execve(cmd, argv, env); +#else + ret = __execve(cmd, argv, env); +#endif + return ret; +} + +unsigned int test_cpu_caps(void) +{ + unsigned int flags = 0; + +#ifdef CONFIG_X86 + if (boot_cpu_has(X86_FEATURE_CMOV)) + flags |= 1 << CPT_CPU_X86_CMOV; + if (cpu_has_fxsr) + flags |= 1 << CPT_CPU_X86_FXSR; + if (cpu_has_xmm) + flags |= 1 << CPT_CPU_X86_SSE; +#ifndef CONFIG_X86_64 + if (cpu_has_xmm2) +#endif + flags |= 1 << CPT_CPU_X86_SSE2; + if (cpu_has_mmx) + flags |= 1 << CPT_CPU_X86_MMX; + if (boot_cpu_has(X86_FEATURE_3DNOW)) + flags |= 1 << CPT_CPU_X86_3DNOW; + if (boot_cpu_has(X86_FEATURE_3DNOWEXT)) + flags |= 1 << CPT_CPU_X86_3DNOW2; + if (boot_cpu_has(X86_FEATURE_SEP)) + flags |= 1 << CPT_CPU_X86_SEP; +#ifdef CONFIG_X86_64 + flags |= 1 << CPT_CPU_X86_EMT64; +#endif +#endif +#ifdef CONFIG_IA64 + flags |= 1 << CPT_CPU_X86_IA64; + flags |= 1 << CPT_CPU_X86_FXSR; +#endif + return flags; +} + +unsigned int test_kernel_config(void) +{ + unsigned int flags = 0; +#ifdef CONFIG_X86 +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) + flags |= 1 << CPT_KERNEL_CONFIG_PAE; +#endif +#endif + return flags; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_kernel.h linux-2.6.18.ovz/kernel/cpt/cpt_kernel.h --- linux-2.6.18/kernel/cpt/cpt_kernel.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_kernel.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,94 @@ +/* Interface to kernel vars which we had to _add_. */ + +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) +#define TASK_TRACED TASK_STOPPED +#define unix_peer(sk) ((sk)->sk_pair) +#define page_mapcount(pg) ((pg)->mapcount) +#else +#define unix_peer(sk) (unix_sk(sk)->peer) +#endif + +#ifdef CONFIG_X86_64 +#define cpu_has_fxsr 1 +#endif +#ifdef CONFIG_IA64 +#define cpu_has_fxsr 1 +#endif + +static inline void do_gettimespec(struct timespec *ts) +{ + struct timeval tv; + do_gettimeofday(&tv); + ts->tv_sec = tv.tv_sec; + ts->tv_nsec = tv.tv_usec*1000; +} + +int local_kernel_thread(int (*fn)(void *), + void * arg, + unsigned long flags, + pid_t pid); +int asm_kernel_thread(int (*fn)(void *), + void * arg, + unsigned long flags, + pid_t pid); + +#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) +void vefs_track_force_stop(struct super_block *super); + +void vefs_track_notify(struct dentry *vdentry, int track_cow); + +struct dentry * vefs_replaced_dentry(struct dentry *de); +int vefs_is_renamed_dentry(struct dentry *vde, struct dentry *pde); +#else +static inline void vefs_track_force_stop(struct super_block *super) { }; + +static inline void vefs_track_notify(struct dentry *vdentry, int track_cow) { }; +#endif + +unsigned int test_cpu_caps(void); +unsigned int test_kernel_config(void); + +#define test_one_flag_old(src, dst, flag, message, ret) \ +if (src & (1 << flag)) \ + if (!(dst & (1 << flag))) { \ + wprintk("Destination cpu does not have " message "\n"); \ + ret = 1; \ + } +#define test_one_flag(src, dst, flag, message, ret) \ +if (src & (1 << flag)) \ + if (!(dst & (1 << flag))) { \ + eprintk_ctx("Destination cpu does not have " message "\n"); \ + ret = 1; \ + } + +static inline void +_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +static inline struct timespec +_ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); + if (unlikely(nsec < 0)) + _set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); + + return ts; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_mm.c linux-2.6.18.ovz/kernel/cpt/cpt_mm.c --- linux-2.6.18/kernel/cpt/cpt_mm.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_mm.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,907 @@ +/* + * + * kernel/cpt/cpt_mm.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +#include "cpt_pagein.h" +#endif +#include "cpt_ubc.h" + +static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, + cpt_context_t *ctx) +{ + if (!list_empty(&aio_ctx->run_list)) { + /* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */ + eprintk_ctx("run list is not empty, cannot suspend AIO\n"); + return -EBUSY; + } + + /* Wait for pending IOCBs. Linux AIO is mostly _fake_. + * It is actually synchronous, except for direct IO and + * some funny raw USB things, which cannot happen inside VE. + * However, we do this for future. + * + * Later note: in 2.6.16 we may allow O_DIRECT, so that + * it is not meaningless code. + */ + wait_for_all_aios(aio_ctx); + + if (!list_empty(&aio_ctx->run_list) || + !list_empty(&aio_ctx->active_reqs) || + aio_ctx->reqs_active) { + eprintk_ctx("were not able to suspend AIO\n"); + return -EBUSY; + } + + return 0; +} + +static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file) { + if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL) + return -ENOMEM; + } + } +#ifdef CONFIG_USER_RESOURCE + if (cpt_add_ubc(mm->mm_ub, ctx) == NULL) + return -ENOMEM; +#endif + + if (mm->ioctx_list) { + struct kioctx *aio_ctx; + int err; + + for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) + if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0) + return err; + } + + return 0; +} + +int cpt_collect_mm(cpt_context_t * ctx) +{ + cpt_object_t *obj; + int err; + int index; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL) + return -ENOMEM; + } + + index = 1; + for_each_object(obj, CPT_OBJ_MM) { + struct mm_struct *mm = obj->o_obj; + if (obj->o_count != atomic_read(&mm->mm_users)) { + eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users)); + return -EAGAIN; + } + cpt_obj_setindex(obj, index++, ctx); + + if ((err = collect_one_mm(mm, ctx)) != 0) + return err; + } + + return 0; +} + +static int zcnt, scnt, scnt0, ucnt; + +/* Function where_is_anon_page() returns address of a anonymous page in mm + * of already dumped process. This happens f.e. after fork(). We do not use + * this right now, just keep statistics, it is diffucult to restore such state, + * but the most direct use is to save space in dumped image. */ + + +static inline unsigned long +vma_address0(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + address |= 1; + return address; +} + +static int really_this_one(struct vm_area_struct *vma, unsigned long address, + struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + int result; + + pgd = pgd_offset(mm, address); + if (unlikely(!pgd_present(*pgd))) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (unlikely(!pmd_present(*pmd))) + return 0; + + result = 0; + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + pte_unmap(pte); + return 0; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) + result = 1; + pte_unmap_unlock(pte, ptl); + return result; +} + +static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr, + struct page *page, cpt_context_t * ctx) +{ + loff_t mmptr = CPT_NULL; + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + int idx = mmobj->o_index; + + if (!PageAnon(page)) + return CPT_NULL; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return CPT_NULL; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + unsigned long addr = vma_address0(page, vma); + cpt_object_t *obj; + + /* We do not try to support mremapped regions (addr != mapaddr), + * only mmaps directly inherited via fork(). + * With this limitation we may check self-consistency of + * vmas (vm_start, vm_pgoff, anon_vma) before + * doing __copy_page_range() in rst_mm. + */ + if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) { + obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx); + if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) { + if (really_this_one(vma, addr, page)) { + mmptr = obj->o_pos; + idx = obj->o_index; + } + } + } + } + spin_unlock(&anon_vma->lock); + + return mmptr; +} + +struct page_area +{ + int type; + unsigned long start; + unsigned long end; + pgoff_t pgoff; + loff_t mm; + __u64 list[16]; +}; + +struct page_desc +{ + int type; + pgoff_t index; + loff_t mm; + int shared; +}; + +enum { + PD_ABSENT, + PD_COPY, + PD_ZERO, + PD_CLONE, + PD_FUNKEY, + PD_LAZY, + PD_ITER, + PD_ITERYOUNG, +}; + +/* 0: page can be obtained from backstore, or still not mapped anonymous page, + or something else, which does not requre copy. + 1: page requires copy + 2: page requres copy but its content is zero. Quite useless. + 3: wp page is shared after fork(). It is to be COWed when modified. + 4: page is something unsupported... We copy it right now. + */ + + + +static void page_get_desc(cpt_object_t *mmobj, + struct vm_area_struct *vma, unsigned long addr, + struct page_desc *pdesc, cpt_context_t * ctx) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *pg = NULL; + pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff; + + pdesc->index = linear_index; + pdesc->shared = 0; + pdesc->mm = CPT_NULL; + + if (vma->vm_flags & VM_IO) { + pdesc->type = PD_ABSENT; + return; + } + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out_absent; + pud = pud_offset(pgd, addr); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out_absent; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out_absent; +#ifdef CONFIG_X86 + if (pmd_huge(*pmd)) { + eprintk_ctx("page_huge\n"); + goto out_unsupported; + } +#endif +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +retry: +#endif + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = *ptep; + pte_unmap(ptep); + + if (pte_none(pte)) + goto out_absent_unlock; + + if (!pte_present(pte)) { + if (pte_file(pte)) { + pdesc->index = pte_to_pgoff(pte); + goto out_absent_unlock; + } + if (vma->vm_flags & VM_SHARED) { + /* It is impossible: shared mappings cannot be in swap */ + eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos); + goto out_unsupported_unlock; + } +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + /* Otherwise it is in swap. */ + if (!ctx->lazy_vm) { + int err; + /* If lazy transfer is not enabled, + * raise it from swap now, so that we + * save at least when the page is shared. + */ + spin_unlock(ptl); + err = handle_mm_fault(mm, vma, addr, 0); + if (err == VM_FAULT_SIGBUS) + goto out_absent; + if (err == VM_FAULT_OOM) + goto out_absent; + err = 0; + goto retry; + } +#endif + pdesc->type = PD_LAZY; + goto out_unlock; + } + + if ((pg = vm_normal_page(vma, addr, pte)) == NULL) { + pdesc->type = PD_COPY; + goto out_unlock; + } + + get_page(pg); + spin_unlock(ptl); + + if (pg->mapping && !PageAnon(pg)) { + if (vma->vm_file == NULL) { + eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr); + goto out_unsupported; + } + if (vma->vm_file->f_mapping != pg->mapping) { + eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n", + addr, vma->vm_file->f_mapping, pg->mapping, + mmobj->o_pos); + goto out_unsupported; + } + pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + /* Page is in backstore. For us it is like + * it is not present. + */ + goto out_absent; + } + + if (PageReserved(pg)) { + /* Special case: ZERO_PAGE is used, when an + * anonymous page is accessed but not written. */ + if (pg == ZERO_PAGE(addr)) { + if (pte_write(pte)) { + eprintk_ctx("not funny already, writable ZERO_PAGE\n"); + goto out_unsupported; + } + zcnt++; + goto out_absent; + } + eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index, + addr, mmobj->o_pos); + goto out_unsupported; + } + + if (pg == ZERO_PAGE(addr)) { + wprintk_ctx("that's how it works now\n"); + } + + if (!pg->mapping) { + eprintk_ctx("page without mapping at %08lx@%Ld\n", addr, + mmobj->o_pos); + goto out_unsupported; + } + + if (pg->mapping && page_mapcount(pg) > 1) { + pdesc->shared = 1; + pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx); + if (pdesc->mm != CPT_NULL) { + scnt0++; + pdesc->type = PD_CLONE; + goto out_put; + } else { + scnt++; + } + } +#ifdef CONFIG_VZ_CHECKPOINT_ITER + if (ctx->iter_done && + test_bit(PG_checkpointed, &pg->flags)) { + if (pte_write(pte)) { + wprintk_ctx("writable PG_checkpointed page\n"); + } + pdesc->index = page_to_pfn(pg); + pdesc->type = pte_young(pte) ? PD_ITERYOUNG : PD_ITER; + goto out_put; + } +#endif + pdesc->type = pte_young(pte) ? PD_COPY : PD_LAZY; + +out_put: + if (pg) + put_page(pg); + return; + +out_unlock: + spin_unlock(ptl); + goto out_put; + +out_absent_unlock: + spin_unlock(ptl); +out_absent: + pdesc->type = PD_ABSENT; + goto out_put; + +out_unsupported_unlock: + spin_unlock(ptl); +out_unsupported: + ucnt++; + pdesc->type = PD_FUNKEY; + goto out_put; +} + +/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages() + * does not really need this thing. It just stores some page fault stats there. + * + * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages + * before accessing vma. + */ +void dump_pages(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct cpt_context *ctx) +{ +#define MAX_PAGE_BATCH 16 + struct page *pg[MAX_PAGE_BATCH]; + int npages = (end - start)/PAGE_SIZE; + int count = 0; + + while (count < npages) { + int copy = npages - count; + int n; + + if (copy > MAX_PAGE_BATCH) + copy = MAX_PAGE_BATCH; + n = get_user_pages(current, vma->vm_mm, start, copy, + 0, 1, pg, NULL); + if (n == copy) { + int i; + for (i=0; iwrite(maddr, PAGE_SIZE, ctx); + kunmap(pg[i]); + } + } else { + eprintk_ctx("get_user_pages fault"); + for ( ; n > 0; n--) + page_cache_release(pg[n-1]); + return; + } + start += n*PAGE_SIZE; + count += n; + for ( ; n > 0; n--) + page_cache_release(pg[n-1]); + } + return; +} + +int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb, + int copy, + struct cpt_context *ctx) +{ + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES; + pgb->cpt_hdrlen = sizeof(*pgb); + pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID; + + ctx->write(pgb, sizeof(*pgb), ctx); + if (copy == PD_COPY || copy == PD_LAZY) + dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa, + struct cpt_context *ctx) +{ + struct cpt_remappage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_REMAPPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1; + + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa, + struct cpt_context *ctx) +{ + struct cpt_copypage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_COPYPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + pgb.cpt_source = pa->mm; + + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa, + cpt_context_t *ctx) +{ + struct cpt_lazypage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_LAZYPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start, + (pa->end-pa->start)/PAGE_SIZE, ctx); +#endif + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_iterpage_block(struct vm_area_struct *vma, struct page_area *pa, + cpt_context_t *ctx) +{ + struct cpt_iterpage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = pa->type == PD_ITER ? CPT_OBJ_ITERPAGES : + CPT_OBJ_ITERYOUNGPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + ctx->write(&pgb, sizeof(pgb), ctx); + + ctx->write(pa->list, 8*((pa->end-pa->start)/PAGE_SIZE), ctx); + + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + + +static int can_expand(struct page_area *pa, struct page_desc *pd) +{ + if (pa->start == pa->end) + return 1; + if (pa->type != pd->type) + return 0; + if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) { + if (pa->end - pa->start >= PAGE_SIZE*16) + return 0; + pa->list[(pa->end - pa->start)/PAGE_SIZE] = pd->index; + } + if (pa->type == PD_ABSENT) + return pd->index == pa->pgoff + 1; + if (pa->type == PD_CLONE) + return pd->mm == pa->mm; + return 1; +} + +static int dump_one_vma(cpt_object_t *mmobj, + struct vm_area_struct *vma, struct cpt_context *ctx) +{ + struct cpt_vma_image *v = cpt_get_buf(ctx); + unsigned long addr; + loff_t saved_object; + struct cpt_page_block pgb; + struct page_area pa; + int cloned_pages = 0; + + cpt_push_object(&saved_object, ctx); + + v->cpt_object = CPT_OBJ_VMA; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_start = vma->vm_start; + v->cpt_end = vma->vm_end; + v->cpt_flags = vma->vm_flags; + if (vma->vm_flags&VM_HUGETLB) { + eprintk_ctx("huge TLB VMAs are still not supported\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_pgprot = vma->vm_page_prot.pgprot; + v->cpt_pgoff = vma->vm_pgoff; + v->cpt_file = CPT_NULL; + v->cpt_type = CPT_VMA_TYPE_0; + v->cpt_anonvma = 0; + + /* We have to remember what VMAs are bound to one anon_vma. + * So, we store an identifier of group of VMAs. It is handy + * to use absolute address of anon_vma as this identifier. */ + v->cpt_anonvmaid = (unsigned long)vma->anon_vma; + + if (vma->vm_file) { + struct file *filp; + cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx); + if (obj == NULL) BUG(); + filp = obj->o_obj; + if (filp->f_op && + filp->f_op->read == NULL && + filp->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_TMPFS) + v->cpt_type = CPT_VMA_TYPE_SHM; + v->cpt_file = obj->o_pos; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + pa.type = PD_ABSENT; + pa.pgoff = vma->vm_pgoff; + pa.mm = CPT_NULL; + pa.start = vma->vm_start; + pa.end = vma->vm_start; + + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + struct page_desc pd; + + page_get_desc(mmobj, vma, addr, &pd, ctx); + cloned_pages += pd.shared; + + if (pd.type == PD_FUNKEY) { + eprintk_ctx("dump_one_vma: funkey page\n"); + return -EINVAL; + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (pd.type == PD_LAZY && + (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED))) + pd.type = PD_COPY; +#else + if (pd.type == PD_LAZY) + pd.type = PD_COPY; +#endif + + if (!can_expand(&pa, &pd)) { + if (pa.type == PD_COPY || + pa.type == PD_ZERO) { + pgb.cpt_start = pa.start; + pgb.cpt_end = pa.end; + dump_page_block(vma, &pgb, pa.type, ctx); + } else if (pa.type == PD_CLONE) { + dump_copypage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_LAZY) { + dump_lazypage_block(vma, &pa, ctx); + } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { + dump_iterpage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_ABSENT && + pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { + dump_remappage_block(vma, &pa, ctx); + } + pa.start = addr; + } + pa.type = pd.type; + pa.end = addr + PAGE_SIZE; + pa.pgoff = pd.index; + if (addr == pa.start) + pa.list[0] = pd.index; + pa.mm = pd.mm; + } + + if (pa.end > pa.start) { + if (pa.type == PD_COPY || + pa.type == PD_ZERO) { + pgb.cpt_start = pa.start; + pgb.cpt_end = pa.end; + dump_page_block(vma, &pgb, pa.type, ctx); + } else if (pa.type == PD_CLONE) { + dump_copypage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_LAZY) { + dump_lazypage_block(vma, &pa, ctx); + } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { + dump_iterpage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_ABSENT && + pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { + dump_remappage_block(vma, &pa, ctx); + } + } + + if (cloned_pages) { + __u32 anonvma = 1; + loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma); + ctx->pwrite(&anonvma, 4, ctx, anonpos); + } + + cpt_close_object(ctx); + + cpt_pop_object(&saved_object, ctx); + + return 0; +} + +static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, + cpt_context_t *ctx) +{ + loff_t saved_object; + struct cpt_aio_ctx_image aimg; + + if (!list_empty(&aio_ctx->run_list) || + !list_empty(&aio_ctx->active_reqs) || + aio_ctx->reqs_active) { + eprintk_ctx("AIO is active after suspend\n"); + return -EBUSY; + } + + cpt_push_object(&saved_object, ctx); + + aimg.cpt_next = CPT_ALIGN(sizeof(aimg)); + aimg.cpt_object = CPT_OBJ_AIO_CONTEXT; + aimg.cpt_hdrlen = sizeof(aimg); + aimg.cpt_content = CPT_CONTENT_ARRAY; + + aimg.cpt_max_reqs = aio_ctx->max_reqs; + aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages; + aimg.cpt_nr = aio_ctx->ring_info.nr; + aimg.cpt_tail = aio_ctx->ring_info.tail; + aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base; + + ctx->write(&aimg, sizeof(aimg), ctx); + + cpt_pop_object(&saved_object, ctx); + return 0; +} + +static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct mm_struct *mm = obj->o_obj; + struct vm_area_struct *vma; + struct cpt_mm_image *v = cpt_get_buf(ctx); + + cpt_open_object(obj, ctx); + + v->cpt_next = -1; + v->cpt_object = CPT_OBJ_MM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_start_code = mm->start_code; + v->cpt_end_code = mm->end_code; + v->cpt_start_data = mm->start_data; + v->cpt_end_data = mm->end_data; + v->cpt_start_brk = mm->start_brk; + v->cpt_brk = mm->brk; + v->cpt_start_stack = mm->start_stack; + v->cpt_start_arg = mm->arg_start; + v->cpt_end_arg = mm->arg_end; + v->cpt_start_env = mm->env_start; + v->cpt_end_env = mm->env_end; + v->cpt_def_flags = mm->def_flags; +#ifdef CONFIG_USER_RESOURCE + v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx); +#endif + v->cpt_dumpable = mm->dumpable; + v->cpt_vps_dumpable = mm->vps_dumpable; + v->cpt_used_hugetlb = 0; /* not used */ + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + +#ifdef CONFIG_X86 + if (mm->context.size) { + loff_t saved_object; + struct cpt_obj_bits b; + int size; + + dprintk_ctx("nontrivial LDT\n"); + + cpt_push_object(&saved_object, ctx); + + cpt_open_object(NULL, ctx); + b.cpt_next = CPT_NULL; + b.cpt_object = CPT_OBJ_BITS; + b.cpt_hdrlen = sizeof(b); + b.cpt_content = CPT_CONTENT_MM_CONTEXT; + b.cpt_size = mm->context.size*LDT_ENTRY_SIZE; + + ctx->write(&b, sizeof(b), ctx); + + size = mm->context.size*LDT_ENTRY_SIZE; + +#if defined(CONFIG_X86_64) || LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15) + ctx->write(mm->context.ldt, size, ctx); +#else + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + ctx->write(kaddr, bytes, ctx); + kunmap(mm->context.ldt_pages[nr]); + } +#endif + + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + } +#endif + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + int err; + +#ifdef CONFIG_X86_64 + if (vma->vm_start == 0xFFFFE000 && + vma->vm_end == 0xFFFFF000) + continue; +#endif + + if ((err = dump_one_vma(obj, vma, ctx)) != 0) + return err; + } + + if (mm->ioctx_list) { + struct kioctx *aio_ctx; + int err; + + for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) + if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0) + return err; + } + + cpt_close_object(ctx); + + return 0; +} + +int cpt_dump_vm(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + scnt = scnt0 = zcnt = 0; + + cpt_open_section(ctx, CPT_SECT_MM); + + for_each_object(obj, CPT_OBJ_MM) { + int err; + + if ((err = dump_one_mm(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + + if (scnt) + dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt); + if (scnt0) + dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0); + if (zcnt) + dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt); + return 0; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_mm.h linux-2.6.18.ovz/kernel/cpt/cpt_mm.h --- linux-2.6.18/kernel/cpt/cpt_mm.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_mm.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,22 @@ +int cpt_collect_mm(cpt_context_t *); + +int cpt_dump_vm(struct cpt_context *ctx); + +__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx); + +int cpt_mm_prepare(unsigned long veid); + +int cpt_free_pgin_dir(struct cpt_context *); +int cpt_start_pagein(struct cpt_context *); +int rst_setup_pagein(struct cpt_context *); +int rst_complete_pagein(struct cpt_context *, int); +int rst_pageind(struct cpt_context *); +int cpt_iteration(cpt_context_t *ctx); +int rst_iteration(cpt_context_t *ctx); +void rst_drop_iter_dir(cpt_context_t *ctx); +int rst_iter(struct vm_area_struct *vma, u64 pfn, + unsigned long addr, cpt_context_t * ctx); + +int rst_swapoff(struct cpt_context *); diff -uprN linux-2.6.18/kernel/cpt/cpt_net.c linux-2.6.18.ovz/kernel/cpt/cpt_net.c --- linux-2.6.18/kernel/cpt/cpt_net.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_net.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,457 @@ +/* + * + * kernel/cpt/cpt_net.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" +#include "cpt_syscalls.h" + +int cpt_dump_link(struct cpt_context * ctx) +{ + struct net_device *dev; + + cpt_open_section(ctx, CPT_SECT_NET_DEVICE); + for (dev = dev_base; dev; dev = dev->next) { + struct cpt_netdev_image v; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_DEVICE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_index = dev->ifindex; + v.cpt_flags = dev->flags; + memcpy(v.cpt_name, dev->name, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + + if (dev != get_exec_env()->_loopback_dev +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) + && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) +#endif +#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) + && dev != get_exec_env()->_venet_dev +#endif + ) { + eprintk_ctx("unsupported netdevice %s\n", dev->name); + cpt_close_section(ctx); + return -EBUSY; + } + } + cpt_close_section(ctx); + return 0; +} + +int cpt_suspend_network(struct cpt_context *ctx) +{ + get_exec_env()->disable_net = 1; + synchronize_net(); + return 0; +} + +int cpt_resume_network(struct cpt_context *ctx) +{ + struct ve_struct *env; + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + env->disable_net = 0; + put_ve(env); + return 0; +} + +int cpt_dump_ifaddr(struct cpt_context * ctx) +{ + struct net_device *dev; + + cpt_open_section(ctx, CPT_SECT_NET_IFADDR); + for (dev = dev_base; dev; dev = dev->next) { + struct in_device *idev = in_dev_get(dev); + struct in_ifaddr *ifa; + + if (!idev) + continue; + + for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) { + struct cpt_ifaddr_image v; + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IFADDR; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_index = dev->ifindex; + v.cpt_family = AF_INET; + v.cpt_masklen = ifa->ifa_prefixlen; + v.cpt_flags = ifa->ifa_flags; + v.cpt_scope = ifa->ifa_scope; + memset(&v.cpt_address, 0, sizeof(v.cpt_address)); + memset(&v.cpt_peer, 0, sizeof(v.cpt_peer)); + memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); + v.cpt_address[0] = ifa->ifa_local; + v.cpt_peer[0] = ifa->ifa_address; + v.cpt_broadcast[0] = ifa->ifa_broadcast; + memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + } + in_dev_put(idev); + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + for (dev = dev_base; dev; dev = dev->next) { + struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_ifaddr *ifa; + + if (!idev) + continue; + + for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { + struct cpt_ifaddr_image v; + + if (dev == &loopback_dev && + ifa->prefix_len == 128 && + ifa->addr.s6_addr32[0] == 0 && + ifa->addr.s6_addr32[1] == 0 && + ifa->addr.s6_addr32[2] == 0 && + ifa->addr.s6_addr32[3] == htonl(1)) + continue; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IFADDR; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_index = dev->ifindex; + v.cpt_family = AF_INET6; + v.cpt_masklen = ifa->prefix_len; + v.cpt_flags = ifa->flags; + v.cpt_scope = ifa->scope; + v.cpt_valid_lft = ifa->valid_lft; + v.cpt_prefered_lft = ifa->prefered_lft; + memcpy(&v.cpt_address, &ifa->addr, 16); + memcpy(&v.cpt_peer, &ifa->addr, 16); + memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); + memcpy(v.cpt_label, dev->name, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + } + in6_dev_put(idev); + } +#endif + cpt_close_section(ctx); + return 0; +} + +static int cpt_dump_route(struct cpt_context * ctx) +{ + int err; + struct socket *sock; + struct msghdr msg; + struct iovec iov; + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + } req; + struct sockaddr_nl nladdr; + struct cpt_object_hdr v; + mm_segment_t oldfs; + char *pg; + + err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); + if (err) + return err; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = RTM_GETROUTE; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.g.rtgen_family = AF_INET; + + iov.iov_base=&req; + iov.iov_len=sizeof(req); + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, sizeof(req)); + set_fs(oldfs); + + if (err < 0) + goto out_sock; + + pg = (char*)__get_free_page(GFP_KERNEL); + if (pg == NULL) { + err = -ENOMEM; + goto out_sock; + } + + cpt_open_section(ctx, CPT_SECT_NET_ROUTE); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_ROUTE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NLMARRAY; + + ctx->write(&v, sizeof(v), ctx); + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +restart: +#endif + for (;;) { + struct nlmsghdr *h; + + iov.iov_base = pg; + iov.iov_len = PAGE_SIZE; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); + set_fs(oldfs); + + if (err < 0) + goto out_sock_pg; + if (msg.msg_flags & MSG_TRUNC) { + err = -ENOBUFS; + goto out_sock_pg; + } + + h = (struct nlmsghdr*)pg; + while (NLMSG_OK(h, err)) { + if (h->nlmsg_type == NLMSG_DONE) { + err = 0; + goto done; + } + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h); + err = errm->error; + eprintk_ctx("NLMSG error: %d\n", errm->error); + goto done; + } + if (h->nlmsg_type != RTM_NEWROUTE) { + eprintk_ctx("NLMSG: %d\n", h->nlmsg_type); + err = -EINVAL; + goto done; + } + ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx); + h = NLMSG_NEXT(h, err); + } + if (err) { + eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type); + err = -EINVAL; + break; + } + } +done: +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (!err && req.g.rtgen_family == AF_INET) { + req.g.rtgen_family = AF_INET6; + iov.iov_base=&req; + iov.iov_len=sizeof(req); + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, sizeof(req)); + set_fs(oldfs); + + if (err > 0) + goto restart; + } +#endif + ctx->align(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + +out_sock_pg: + free_page((unsigned long)pg); +out_sock: + sock_release(sock); + return err; +} + +static int dumpfn(void *arg) +{ + int i; + int *pfd = arg; + char *argv[] = { "iptables-save", "-c", NULL }; + + i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump iptables\n"); + module_put(THIS_MODULE); + return 255 << 8; + } + + if (pfd[1] != 1) + sc_dup2(pfd[1], 1); + + for (i=0; ifiles->fdt->max_fds; i++) { + if (i != 1) + sc_close(i); + } + + module_put(THIS_MODULE); + + set_fs(KERNEL_DS); + i = sc_execve("/sbin/iptables-save", argv, NULL); + if (i == -ENOENT) + i = sc_execve("/usr/sbin/iptables-save", argv, NULL); + eprintk("failed to exec iptables-save: %d\n", i); + return 255 << 8; +} + + +static int cpt_dump_iptables(struct cpt_context * ctx) +{ + int err; + int pid; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + char buf[16]; + loff_t pos; + int n; + int status; + mm_segment_t oldfs; + + if (!(get_exec_env()->_iptables_modules & VE_IP_IPTABLES_MOD)) + return 0; + + err = sc_pipe(pfd); + if (err < 0) { + eprintk_ctx("sc_pipe: %d\n", err); + return err; + } + err = pid = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); + if (err < 0) { + eprintk_ctx("local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[0]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + cpt_open_section(ctx, CPT_SECT_NET_IPTABLES); + + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NAME; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&v, sizeof(v), ctx); + + pos = ctx->file->f_pos; + do { + oldfs = get_fs(); set_fs(KERNEL_DS); + n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); + set_fs(oldfs); + if (n > 0) + ctx->write(buf, n, ctx); + } while (n > 0); + + if (n < 0) + eprintk_ctx("read: %d\n", n); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("iptables-save exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("iptables-save terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + + if (ctx->file->f_pos != pos) { + buf[0] = 0; + ctx->write(buf, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + } else { + pos = ctx->current_section; + cpt_close_object(ctx); + cpt_close_section(ctx); + ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL; + ctx->file->f_pos = pos; + } + return n ? : err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + return err; +} + +int cpt_dump_ifinfo(struct cpt_context * ctx) +{ + int err; + + rtnl_lock(); + err = cpt_dump_link(ctx); + if (!err) + err = cpt_dump_ifaddr(ctx); + rtnl_unlock(); + if (!err) + err = cpt_dump_route(ctx); + if (!err) + err = cpt_dump_iptables(ctx); + return err; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_net.h linux-2.6.18.ovz/kernel/cpt/cpt_net.h --- linux-2.6.18/kernel/cpt/cpt_net.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_net.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,7 @@ +int cpt_dump_ifinfo(struct cpt_context *ctx); +int rst_restore_net(struct cpt_context *ctx); +int cpt_suspend_network(struct cpt_context *ctx); +int cpt_resume_network(struct cpt_context *ctx); +int rst_resume_network(struct cpt_context *ctx); +int cpt_dump_ip_conntrack(struct cpt_context *ctx); +int rst_restore_ip_conntrack(struct cpt_context * ctx); diff -uprN linux-2.6.18/kernel/cpt/cpt_obj.c linux-2.6.18.ovz/kernel/cpt/cpt_obj.c --- linux-2.6.18/kernel/cpt/cpt_obj.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_obj.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,163 @@ +/* + * + * kernel/cpt/cpt_obj.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = kmalloc(sizeof(cpt_object_t), gfp); + if (obj) { + INIT_LIST_HEAD(&obj->o_list); + INIT_LIST_HEAD(&obj->o_hash); + INIT_LIST_HEAD(&obj->o_alist); + obj->o_count = 1; + obj->o_pos = CPT_NULL; + obj->o_lock = 0; + obj->o_parent = NULL; + obj->o_index = CPT_NOINDEX; + obj->o_obj = NULL; + obj->o_image = NULL; + ctx->objcount++; + } + return obj; +} + +void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx) +{ + list_del(&obj->o_alist); + kfree(obj); + ctx->objcount--; +} + +void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx) +{ + list_add_tail(&obj->o_list, &ctx->object_array[type]); +} + +void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, + cpt_object_t *head, cpt_context_t *ctx) +{ + list_add(&obj->o_list, &head->o_list); +} + +cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p, + unsigned gfp_mask, cpt_context_t *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(type, p, ctx); + + if (obj) { + obj->o_count++; + return obj; + } + + if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) { + if (p) + cpt_obj_setobj(obj, p, ctx); + intern_cpt_object(type, obj, ctx); + return obj; + } + return NULL; +} + +cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx) +{ + return __cpt_object_add(type, p, GFP_KERNEL, ctx); +} + +cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(type, p, ctx); + + if (obj) + obj->o_count++; + + return obj; +} + +int cpt_object_init(cpt_context_t *ctx) +{ + int i; + + for (i=0; iobject_array[i]); + } + return 0; +} + +int cpt_object_destroy(cpt_context_t *ctx) +{ + int i; + + for (i=0; iobject_array[i])) { + struct list_head *head = ctx->object_array[i].next; + cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); + list_del(head); + if (obj->o_image) + kfree(obj->o_image); + free_cpt_object(obj, ctx); + } + } + if (ctx->objcount != 0) + eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount); + return 0; +} + +cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_obj == p) + return obj; + } + return NULL; +} + +cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_pos == pos) + return obj; + } + return NULL; +} + +cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_index == index) + return obj; + } + return NULL; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_obj.h linux-2.6.18.ovz/kernel/cpt/cpt_obj.h --- linux-2.6.18/kernel/cpt/cpt_obj.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_obj.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,62 @@ +#ifndef __CPT_OBJ_H_ +#define __CPT_OBJ_H_ 1 + +#include +#include + +typedef struct _cpt_object +{ + struct list_head o_list; + struct list_head o_hash; + int o_count; + int o_index; + int o_lock; + loff_t o_pos; + loff_t o_ppos; + void *o_obj; + void *o_image; + void *o_parent; + struct list_head o_alist; +} cpt_object_t; + +struct cpt_context; + +#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list) + + +extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx); +extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx); + +cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx); +cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx); +cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx); + +static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx) +{ + cpt->o_pos = pos; + /* Add to pos hash table */ +} + +static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx) +{ + cpt->o_obj = ptr; + /* Add to hash table */ +} + +static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx) +{ + cpt->o_index = index; + /* Add to index hash table */ +} + + +extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx); +extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx); +extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx); +extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx); +extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx); + +extern int cpt_object_init(struct cpt_context *ctx); +extern int cpt_object_destroy(struct cpt_context *ctx); + +#endif /* __CPT_OBJ_H_ */ diff -uprN linux-2.6.18/kernel/cpt/cpt_proc.c linux-2.6.18.ovz/kernel/cpt/cpt_proc.c --- linux-2.6.18/kernel/cpt/cpt_proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_proc.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,588 @@ +/* + * + * kernel/cpt/cpt_proc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" + +MODULE_AUTHOR("Alexey Kuznetsov "); +MODULE_LICENSE("GPL"); + +/* List of contexts and lock protecting the list */ +static struct list_head cpt_context_list; +static spinlock_t cpt_context_lock; + +static int proc_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + cpt_context_t *ctx; + + len += sprintf(buffer, "Ctx Id VE State\n"); + + spin_lock(&cpt_context_lock); + + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + len += sprintf(buffer+len,"%p %08x %-8u %d", + ctx, + ctx->contextid, + ctx->ve_id, + ctx->ctx_state + ); + + buffer[len++] = '\n'; + + pos = begin+len; + if (pos < offset) { + len = 0; + begin = pos; + } + if (pos > offset+length) + goto done; + } + *eof = 1; + +done: + spin_unlock(&cpt_context_lock); + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) + len = length; + if(len < 0) + len = 0; + return len; +} + +void cpt_context_release(cpt_context_t *ctx) +{ + list_del(&ctx->ctx_list); + spin_unlock(&cpt_context_lock); + + if (ctx->ctx_state > 0) + cpt_resume(ctx); + ctx->ctx_state = CPT_CTX_ERROR; + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) + put_task_struct(ctx->pgin_task); + if (ctx->pgin_dir) + cpt_free_pgin_dir(ctx); + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); +#endif + if (ctx->objcount) + eprintk_ctx("%d objects leaked\n", ctx->objcount); + if (ctx->file) + fput(ctx->file); + cpt_flush_error(ctx); + if (ctx->errorfile) { + fput(ctx->errorfile); + ctx->errorfile = NULL; + } + if (ctx->error_msg) { + free_page((unsigned long)ctx->error_msg); + ctx->error_msg = NULL; + } + if (ctx->statusfile) + fput(ctx->statusfile); + if (ctx->lockfile) + fput(ctx->lockfile); + kfree(ctx); + + spin_lock(&cpt_context_lock); +} + +static void __cpt_context_put(cpt_context_t *ctx) +{ + if (!--ctx->refcount) + cpt_context_release(ctx); +} + +static void cpt_context_put(cpt_context_t *ctx) +{ + spin_lock(&cpt_context_lock); + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); +} + +cpt_context_t * cpt_context_open(void) +{ + cpt_context_t *ctx; + + if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { + cpt_context_init(ctx); + spin_lock(&cpt_context_lock); + list_add_tail(&ctx->ctx_list, &cpt_context_list); + spin_unlock(&cpt_context_lock); + ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); + if (ctx->error_msg != NULL) + ctx->error_msg[0] = 0; + } + return ctx; +} + +static cpt_context_t * cpt_context_lookup(unsigned int contextid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->contextid == contextid) { + ctx->refcount++; + spin_unlock(&cpt_context_lock); + return ctx; + } + } + spin_unlock(&cpt_context_lock); + return NULL; +} + +int cpt_context_lookup_veid(unsigned int veid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->ve_id == veid && ctx->ctx_state > 0) { + spin_unlock(&cpt_context_lock); + return 1; + } + } + spin_unlock(&cpt_context_lock); + return 0; +} + +static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) +{ + int err = 0; + cpt_context_t *ctx; + struct file *dfile = NULL; + int try; + + unlock_kernel(); + + if (cmd == CPT_VMPREP) { +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = cpt_mm_prepare(arg); +#else + err = -EINVAL; +#endif + goto out_lock; + } + + if (cmd == CPT_TEST_CAPS) { + unsigned int src_flags, dst_flags = arg; + + err = 0; + src_flags = test_cpu_caps(); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); + goto out_lock; + } + + if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { + cpt_context_t *old_ctx; + + ctx = NULL; + if (cmd == CPT_JOIN_CONTEXT) { + err = -ENOENT; + ctx = cpt_context_lookup(arg); + if (!ctx) + goto out_lock; + } + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + file->private_data = ctx; + + if (old_ctx) { + if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { + old_ctx->sticky = 0; + old_ctx->refcount--; + } + __cpt_context_put(old_ctx); + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_lock; + } + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + if (ctx) + ctx->refcount++; + spin_unlock(&cpt_context_lock); + + if (!ctx) { + cpt_context_t *old_ctx; + + err = -ENOMEM; + ctx = cpt_context_open(); + if (!ctx) + goto out_lock; + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + if (!old_ctx) { + ctx->refcount++; + file->private_data = ctx; + } else { + old_ctx->refcount++; + } + if (old_ctx) { + __cpt_context_put(ctx); + ctx = old_ctx; + } + spin_unlock(&cpt_context_lock); + } + + if (cmd == CPT_GET_CONTEXT) { + unsigned int contextid = (unsigned int)arg; + + if (ctx->contextid && ctx->contextid != contextid) { + err = -EINVAL; + goto out_nosem; + } + if (!ctx->contextid) { + cpt_context_t *c1 = cpt_context_lookup(contextid); + if (c1) { + cpt_context_put(c1); + err = -EEXIST; + goto out_nosem; + } + ctx->contextid = contextid; + } + spin_lock(&cpt_context_lock); + if (!ctx->sticky) { + ctx->sticky = 1; + ctx->refcount++; + } + spin_unlock(&cpt_context_lock); + goto out_nosem; + } + + down(&ctx->main_sem); + + err = -EBUSY; + if (ctx->ctx_state < 0) + goto out; + + err = 0; + switch (cmd) { + case CPT_SET_DUMPFD: + if (ctx->ctx_state == CPT_CTX_DUMPING) { + err = -EBUSY; + break; + } + if (arg >= 0) { + err = -EBADF; + dfile = fget(arg); + if (dfile == NULL) + break; + if (dfile->f_op == NULL || + dfile->f_op->write == NULL) { + fput(dfile); + break; + } + err = 0; + } + if (ctx->file) + fput(ctx->file); + ctx->file = dfile; + break; + case CPT_SET_ERRORFD: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->errorfile) + fput(ctx->errorfile); + ctx->errorfile = dfile; + break; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + case CPT_SET_PAGEINFDIN: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + ctx->pagein_file_in = dfile; + break; + case CPT_SET_PAGEINFDOUT: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + ctx->pagein_file_out = dfile; + break; + case CPT_SET_LAZY: + ctx->lazy_vm = arg; + break; + case CPT_ITER: + err = cpt_iteration(ctx); + break; + case CPT_PAGEIND: + err = cpt_start_pagein(ctx); + break; +#endif + case CPT_SET_VEID: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ve_id = arg; + break; + case CPT_SET_CPU_FLAGS: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->dst_cpu_flags = arg; + ctx->src_cpu_flags = test_cpu_caps(); + break; + case CPT_SUSPEND: + if (cpt_context_lookup_veid(ctx->ve_id) || + ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ctx_state = CPT_CTX_SUSPENDING; + try = 0; + do { + err = cpt_vps_suspend(ctx); + if (err) + cpt_resume(ctx); + if (err == -EAGAIN) + msleep(1000); + try++; + } while (err == -EAGAIN && try < 3); + if (err) { + ctx->ctx_state = CPT_CTX_IDLE; + } else { + ctx->ctx_state = CPT_CTX_SUSPENDED; + } + break; + case CPT_DUMP: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + err = cpt_dump(ctx); + break; + case CPT_RESUME: + if (ctx->ctx_state == CPT_CTX_IDLE) { + err = -ENOENT; + break; + } + err = cpt_resume(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_KILL: + if (ctx->ctx_state == CPT_CTX_IDLE) { + err = -ENOENT; + break; + } + err = cpt_kill(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_TEST_VECAPS: + { + __u32 dst_flags = arg; + __u32 src_flags; + + err = cpt_vps_caps(ctx, &src_flags); + if (err) + break; + + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err); + if (src_flags & CPT_UNSUPPORTED_MASK) + err = 1; + break; + } + default: + err = -EINVAL; + break; + } + +out: + cpt_flush_error(ctx); + up(&ctx->main_sem); +out_nosem: + cpt_context_put(ctx); +out_lock: + lock_kernel(); + return err; +} + +static int cpt_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int cpt_release(struct inode * inode, struct file * file) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + file->private_data = NULL; + + if (ctx) + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); + + module_put(THIS_MODULE); + return 0; +} + + +static struct file_operations cpt_fops = { + .owner = THIS_MODULE, + .open = cpt_open, + .release = cpt_release, + .ioctl = cpt_ioctl, +}; + +static struct proc_dir_entry *proc_ent; + +static struct ctl_table_header *ctl_header; + +static ctl_table debug_table[] = { + { + .ctl_name = 9475, + .procname = "cpt", + .data = &debug_level, + .maxlen = sizeof(debug_level), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; +static ctl_table root_table[] = { + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { .ctl_name = 0 } +}; + +static int __init init_cpt(void) +{ + int err; + + err = -ENOMEM; + ctl_header = register_sysctl_table(root_table, 0); + if (!ctl_header) + goto err_mon; + + spin_lock_init(&cpt_context_lock); + INIT_LIST_HEAD(&cpt_context_list); + + err = -EINVAL; + proc_ent = create_proc_entry_mod("cpt", 0600, NULL, THIS_MODULE); + if (!proc_ent) + goto err_out; + + cpt_fops.read = proc_ent->proc_fops->read; + cpt_fops.write = proc_ent->proc_fops->write; + cpt_fops.llseek = proc_ent->proc_fops->llseek; + proc_ent->proc_fops = &cpt_fops; + + proc_ent->read_proc = proc_read; + proc_ent->data = NULL; + proc_ent->owner = THIS_MODULE; + return 0; + +err_out: + unregister_sysctl_table(ctl_header); +err_mon: + return err; +} +module_init(init_cpt); + +static void __exit exit_cpt(void) +{ + remove_proc_entry("cpt", NULL); + unregister_sysctl_table(ctl_header); + + spin_lock(&cpt_context_lock); + while (!list_empty(&cpt_context_list)) { + cpt_context_t *ctx; + ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); + + if (!ctx->sticky) + ctx->refcount++; + ctx->sticky = 0; + + BUG_ON(ctx->refcount != 1); + + __cpt_context_put(ctx); + } + spin_unlock(&cpt_context_lock); +} +module_exit(exit_cpt); diff -uprN linux-2.6.18/kernel/cpt/cpt_process.c linux-2.6.18.ovz/kernel/cpt/cpt_process.c --- linux-2.6.18/kernel/cpt/cpt_process.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_process.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1313 @@ +/* + * + * kernel/cpt/cpt_process.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_ubc.h" +#include "cpt_process.h" +#include "cpt_kernel.h" + +#ifdef CONFIG_X86_32 +#undef task_pt_regs +#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1) +#endif + +#ifdef CONFIG_X86 + +static u32 encode_segment(u32 segreg) +{ + segreg &= 0xFFFF; + + if (segreg == 0) + return CPT_SEG_ZERO; + if ((segreg & 3) != 3) { + wprintk("Invalid RPL of a segment reg %x\n", segreg); + return CPT_SEG_ZERO; + } + + /* LDT descriptor, it is just an index to LDT array */ + if (segreg & 4) + return CPT_SEG_LDT + (segreg >> 3); + + /* TLS descriptor. */ + if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN && + (segreg >> 3) <= GDT_ENTRY_TLS_MAX) + return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN); + + /* One of standard desriptors */ +#ifdef CONFIG_X86_64 + if (segreg == __USER32_DS) + return CPT_SEG_USER32_DS; + if (segreg == __USER32_CS) + return CPT_SEG_USER32_CS; + if (segreg == __USER_DS) + return CPT_SEG_USER64_DS; + if (segreg == __USER_CS) + return CPT_SEG_USER64_CS; +#else + if (segreg == __USER_DS) + return CPT_SEG_USER32_DS; + if (segreg == __USER_CS) + return CPT_SEG_USER32_CS; +#endif + wprintk("Invalid segment reg %x\n", segreg); + return CPT_SEG_ZERO; +} + +#ifdef CONFIG_X86_64 +static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s, + struct task_struct *tsk) +{ + d->cpt_ebp = s->rbp; + d->cpt_ebx = s->rbx; + d->cpt_eax = s->rax; + d->cpt_ecx = s->rcx; + d->cpt_edx = s->rdx; + d->cpt_esi = s->rsi; + d->cpt_edi = s->rdi; + d->cpt_orig_eax = s->orig_rax; + d->cpt_eip = s->rip; + d->cpt_xcs = encode_segment(s->cs); + d->cpt_eflags = s->eflags; + d->cpt_esp = s->rsp; + d->cpt_xss = encode_segment(s->ss); + d->cpt_xds = encode_segment(tsk->thread.ds); + d->cpt_xes = encode_segment(tsk->thread.es); +} + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + cpt_open_object(NULL, ctx); + + if (tsk->thread_info->flags&_TIF_IA32) { + struct cpt_x86_regs ri; + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[4] = 0; + ri.cpt_debugreg[5] = 0; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + ri.cpt_fs = encode_segment(tsk->thread.fsindex); + ri.cpt_gs = encode_segment(tsk->thread.gsindex); + + xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk); + + ctx->write(&ri, sizeof(ri), ctx); + } else { + struct cpt_x86_64_regs ri; + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_64_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_fsbase = tsk->thread.fs; + ri.cpt_gsbase = tsk->thread.gs; + ri.cpt_fsindex = encode_segment(tsk->thread.fsindex); + ri.cpt_gsindex = encode_segment(tsk->thread.gsindex); + ri.cpt_ds = encode_segment(tsk->thread.ds); + ri.cpt_es = encode_segment(tsk->thread.es); + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[4] = 0; + ri.cpt_debugreg[5] = 0; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + + memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs)); + + ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs); + ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss); + + ctx->write(&ri, sizeof(ri), ctx); + +#if 0 + if (ri.cpt_rip >= VSYSCALL_START && ri.cpt_rip < VSYSCALL_END) { + eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk)); + return -EAGAIN; + } +#endif + } + cpt_close_object(ctx); + + return 0; +} + +#else + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_x86_regs ri; + + cpt_open_object(NULL, ctx); + + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_debugreg[0] = tsk->thread.debugreg[0]; + ri.cpt_debugreg[1] = tsk->thread.debugreg[1]; + ri.cpt_debugreg[2] = tsk->thread.debugreg[2]; + ri.cpt_debugreg[3] = tsk->thread.debugreg[3]; + ri.cpt_debugreg[4] = tsk->thread.debugreg[4]; + ri.cpt_debugreg[5] = tsk->thread.debugreg[5]; + ri.cpt_debugreg[6] = tsk->thread.debugreg[6]; + ri.cpt_debugreg[7] = tsk->thread.debugreg[7]; + ri.cpt_fs = encode_segment(tsk->thread.fs); + ri.cpt_gs = encode_segment(tsk->thread.gs); + + memcpy(&ri.cpt_ebx, task_pt_regs(tsk), sizeof(struct pt_regs)); + + ri.cpt_xcs = encode_segment(task_pt_regs(tsk)->xcs); + ri.cpt_xss = encode_segment(task_pt_regs(tsk)->xss); + ri.cpt_xds = encode_segment(task_pt_regs(tsk)->xds); + ri.cpt_xes = encode_segment(task_pt_regs(tsk)->xes); + + ctx->write(&ri, sizeof(ri), ctx); + cpt_close_object(ctx); + + return 0; +} +#endif +#endif + +#ifdef CONFIG_IA64 + +/* + PMD? + */ + +#define _C(x) do { if ((err = (x)) < 0) { printk("atm:" CPT_FID #x " %d\n", \ + CPT_TID(tsk), err); return -EINVAL; } } while (0) + +static int ass_to_mouth(struct cpt_ia64_regs *r, struct task_struct *tsk, + struct cpt_context *ctx) +{ + int err; + struct unw_frame_info info; + struct ia64_fpreg fpval; + int i; + + unw_init_from_blocked_task(&info, tsk); + _C(unw_unwind_to_user(&info)); + + /* NAT_BITS */ + do { + unsigned long scratch_unat; + + scratch_unat = info.sw->caller_unat; + if (info.pri_unat_loc) + scratch_unat = *info.pri_unat_loc; + + r->nat[0] = ia64_get_scratch_nat_bits(task_pt_regs(tsk), scratch_unat); + /* Just to be on safe side. */ + r->nat[0] &= 0xFFFFFFFFUL; + } while (0); + + /* R4-R7 */ + for (i = 4; i <= 7; i++) { + char nat = 0; + _C(unw_access_gr(&info, i, &r->gr[i], &nat, 0)); + r->nat[0] |= (nat != 0) << i; + } + + /* B1-B5 */ + for (i = 1; i <= 5; i++) { + _C(unw_access_br(&info, i, &r->br[i], 0)); + } + + /* AR_EC, AR_LC */ + _C(unw_access_ar(&info, UNW_AR_EC, &r->ar_ec, 0)); + _C(unw_access_ar(&info, UNW_AR_LC, &r->ar_lc, 0)); + + /* F2..F5, F16..F31 */ + for (i = 2; i <= 5; i++) { + _C(unw_get_fr(&info, i, &fpval)); + memcpy(&r->fr[i*2], &fpval, 16); + } + for (i = 16; i <= 31; i++) { + _C(unw_get_fr(&info, i, &fpval)); + memcpy(&r->fr[i*2], &fpval, 16); + } + return 0; +} + +#undef _C + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + int err; + unsigned long pg; + struct cpt_ia64_regs *r; + struct ia64_psr *psr; + struct switch_stack *sw; + struct pt_regs *pt; + void *krbs = (void *)tsk + IA64_RBS_OFFSET; + unsigned long reg; + + if (tsk->exit_state) + return 0; + + pt = task_pt_regs(tsk); + + sw = (struct switch_stack *) (tsk->thread.ksp + 16); + + if ((pg = __get_free_page(GFP_KERNEL)) == 0) + return -ENOMEM; + + r = (void*)pg; + /* To catch if we forgot some register */ + memset(r, 0xA5, sizeof(*r)); + + r->gr[0] = 0; + r->fr[0] = r->fr[1] = 0; + r->fr[2] = 0x8000000000000000UL; + r->fr[3] = 0xffff; + + r->nat[0] = r->nat[1] = 0; + + err = ass_to_mouth(r, tsk, ctx); + if (err) { + printk("ass_to_mouth error %d\n", err); + goto out; + } + + /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ + memcpy(&r->gr[1], &pt->r1, 8*(2-1)); + memcpy(&r->gr[2], &pt->r2, 8*(4-2)); + memcpy(&r->gr[8], &pt->r8, 8*(12-8)); + memcpy(&r->gr[12], &pt->r12, 8*(14-12)); + memcpy(&r->gr[14], &pt->r14, 8*(15-14)); + memcpy(&r->gr[15], &pt->r15, 8*(16-15)); + memcpy(&r->gr[16], &pt->r16, 8*(32-16)); + + r->br[0] = pt->b0; + r->br[6] = pt->b6; + r->br[7] = pt->b7; + + r->ar_bspstore = pt->ar_bspstore; + r->ar_unat = pt->ar_unat; + r->ar_pfs = pt->ar_pfs; + r->ar_ccv = pt->ar_ccv; + r->ar_fpsr = pt->ar_fpsr; + r->ar_csd = pt->ar_csd; + r->ar_ssd = pt->ar_ssd; + r->ar_rsc = pt->ar_rsc; + + r->cr_iip = pt->cr_iip; + r->cr_ipsr = pt->cr_ipsr; + + r->pr = pt->pr; + + r->cfm = pt->cr_ifs; + r->ar_rnat = pt->ar_rnat; + + /* fpregs 6..9,10..11 are in pt_regs */ + memcpy(&r->fr[2*6], &pt->f6, 16*(10-6)); + memcpy(&r->fr[2*10], &pt->f10, 16*(12-10)); + /* fpreg 12..15 are on switch stack */ + memcpy(&r->fr[2*12], &sw->f12, 16*(16-12)); + /* fpregs 32...127 */ + psr = ia64_psr(task_pt_regs(tsk)); + preempt_disable(); + if (ia64_is_local_fpu_owner(tsk) && psr->mfh) { + psr->mfh = 0; + tsk->thread.flags |= IA64_THREAD_FPH_VALID; + ia64_save_fpu(&tsk->thread.fph[0]); + } + preempt_enable(); + memcpy(&r->fr[32*2], tsk->thread.fph, 16*(128-32)); + + if (tsk->thread.flags & IA64_THREAD_DBG_VALID) { + memcpy(r->ibr, tsk->thread.ibr, sizeof(r->ibr)); + memcpy(r->dbr, tsk->thread.dbr, sizeof(r->ibr)); + } else { + memset(r->ibr, 0, sizeof(r->ibr)); + memset(r->dbr, 0, sizeof(r->dbr)); + } + + r->loadrs = pt->loadrs; + r->num_regs = ia64_rse_num_regs(krbs, krbs + 8*(pt->loadrs >> 19)); + if ((long)pt->cr_ifs > 0) + r->num_regs += (pt->cr_ifs & 0x7f); + + if (r->num_regs > 96) { + eprintk_ctx(CPT_FID " too much RSE regs %lu\n", + CPT_TID(tsk), r->num_regs); + return -EINVAL; + } + + for (reg = 0; reg < r->num_regs; reg++) { + unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); + unsigned long *rnatp = ia64_rse_rnat_addr(ptr); + + r->gr[32+reg] = *ptr; + + if ((unsigned long)rnatp >= sw->ar_bspstore) + rnatp = &sw->ar_rnat; + if (*rnatp & (1UL<nat[0] |= (1UL<<(reg+32)); + else + r->nat[1] |= (1UL<<(reg-32)); + } + } + if (r->nat[0] | r->nat[1]) + wprintk_ctx(CPT_FID " nat bits %lx%016lx\n", CPT_TID(tsk), + r->nat[1], r->nat[0]); + + cpt_open_object(NULL, ctx); + r->cpt_next = sizeof(*r); + r->cpt_object = CPT_OBJ_IA64_REGS; + r->cpt_hdrlen = sizeof(*r); + r->cpt_content = CPT_CONTENT_VOID; + ctx->write(r, sizeof(*r), ctx); + cpt_close_object(ctx); + err = 0; + +out: + free_page(pg); + return err; +} +#endif + +static int dump_kstack(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_obj_bits hdr; + unsigned long size; + void *start; + + cpt_open_object(NULL, ctx); + +#ifdef CONFIG_X86_64 + size = tsk->thread.rsp0 - tsk->thread.rsp; + start = (void*)tsk->thread.rsp; +#elif defined(CONFIG_X86_32) + size = tsk->thread.esp0 - tsk->thread.esp; + start = (void*)tsk->thread.esp; +#elif defined(CONFIG_IA64) + size = (unsigned long)(task_pt_regs(tsk)+1) - tsk->thread.ksp; + start = (void*)tsk->thread.ksp; +#else +#error Arch is not supported +#endif + + hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); + hdr.cpt_object = CPT_OBJ_BITS; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = CPT_CONTENT_STACK; + hdr.cpt_size = size; + + ctx->write(&hdr, sizeof(hdr), ctx); + ctx->write(start, size, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} + +#ifdef CONFIG_X86 +/* Formats of i387_fxsave_struct are the same for x86_64 + * and i386. Plain luck. */ + +static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_obj_bits hdr; + unsigned long size; + int type; + + cpt_open_object(NULL, ctx); + + type = CPT_CONTENT_X86_FPUSTATE; + size = sizeof(struct i387_fxsave_struct); +#ifndef CONFIG_X86_64 + if (!cpu_has_fxsr) { + size = sizeof(struct i387_fsave_struct); + type = CPT_CONTENT_X86_FPUSTATE_OLD; + } +#endif + + hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); + hdr.cpt_object = CPT_OBJ_BITS; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = type; + hdr.cpt_size = size; + + ctx->write(&hdr, sizeof(hdr), ctx); + ctx->write(&tsk->thread.i387, size, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} +#endif + +#ifdef CONFIG_IA64 + +static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) +{ + return 0; +} +#endif + +static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info) +{ + si->cpt_signo = info->si_signo; + si->cpt_errno = info->si_errno; + si->cpt_code = info->si_code; + + switch(si->cpt_code & __SI_MASK) { + case __SI_TIMER: + si->cpt_pid = info->si_tid; + si->cpt_uid = info->si_overrun; + si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr); + si->cpt_utime = info->si_sys_private; + break; + case __SI_POLL: + si->cpt_pid = info->si_band; + si->cpt_uid = info->si_fd; + break; + case __SI_FAULT: + si->cpt_sigval = cpt_ptr_export(info->si_addr); +#ifdef __ARCH_SI_TRAPNO + si->cpt_pid = info->si_trapno; +#endif + break; + case __SI_CHLD: + si->cpt_pid = is_virtual_pid(info->si_pid) ? info->si_pid : pid_to_vpid(info->si_pid); + si->cpt_uid = info->si_uid; + si->cpt_sigval = info->si_status; + si->cpt_stime = info->si_stime; + si->cpt_utime = info->si_utime; + break; + case __SI_KILL: + case __SI_RT: + case __SI_MESGQ: + default: + si->cpt_pid = is_virtual_pid(info->si_pid) ? info->si_pid : pid_to_vpid(info->si_pid); + si->cpt_uid = info->si_uid; + si->cpt_sigval = cpt_ptr_export(info->si_ptr); + break; + } + return 0; +} + +static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx) +{ + struct sigqueue *q; + loff_t saved_obj; + + if (list_empty(&list->list)) + return 0; + + cpt_push_object(&saved_obj, ctx); + list_for_each_entry(q, &list->list, list) { + struct cpt_siginfo_image si; + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_SIGINFO; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + si.cpt_qflags = q->flags; + si.cpt_user = q->user->uid; + + if (encode_siginfo(&si, &q->info)) + return -EINVAL; + + ctx->write(&si, sizeof(si), ctx); + } + cpt_pop_object(&saved_obj, ctx); + return 0; +} + + + +static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct signal_struct *sig = obj->o_obj; + struct cpt_signal_image *v = cpt_get_buf(ctx); + struct task_struct *tsk; + int i; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SIGNAL_STRUCT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + if (sig->pgrp <= 0) { + eprintk_ctx("bad pgid\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_pgrp_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->pgrp); + if (tsk == NULL) + v->cpt_pgrp_type = CPT_PGRP_ORPHAN; + read_unlock(&tasklist_lock); + v->cpt_pgrp = pid_to_vpid(sig->pgrp); + + v->cpt_old_pgrp = 0; + if (sig->tty_old_pgrp < 0) { + eprintk_ctx("bad tty_old_pgrp\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + if (sig->tty_old_pgrp > 0) { + v->cpt_old_pgrp_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->tty_old_pgrp); + if (tsk == NULL) { + v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN; + tsk = find_task_by_pid_type_ve(PIDTYPE_PGID, sig->tty_old_pgrp); + } + read_unlock(&tasklist_lock); + if (tsk == NULL) { + eprintk_ctx("tty_old_pgrp does not exist anymore\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_old_pgrp = _pid_to_vpid(sig->tty_old_pgrp); + if ((int)v->cpt_old_pgrp < 0) { + dprintk_ctx("stray tty_old_pgrp %d\n", sig->tty_old_pgrp); + v->cpt_old_pgrp = -1; + v->cpt_old_pgrp_type = CPT_PGRP_STRAY; + } + } + + if (sig->session <= 0) { + eprintk_ctx("bad session\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_session_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->session); + if (tsk == NULL) + v->cpt_session_type = CPT_PGRP_ORPHAN; + read_unlock(&tasklist_lock); + v->cpt_session = pid_to_vpid(sig->session); + + v->cpt_leader = sig->leader; + v->cpt_ctty = CPT_NULL; + if (sig->tty) { + cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx); + if (cobj) + v->cpt_ctty = cobj->o_pos; + else { + eprintk_ctx("controlling tty is not found\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + } + memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8); + + v->cpt_curr_target = 0; + if (sig->curr_target) + v->cpt_curr_target = virt_pid(sig->curr_target); + v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0); + v->cpt_group_exit_code = sig->group_exit_code; + v->cpt_group_exit_task = 0; + if (sig->group_exit_task) + v->cpt_group_exit_task = virt_pid(sig->group_exit_task); + v->cpt_notify_count = sig->notify_count; + v->cpt_group_stop_count = sig->group_stop_count; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8) + v->cpt_utime = sig->utime; + v->cpt_stime = sig->stime; + v->cpt_cutime = sig->cutime; + v->cpt_cstime = sig->cstime; + v->cpt_nvcsw = sig->nvcsw; + v->cpt_nivcsw = sig->nivcsw; + v->cpt_cnvcsw = sig->cnvcsw; + v->cpt_cnivcsw = sig->cnivcsw; + v->cpt_min_flt = sig->min_flt; + v->cpt_maj_flt = sig->maj_flt; + v->cpt_cmin_flt = sig->cmin_flt; + v->cpt_cmaj_flt = sig->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = sig->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = sig->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#endif + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + dump_sigqueue(&sig->shared_pending, ctx); + + cpt_close_object(ctx); + return 0; +} + + +int cpt_check_unsupported(struct task_struct *tsk, cpt_context_t *ctx) +{ + if (tsk->splice_pipe) { + eprintk_ctx("splice is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#ifdef CONFIG_KEYS + if (tsk->request_key_auth || tsk->thread_keyring) { + eprintk_ctx("keys are used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif +#ifdef CONFIG_NUMA + if (tsk->mempolicy) { + eprintk_ctx("NUMA mempolicy is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif +#ifdef CONFIG_TUX + if (tsk->tux_info) { + eprintk_ctx("TUX is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif + return 0; +} + +static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + int last_thread; + struct cpt_task_image *v = cpt_get_buf(ctx); + cpt_object_t *tobj; + cpt_object_t *tg_obj; + loff_t saved_obj; + int i; + int err; + struct timespec delta; + struct mm_struct * tsk_mm; + struct files_struct * tsk_files; + struct fs_struct * tsk_fs; + struct namespace * tsk_ns; + + cpt_open_object(obj, ctx); + + v->cpt_signal = CPT_NULL; + tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx); + if (!tg_obj) BUG(); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_TASK; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_state = tsk->state; + if (tsk->state == EXIT_ZOMBIE) { + eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } else if (tsk->state == EXIT_DEAD) { + if (tsk->exit_state != EXIT_DEAD && + tsk->exit_state != EXIT_ZOMBIE) { + eprintk_ctx("invalid exit_state %ld on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if (tsk->exit_state) { + v->cpt_state = tsk->exit_state; + if (tsk->state != EXIT_DEAD) { + eprintk_ctx("invalid tsk->state %ld/%ld on" CPT_FID "\n", + tsk->state, tsk->exit_state, CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if (cpt_check_unsupported(tsk, ctx)) { + cpt_release_buf(ctx); + return -EBUSY; + } + + v->cpt_flags = tsk->flags&~(PF_FROZEN|PF_EXIT_RESTART); + v->cpt_ptrace = tsk->ptrace; + v->cpt_prio = tsk->prio; + v->cpt_exit_code = tsk->exit_code; + v->cpt_exit_signal = tsk->exit_signal; + v->cpt_pdeath_signal = tsk->pdeath_signal; + v->cpt_static_prio = tsk->static_prio; + v->cpt_rt_priority = tsk->rt_priority; + v->cpt_policy = tsk->policy; + if (v->cpt_policy != SCHED_NORMAL) { + eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm); + cpt_release_buf(ctx); + return -EINVAL; + } + + /* Unpleasant moment. When leader of thread group exits, + * it remains in zombie state until all the group exits. + * We save not-NULL pointers to process mm/files/fs, so + * that we can restore this thread group. + */ + tsk_mm = tsk->mm; + tsk_files = tsk->files; + tsk_fs = tsk->fs; + tsk_ns = tsk->nsproxy ? tsk->nsproxy->namespace : NULL; + + if (tsk->exit_state && !thread_group_empty(tsk) && + thread_group_leader(tsk)) { + struct task_struct * p = tsk; + + read_lock(&tasklist_lock); + do { + if (p->mm) + tsk_mm = p->mm; + if (p->files) + tsk_files = p->files; + if (p->fs) + tsk_fs = p->fs; + if (p->nsproxy && p->nsproxy->namespace) + tsk_ns = p->nsproxy->namespace; + p = next_thread(p); + } while (p != tsk); + read_unlock(&tasklist_lock); + } + + v->cpt_mm = CPT_NULL; + if (tsk_mm) { + tobj = lookup_cpt_object(CPT_OBJ_MM, tsk_mm, ctx); + if (!tobj) BUG(); + v->cpt_mm = tobj->o_pos; + } + v->cpt_files = CPT_NULL; + if (tsk_files) { + tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk_files, ctx); + if (!tobj) BUG(); + v->cpt_files = tobj->o_pos; + } + v->cpt_fs = CPT_NULL; + if (tsk_fs) { + tobj = lookup_cpt_object(CPT_OBJ_FS, tsk_fs, ctx); + if (!tobj) BUG(); + v->cpt_fs = tobj->o_pos; + } + v->cpt_namespace = CPT_NULL; + if (tsk_ns) { + tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk_ns, ctx); + if (!tobj) BUG(); + v->cpt_namespace = tobj->o_pos; + + if (tsk_ns != current->nsproxy->namespace) + eprintk_ctx("namespaces are not supported:" + "process " CPT_FID "\n", CPT_TID(tsk)); + } + v->cpt_sysvsem_undo = CPT_NULL; + if (tsk->sysvsem.undo_list && !tsk->exit_state) { + tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx); + if (!tobj) BUG(); + v->cpt_sysvsem_undo = tobj->o_pos; + } + v->cpt_sighand = CPT_NULL; + if (tsk->sighand) { + tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx); + if (!tobj) BUG(); + v->cpt_sighand = tobj->o_pos; + } + v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked); + v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked); + v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask); + + v->cpt_pid = virt_pid(tsk); + v->cpt_tgid = virt_tgid(tsk); + v->cpt_ppid = 0; + if (tsk->parent) { + if (tsk->parent != tsk->real_parent && + !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) { + eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, virt_pid(tsk), tsk->comm); + cpt_release_buf(ctx); + return -EBUSY; + } + v->cpt_ppid = virt_pid(tsk->parent); + } + v->cpt_rppid = tsk->real_parent ? virt_pid(tsk->real_parent) : 0; + v->cpt_pgrp = virt_pgid(tsk); + v->cpt_session = virt_sid(tsk); + v->cpt_old_pgrp = 0; + if (tsk->signal->tty_old_pgrp) + v->cpt_old_pgrp = _pid_to_vpid(tsk->signal->tty_old_pgrp); + v->cpt_leader = tsk->group_leader ? virt_pid(tsk->group_leader) : 0; + v->cpt_set_tid = (unsigned long)tsk->set_child_tid; + v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid; + memcpy(v->cpt_comm, tsk->comm, 16); + v->cpt_user = tsk->user->uid; + v->cpt_uid = tsk->uid; + v->cpt_euid = tsk->euid; + v->cpt_suid = tsk->suid; + v->cpt_fsuid = tsk->fsuid; + v->cpt_gid = tsk->gid; + v->cpt_egid = tsk->egid; + v->cpt_sgid = tsk->sgid; + v->cpt_fsgid = tsk->fsgid; + v->cpt_ngids = 0; + if (tsk->group_info && tsk->group_info->ngroups != 0) { + int i = tsk->group_info->ngroups; + if (i > 32) { + /* Shame... I did a simplified version and _forgot_ + * about this. Later, later. */ + eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + v->cpt_ngids = i; + for (i--; i>=0; i--) + v->cpt_gids[i] = tsk->group_info->small_block[i]; + } + v->cpt_prctl_uac = 0; + v->cpt_prctl_fpemu = 0; + v->__cpt_pad1 = 0; +#ifdef CONFIG_IA64 + v->cpt_prctl_uac = (tsk->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT; + v->cpt_prctl_fpemu = (tsk->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT; +#endif + memcpy(&v->cpt_ecap, &tsk->cap_effective, 8); + memcpy(&v->cpt_icap, &tsk->cap_inheritable, 8); + memcpy(&v->cpt_pcap, &tsk->cap_permitted, 8); + v->cpt_keepcap = tsk->keep_capabilities; + + v->cpt_did_exec = tsk->did_exec; + v->cpt_exec_domain = -1; + v->cpt_thrflags = tsk->thread_info->flags & ~(1<cpt_64bit = 0; +#ifdef CONFIG_X86_64 + /* Clear x86_64 specific flags */ + v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); + if (!(tsk->thread_info->flags & _TIF_IA32)) { + ctx->tasks64++; + v->cpt_64bit = 1; + } +#endif +#ifdef CONFIG_IA64 + /* Clear ia64 specific flags */ + //// v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); + if (!IS_IA32_PROCESS(task_pt_regs(tsk))) { + ctx->tasks64++; + v->cpt_64bit = 1; + } +#endif + v->cpt_thrstatus = tsk->thread_info->status; + v->cpt_addr_limit = -1; + + v->cpt_personality = tsk->personality; + +#ifdef CONFIG_X86 + for (i=0; i=3) { + eprintk_ctx("too many tls descs\n"); + cpt_release_buf(ctx); + return -EINVAL; + } +#ifndef CONFIG_X86_64 + v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a; +#else + v->cpt_tls[i] = tsk->thread.tls_array[i]; +#endif + } +#endif + + v->cpt_restart.fn = CPT_RBL_0; + if (tsk->thread_info->restart_block.fn != current->thread_info->restart_block.fn) { + ktime_t e; + + if (tsk->thread_info->restart_block.fn != nanosleep_restart +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) + && tsk->thread_info->restart_block.fn != compat_nanosleep_restart +#endif + ) { + eprintk_ctx("unknown restart block %p\n", tsk->thread_info->restart_block.fn); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_restart.fn = CPT_RBL_NANOSLEEP; +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) + if (tsk->thread_info->restart_block.fn == compat_nanosleep_restart) + v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP; +#endif + + e.tv64 = ((u64)tsk->thread_info->restart_block.arg1 << 32) | + (u64) tsk->thread_info->restart_block.arg0; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = ktime_to_ns(e); + v->cpt_restart.arg1 = 0; + v->cpt_restart.arg2 = tsk->thread_info->restart_block.arg2; + v->cpt_restart.arg3 = tsk->thread_info->restart_block.arg3; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + } + + v->cpt_it_real_incr = 0; + v->cpt_it_prof_incr = 0; + v->cpt_it_virt_incr = 0; + v->cpt_it_real_value = 0; + v->cpt_it_prof_value = 0; + v->cpt_it_virt_value = 0; + if (thread_group_leader(tsk) && tsk->exit_state == 0) { + ktime_t rem; + + v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr); + v->cpt_it_prof_incr = tsk->signal->it_prof_incr; + v->cpt_it_virt_incr = tsk->signal->it_virt_incr; + + rem = hrtimer_get_remaining(&tsk->signal->real_timer); + + if (hrtimer_active(&tsk->signal->real_timer)) { + if (rem.tv64 <= 0) + rem.tv64 = NSEC_PER_USEC; + v->cpt_it_real_value = ktime_to_ns(rem); + dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_it_real_value); + } + v->cpt_it_prof_value = tsk->signal->it_prof_expires; + v->cpt_it_virt_value = tsk->signal->it_virt_expires; + } + v->cpt_used_math = (tsk_used_math(tsk) != 0); + + if (tsk->notifier) { + eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm); + cpt_release_buf(ctx); + return -EINVAL; + } + + v->cpt_utime = tsk->utime; + v->cpt_stime = tsk->stime; + delta = tsk->start_time; + _set_normalized_timespec(&delta, + delta.tv_sec - get_exec_env()->start_timespec.tv_sec, + delta.tv_nsec - get_exec_env()->start_timespec.tv_nsec); + v->cpt_starttime = cpt_timespec_export(&delta); + v->cpt_nvcsw = tsk->nvcsw; + v->cpt_nivcsw = tsk->nivcsw; + v->cpt_min_flt = tsk->min_flt; + v->cpt_maj_flt = tsk->maj_flt; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) + v->cpt_cutime = tsk->cutime; + v->cpt_cstime = tsk->cstime; + v->cpt_cnvcsw = tsk->cnvcsw; + v->cpt_cnivcsw = tsk->cnivcsw; + v->cpt_cmin_flt = tsk->cmin_flt; + v->cpt_cmaj_flt = tsk->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = tsk->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#else + v->cpt_cutime = tsk->signal->cutime; + v->cpt_cstime = tsk->signal->cstime; + v->cpt_cnvcsw = tsk->signal->cnvcsw; + v->cpt_cnivcsw = tsk->signal->cnivcsw; + v->cpt_cmin_flt = tsk->signal->cmin_flt; + v->cpt_cmaj_flt = tsk->signal->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#endif + +#ifdef CONFIG_USER_RESOURCE + if (tsk->mm) + v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx); + else + v->cpt_mm_ub = CPT_NULL; + v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx); + v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx); + v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx); +#endif + + v->cpt_ptrace_message = tsk->ptrace_message; + v->cpt_pn_state = tsk->pn_state; + v->cpt_stopped_state = tsk->stopped_state; + v->cpt_sigsuspend_state = 0; + +#ifdef CONFIG_X86_32 + if (tsk->thread.vm86_info) { + eprintk_ctx("vm86 task is running\n"); + cpt_release_buf(ctx); + return -EBUSY; + } +#endif + + v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + dump_kstack(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + + cpt_push_object(&saved_obj, ctx); + err = dump_registers(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + if (err) + return err; + + if (tsk_used_math(tsk)) { + cpt_push_object(&saved_obj, ctx); + dump_fpustate(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->last_siginfo) { + struct cpt_siginfo_image si; + cpt_push_object(&saved_obj, ctx); + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_LASTSIGINFO; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + if (encode_siginfo(&si, tsk->last_siginfo)) + return -EINVAL; + + ctx->write(&si, sizeof(si), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->sas_ss_size) { + struct cpt_sigaltstack_image si; + cpt_push_object(&saved_obj, ctx); + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_SIGALTSTACK; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + si.cpt_stack = tsk->sas_ss_sp; + si.cpt_stacksize = tsk->sas_ss_size; + + ctx->write(&si, sizeof(si), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->robust_list +#ifdef CONFIG_COMPAT + || tsk->compat_robust_list +#endif + ) { + struct cpt_task_aux_image ai; + cpt_push_object(&saved_obj, ctx); + + ai.cpt_next = sizeof(ai); + ai.cpt_object = CPT_OBJ_TASK_AUX; + ai.cpt_hdrlen = sizeof(ai); + ai.cpt_content = CPT_CONTENT_VOID; + + ai.cpt_robust_list = (unsigned long)tsk->robust_list; +#ifdef CONFIG_X86_64 +#ifdef CONFIG_COMPAT + if (tsk->thread_info->flags&_TIF_IA32) + ai.cpt_robust_list = (unsigned long)tsk->compat_robust_list; +#endif +#endif + ctx->write(&ai, sizeof(ai), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + dump_sigqueue(&tsk->pending, ctx); + + last_thread = 1; + read_lock(&tasklist_lock); + do { + struct task_struct * next = next_thread(tsk); + if (next != tsk && !thread_group_leader(next)) + last_thread = 0; + } while (0); + read_unlock(&tasklist_lock); + + if (last_thread) { + struct task_struct *prev_tsk; + int err; + loff_t pos = ctx->file->f_pos; + + cpt_push_object(&saved_obj, ctx); + err = dump_one_signal_struct(tg_obj, ctx); + cpt_pop_object(&saved_obj, ctx); + if (err) + return err; + + prev_tsk = tsk; + for (;;) { + if (prev_tsk->tgid == tsk->tgid) { + loff_t tg_pos; + + tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal); + ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos); + if (thread_group_leader(prev_tsk)) + break; + } + + if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) { + eprintk_ctx("bug: thread group leader is lost\n"); + return -EINVAL; + } + + obj = list_entry(obj->o_list.prev, cpt_object_t, o_list); + prev_tsk = obj->o_obj; + } + } + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_tasks(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_TASKS); + + for_each_object(obj, CPT_OBJ_TASK) { + int err; + + if ((err = dump_one_process(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +int cpt_collect_signals(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + /* Collect process fd sets */ + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) { + eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, virt_pid(tsk), tsk->comm); + return -EBUSY; + } + if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL) + return -ENOMEM; + if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL) + return -ENOMEM; + } + return 0; +} + + +static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct sighand_struct *sig = obj->o_obj; + struct cpt_sighand_image *v = cpt_get_buf(ctx); + int i; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SIGHAND_STRUCT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + for (i=0; i< _NSIG; i++) { + if (sig->action[i].sa.sa_handler != SIG_DFL || + sig->action[i].sa.sa_flags) { + loff_t saved_obj; + struct cpt_sighandler_image *o = cpt_get_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + o->cpt_next = CPT_NULL; + o->cpt_object = CPT_OBJ_SIGHANDLER; + o->cpt_hdrlen = sizeof(*o); + o->cpt_content = CPT_CONTENT_VOID; + + o->cpt_signo = i; + o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler; + o->cpt_restorer = 0; +#ifdef CONFIG_X86 + o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer; +#endif + o->cpt_flags = sig->action[i].sa.sa_flags; + memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8); + ctx->write(o, sizeof(*o), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + } + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_sighand(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT); + + for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) { + int err; + + if ((err = dump_one_sighand_struct(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_process.h linux-2.6.18.ovz/kernel/cpt/cpt_process.h --- linux-2.6.18/kernel/cpt/cpt_process.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_process.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,10 @@ +int cpt_collect_signals(cpt_context_t *); +int cpt_dump_signal(struct cpt_context *); +int cpt_dump_sighand(struct cpt_context *); +int cpt_dump_tasks(struct cpt_context *); + +int rst_signal_complete(struct cpt_task_image *ti, int *exiting, struct cpt_context *ctx); +__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx); + +int rst_restore_process(struct cpt_context *ctx); +int rst_process_linkage(struct cpt_context *ctx); diff -uprN linux-2.6.18/kernel/cpt/cpt_socket.c linux-2.6.18.ovz/kernel/cpt/cpt_socket.c --- linux-2.6.18/kernel/cpt/cpt_socket.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_socket.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,783 @@ +/* + * + * kernel/cpt/cpt_socket.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_files.h" +#include "cpt_kernel.h" + +static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx); + + +/* Sockets are quite different of another kinds of files. + * There is one simplification: only one struct file can refer to a socket, + * so we could store information about socket directly in section FILES as + * a description of a file and append f.e. array of not-yet-accepted + * connections of listening socket as array of auxiliary data. + * + * Complications are: + * 1. TCP sockets can be orphans. We have to relocate orphans as well, + * so we have to create special section for orphans. + * 2. AF_UNIX sockets are distinguished objects: set of links between + * AF_UNIX sockets is quite arbitrary. + * A. Each socket can refers to many of files due to FD passing. + * B. Each socket except for connected ones can have in queue skbs + * sent by any of sockets. + * + * 2A is relatively easy: after our tasks are frozen we make an additional + * recursive pass throgh set of collected files and get referenced to + * FD passed files. After end of recursion, all the files are treated + * in the same way. All they will be stored in section FILES. + * + * 2B. We have to resolve all those references at some point. + * It is the place where pipe-like approach to image fails. + * + * All this makes socket checkpointing quite chumbersome. + * Right now we collect all the sockets and assign some numeric index value + * to each of them. The socket section is separate and put after section FILES, + * so section FILES refers to sockets by index, section SOCKET refers to FILES + * as usual by position in image. All the refs inside socket section are + * by index. When restoring we read socket section, create objects to hold + * mappings index <-> pos. At the second pass we open sockets (simultaneosly + * with their pairs) and create FILE objects. + */ + + +/* ====== FD passing ====== */ + +/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we + * have to implement this. A problem is that in general case we receive + * skbs from an unknown context, so new files can arrive to checkpointed + * set of processes even after they are stopped. Well, we are going just + * to ignore unknown fds while doing real checkpointing. It is fair because + * links outside checkpointed set are going to fail anyway. + * + * ATTN: the procedure is recursive. We linearize the recursion adding + * newly found files to the end of file list, so they will be analyzed + * in the same loop. + */ + +static int collect_one_passedfd(struct file *file, cpt_context_t * ctx) +{ + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock; + struct sock *sk; + struct sk_buff *skb; + + if (!S_ISSOCK(inode->i_mode)) + return -ENOTSOCK; + + sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; + + if (sock->ops->family != AF_UNIX) + return 0; + + sk = sock->sk; + + /* Subtle locking issue. skbs cannot be removed while + * we are scanning, because all the processes are stopped. + * They still can be added to tail of queue. Locking while + * we dereference skb->next is enough to resolve this. + * See above about collision with skbs added after we started + * checkpointing. + */ + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + if (UNIXCB(skb).fp && skb->sk && + (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + int i; + + for (i = fpl->count-1; i >= 0; i--) { + if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL) + return -ENOMEM; + } + } + + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + + return 0; +} + +int cpt_collect_passedfds(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { + int err; + + if ((err = collect_one_passedfd(file, ctx)) < 0) + return err; + } + } + + return 0; +} + +/* ====== End of FD passing ====== */ + +/* Must be called under bh_lock_sock() */ + +void clear_backlog(struct sock *sk) +{ + struct sk_buff *skb = sk->sk_backlog.head; + + sk->sk_backlog.head = sk->sk_backlog.tail = NULL; + while (skb) { + struct sk_buff *next = skb->next; + + skb->next = NULL; + kfree_skb(skb); + skb = next; + } +} + +void release_sock_nobacklog(struct sock *sk) +{ + spin_lock_bh(&(sk->sk_lock.slock)); + clear_backlog(sk); + sk->sk_lock.owner = NULL; + if (waitqueue_active(&(sk->sk_lock.wq))) + wake_up(&(sk->sk_lock.wq)); + spin_unlock_bh(&(sk->sk_lock.slock)); +} + +int cpt_dump_skb(int type, int owner, struct sk_buff *skb, + struct cpt_context *ctx) +{ + struct cpt_skb_image *v = cpt_get_buf(ctx); + loff_t saved_obj; + struct timeval tmptv; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SKB; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_owner = owner; + v->cpt_queue = type; + skb_get_timestamp(skb, &tmptv); + v->cpt_stamp = cpt_timeval_export(&tmptv); + v->cpt_hspace = skb->data - skb->head; + v->cpt_tspace = skb->end - skb->tail; + v->cpt_h = skb->h.raw - skb->head; + v->cpt_nh = skb->nh.raw - skb->head; + v->cpt_mac = skb->mac.raw - skb->head; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v->cpt_cb)); + memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb)); + if (sizeof(skb->cb) > sizeof(v->cpt_cb)) { + int i; + for (i=sizeof(v->cpt_cb); icb); i++) { + if (skb->cb[i]) { + wprintk_ctx("dirty skb cb"); + break; + } + } + } + v->cpt_len = skb->len; + v->cpt_mac_len = skb->mac_len; + v->cpt_csum = skb->csum; + v->cpt_local_df = skb->local_df; + v->cpt_pkt_type = skb->pkt_type; + v->cpt_ip_summed = skb->ip_summed; + v->cpt_priority = skb->priority; + v->cpt_protocol = skb->protocol; + v->cpt_security = 0; + v->cpt_gso_segs = skb_shinfo(skb)->gso_segs; + v->cpt_gso_size = skb_shinfo(skb)->gso_size; + if (skb_shinfo(skb)->gso_type) { + eprintk_ctx("skb ufo is not supported\n"); + return -EINVAL; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (skb->len + (skb->data - skb->head) > 0) { + struct cpt_obj_bits ob; + loff_t saved_obj2; + + cpt_push_object(&saved_obj2, ctx); + cpt_open_object(NULL, ctx); + ob.cpt_next = CPT_NULL; + ob.cpt_object = CPT_OBJ_BITS; + ob.cpt_hdrlen = sizeof(ob); + ob.cpt_content = CPT_CONTENT_DATA; + ob.cpt_size = skb->len + v->cpt_hspace; + + ctx->write(&ob, sizeof(ob), ctx); + + ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx); + if (skb->data_len) { + int offset = skb->len - skb->data_len; + while (offset < skb->len) { + int copy = skb->len - offset; + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy)) + BUG(); + ctx->write(ctx->tmpbuf, copy, ctx); + __cpt_release_buf(ctx); + offset += copy; + } + } + + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj2, ctx); + } + + if (skb->sk && skb->sk->sk_family == AF_UNIX) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + + if (fpl) { + int i; + + for (i = 0; i < fpl->count; i++) { + struct cpt_fd_image v; + cpt_object_t *obj; + loff_t saved_obj2; + + obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx); + + if (!obj) { + eprintk_ctx("lost passed FD\n"); + return -EINVAL; + } + + cpt_push_object(&saved_obj2, ctx); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_FILEDESC; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_fd = i; + v.cpt_file = obj->o_pos; + v.cpt_flags = 0; + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj2, ctx); + } + } + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + return 0; +} + +static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + struct sock *sk_cache = NULL; + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + int err; + + if (sk->sk_family == AF_UNIX) { + cpt_object_t *obj; + if (skb->sk != sk_cache) { + idx = -1; + sk_cache = NULL; + obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx); + if (obj) { + idx = obj->o_index; + sk_cache = skb->sk; + } else if (unix_peer(sk) != skb->sk) + goto next_skb; + } + } + + err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, ctx); + if (err) + return err; + +next_skb: + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + return 0; +} + +static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + + skb = skb_peek(&sk->sk_write_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) { + int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, ctx); + if (err) + return err; + + spin_lock_irq(&sk->sk_write_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_write_queue.lock); + } + return 0; +} + +void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx) +{ + loff_t saved_obj; + if (sk->sk_filter) { + struct cpt_obj_bits v; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SKFILTER; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_DATA; + v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter); + + ctx->write(&v, sizeof(v), ctx); + ctx->write(sk->sk_filter->insns, v.cpt_size, ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + cpt_push_object(&saved_obj, ctx); + cpt_dump_mcfilter(sk, ctx); + cpt_pop_object(&saved_obj, ctx); + } +} + +/* Dump socket content */ + +int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx) +{ + struct cpt_sock_image *v = cpt_get_buf(ctx); + struct socket *sock; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SOCKET; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_file = CPT_NULL; + sock = sk->sk_socket; + if (sock && sock->file) { + cpt_object_t *tobj; + tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx); + if (tobj) + v->cpt_file = tobj->o_pos; + } + v->cpt_index = index; + v->cpt_parent = parent; + + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + if (sock && !obj->o_lock) { + lockdep_off(); + lock_sock(sk); + lockdep_on(); + obj->o_lock = 1; + } + } + + /* Some bits stored in inode */ + v->cpt_ssflags = sock ? sock->flags : 0; + v->cpt_sstate = sock ? sock->state : 0; + v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0; + + /* Common data */ + v->cpt_family = sk->sk_family; + v->cpt_type = sk->sk_type; + v->cpt_state = sk->sk_state; + v->cpt_reuse = sk->sk_reuse; + v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED); + v->cpt_shutdown = sk->sk_shutdown; + v->cpt_userlocks = sk->sk_userlocks; + v->cpt_no_check = sk->sk_no_check; + v->cpt_zapped = sock_flag(sk, SOCK_DBG); + v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP); + v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE); + v->cpt_protocol = sk->sk_protocol; + v->cpt_err = sk->sk_err; + v->cpt_err_soft = sk->sk_err_soft; + v->cpt_max_ack_backlog = sk->sk_max_ack_backlog; + v->cpt_priority = sk->sk_priority; + v->cpt_rcvlowat = sk->sk_rcvlowat; + v->cpt_rcvtimeo = CPT_NULL; + if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT) + v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo; + v->cpt_sndtimeo = CPT_NULL; + if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT) + v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo; + v->cpt_rcvbuf = sk->sk_rcvbuf; + v->cpt_sndbuf = sk->sk_sndbuf; + v->cpt_bound_dev_if = sk->sk_bound_dev_if; + v->cpt_flags = sk->sk_flags; + v->cpt_lingertime = CPT_NULL; + if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT) + v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime; + v->cpt_peer_pid = sk->sk_peercred.pid; + v->cpt_peer_uid = sk->sk_peercred.uid; + v->cpt_peer_gid = sk->sk_peercred.gid; + v->cpt_stamp = cpt_timeval_export(&sk->sk_stamp); + + v->cpt_peer = -1; + v->cpt_socketpair = 0; + v->cpt_deleted = 0; + + v->cpt_laddrlen = 0; + if (sock) { + int alen = sizeof(v->cpt_laddr); + int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0); + if (err) { + cpt_release_buf(ctx); + return err; + } + v->cpt_laddrlen = alen; + } + v->cpt_raddrlen = 0; + if (sock) { + int alen = sizeof(v->cpt_raddr); + int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2); + if (!err) + v->cpt_raddrlen = alen; + } + + if (sk->sk_family == AF_UNIX) { + if (unix_sk(sk)->dentry) { + struct dentry *d = unix_sk(sk)->dentry; + v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d); + if (!v->cpt_deleted) { + int err = 0; + char *path; + unsigned long pg = __get_free_page(GFP_KERNEL); + + if (!pg) { + cpt_release_buf(ctx); + return -ENOMEM; + } + + path = d_path(d, unix_sk(sk)->mnt, (char *)pg, PAGE_SIZE); + + if (!IS_ERR(path)) { + int len = strlen(path); + if (len < 126) { + strcpy(((char*)v->cpt_laddr)+2, path); + v->cpt_laddrlen = len + 2; + } else { + wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2); + } + err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, ctx); + } else { + eprintk_ctx("cannot get path of an af_unix socket\n"); + err = PTR_ERR(path); + } + free_page(pg); + if (err) { + cpt_release_buf(ctx); + return err; + } + } + } + + /* If the socket is connected, find its peer. If peer is not + * in our table, the socket is connected to external process + * and we consider it disconnected. + */ + if (unix_peer(sk)) { + cpt_object_t *pobj; + pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx); + if (pobj) + v->cpt_peer = pobj->o_index; + else + v->cpt_shutdown = SHUTDOWN_MASK; + + if (unix_peer(unix_peer(sk)) == sk) + v->cpt_socketpair = 1; + } + + /* If the socket shares address with another socket it is + * child of some listening socket. Find and record it. */ + if (unix_sk(sk)->addr && + atomic_read(&unix_sk(sk)->addr->refcnt) > 1 && + sk->sk_state != TCP_LISTEN) { + cpt_object_t *pobj; + for_each_object(pobj, CPT_OBJ_SOCKET) { + struct sock *psk = pobj->o_obj; + if (psk->sk_family == AF_UNIX && + psk->sk_state == TCP_LISTEN && + unix_sk(psk)->addr == unix_sk(sk)->addr) { + v->cpt_parent = pobj->o_index; + break; + } + } + } + } + + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + cpt_dump_socket_in(v, sk, ctx); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_dump_sock_attr(sk, ctx); + + dump_rqueue(index, sk, ctx); + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + dump_wqueue(index, sk, ctx); + cpt_dump_ofo_queue(index, sk, ctx); + } + + if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + && sk->sk_state == TCP_LISTEN) + cpt_dump_synwait_queue(sk, index, ctx); + + cpt_close_object(ctx); + + if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + && sk->sk_state == TCP_LISTEN) + cpt_dump_accept_queue(sk, index, ctx); + + return 0; +} + +int cpt_dump_orphaned_sockets(struct cpt_context *ctx) +{ + int i; + + cpt_open_section(ctx, CPT_SECT_ORPHANS); + + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + struct sock *sk; + struct hlist_node *node; + +retry: + read_lock_bh(&tcp_hashinfo.ehash[i].lock); + sk_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) { + + if (sk->owner_env != get_exec_env()) + continue; + if (sk->sk_socket) + continue; + if (!sock_flag(sk, SOCK_DEAD)) + continue; + if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx)) + continue; + sock_hold(sk); + read_unlock_bh(&tcp_hashinfo.ehash[i].lock); + + local_bh_disable(); + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + eprintk_ctx("BUG: sk locked by whom?\n"); + sk->sk_lock.owner = (void *)1; + bh_unlock_sock(sk); + local_bh_enable(); + + cpt_dump_socket(NULL, sk, -1, -1, ctx); + + local_bh_disable(); + bh_lock_sock(sk); + sk->sk_lock.owner = NULL; + clear_backlog(sk); + tcp_done(sk); + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); + + goto retry; + } + read_unlock_bh(&tcp_hashinfo.ehash[i].lock); + } + cpt_close_section(ctx); + return 0; +} + +static int can_dump(struct sock *sk, cpt_context_t *ctx) +{ + switch (sk->sk_family) { + case AF_NETLINK: + if (((struct netlink_sock *)sk)->cb) { + eprintk_ctx("netlink socket has active callback\n"); + return 0; + } + break; + } + return 1; +} + +/* We are not going to block suspend when we have external AF_UNIX connections. + * But we cannot stop feed of new packets/connections to our environment + * from outside. Taking into account that it is intrincically unreliable, + * we collect some amount of data, but when checkpointing/restoring we + * are going to drop everything, which does not make sense: skbs sent + * by outside processes, connections from outside etc. etc. + */ + +/* The first pass. When we see socket referenced by a file, we just + * add it to socket table */ +int cpt_collect_socket(struct file *file, cpt_context_t * ctx) +{ + cpt_object_t *obj; + struct socket *sock; + struct sock *sk; + + if (!S_ISSOCK(file->f_dentry->d_inode->i_mode)) + return -ENOTSOCK; + sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket; + sk = sock->sk; + if (!can_dump(sk, ctx)) + return -EAGAIN; + if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL) + return -ENOMEM; + obj->o_parent = file; + + return 0; +} + +/* + * We should end with table containing: + * * all sockets opened by our processes in the table. + * * all the sockets queued in listening queues on _our_ listening sockets, + * which are connected to our opened sockets. + */ + +static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx) +{ + struct sock *sk = obj->o_obj; + cpt_object_t *cobj; + struct sk_buff *skb; + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + struct sock *lsk = skb->sk; + if (unix_peer(lsk) && + lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) { + if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL) + return -ENOMEM; + cobj->o_parent = obj->o_parent; + } + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + + return 0; +} + +int cpt_index_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + unsigned long index = 0; + + /* Collect not-yet-accepted children of listening sockets. */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + + if (sk->sk_state != TCP_LISTEN) + continue; + + if (sk->sk_family == AF_UNIX) + collect_one_unix_listening_sock(obj, ctx); + } + + /* Assign indices to all the sockets. */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + cpt_obj_setindex(obj, index++, ctx); + + if (sk->sk_socket && sk->sk_socket->file) { + cpt_object_t *tobj; + tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx); + if (tobj) + cpt_obj_setindex(tobj, obj->o_index, ctx); + } + } + + return 0; +} + +void cpt_unlock_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + lockdep_off(); + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && obj->o_lock) { + if (sk->sk_socket) + release_sock(sk); + } + } + lockdep_on(); +} + +void cpt_kill_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && obj->o_lock) { + cpt_kill_socket(sk, ctx); + if (sk->sk_socket) + release_sock_nobacklog(sk); + } + } +} + +__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx) +{ + struct fasync_struct *fa; + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock; + + sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; + + for (fa = sock->fasync_list; fa; fa = fa->fa_next) { + if (fa->fa_file == file) + return fa->fa_fd; + } + return -1; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_socket.h linux-2.6.18.ovz/kernel/cpt/cpt_socket.h --- linux-2.6.18/kernel/cpt/cpt_socket.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_socket.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,33 @@ +struct sock; + +int cpt_collect_passedfds(cpt_context_t *); +int cpt_index_sockets(cpt_context_t *); +int cpt_collect_socket(struct file *, cpt_context_t *); +int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx); +int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx); +int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx); +int rst_sockets(struct cpt_context *ctx); +int rst_sockets_complete(struct cpt_context *ctx); +int cpt_dump_orphaned_sockets(struct cpt_context *ctx); + +int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx); +struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx); + +void cpt_unlock_sockets(cpt_context_t *); +void cpt_kill_sockets(cpt_context_t *); + + +int cpt_kill_socket(struct sock *, cpt_context_t *); +int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*); +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx); +__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx); +int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *); +int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx); +int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx); +int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct cpt_context *ctx); +int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx); + +int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx); +int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx); diff -uprN linux-2.6.18/kernel/cpt/cpt_socket_in.c linux-2.6.18.ovz/kernel/cpt/cpt_socket_in.c --- linux-2.6.18/kernel/cpt/cpt_socket_in.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_socket_in.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,443 @@ +/* + * + * kernel/cpt/cpt_socket_in.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +static inline __u32 jiffies_export(unsigned long tmo) +{ + __s32 delta = (long)(tmo - jiffies); + return delta; +} + +static inline __u32 tcp_jiffies_export(__u32 tmo) +{ + __s32 delta = tmo - tcp_time_stamp; + return delta; +} + +int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + struct tcp_sock *tp; + + if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP) + return 0; + + tp = tcp_sk(sk); + + skb = skb_peek(&tp->out_of_order_queue); + while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) { + int err; + + err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, ctx); + if (err) + return err; + + spin_lock_irq(&tp->out_of_order_queue.lock); + skb = skb->next; + spin_unlock_irq(&tp->out_of_order_queue.lock); + } + return 0; +} + +static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk, + struct cpt_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + + si->cpt_pred_flags = tp->pred_flags; + si->cpt_rcv_nxt = tp->rcv_nxt; + si->cpt_snd_nxt = tp->snd_nxt; + si->cpt_snd_una = tp->snd_una; + si->cpt_snd_sml = tp->snd_sml; + si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp); + si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime); + si->cpt_tcp_header_len = tp->tcp_header_len; + si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending; + si->cpt_quick = inet_csk(sk)->icsk_ack.quick; + si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong; + si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked; + si->cpt_ato = inet_csk(sk)->icsk_ack.ato; + si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout); + si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime); + si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size; + si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss; + si->cpt_snd_wl1 = tp->snd_wl1; + si->cpt_snd_wnd = tp->snd_wnd; + si->cpt_max_window = tp->max_window; + si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie; + si->cpt_mss_cache = tp->mss_cache; + si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */ + si->cpt_mss_clamp = tp->rx_opt.mss_clamp; + si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len; + si->cpt_ext2_header_len = 0; + si->cpt_ca_state = inet_csk(sk)->icsk_ca_state; + si->cpt_retransmits = inet_csk(sk)->icsk_retransmits; + si->cpt_reordering = tp->reordering; + si->cpt_frto_counter = tp->frto_counter; + si->cpt_frto_highmark = tp->frto_highmark; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + // // si->cpt_adv_cong = tp->adv_cong; +#endif + si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept; + si->cpt_backoff = inet_csk(sk)->icsk_backoff; + si->cpt_srtt = tp->srtt; + si->cpt_mdev = tp->mdev; + si->cpt_mdev_max = tp->mdev_max; + si->cpt_rttvar = tp->rttvar; + si->cpt_rtt_seq = tp->rtt_seq; + si->cpt_rto = inet_csk(sk)->icsk_rto; + si->cpt_packets_out = tp->packets_out; + si->cpt_left_out = tp->left_out; + si->cpt_retrans_out = tp->retrans_out; + si->cpt_lost_out = tp->lost_out; + si->cpt_sacked_out = tp->sacked_out; + si->cpt_fackets_out = tp->fackets_out; + si->cpt_snd_ssthresh = tp->snd_ssthresh; + si->cpt_snd_cwnd = tp->snd_cwnd; + si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt; + si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp; + si->cpt_snd_cwnd_used = tp->snd_cwnd_used; + si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp); + si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout); + si->cpt_ka_timeout = 0; + si->cpt_rcv_wnd = tp->rcv_wnd; + si->cpt_rcv_wup = tp->rcv_wup; + si->cpt_write_seq = tp->write_seq; + si->cpt_pushed_seq = tp->pushed_seq; + si->cpt_copied_seq = tp->copied_seq; + si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok; + si->cpt_wscale_ok = tp->rx_opt.wscale_ok; + si->cpt_sack_ok = tp->rx_opt.sack_ok; + si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp; + si->cpt_snd_wscale = tp->rx_opt.snd_wscale; + si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale; + si->cpt_nonagle = tp->nonagle; + si->cpt_keepalive_probes = tp->keepalive_probes; + si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval; + si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr; + si->cpt_ts_recent = tp->rx_opt.ts_recent; + si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + si->cpt_user_mss = tp->rx_opt.user_mss; + si->cpt_dsack = tp->rx_opt.dsack; + si->cpt_eff_sacks = tp->rx_opt.eff_sacks; + si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq; + si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq; + si->cpt_sack_array[2] = tp->selective_acks[0].start_seq; + si->cpt_sack_array[3] = tp->selective_acks[0].end_seq; + si->cpt_sack_array[4] = tp->selective_acks[1].start_seq; + si->cpt_sack_array[5] = tp->selective_acks[1].end_seq; + si->cpt_sack_array[6] = tp->selective_acks[2].start_seq; + si->cpt_sack_array[7] = tp->selective_acks[2].end_seq; + si->cpt_sack_array[8] = tp->selective_acks[3].start_seq; + si->cpt_sack_array[9] = tp->selective_acks[3].end_seq; + si->cpt_window_clamp = tp->window_clamp; + si->cpt_rcv_ssthresh = tp->rcv_ssthresh; + si->cpt_probes_out = inet_csk(sk)->icsk_probes_out; + si->cpt_num_sacks = tp->rx_opt.num_sacks; + si->cpt_advmss = tp->advmss; + si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries; + si->cpt_ecn_flags = tp->ecn_flags; + si->cpt_prior_ssthresh = tp->prior_ssthresh; + si->cpt_high_seq = tp->high_seq; + si->cpt_retrans_stamp = tp->retrans_stamp; + si->cpt_undo_marker = tp->undo_marker; + si->cpt_undo_retrans = tp->undo_retrans; + si->cpt_urg_seq = tp->urg_seq; + si->cpt_urg_data = tp->urg_data; + si->cpt_pending = inet_csk(sk)->icsk_pending; + si->cpt_urg_mode = tp->urg_mode; + si->cpt_snd_up = tp->snd_up; + si->cpt_keepalive_time = tp->keepalive_time; + si->cpt_keepalive_intvl = tp->keepalive_intvl; + si->cpt_linger2 = tp->linger2; + + if (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_CLOSE && + sock_flag(sk, SOCK_KEEPOPEN)) { + si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires); + } + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + { + extern struct inet_connection_sock_af_ops ipv6_mapped; + if (sk->sk_family == AF_INET6 && + inet_csk(sk)->icsk_af_ops == &ipv6_mapped) + si->cpt_mapped = 1; + } +#endif + + return 0; +} + + +int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk, + struct cpt_context *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + if (sk->sk_family == AF_INET) { + struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr); + sin->sin_family = AF_INET; + sin->sin_port = inet->sport; + sin->sin_addr.s_addr = inet->rcv_saddr; + si->cpt_laddrlen = sizeof(*sin); + } else if (sk->sk_family == AF_INET6) { + struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = inet->sport; + memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16); + si->cpt_laddrlen = sizeof(*sin6); + } + if (!inet->num) + si->cpt_laddrlen = 0; + + si->cpt_daddr = inet->daddr; + si->cpt_dport = inet->dport; + si->cpt_saddr = inet->saddr; + si->cpt_rcv_saddr = inet->rcv_saddr; + si->cpt_sport = inet->sport; + si->cpt_uc_ttl = inet->uc_ttl; + si->cpt_tos = inet->tos; + si->cpt_cmsg_flags = inet->cmsg_flags; + si->cpt_mc_index = inet->mc_index; + si->cpt_mc_addr = inet->mc_addr; + si->cpt_hdrincl = inet->hdrincl; + si->cpt_mc_ttl = inet->mc_ttl; + si->cpt_mc_loop = inet->mc_loop; + si->cpt_pmtudisc = inet->pmtudisc; + si->cpt_recverr = inet->recverr; + si->cpt_freebind = inet->freebind; + si->cpt_idcounter = inet->id; + + si->cpt_cork_flags = inet->cork.flags; + si->cpt_cork_fragsize = 0; + si->cpt_cork_length = inet->cork.length; + si->cpt_cork_addr = inet->cork.addr; + si->cpt_cork_saddr = inet->cork.fl.fl4_src; + si->cpt_cork_daddr = inet->cork.fl.fl4_dst; + si->cpt_cork_oif = inet->cork.fl.oif; + if (inet->cork.rt) { + si->cpt_cork_fragsize = inet->cork.fragsize; + si->cpt_cork_saddr = inet->cork.rt->fl.fl4_src; + si->cpt_cork_daddr = inet->cork.rt->fl.fl4_dst; + si->cpt_cork_oif = inet->cork.rt->fl.oif; + } + + if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { + struct udp_sock *up = udp_sk(sk); + si->cpt_udp_pending = up->pending; + si->cpt_udp_corkflag = up->corkflag; + si->cpt_udp_encap = up->encap_type; + si->cpt_udp_len = up->len; + } + + if (sk->sk_family == AF_INET6) { + memcpy(si->cpt_saddr6, &np->saddr, 16); + memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16); + memcpy(si->cpt_daddr6, &np->daddr, 16); + si->cpt_flow_label6 = np->flow_label; + si->cpt_frag_size6 = np->frag_size; + si->cpt_hop_limit6 = np->hop_limit; + si->cpt_mcast_hops6 = np->mcast_hops; + si->cpt_mcast_oif6 = np->mcast_oif; + si->cpt_rxopt6 = np->rxopt.all; + si->cpt_mc_loop6 = np->mc_loop; + si->cpt_recverr6 = np->recverr; + si->cpt_sndflow6 = np->sndflow; + si->cpt_pmtudisc6 = np->pmtudisc; + si->cpt_ipv6only6 = np->ipv6only; + si->cpt_mapped = 0; + } + + if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) + cpt_dump_socket_tcp(si, sk, ctx); + + return 0; +} + +int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx) +{ + struct request_sock *req; + + for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next) + cpt_dump_socket(NULL, req->sk, -1, index, ctx); + return 0; +} + + +static int dump_openreq(struct request_sock *req, struct sock *sk, int index, + struct cpt_context *ctx) +{ + struct cpt_openreq_image *v = cpt_get_buf(ctx); + + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_OPENREQ; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn; + v->cpt_snt_isn = tcp_rsk(req)->snt_isn; + v->cpt_rmt_port = inet_rsk(req)->rmt_port; + v->cpt_mss = req->mss; + // // v->cpt_family = (req->class == &or_ipv4 ? AF_INET : AF_INET6); + v->cpt_retrans = req->retrans; + v->cpt_snd_wscale = inet_rsk(req)->snd_wscale; + v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale; + v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok; + v->cpt_sack_ok = inet_rsk(req)->sack_ok; + v->cpt_wscale_ok = inet_rsk(req)->wscale_ok; + v->cpt_ecn_ok = inet_rsk(req)->ecn_ok; + v->cpt_acked = inet_rsk(req)->acked; + v->cpt_window_clamp = req->window_clamp; + v->cpt_rcv_wnd = req->rcv_wnd; + v->cpt_ts_recent = req->ts_recent; + v->cpt_expires = jiffies_export(req->expires); + + if (v->cpt_family == AF_INET) { + memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4); + memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4); + } else { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16); + memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16); + v->cpt_iif = inet6_rsk(req)->iif; +#endif + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx) +{ + struct listen_sock *lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; + struct request_sock *req; + int i; + + for (i=0; isyn_table[i]; req; req=req->dl_next) { + loff_t saved_obj; + cpt_push_object(&saved_obj, ctx); + dump_openreq(req, sk, index, ctx); + cpt_pop_object(&saved_obj, ctx); + } + } + return 0; +} + + +int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx) +{ + if (sk->sk_state != TCP_CLOSE && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && + sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + tcp_set_state(sk, TCP_CLOSE); + else + sk->sk_prot->disconnect(sk, 0); + } + return 0; +} + +int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml; + + for (iml = inet->mc_list; iml; iml = iml->next) { + struct cpt_sockmc_image smi; + int scnt = 0; + int i; + + if (iml->sflist) + scnt = iml->sflist->sl_count*16; + + smi.cpt_next = sizeof(smi) + scnt; + smi.cpt_object = CPT_OBJ_SOCK_MCADDR; + smi.cpt_hdrlen = sizeof(smi); + smi.cpt_content = CPT_CONTENT_DATA; + + smi.cpt_family = AF_INET; + smi.cpt_mode = iml->sfmode; + smi.cpt_ifindex = iml->multi.imr_ifindex; + memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr)); + smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr; + + ctx->write(&smi, sizeof(smi), ctx); + + for (i = 0; i < scnt; i++) { + u32 addr[4]; + memset(&addr, 0, sizeof(addr)); + addr[0] = iml->sflist->sl_addr[i]; + ctx->write(&addr, sizeof(addr), ctx); + } + } + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (sk->sk_family == AF_INET6) { + struct ipv6_mc_socklist *mcl; + struct ipv6_pinfo *np = inet6_sk(sk); + + for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) { + struct cpt_sockmc_image smi; + int scnt = 0; + int i; + + if (mcl->sflist) + scnt = mcl->sflist->sl_count*16; + + smi.cpt_next = sizeof(smi) + scnt; + smi.cpt_object = CPT_OBJ_SOCK_MCADDR; + smi.cpt_hdrlen = sizeof(smi); + smi.cpt_content = CPT_CONTENT_DATA; + + smi.cpt_family = AF_INET6; + smi.cpt_mode = mcl->sfmode; + smi.cpt_ifindex = mcl->ifindex; + memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr)); + + ctx->write(&smi, sizeof(smi), ctx); + for (i = 0; i < scnt; i++) + ctx->write(&mcl->sflist->sl_addr[i], 16, ctx); + } + } +#endif + return 0; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_syscalls.h linux-2.6.18.ovz/kernel/cpt/cpt_syscalls.h --- linux-2.6.18/kernel/cpt/cpt_syscalls.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_syscalls.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,100 @@ +#include +#include +#include + +#define WRAP(c, args) return sys_##c args +#define WRAP2(c, args) int err; mm_segment_t oldfs; \ + oldfs = get_fs(); set_fs(KERNEL_DS); \ + err = sys_##c args ;\ + set_fs(oldfs); \ + return err + +static inline int sc_close(int fd) +{ + WRAP(close, (fd)); +} + +static inline int sc_dup2(int fd1, int fd2) +{ + WRAP(dup2, (fd1, fd2)); +} + +static inline int sc_unlink(char *name) +{ + WRAP2(unlink, (name)); +} + +static inline int sc_pipe(int *pfd) +{ + return do_pipe(pfd); +} + +static inline int sc_mknod(char *name, int mode, int dev) +{ + WRAP2(mknod, (name, mode, dev)); +} + +static inline int sc_chmod(char *name, int mode) +{ + WRAP2(mkdir, (name, mode)); +} + +static inline int sc_chown(char *name, int uid, int gid) +{ + WRAP2(chown, (name, uid, gid)); +} + +static inline int sc_mkdir(char *name, int mode) +{ + WRAP2(mkdir, (name, mode)); +} + +static inline int sc_rmdir(char *name) +{ + WRAP2(rmdir, (name)); +} + +static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags) +{ + WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL)); +} + +static inline int sc_mprotect(unsigned long start, size_t len, + unsigned long prot) +{ + WRAP(mprotect, (start, len, prot)); +} + +static inline int sc_mlock(unsigned long start, size_t len) +{ + WRAP(mlock, (start, len)); +} + +static inline int sc_munlock(unsigned long start, size_t len) +{ + WRAP(munlock, (start, len)); +} + +static inline int sc_remap_file_pages(unsigned long start, size_t len, + unsigned long prot, unsigned long pgoff, + unsigned long flags) +{ + WRAP(remap_file_pages, (start, len, prot, pgoff, flags)); +} + +static inline int sc_waitx(int pid, int opt, int *stat_addr) +{ + WRAP(wait4, (pid, stat_addr, opt, NULL)); +} + +static inline int sc_flock(int fd, int flags) +{ + WRAP(flock, (fd, flags)); +} + +static inline int sc_open(char* path, int flags, int mode) +{ + WRAP(open, (path, flags, mode)); +} + +extern int sc_execve(char *cms, char **argv, char **env); diff -uprN linux-2.6.18/kernel/cpt/cpt_sysvipc.c linux-2.6.18.ovz/kernel/cpt/cpt_sysvipc.c --- linux-2.6.18/kernel/cpt/cpt_sysvipc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_sysvipc.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,317 @@ +/* + * + * kernel/cpt/cpt_sysvipc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" + +struct _warg { + struct file *file; + struct cpt_sysvshm_image *v; +}; + +static int dump_one_shm(struct shmid_kernel *shp, void *arg) +{ + struct _warg *warg = arg; + struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v; + + if (shp->shm_file != warg->file) + return 0; + + v->cpt_key = shp->shm_perm.key; + v->cpt_uid = shp->shm_perm.uid; + v->cpt_gid = shp->shm_perm.gid; + v->cpt_cuid = shp->shm_perm.cuid; + v->cpt_cgid = shp->shm_perm.cgid; + v->cpt_mode = shp->shm_perm.mode; + v->cpt_seq = shp->shm_perm.seq; + + v->cpt_id = shp->id; + v->cpt_segsz = shp->shm_segsz; + v->cpt_atime = shp->shm_atim; + v->cpt_ctime = shp->shm_ctim; + v->cpt_dtime = shp->shm_dtim; + v->cpt_creator = shp->shm_cprid; + v->cpt_last = shp->shm_lprid; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1; +#else + v->cpt_mlockuser = -1; +#endif + return 1; +} + +int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx) +{ + struct cpt_sysvshm_image *v = cpt_get_buf(ctx); + struct _warg warg; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSV_SHM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + warg.file = file; + warg.v = v; + if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) { + cpt_release_buf(ctx); + return -ESRCH; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + + +int match_sem(int id, struct sem_array *sema, void *arg) +{ + if (id != (unsigned long)arg) + return 0; + return sema->sem_nsems + 1; +} + +static int get_sem_nsem(int id, cpt_context_t *ctx) +{ + int res; + res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id); + if (res > 0) + return res - 1; + eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id); + return -ESRCH; +} + +static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx) +{ + struct cpt_sysvsem_undo_image v; + loff_t saved_obj; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_SEMUNDO; + v.cpt_id = su->semid; + v.cpt_nsem = get_sem_nsem(su->semid, ctx); + if ((int)v.cpt_nsem < 0) + return -ESRCH; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx); + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + return 0; +} + +struct sem_warg { + int last_id; + struct cpt_sysvsem_image *v; +}; + +static int dump_one_sem(int id, struct sem_array *sma, void *arg) +{ + struct sem_warg * warg = (struct sem_warg *)arg; + struct cpt_sysvsem_image *v = warg->v; + int i; + + if (warg->last_id != -1) { + if ((id % IPCMNI) <= warg->last_id) + return 0; + } + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSV_SEM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_SEMARRAY; + + v->cpt_key = sma->sem_perm.key; + v->cpt_uid = sma->sem_perm.uid; + v->cpt_gid = sma->sem_perm.gid; + v->cpt_cuid = sma->sem_perm.cuid; + v->cpt_cgid = sma->sem_perm.cgid; + v->cpt_mode = sma->sem_perm.mode; + v->cpt_seq = sma->sem_perm.seq; + + v->cpt_id = id; + v->cpt_ctime = sma->sem_ctime; + v->cpt_otime = sma->sem_otime; + + for (i=0; isem_nsems; i++) { + struct { + __u32 semval; + __u32 sempid; + } *s = (void*)v + v->cpt_next; + if (v->cpt_next >= PAGE_SIZE - sizeof(*s)) + return -EINVAL; + s->semval = sma->sem_base[i].semval; + s->sempid = sma->sem_base[i].sempid; + v->cpt_next += sizeof(*s); + } + + warg->last_id = id % IPCMNI; + return 1; +} + + +int cpt_dump_sysvsem(struct cpt_context *ctx) +{ + cpt_object_t *obj; + struct sem_warg warg; + + /* Dumping semaphores is quite tricky because we cannot + * write to dump file under lock inside sysvipc_walk_sem(). + */ + cpt_open_section(ctx, CPT_SECT_SYSV_SEM); + warg.last_id = -1; + warg.v = cpt_get_buf(ctx); + for (;;) { + if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0) + break; + ctx->write(warg.v, warg.v->cpt_next, ctx); + } + cpt_release_buf(ctx); + cpt_close_section(ctx); + + cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO); + for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { + struct sem_undo_list *semu = obj->o_obj; + struct sem_undo *su; + struct cpt_object_hdr v; + loff_t saved_obj; + + cpt_open_object(obj, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SYSVSEM_UNDO; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + for (su = semu->proc_list; su; su = su->proc_next) { + if (su->semid != -1) { + int err; + err = dump_one_semundo(su, ctx); + if (err < 0) + return err; + } + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + } + cpt_close_section(ctx); + return 0; +} + +static int collect_one_msg(int id, struct msg_queue *msq, void *arg) +{ + int *retp = arg; + (*retp)++; + return 0; +} + +int cpt_collect_sysvmsg(cpt_context_t * ctx) +{ + int ret = 0; + sysvipc_walk_msg(collect_one_msg, &ret); + if (ret) { + eprintk_ctx("SYSV msgqueues are not supported, found %d\n", ret); + return -EBUSY; + } + return 0; +} + +static int cpt_collect_sysvsem_undo(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->exit_state) { + /* ipc/sem.c forgets to clear tsk->sysvsem.undo_list + * on exit. Grrr... */ + continue; + } + if (tsk->sysvsem.undo_list && + cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL) + return -ENOMEM; + } + + for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { + struct sem_undo_list *semu = obj->o_obj; + + if (atomic_read(&semu->refcnt) != obj->o_count) { + eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt)); + return -EBUSY; + } + } + return 0; +} + +static int collect_one_shm(struct shmid_kernel *shp, void *arg) +{ + cpt_context_t *ctx = arg; + + if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL) + return -ENOMEM; + return 0; +} + +int cpt_collect_sysvshm(cpt_context_t * ctx) +{ + int err; + + err = sysvipc_walk_shm(collect_one_shm, ctx); + + return err < 0 ? err : 0; +} + +int cpt_collect_sysv(cpt_context_t * ctx) +{ + int err; + + err = cpt_collect_sysvsem_undo(ctx); + if (err) + return err; + err = cpt_collect_sysvmsg(ctx); + if (err) + return err; + err = cpt_collect_sysvshm(ctx); + if (err) + return err; + + return 0; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_tty.c linux-2.6.18.ovz/kernel/cpt/cpt_tty.c --- linux-2.6.18/kernel/cpt/cpt_tty.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_tty.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,216 @@ +/* + * + * kernel/cpt/cpt_tty.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +/* We must support at least N_TTY. */ + +int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx) +{ + struct tty_struct *tty = file->private_data; + cpt_object_t *obj; + struct cpt_obj_ref o; + loff_t saved_pos; + + obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx); + if (!obj) + return -EINVAL; + + cpt_push_object(&saved_pos, ctx); + + o.cpt_next = sizeof(o); + o.cpt_object = CPT_OBJ_REF; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_VOID; + o.cpt_pos = obj->o_pos; + ctx->write(&o, sizeof(o), ctx); + + cpt_pop_object(&saved_pos, ctx); + + return 0; +} + +int cpt_collect_tty(struct file *file, cpt_context_t * ctx) +{ + struct tty_struct *tty = file->private_data; + + if (tty) { + if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL) + return -ENOMEM; + if (tty->link) { + cpt_object_t *obj; + + obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx); + if (obj == NULL) + return -ENOMEM; + /* Undo o_count, tty->link is not a reference */ + obj->o_count--; + } + } + return 0; +} + +int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct tty_struct *tty = obj->o_obj; + struct cpt_tty_image *v; + + if (tty->link) { + if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) { + eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE); + return -EINVAL; + } + if (tty->link->link != tty) { + eprintk_ctx("bad pty pair\n"); + return -EINVAL; + } + if (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_SLAVE && + tty->link->count) + obj->o_count++; + } + if (obj->o_count != tty->count) { + eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count); + return -EBUSY; + } + + cpt_open_object(obj, ctx); + + v = cpt_get_buf(ctx); + v->cpt_next = -1; + v->cpt_object = CPT_OBJ_TTY; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_index = tty->index; + v->cpt_link = -1; + if (tty->link) + v->cpt_link = tty->link->index; + v->cpt_drv_type = tty->driver->type; + v->cpt_drv_subtype = tty->driver->subtype; + v->cpt_drv_flags = tty->driver->flags; + v->cpt_packet = tty->packet; + v->cpt_stopped = tty->stopped; + v->cpt_hw_stopped = tty->hw_stopped; + v->cpt_flow_stopped = tty->flow_stopped; + v->cpt_flags = tty->flags; + v->cpt_ctrl_status = tty->ctrl_status; + v->cpt_canon_data = tty->canon_data; + v->cpt_canon_head = tty->canon_head - tty->read_tail; + v->cpt_canon_column = tty->canon_column; + v->cpt_column = tty->column; + v->cpt_erasing = tty->erasing; + v->cpt_lnext = tty->lnext; + v->cpt_icanon = tty->icanon; + v->cpt_raw = tty->raw; + v->cpt_real_raw = tty->real_raw; + v->cpt_closing = tty->closing; + v->cpt_minimum_to_wake = tty->minimum_to_wake; + v->cpt_pgrp = 0; + if (tty->pgrp > 0) { + v->cpt_pgrp = _pid_to_vpid(tty->pgrp); + if ((int)v->cpt_pgrp < 0) { + dprintk_ctx("cannot map tty->pgrp %d -> %d\n", tty->pgrp, (int)v->cpt_pgrp); + v->cpt_pgrp = -1; + } + } + v->cpt_session = 0; + if (tty->session > 0) { + v->cpt_session = _pid_to_vpid(tty->session); + if ((int)v->cpt_session < 0) { + eprintk_ctx("cannot map tty->session %d -> %d\n", tty->session, (int)v->cpt_session); + cpt_release_buf(ctx); + return -EINVAL; + } + } + memcpy(v->cpt_name, tty->name, 64); + v->cpt_ws_row = tty->winsize.ws_row; + v->cpt_ws_col = tty->winsize.ws_col; + v->cpt_ws_prow = tty->winsize.ws_ypixel; + v->cpt_ws_pcol = tty->winsize.ws_xpixel; + if (tty->termios == NULL) { + eprintk_ctx("NULL termios"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_c_line = tty->termios->c_line; + v->cpt_c_iflag = tty->termios->c_iflag; + v->cpt_c_oflag = tty->termios->c_oflag; + v->cpt_c_cflag = tty->termios->c_cflag; + v->cpt_c_lflag = tty->termios->c_lflag; + memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS); + if (NCCS < 32) + memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS); + memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags)); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (tty->read_buf && tty->read_cnt) { + struct cpt_obj_bits *v = cpt_get_buf(ctx); + loff_t saved_pos; + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = tty->read_cnt; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (tty->read_cnt) { + int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail); + ctx->write(tty->read_buf + tty->read_tail, n, ctx); + if (tty->read_cnt > n) + ctx->write(tty->read_buf, tty->read_cnt-n, ctx); + ctx->align(ctx); + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + } + + cpt_close_object(ctx); + + return 0; +} + +__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx) +{ + struct tty_struct * tty; + struct fasync_struct *fa; + + tty = (struct tty_struct *)file->private_data; + + for (fa = tty->fasync; fa; fa = fa->fa_next) { + if (fa->fa_file == file) + return fa->fa_fd; + } + return -1; +} diff -uprN linux-2.6.18/kernel/cpt/cpt_ubc.c linux-2.6.18.ovz/kernel/cpt/cpt_ubc.c --- linux-2.6.18/kernel/cpt/cpt_ubc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_ubc.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,132 @@ +/* + * + * kernel/cpt/cpt_ubc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx); + if (obj != NULL) { + if (obj->o_count == 1) + get_beancounter(bc); + if (bc->parent != NULL && obj->o_parent == NULL) + obj->o_parent = cpt_add_ubc(bc->parent, ctx); + } + return obj; +} + +__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx); + if (obj == NULL) { + char buf[48]; + print_ub_uid(bc, buf, sizeof(buf)); + eprintk("CPT: unknown ub %s (%p)\n", buf, bc); + dump_stack(); + return CPT_NULL; + } + return obj->o_pos; +} + +static void dump_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, + int held) +{ + dmp->barrier = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL); + dmp->limit = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL); + dmp->held = (held ? prm->held : CPT_NULL); + dmp->maxheld = prm->maxheld; + dmp->minheld = prm->minheld; + dmp->failcnt = prm->failcnt; +} + +static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct user_beancounter *bc; + struct cpt_beancounter_image *v; + int i; + + bc = obj->o_obj; + v = cpt_get_buf(ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_UBC; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + if (obj->o_parent != NULL) + v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos; + else + v->cpt_parent = CPT_NULL; + v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0; + for (i = 0; i < UB_RESOURCES; i++) { + dump_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + dump_one_bc_parm(v->cpt_parms + i * 2 + 1, bc->ub_store + i, 1); + } + memset(v->cpt_parms + UB_RESOURCES * 2, 0, + sizeof(v->cpt_parms) + - UB_RESOURCES * 2 * sizeof(v->cpt_parms[0])); + + cpt_open_object(obj, ctx); + ctx->write(v, sizeof(*v), ctx); + cpt_close_object(ctx); + + cpt_release_buf(ctx); + return 0; +} + +int cpt_dump_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int skipped; + int top; + + cpt_open_section(ctx, CPT_SECT_UBC); + + do { + skipped = 0; + top = 0; + for_each_object(obj, CPT_OBJ_UBC) { + if (obj->o_parent == NULL) + top++; + if (obj->o_pos != CPT_NULL) + continue; + if (obj->o_parent != NULL && + ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL) + skipped++; + else + dump_one_bc(obj, ctx); + } + } while (skipped && (top < 2)); + + cpt_close_section(ctx); + if (top > 1) { + eprintk_ctx("More than one top level ub exist"); + return -EINVAL; + } + + return 0; +} + +void cpt_finish_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_UBC) + put_beancounter(obj->o_obj); +} diff -uprN linux-2.6.18/kernel/cpt/cpt_ubc.h linux-2.6.18.ovz/kernel/cpt/cpt_ubc.h --- linux-2.6.18/kernel/cpt/cpt_ubc.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_ubc.h 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,23 @@ +#ifdef CONFIG_USER_RESOURCE +cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx); +__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx); +int cpt_dump_ubc(struct cpt_context *ctx); + +struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx); +int rst_undump_ubc(struct cpt_context *ctx); + +void cpt_finish_ubc(struct cpt_context *ctx); +void rst_finish_ubc(struct cpt_context *ctx); +void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id); +void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id); +#else +static int inline cpt_dump_ubc(struct cpt_context *ctx) +{ return 0; } +static int inline rst_undump_ubc(struct cpt_context *ctx) +{ return 0; } +static void inline cpt_finish_ubc(struct cpt_context *ctx) +{ return; } +static void inline rst_finish_ubc(struct cpt_context *ctx) +{ return; } +#endif + diff -uprN linux-2.6.18/kernel/cpt/cpt_x8664.S linux-2.6.18.ovz/kernel/cpt/cpt_x8664.S --- linux-2.6.18/kernel/cpt/cpt_x8664.S 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/cpt_x8664.S 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,69 @@ +#define ASSEMBLY 1 +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + .code64 + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorq %rax, %rax + pushq %rax /* ss */ + pushq %rax /* rsp */ + pushq $(1<<9) /* eflags - interrupts on */ + pushq $__KERNEL_CS /* cs */ + pushq \child_rip /* rip */ + pushq %rax /* orig rax */ + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + .endm + +ENTRY(asm_kernel_thread) + CFI_STARTPROC + FAKE_STACK_FRAME $child_rip + SAVE_ALL + + # rdi: flags, rsi: usp, rdx: will be &pt_regs + movq %rdx,%rdi + orq $0x00800000,%rdi + movq $-1, %rsi + movq %rsp, %rdx + + xorl %r8d,%r8d + xorl %r9d,%r9d + pushq %rcx + call do_fork_pid + addq $8, %rsp + /* call do_fork */ + movq %rax,RAX(%rsp) + xorl %edi,%edi + RESTORE_ALL + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +ENDPROC(asm_kernel_thread) + +child_rip: + pushq $0 # fake return address + CFI_STARTPROC + movq %rdi, %rax + movq %rsi, %rdi + call *%rax + movq %rax, %rdi + call do_exit + CFI_ENDPROC +ENDPROC(child_rip) + diff -uprN linux-2.6.18/kernel/cpt/rst_conntrack.c linux-2.6.18.ovz/kernel/cpt/rst_conntrack.c --- linux-2.6.18/kernel/cpt/rst_conntrack.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_conntrack.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,286 @@ +/* + * + * kernel/cpt/rst_conntrack.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + +#include +#include +#include +#include +#include +#include +#include +#include + +#define ASSERT_READ_LOCK(x) do { } while (0) +#define ASSERT_WRITE_LOCK(x) do { } while (0) + +#include + + +#include "cpt_obj.h" +#include "cpt_context.h" + +struct ct_holder +{ + struct ct_holder *next; + struct ip_conntrack *ct; + int index; +}; + +static void decode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple, int dir) +{ + tuple->dst.ip = v->cpt_dst; + tuple->dst.u.all = v->cpt_dstport; + tuple->dst.protonum = v->cpt_protonum; + tuple->dst.dir = v->cpt_dir; + if (dir != tuple->dst.dir) + wprintk("dir != tuple->dst.dir\n"); + + tuple->src.ip = v->cpt_src; + tuple->src.u.all = v->cpt_srcport; +} + + +static int undump_expect_list(struct ip_conntrack *ct, + struct cpt_ip_conntrack_image *ci, + loff_t pos, struct ct_holder *ct_list, + cpt_context_t *ctx) +{ + loff_t end; + int err; + + end = pos + ci->cpt_next; + pos += ci->cpt_hdrlen; + while (pos < end) { + struct cpt_ip_connexpect_image v; + struct ip_conntrack_expect *exp; + struct ip_conntrack *sibling; + + err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx); + if (err) + return err; + + sibling = NULL; + if (v.cpt_sibling_conntrack) { + struct ct_holder *c; + + for (c = ct_list; c; c = c->next) { + if (c->index == v.cpt_sibling_conntrack) { + sibling = c->ct; + break; + } + } + if (!sibling) { + eprintk_ctx("lost sibling of expectation\n"); + return -EINVAL; + } + } + + write_lock_bh(&ip_conntrack_lock); + + /* It is possible. Helper module could be just unregistered, + * if expectation were on the list, it would be destroyed. */ + if (ct->helper == NULL) { + write_unlock_bh(&ip_conntrack_lock); + dprintk_ctx("conntrack: no helper and non-trivial expectation\n"); + continue; + } + + exp = ip_conntrack_expect_alloc(NULL); + if (exp == NULL) { + write_unlock_bh(&ip_conntrack_lock); + return -ENOMEM; + } + + if (ct->helper->timeout && !del_timer(&exp->timeout)) { + /* Dying already. We can do nothing. */ + write_unlock_bh(&ip_conntrack_lock); + dprintk_ctx("conntrack expectation is dying\n"); + continue; + } + + decode_tuple(&v.cpt_tuple, &exp->tuple, 0); + decode_tuple(&v.cpt_mask, &exp->mask, 0); + + exp->master = ct; + nf_conntrack_get(&ct->ct_general); + ip_conntrack_expect_insert(exp); +#if 0 + if (sibling) { + exp->sibling = sibling; + sibling->master = exp; + LIST_DELETE(&ve_ip_conntrack_expect_list, exp); + ct->expecting--; + nf_conntrack_get(&master_ct(sibling)->infos[0]); + } else +#endif + if (ct->helper->timeout) { + exp->timeout.expires = jiffies + v.cpt_timeout; + add_timer(&exp->timeout); + } + write_unlock_bh(&ip_conntrack_lock); + + pos += v.cpt_next; + } + return 0; +} + +static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos, + struct ct_holder **ct_list, cpt_context_t *ctx) +{ + int err = 0; + struct ip_conntrack *conntrack; + struct ct_holder *c; + struct ip_conntrack_tuple orig, repl; + + c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); + if (c == NULL) + return -ENOMEM; + + decode_tuple(&ci->cpt_tuple[0], &orig, 0); + decode_tuple(&ci->cpt_tuple[1], &repl, 1); + + conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub); + if (!conntrack || IS_ERR(conntrack)) { + kfree(c); + return -ENOMEM; + } + + c->ct = conntrack; + c->next = *ct_list; + *ct_list = c; + c->index = ci->cpt_index; + + decode_tuple(&ci->cpt_tuple[0], &conntrack->tuplehash[0].tuple, 0); + decode_tuple(&ci->cpt_tuple[1], &conntrack->tuplehash[1].tuple, 1); + + conntrack->status = ci->cpt_status; + + memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto)); + memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help)); + +#ifdef CONFIG_IP_NF_NAT_NEEDED +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + conntrack->nat.masq_index = ci->cpt_masq_index; +#endif + if (ci->cpt_initialized) { + conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos; + conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before; + conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after; + conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos; + conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before; + conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after; + } + if (conntrack->status & IPS_NAT_DONE_MASK) + ip_nat_hash_conntrack(conntrack); +#endif + + if (ci->cpt_ct_helper) { + conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple); + if (conntrack->helper == NULL) { + eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n"); + err = -EINVAL; + } + } + + ip_conntrack_hash_insert(conntrack); + conntrack->timeout.expires = jiffies + ci->cpt_timeout; + + if (err == 0 && ci->cpt_next > ci->cpt_hdrlen) + err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx); + + return err; +} + +int rst_restore_ip_conntrack(struct cpt_context * ctx) +{ + int err = 0; + loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_ip_conntrack_image ci; + struct ct_holder *c; + struct ct_holder *ct_list = NULL; + + if (sec == CPT_NULL) + return 0; + + if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) { + eprintk_ctx("conntrack module ct->proto version mismatch\n"); + return -EINVAL; + } + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx); + if (err) + break; + err = undump_one_ct(&ci, sec, &ct_list, ctx); + if (err) + break; + sec += ci.cpt_next; + } + + while ((c = ct_list) != NULL) { + ct_list = c->next; + if (c->ct) + add_timer(&c->ct->timeout); + kfree(c); + } + + return err; +} + +#else + +#include "cpt_obj.h" +#include "cpt_context.h" + +int rst_restore_ip_conntrack(struct cpt_context * ctx) +{ + if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL) + return -EINVAL; + return 0; +} + +#endif diff -uprN linux-2.6.18/kernel/cpt/rst_context.c linux-2.6.18.ovz/kernel/cpt/rst_context.c --- linux-2.6.18/kernel/cpt/rst_context.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_context.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,324 @@ +/* + * + * kernel/cpt/rst_context.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->read(file, addr, count, &file->f_pos); + set_fs(oldfs); + if (err != count) + return err >= 0 ? -EIO : err; + return 0; +} + +static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->read(file, addr, count, &pos); + set_fs(oldfs); + if (err != count) + return err >= 0 ? -EIO : err; + return 0; +} + +static void file_align(struct cpt_context *ctx) +{ + struct file *file = ctx->file; + + if (file) + file->f_pos = CPT_ALIGN(file->f_pos); +} + +int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end) +{ + struct cpt_section_hdr hdr; + int err; + loff_t pos; + + pos = ctx->sections[type]; + *start = *end = pos; + + if (pos != CPT_NULL) { + if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0) + return err; + if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr)) + return -EINVAL; + *start = pos + hdr.cpt_hdrlen; + *end = pos + hdr.cpt_next; + } + return 0; +} +EXPORT_SYMBOL(rst_get_section); + +void rst_context_init(struct cpt_context *ctx) +{ + int i; + + memset(ctx, 0, sizeof(*ctx)); + + init_MUTEX(&ctx->main_sem); + ctx->refcount = 1; + + ctx->current_section = -1; + ctx->current_object = -1; + ctx->pagesize = PAGE_SIZE; + ctx->read = file_read; + ctx->pread = file_pread; + ctx->align = file_align; + for (i=0; i < CPT_SECT_MAX; i++) + ctx->sections[i] = CPT_NULL; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + init_completion(&ctx->pgin_notify); +#endif + cpt_object_init(ctx); +} + +static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx) +{ + struct cpt_section_hdr h; + + while (start < end) { + int err; + + err = ctx->pread(&h, sizeof(h), ctx, start); + if (err) + return err; + if (h.cpt_hdrlen < sizeof(h) || + h.cpt_next < h.cpt_hdrlen || + start + h.cpt_next > end) + return -EINVAL; + if (h.cpt_section >= CPT_SECT_MAX) + return -EINVAL; + ctx->sections[h.cpt_section] = start; + start += h.cpt_next; + } + return 0; +} + +int rst_open_dumpfile(struct cpt_context *ctx) +{ + int err; + struct cpt_major_tail *v; + struct cpt_major_hdr h; + unsigned long size; + + err = -EBADF; + if (!ctx->file) + goto err_out; + + err = -ENOMEM; + ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); + if (ctx->tmpbuf == NULL) + goto err_out; + __cpt_release_buf(ctx); + + size = ctx->file->f_dentry->d_inode->i_size; + + if (size & 7) { + err = -EINVAL; + goto err_out; + } + if (size < sizeof(struct cpt_major_hdr) + + sizeof(struct cpt_major_tail)) { + err = -EINVAL; + goto err_out; + } + err = ctx->pread(&h, sizeof(h), ctx, 0); + if (err) { + eprintk_ctx("too short image 1 %d\n", err); + goto err_out; + } + if (h.cpt_signature[0] != CPT_SIGNATURE0 || + h.cpt_signature[1] != CPT_SIGNATURE1 || + h.cpt_signature[2] != CPT_SIGNATURE2 || + h.cpt_signature[3] != CPT_SIGNATURE3) { + err = -EINVAL; + goto err_out; + } + if (h.cpt_hz != HZ) { + err = -EINVAL; + eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ); + goto err_out; + } + ctx->virt_jiffies64 = h.cpt_start_jiffies64; + ctx->start_time.tv_sec = h.cpt_start_sec; + ctx->start_time.tv_nsec = h.cpt_start_nsec; + ctx->kernel_config_flags = h.cpt_kernel_config[0]; + ctx->iptables_mask = h.cpt_iptables_mask; + if (h.cpt_image_version > CPT_VERSION_18 || + CPT_VERSION_MINOR(h.cpt_image_version) > 1) { + eprintk_ctx("Unknown image version: %x. Can't restore.\n", + h.cpt_image_version); + err = -EINVAL; + goto err_out; + } + ctx->image_version = h.cpt_image_version; + ctx->features = (__u64)((__u64)h.cpt_ve_features2<<32 | h.cpt_ve_features); + ctx->image_arch = h.cpt_os_arch; + + v = cpt_get_buf(ctx); + err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v)); + if (err) { + eprintk_ctx("too short image 2 %d\n", err); + cpt_release_buf(ctx); + goto err_out; + } + if (v->cpt_signature[0] != CPT_SIGNATURE0 || + v->cpt_signature[1] != CPT_SIGNATURE1 || + v->cpt_signature[2] != CPT_SIGNATURE2 || + v->cpt_signature[3] != CPT_SIGNATURE3 || + v->cpt_nsect != CPT_SECT_MAX_INDEX) { + err = -EINVAL; + cpt_release_buf(ctx); + goto err_out; + } + if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) { + cpt_release_buf(ctx); + goto err_out; + } +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + ctx->lazypages = v->cpt_lazypages; +#endif + ctx->tasks64 = v->cpt_64bit; + cpt_release_buf(ctx); + return 0; + +err_out: + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } + return err; +} + +void rst_close_dumpfile(struct cpt_context *ctx) +{ + if (ctx->file) { + fput(ctx->file); + ctx->file = NULL; + } + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } +} + +int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx) +{ + int err; + struct cpt_object_hdr *hdr = tmp; + err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos); + if (err) + return err; + if (type > 0 && type != hdr->cpt_object) + return -EINVAL; + if (hdr->cpt_hdrlen > hdr->cpt_next) + return -EINVAL; + if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr)) + return -EINVAL; + if (size < sizeof(*hdr)) + return -EINVAL; + if (size > hdr->cpt_hdrlen) + size = hdr->cpt_hdrlen; + if (size > sizeof(*hdr)) + err = ctx->pread(hdr+1, size - sizeof(*hdr), + ctx, pos + sizeof(*hdr)); + return err; +} +EXPORT_SYMBOL(_rst_get_object); + +void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx) +{ + int err; + void *tmp; + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(hdr), ctx, pos); + if (err) + return NULL; + if (type > 0 && type != hdr.cpt_object) + return NULL; + if (hdr.cpt_hdrlen > hdr.cpt_next) + return NULL; + if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr)) + return NULL; + tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL); + if (!tmp) + return NULL; + err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos); + if (!err) + return tmp; + kfree(tmp); + return NULL; +} +EXPORT_SYMBOL(__rst_get_object); + +__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx) +{ + int err; + struct cpt_object_hdr hdr; + __u8 *name; + + err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx); + if (err) + return NULL; + if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE) + return NULL; + name = (void*)__get_free_page(GFP_KERNEL); + if (!name) + return NULL; + err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen, + ctx, *pos_p + hdr.cpt_hdrlen); + if (err) { + free_page((unsigned long)name); + return NULL; + } + *pos_p += hdr.cpt_next; + return name; +} + +__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx) +{ + return __rst_get_name(&pos, ctx); +} + +void rst_put_name(__u8 *name, struct cpt_context *ctx) +{ + unsigned long addr = (unsigned long)name; + + if (addr) + free_page(addr&~(PAGE_SIZE-1)); +} diff -uprN linux-2.6.18/kernel/cpt/rst_epoll.c linux-2.6.18.ovz/kernel/cpt/rst_epoll.c --- linux-2.6.18/kernel/cpt/rst_epoll.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_epoll.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,173 @@ +/* + * + * kernel/cpt/rst_epoll.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +/* Those funcations are static in fs/eventpoll.c */ +extern struct file_operations eventpoll_fops; +extern int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd); +extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); +extern void ep_release_epitem(struct epitem *epi); + + +struct file *cpt_open_epolldev(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + struct file *file; + int efd; + + /* Argument "size" is ignored, use just 1 */ + efd = sys_epoll_create(1); + if (efd < 0) + return ERR_PTR(efd); + + file = fget(efd); + sys_close(efd); + return file; +} + +static int restore_one_epoll(cpt_object_t *obj, + loff_t pos, + struct cpt_epoll_image *ebuf, + cpt_context_t *ctx) +{ + int err = 0; + loff_t endpos; + struct file *file = obj->o_obj; + struct eventpoll *ep; + + if (file->f_op != &eventpoll_fops) { + eprintk_ctx("bad epoll file\n"); + return -EINVAL; + } + + ep = file->private_data; + + if (unlikely(ep == NULL)) { + eprintk_ctx("bad epoll device\n"); + return -EINVAL; + } + + endpos = pos + ebuf->cpt_next; + pos += ebuf->cpt_hdrlen; + while (pos < endpos) { + struct cpt_epoll_file_image efi; + struct epoll_event epds; + + cpt_object_t *tobj; + + err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx); + if (err) + return err; + tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx); + if (!tobj) { + eprintk_ctx("epoll file not found\n"); + return -EINVAL; + } + epds.events = efi.cpt_events; + epds.data = efi.cpt_data; + down_write(&ep->sem); + err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd); + if (!err) { + struct epitem *epi; + epi = ep_find(ep, tobj->o_obj, efi.cpt_fd); + if (epi) { + epi->revents = efi.cpt_revents; + if (efi.cpt_ready) { + unsigned long flags; + write_lock_irqsave(&ep->lock, flags); + if (list_empty(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); + write_unlock_irqrestore(&ep->lock, flags); + } + ep_release_epitem(epi); + } + } + up_write(&ep->sem); + if (err) + break; + pos += efi.cpt_next; + } + return err; +} + +int rst_eventpoll(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_EPOLL]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_epoll_image *ebuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx); + if (obj == NULL) { + eprintk_ctx("cannot find epoll file object\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + err = restore_one_epoll(obj, sec, ebuf, ctx); + cpt_release_buf(ctx); + if (err) + return err; + sec += ebuf->cpt_next; + } + + return 0; + +} diff -uprN linux-2.6.18/kernel/cpt/rst_files.c linux-2.6.18.ovz/kernel/cpt/rst_files.c --- linux-2.6.18/kernel/cpt/rst_files.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_files.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1635 @@ +/* + * + * kernel/cpt/rst_files.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" + +#include "cpt_syscalls.h" + + +struct filejob { + struct filejob *next; + int pid; + loff_t fdi; +}; + +static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx) +{ + struct filejob *j; + + j = kmalloc(sizeof(*j), GFP_KERNEL); + if (j == NULL) + return -ENOMEM; + j->pid = current->pid; + j->fdi = pos; + j->next = ctx->filejob_queue; + ctx->filejob_queue = j; + return 0; +} + +static void _anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + /* + * If nobody else uses this page, and we don't already have a + * temporary page, let's keep track of it as a one-deep + * allocation cache. (Otherwise just release our reference to it) + */ + if (page_count(page) == 1 && !pipe->tmp_page) + pipe->tmp_page = page; + else + page_cache_release(page); + + module_put(THIS_MODULE); +} + +static void *_anon_pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, int atomic) +{ + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return kmap_atomic(buf->page, KM_USER0); + } + + return kmap(buf->page); +} + +static void _anon_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, void *map_data) +{ + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + kunmap_atomic(map_data, KM_USER0); + } else + kunmap(buf->page); +} + +static int _anon_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + if (page_count(page) == 1) { + lock_page(page); + return 0; + } + + return 1; +} + +static void _anon_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + +static int _anon_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + return 0; +} + +static struct pipe_buf_operations _anon_pipe_buf_ops = { + .can_merge = 1, + .map = _anon_pipe_buf_map, + .unmap = _anon_pipe_buf_unmap, + .release = _anon_pipe_buf_release, + .pin = _anon_pipe_buf_pin, + .get = _anon_pipe_buf_get, + .steal = _anon_pipe_buf_steal, +}; + +/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer + * many times. We need to mark it in CPT_OBJ_INODE table in some way. + */ +static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi, + struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + struct cpt_inode_image ii; + struct cpt_obj_bits b; + struct pipe_inode_info *info; + int err; + int count; + + if (!S_ISFIFO(ino->i_mode)) { + eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", (long long)fi->cpt_inode); + return -EINVAL; + } + if (fi->cpt_inode == CPT_NULL) + return 0; + + err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); + if (err) + return err; + + if (ii.cpt_next <= ii.cpt_hdrlen) + return 0; + + err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx); + if (err) + return err; + + if (b.cpt_size == 0) + return 0; + + mutex_lock(&ino->i_mutex); + info = ino->i_pipe; + if (info->nrbufs) { + mutex_unlock(&ino->i_mutex); + eprintk("pipe buffer is restored already\n"); + return -EINVAL; + } + info->curbuf = 0; + count = 0; + while (count < b.cpt_size) { + struct pipe_buffer *buf = info->bufs + info->nrbufs; + void * addr; + int chars; + + chars = b.cpt_size - count; + if (chars > PAGE_SIZE) + chars = PAGE_SIZE; + if (!try_module_get(THIS_MODULE)) { + err = -EBUSY; + break; + } + + buf->page = alloc_page(GFP_HIGHUSER); + if (buf->page == NULL) { + err = -ENOMEM; + break; + } + buf->ops = &_anon_pipe_buf_ops; + buf->offset = 0; + buf->len = chars; + info->nrbufs++; + addr = kmap(buf->page); + err = ctx->pread(addr, chars, ctx, + fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count); + if (err) + break; + count += chars; + } + mutex_unlock(&ino->i_mutex); + + return err; +} + +static int make_flags(struct cpt_file_image *fi) +{ + int flags = O_NOFOLLOW; + switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) { + case FMODE_READ|FMODE_WRITE: + flags |= O_RDWR; break; + case FMODE_WRITE: + flags |= O_WRONLY; break; + case FMODE_READ: + flags |= O_RDONLY; break; + default: break; + } + flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC); + flags |= O_NONBLOCK|O_NOCTTY; + return flags; +} + +static struct file *open_pipe(char *name, + struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct cpt_inode_image ii; + struct file *rf, *wf; + + err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); + if (err) + return ERR_PTR(err); + + if (ii.cpt_sb == FSMAGIC_PIPEFS) { + int pfd[2]; + + if ((err = sc_pipe(pfd)) < 0) + return ERR_PTR(err); + + rf = fcheck(pfd[0]); + wf = fcheck(pfd[1]); + get_file(rf); + get_file(wf); + sc_close(pfd[0]); + sc_close(pfd[1]); + + if (fi->cpt_mode&FMODE_READ) { + struct file *tf; + tf = wf; wf = rf; rf = tf; + } + } else { + if (fi->cpt_mode&FMODE_READ) { + rf = filp_open(name, flags, 0); + if (IS_ERR(rf)) { + dprintk_ctx("filp_open\n"); + return rf; + } + dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current), + (long long)fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode); + return rf; + } + + dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), (long long)fi->cpt_inode); + + rf = filp_open(name, O_RDWR|O_NONBLOCK, 0); + if (IS_ERR(rf)) + return rf; + wf = dentry_open(dget(rf->f_dentry), + mntget(rf->f_vfsmnt), flags); + } + + /* Add pipe inode to obj table. */ + obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx); + if (obj == NULL) { + fput(rf); fput(wf); + return ERR_PTR(-ENOMEM); + } + cpt_obj_setpos(obj, fi->cpt_inode, ctx); + obj->o_parent = rf; + + /* Add another side of pipe to obj table, it will not be used + * (o_pos = PT_NULL), another processes opeining pipe will find + * inode and open it with dentry_open(). */ + obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx); + if (obj == NULL) { + fput(wf); + return ERR_PTR(-ENOMEM); + } + return wf; +} + +static struct file *open_special(struct cpt_file_image *fi, + unsigned flags, + int deleted, + struct cpt_context *ctx) +{ + struct cpt_inode_image *ii; + struct file *file; + + /* Directories and named pipes are not special actually */ + if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode)) + return NULL; + + /* No support for block devices at the moment. */ + if (S_ISBLK(fi->cpt_i_mode)) + return ERR_PTR(-EINVAL); + + if (S_ISSOCK(fi->cpt_i_mode)) { + eprintk_ctx("bug: socket is not open\n"); + return ERR_PTR(-EINVAL); + } + + /* Support only (some) character devices at the moment. */ + if (!S_ISCHR(fi->cpt_i_mode)) + return ERR_PTR(-EINVAL); + + ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx); + if (ii == NULL) + return ERR_PTR(-ENOMEM); + + /* Do not worry about this right now. /dev/null,zero,*random are here. + * To prohibit at least /dev/mem? + */ + if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) { + kfree(ii); + return NULL; + } + + file = rst_open_tty(fi, ii, flags, ctx); + kfree(ii); + return file; +} + +static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx) +{ + struct file_lock lock; + cpt_object_t *obj; + + memset(&lock, 0, sizeof(lock)); + lock.fl_type = fli->cpt_type; + lock.fl_flags = fli->cpt_flags & ~FL_SLEEP; + lock.fl_start = fli->cpt_start; + lock.fl_end = fli->cpt_end; + obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx); + if (!obj) { + eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner); + return -EINVAL; + } + lock.fl_owner = obj->o_obj; + lock.fl_pid = vpid_to_pid(fli->cpt_pid); + if (lock.fl_pid < 0) { + eprintk_ctx("unknown lock pid %d\n", lock.fl_pid); + return -EINVAL; + } + lock.fl_file = file; + + if (lock.fl_owner == NULL) + eprintk_ctx("no lock owner\n"); + return posix_lock_file(file, &lock); +} + +static int restore_flock(struct file *file, struct cpt_flock_image *fli, + cpt_context_t *ctx) +{ + int cmd, err, fd; + fd = get_unused_fd(); + if (fd < 0) { + eprintk_ctx("BSD flock cannot be restored\n"); + return fd; + } + get_file(file); + fd_install(fd, file); + if (fli->cpt_type == F_RDLCK) { + cmd = LOCK_SH; + } else if (fli->cpt_type == F_WRLCK) { + cmd = LOCK_EX; + } else { + eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type); + sc_close(fd); + return -EINVAL; + } + + err = sc_flock(fd, LOCK_NB | cmd); + sc_close(fd); + return err; +} + + +static int fixup_posix_locks(struct file *file, + struct cpt_file_image *fi, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end; + struct cpt_flock_image fli; + + end = pos + fi->cpt_next; + pos += fi->cpt_hdrlen; + while (pos < end) { + err = rst_get_object(-1, pos, &fli, ctx); + if (err) + return err; + if (fli.cpt_object == CPT_OBJ_FLOCK && + (fli.cpt_flags&FL_POSIX)) { + err = restore_posix_lock(file, &fli, ctx); + if (err) + return err; + dprintk_ctx("posix lock restored\n"); + } + pos += fli.cpt_next; + } + return 0; +} + +int rst_posix_locks(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + struct cpt_file_image fi; + + if (obj->o_pos == CPT_NULL) + continue; + + err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx); + if (err < 0) + return err; + if (fi.cpt_next > fi.cpt_hdrlen) + fixup_posix_locks(file, &fi, obj->o_pos, ctx); + } + return 0; +} + +static int fixup_flocks(struct file *file, + struct cpt_file_image *fi, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end; + struct cpt_flock_image fli; + + end = pos + fi->cpt_next; + pos += fi->cpt_hdrlen; + while (pos < end) { + err = rst_get_object(-1, pos, &fli, ctx); + if (err) + return err; + if (fli.cpt_object == CPT_OBJ_FLOCK && + (fli.cpt_flags&FL_FLOCK)) { + err = restore_flock(file, &fli, ctx); + if (err) + return err; + dprintk_ctx("bsd lock restored\n"); + } + pos += fli.cpt_next; + } + return 0; +} + + +static int fixup_reg_data(struct file *file, loff_t pos, loff_t end, + struct cpt_context *ctx) +{ + int err; + struct cpt_page_block pgb; + ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); + + do_write = file->f_op->write; + if (do_write == NULL) { + eprintk_ctx("no write method. Cannot restore contents of the file.\n"); + return -EINVAL; + } + + atomic_inc(&file->f_count); + + while (pos < end) { + loff_t opos; + loff_t ipos; + int count; + + err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); + if (err) + goto out; + dprintk_ctx("restoring file data block: %08x-%08x\n", + (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); + ipos = pos + pgb.cpt_hdrlen; + opos = pgb.cpt_start; + count = pgb.cpt_end-pgb.cpt_start; + while (count > 0) { + mm_segment_t oldfs; + int copy = count; + + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + oldfs = get_fs(); set_fs(KERNEL_DS); + err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); + set_fs(oldfs); + if (err) { + __cpt_release_buf(ctx); + goto out; + } + if (!(file->f_mode & FMODE_WRITE) || + (file->f_flags&O_DIRECT)) { + fput(file); + file = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), O_WRONLY); + if (IS_ERR(file)) { + __cpt_release_buf(ctx); + return PTR_ERR(file); + } + } + oldfs = get_fs(); set_fs(KERNEL_DS); + ipos += copy; + err = do_write(file, ctx->tmpbuf, copy, &opos); + set_fs(oldfs); + __cpt_release_buf(ctx); + if (err != copy) { + if (err >= 0) + err = -EIO; + goto out; + } + count -= copy; + } + pos += pgb.cpt_next; + } + err = 0; + +out: + fput(file); + return err; +} + + +static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi, + struct cpt_inode_image *ii, + struct cpt_context *ctx) +{ + int err; + struct file *file = *file_p; + struct iattr newattrs; + + if (!S_ISREG(fi->cpt_i_mode)) + return 0; + + if (file == NULL) { + file = shmem_file_setup("dev/zero", ii->cpt_size, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + *file_p = file; + } + + if (ii->cpt_next > ii->cpt_hdrlen) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ctx, fi->cpt_inode+ii->cpt_hdrlen); + if (err) + return err; + if (hdr.cpt_object == CPT_OBJ_PAGES) { + err = fixup_reg_data(file, fi->cpt_inode+ii->cpt_hdrlen, + fi->cpt_inode+ii->cpt_next, ctx); + if (err) + return err; + } + } + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + /* stage 1 - update size like do_truncate does */ + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + newattrs.ia_size = ii->cpt_size; + cpt_timespec_import(&newattrs.ia_ctime, ii->cpt_ctime); + err = notify_change(file->f_dentry, &newattrs); + if (err) + goto out; + + /* stage 2 - update times, owner and mode */ + newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_ATIME_SET | ATTR_MTIME_SET | + ATTR_MODE | ATTR_UID | ATTR_GID; + newattrs.ia_uid = ii->cpt_uid; + newattrs.ia_gid = ii->cpt_gid; + newattrs.ia_mode = file->f_dentry->d_inode->i_mode & S_IFMT; + newattrs.ia_mode |= (ii->cpt_mode & ~S_IFMT); + cpt_timespec_import(&newattrs.ia_atime, ii->cpt_atime); + cpt_timespec_import(&newattrs.ia_mtime, ii->cpt_mtime); + err = notify_change(file->f_dentry, &newattrs); + +out: + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return err; +} + +static int fixup_file_flags(struct file *file, struct cpt_file_image *fi, + int was_dentry_open, loff_t pos, + cpt_context_t *ctx) +{ + if (fi->cpt_pos != file->f_pos) { + int err = -ESPIPE; + if (file->f_op->llseek) + err = file->f_op->llseek(file, fi->cpt_pos, 0); + if (err < 0) { + dprintk_ctx("file %Ld lseek %Ld - %Ld\n", + (long long)pos, + (long long)file->f_pos, + (long long)fi->cpt_pos); + file->f_pos = fi->cpt_pos; + } + } + file->f_uid = fi->cpt_uid; + file->f_gid = fi->cpt_gid; + file->f_owner.pid = 0; + if (fi->cpt_fown_pid) { + file->f_owner.pid = comb_vpid_to_pid(fi->cpt_fown_pid); + if (file->f_owner.pid == 0) { + wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n", file->f_owner.pid); + return -EINVAL; + } + } + file->f_owner.uid = fi->cpt_fown_uid; + file->f_owner.euid = fi->cpt_fown_euid; + file->f_owner.signum = fi->cpt_fown_signo; + + if (file->f_mode != fi->cpt_mode) { + if (was_dentry_open && + ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) { + file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK); + file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK); + } + if (file->f_mode != fi->cpt_mode) + wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode); + } + if (file->f_flags != fi->cpt_flags) { + if (!(fi->cpt_flags&O_NOFOLLOW)) + file->f_flags &= ~O_NOFOLLOW; + if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) { + file->f_flags &= ~O_NONBLOCK; + file->f_flags |= fi->cpt_flags&O_NONBLOCK; + } + if (fi->cpt_flags&FASYNC) { + if (fi->cpt_fown_fd == -1) { + wprintk_ctx("No fd for FASYNC\n"); + return -EINVAL; + } else if (file->f_op && file->f_op->fasync) { + if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) { + wprintk_ctx("FASYNC problem\n"); + return -EINVAL; + } else { + file->f_flags |= FASYNC; + } + } + } + if (file->f_flags != fi->cpt_flags) { + eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags); + return -EINVAL; + } + } + return 0; +} + +static struct file * +open_deleted(char *name, unsigned flags, struct cpt_file_image *fi, + struct cpt_inode_image *ii, cpt_context_t *ctx) +{ + struct file * file; + char *suffix = NULL; + int attempt = 0; + int tmp_pass = 0; + mode_t mode = fi->cpt_i_mode; + + /* Strip (deleted) part... */ + if (strlen(name) > strlen(" (deleted)")) { + if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) { + suffix = &name[strlen(name) - strlen(" (deleted)")]; + *suffix = 0; + } else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) { + memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1); + suffix = name + strlen(name); + } + } + +try_again: + for (;;) { + if (attempt) { + if (attempt > 1000) { + eprintk_ctx("open_deleted: failed after %d attempts\n", attempt); + return ERR_PTR(-EEXIST); + } + if (suffix == NULL) { + eprintk_ctx("open_deleted: no suffix\n"); + return ERR_PTR(-EEXIST); + } + sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt)); + } + attempt++; + + if (S_ISFIFO(mode)) { + int err; + err = sc_mknod(name, S_IFIFO|(mode&017777), 0); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = open_pipe(name, fi, flags, ctx); + sc_unlink(name); + } else if (S_ISCHR(mode)) { + int err; + err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev)); + kfree(ii); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = filp_open(name, flags, mode&017777); + sc_unlink(name); + } else if (S_ISDIR(mode)) { + int err; + err = sc_mkdir(name, mode&017777); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = filp_open(name, flags, mode&017777); + sc_rmdir(name); + } else { + file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777); + if (IS_ERR(file)) { + if (PTR_ERR(file) == -EEXIST) + continue; + if (!tmp_pass) + goto change_dir; + } else { + sc_unlink(name); + } + } + break; + } + + if (IS_ERR(file)) { + eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file)); + return file; + } else { + dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode); + } + return file; + +change_dir: + sprintf(name, "/tmp/rst%u", current->pid); + suffix = name + strlen(name); + attempt = 1; + tmp_pass = 1; + goto try_again; +} + +struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx) +{ + int err; + int was_dentry_open = 0; + cpt_object_t *obj; + cpt_object_t *iobj; + struct cpt_file_image fi; + __u8 *name = NULL; + struct file *file; + int flags; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx); + if (obj) { + file = obj->o_obj; + if (obj->o_index >= 0) { + dprintk_ctx("file is attached to a socket\n"); + err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); + if (err < 0) + goto err_out; + fixup_file_flags(file, &fi, 0, pos, ctx); + } + get_file(file); + return file; + } + + err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); + if (err < 0) + goto err_out; + + flags = make_flags(&fi); + + /* Easy way, inode has been already open. */ + if (fi.cpt_inode != CPT_NULL && + !(fi.cpt_lflags & CPT_DENTRY_CLONING) && + (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL && + iobj->o_parent) { + struct file *filp = iobj->o_parent; + file = dentry_open(dget(filp->f_dentry), + mntget(filp->f_vfsmnt), flags); + dprintk_ctx("rst_file: file obtained by dentry_open\n"); + was_dentry_open = 1; + goto map_file; + } + + if (fi.cpt_next > fi.cpt_hdrlen) + name = rst_get_name(pos + sizeof(fi), ctx); + + if (!name) { + eprintk_ctx("no name for file?\n"); + err = -EINVAL; + goto err_out; + } + + if (fi.cpt_lflags & CPT_DENTRY_DELETED) { + struct cpt_inode_image ii; + if (fi.cpt_inode == CPT_NULL) { + eprintk_ctx("deleted file and no inode.\n"); + err = -EINVAL; + goto err_out; + } + + err = rst_get_object(CPT_OBJ_INODE, fi.cpt_inode, &ii, ctx); + if (err) + goto err_out; + + if (ii.cpt_next > ii.cpt_hdrlen) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(hdr), ctx, + fi.cpt_inode + ii.cpt_hdrlen); + if (err) + goto err_out; + if (hdr.cpt_object == CPT_OBJ_NAME) { + rst_put_name(name, ctx); + name = rst_get_name(fi.cpt_inode+ii.cpt_hdrlen, + ctx); + if (!name) { + eprintk_ctx("no name for link?\n"); + err = -EINVAL; + goto err_out; + } + goto open_file; + } + } + + /* One very special case... */ + if (S_ISREG(fi.cpt_i_mode) && + (!name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) { + /* MAP_ANON|MAP_SHARED mapping. + * kernel makes this damn ugly way, when file which + * is passed to mmap by user does not match + * file finally attached to VMA. Ok, rst_mm + * has to take care of this. Otherwise, it will fail. + */ + file = NULL; + } else if (S_ISREG(fi.cpt_i_mode) || + S_ISCHR(fi.cpt_i_mode) || + S_ISFIFO(fi.cpt_i_mode) || + S_ISDIR(fi.cpt_i_mode)) { + if (S_ISCHR(fi.cpt_i_mode)) { + file = open_special(&fi, flags, 1, ctx); + if (file != NULL) + goto map_file; + } + file = open_deleted(name, flags, &fi, &ii, ctx); + if (IS_ERR(file)) + goto out; + } else { + eprintk_ctx("not a regular deleted file.\n"); + err = -EINVAL; + goto err_out; + } + + err = fixup_file_content(&file, &fi, &ii, ctx); + if (err) + goto err_put; + goto map_file; + } else { +open_file: + if (!name[0]) { + eprintk_ctx("empty name for file?\n"); + err = -EINVAL; + goto err_out; + } + if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) && + (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL) + goto map_file; + if (S_ISFIFO(fi.cpt_i_mode) && + (file = open_pipe(name, &fi, flags, ctx)) != NULL) + goto map_file; + if (!S_ISREG(fi.cpt_i_mode) && + (file = open_special(&fi, flags, 0, ctx)) != NULL) + goto map_file; + } + + file = filp_open(name, flags, 0); + +map_file: + if (!IS_ERR(file)) { + fixup_file_flags(file, &fi, was_dentry_open, pos, ctx); + + if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) { + err = fixup_pipe_data(file, &fi, ctx); + if (err) + goto err_put; + } + + /* This is very special hack. Logically, cwd/root are + * nothing but open directories. Nevertheless, this causes + * failures of restores, when number of open files in VE + * is close to limit. So, if it is rst_file() of cwd/root + * (fd = -2) and the directory is not deleted, we skip + * adding files to object table. If the directory is + * not unlinked, this cannot cause any problems. + */ + if (fd != -2 || + !S_ISDIR(file->f_dentry->d_inode->i_mode) || + (fi.cpt_lflags & CPT_DENTRY_DELETED)) { + obj = cpt_object_get(CPT_OBJ_FILE, file, ctx); + if (!obj) { + obj = cpt_object_add(CPT_OBJ_FILE, file, ctx); + if (obj) + get_file(file); + } + if (obj) + cpt_obj_setpos(obj, pos, ctx); + + obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (obj) { + cpt_obj_setpos(obj, fi.cpt_inode, ctx); + if (!obj->o_parent || !(fi.cpt_lflags & CPT_DENTRY_DELETED)) + obj->o_parent = file; + } + } + + if (fi.cpt_next > fi.cpt_hdrlen) { + err = fixup_flocks(file, &fi, pos, ctx); + if (err) + goto err_put; + } + } else { + if (fi.cpt_lflags & CPT_DENTRY_PROC) { + dprintk_ctx("rst_file /proc delayed\n"); + file = NULL; + } else if (name) + eprintk_ctx("can't open file %s\n", name); + } + +out: + if (name) + rst_put_name(name, ctx); + return file; + +err_put: + if (file) + fput(file); +err_out: + if (name) + rst_put_name(name, ctx); + return ERR_PTR(err); +} + + +__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + + if (ti->cpt_files == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx)) + flag |= CLONE_FILES; + if (ti->cpt_fs == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx)) + flag |= CLONE_FS; + return flag; +} + +static void local_close_files(struct files_struct * files) +{ + int i, j; + + j = 0; + for (;;) { + unsigned long set; + i = j * __NFDBITS; + if (i >= files->fdt->max_fdset || i >= files->fdt->max_fds) + break; + set = files->fdt->open_fds->fds_bits[j]; + while (set) { + if (set & 1) { + struct file * file = xchg(&files->fdt->fd[i], NULL); + if (file) + filp_close(file, files); + } + i++; + set >>= 1; + } + files->fdt->open_fds->fds_bits[j] = 0; + files->fdt->close_on_exec->fds_bits[j] = 0; + j++; + } +} + +extern int expand_fdtable(struct files_struct *files, int nr); + + +int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct cpt_files_struct_image fi; + struct files_struct *f = current->files; + cpt_object_t *obj; + loff_t pos, endpos; + int err; + + if (ti->cpt_files == CPT_NULL) { + current->files = NULL; + if (f) + put_files_struct(f); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx); + if (obj) { + if (obj->o_obj != f) { + put_files_struct(f); + f = obj->o_obj; + atomic_inc(&f->count); + current->files = f; + } + return 0; + } + + err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx); + if (err) + return err; + + local_close_files(f); + + if (fi.cpt_max_fds > f->fdt->max_fds) { + spin_lock(&f->file_lock); + err = expand_fdtable(f, fi.cpt_max_fds-1); + spin_unlock(&f->file_lock); + if (err) + return err; + } + + pos = ti->cpt_files + fi.cpt_hdrlen; + endpos = ti->cpt_files + fi.cpt_next; + while (pos < endpos) { + struct cpt_fd_image fdi; + struct file *filp; + + err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx); + if (err) + return err; + filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); + if (IS_ERR(filp)) { + eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp), + (long long)fdi.cpt_file); + return PTR_ERR(filp); + } + if (filp == NULL) { + int err = rst_filejob_queue(pos, ctx); + if (err) + return err; + } else { + if (fdi.cpt_fd >= f->fdt->max_fds) BUG(); + f->fdt->fd[fdi.cpt_fd] = filp; + FD_SET(fdi.cpt_fd, f->fdt->open_fds); + if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) + FD_SET(fdi.cpt_fd, f->fdt->close_on_exec); + } + pos += fdi.cpt_next; + } + f->next_fd = fi.cpt_next_fd; + + obj = cpt_object_add(CPT_OBJ_FILES, f, ctx); + if (obj) { + cpt_obj_setpos(obj, ti->cpt_files, ctx); + cpt_obj_setindex(obj, fi.cpt_index, ctx); + } + return 0; +} + +int rst_do_filejobs(cpt_context_t *ctx) +{ + struct filejob *j; + + while ((j = ctx->filejob_queue) != NULL) { + int err; + struct task_struct *tsk; + struct cpt_fd_image fdi; + struct file *filp; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid_ve(j->pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (!tsk) + return -EINVAL; + + err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx); + if (err) { + put_task_struct(tsk); + return err; + } + + if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); + if (tsk->files->fdt->fd[fdi.cpt_fd] || + FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) { + eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi); + put_task_struct(tsk); + return -EBUSY; + } + + filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); + if (IS_ERR(filp)) { + eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), (unsigned long long)fdi.cpt_file); + put_task_struct(tsk); + return PTR_ERR(filp); + } + if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); + tsk->files->fdt->fd[fdi.cpt_fd] = filp; + FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds); + if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) + FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec); + + dprintk_ctx("filejob %Ld done\n", j->fdi); + + put_task_struct(tsk); + ctx->filejob_queue = j->next; + kfree(j); + } + return 0; +} + +void rst_flush_filejobs(cpt_context_t *ctx) +{ + struct filejob *j; + + while ((j = ctx->filejob_queue) != NULL) { + ctx->filejob_queue = j->next; + kfree(j); + } +} + +int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct fs_struct *f = current->fs; + cpt_object_t *obj; + + if (ti->cpt_fs == CPT_NULL) { + exit_fs(current); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx); + if (obj) { + if (obj->o_obj != f) { + exit_fs(current); + f = obj->o_obj; + atomic_inc(&f->count); + current->fs = f; + } + return 0; + } + + /* Do _not_ restore root. Image contains absolute pathnames. + * So, we fix it in context of rst process. + */ + + obj = cpt_object_add(CPT_OBJ_FS, f, ctx); + if (obj) + cpt_obj_setpos(obj, ti->cpt_fs, ctx); + + return 0; +} + +static int get_dir(struct dentry **dp, struct vfsmount **mp, + loff_t *pos, struct cpt_context *ctx) +{ + struct cpt_file_image fi; + struct file * file; + int err; + + err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx); + if (err) + return err; + + file = rst_file(*pos, -2, ctx); + if (IS_ERR(file)) + return PTR_ERR(file); + + *dp = dget(file->f_dentry); + *mp = mntget(file->f_vfsmnt); + *pos += fi.cpt_next; + fput(file); + return 0; +} + +static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct dentry *old_root; + struct vfsmount *old_rootmnt; + write_lock(&fs->lock); + old_root = fs->root; + old_rootmnt = fs->rootmnt; + fs->rootmnt = mnt; + fs->root = dentry; + write_unlock(&fs->lock); + if (old_root) { + dput(old_root); + mntput(old_rootmnt); + } +} + +static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct dentry *old_pwd; + struct vfsmount *old_pwdmnt; + + write_lock(&fs->lock); + old_pwd = fs->pwd; + old_pwdmnt = fs->pwdmnt; + fs->pwdmnt = mnt; + fs->pwd = dentry; + write_unlock(&fs->lock); + + if (old_pwd) { + dput(old_pwd); + mntput(old_pwdmnt); + } +} + + +int rst_restore_fs(struct cpt_context *ctx) +{ + loff_t pos; + cpt_object_t *obj; + int err = 0; + + for_each_object(obj, CPT_OBJ_FS) { + struct cpt_fs_struct_image fi; + struct fs_struct *fs = obj->o_obj; + int i; + struct dentry *d[3]; + struct vfsmount *m[3]; + + err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx); + if (err) + return err; + + fs->umask = fi.cpt_umask; + + pos = obj->o_pos + fi.cpt_hdrlen; + d[0] = d[1] = d[2] = NULL; + m[0] = m[1] = m[2] = NULL; + i = 0; + while (pos < obj->o_pos + fi.cpt_next && i<3) { + err = get_dir(d+i, m+i, &pos, ctx); + if (err) { + eprintk_ctx("cannot get_dir: %d", err); + for (--i; i >= 0; i--) { + if (d[i]) + dput(d[i]); + if (m[i]) + mntput(m[i]); + } + return err; + } + i++; + } + if (d[0]) + __set_fs_root(fs, m[0], d[0]); + if (d[1]) + __set_fs_pwd(fs, m[1], d[1]); + if (d[2]) { + struct dentry *olddentry; + struct vfsmount *oldmnt; + write_lock(&fs->lock); + oldmnt = fs->altrootmnt; + olddentry = fs->altroot; + fs->altrootmnt = m[2]; + fs->altroot = d[2]; + write_unlock(&fs->lock); + + if (olddentry) { + dput(olddentry); + mntput(oldmnt); + } + } + } + return err; +} + +int do_one_mount(char *mntpnt, char *mnttype, char *mntbind, + unsigned long flags, unsigned long mnt_flags, + struct cpt_context *ctx) +{ + int err; + + if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0)) + mntbind = NULL; + + if (mntbind) + flags |= MS_BIND; + /* Join per-mountpoint flags with global flags */ + if (mnt_flags & MNT_NOSUID) + flags |= MS_NOSUID; + if (mnt_flags & MNT_NODEV) + flags |= MS_NODEV; + if (mnt_flags & MNT_NOEXEC) + flags |= MS_NOEXEC; + + err = sc_mount(mntbind, mntpnt, mnttype, flags); + if (err < 0) { + eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags); + return err; + } + return 0; +} + +static int undumptmpfs(void *arg) +{ + int i; + int *pfd = arg; + int fd1, fd2, err; + char *argv[] = { "tar", "x", "-C", "/", "-S", NULL }; + + if (pfd[0] != 0) + sc_dup2(pfd[0], 0); + + set_fs(KERNEL_DS); + fd1 = sc_open("/dev/null", O_WRONLY, 0); + fd2 = sc_open("/dev/null", O_WRONLY, 0); +try: + if (fd1 < 0 || fd2 < 0) { + if (fd1 == -ENOENT && fd2 == -ENOENT) { + err = sc_mknod("/dev/null", S_IFCHR|0666, + new_encode_dev((MEM_MAJOR<files->fdt->max_fds; i++) + sc_close(i); + + module_put(THIS_MODULE); + + i = sc_execve("/bin/tar", argv, NULL); + eprintk("failed to exec /bin/tar: %d\n", i); + return 255 << 8; +} + +static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx) +{ + int err; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + int n; + loff_t end; + int pid; + int status; + mm_segment_t oldfs; + + err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx); + if (err < 0) + return err; + + err = sc_pipe(pfd); + if (err < 0) + return err; + pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0); + if (err < 0) + goto out; + f = fget(pfd[1]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + ctx->file->f_pos = *pos + v.cpt_hdrlen; + end = *pos + v.cpt_next; + *pos += v.cpt_next; + do { + char buf[16]; + + n = end - ctx->file->f_pos; + if (n > sizeof(buf)) + n = sizeof(buf); + + if (ctx->read(buf, n, ctx)) + break; + oldfs = get_fs(); set_fs(KERNEL_DS); + f->f_op->write(f, buf, n, &f->f_pos); + set_fs(oldfs); + } while (ctx->file->f_pos < end); + + fput(f); + + clear_tsk_thread_flag(current,TIF_SIGPENDING); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("tar exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("tar terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + + return err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + return err; +} + +int check_ext_mount(char *mntpnt, char *mnttype, struct cpt_context *ctx) +{ + struct namespace *n = current->nsproxy->namespace; + struct list_head *p; + struct vfsmount *t; + char *path, *path_buf; + int ret; + + ret = -ENOENT; + path_buf = cpt_get_buf(ctx); + down_read(&namespace_sem); + list_for_each(p, &n->list) { + t = list_entry(p, struct vfsmount, mnt_list); + path = d_path(t->mnt_root, t, path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + if (!strcmp(path, mntpnt) && + !strcmp(t->mnt_sb->s_type->name, mnttype)) { + ret = 0; + break; + } + } + up_read(&namespace_sem); + __cpt_release_buf(ctx); + return ret; +} + +int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t endpos; + + endpos = pos + mi->cpt_next; + pos += mi->cpt_hdrlen; + + while (pos < endpos) { + char *mntdev; + char *mntpnt; + char *mnttype; + char *mntbind; + + mntdev = __rst_get_name(&pos, ctx); + mntpnt = __rst_get_name(&pos, ctx); + mnttype = __rst_get_name(&pos, ctx); + mntbind = NULL; + if (mi->cpt_mntflags & CPT_MNT_BIND) + mntbind = __rst_get_name(&pos, ctx); + err = -EINVAL; + if (mnttype && mntpnt) { + err = 0; + if (!(mi->cpt_mntflags & CPT_MNT_EXT) && + strcmp(mntpnt, "/")) { + err = do_one_mount(mntpnt, mnttype, mntbind, + mi->cpt_flags, + mi->cpt_mntflags, ctx); + if (!err && + strcmp(mnttype, "tmpfs") == 0 && + !(mi->cpt_mntflags & (CPT_MNT_BIND))) + err = rst_restore_tmpfs(&pos, ctx); + } else if (mi->cpt_mntflags & CPT_MNT_EXT) { + err = check_ext_mount(mntpnt, mnttype, ctx); + if (err) + eprintk_ctx("mount point is missing: %s\n", mntpnt); + } + } + if (mntdev) + rst_put_name(mntdev, ctx); + if (mntpnt) + rst_put_name(mntpnt, ctx); + if (mnttype) + rst_put_name(mnttype, ctx); + if (mntbind) + rst_put_name(mntbind, ctx); + if (err) + return err; + } + return 0; +} + +int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx) +{ + int err; + struct cpt_vfsmount_image mi; + + while (pos < endpos) { + err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx); + if (err) + return err; + err = restore_one_vfsmount(&mi, pos, ctx); + if (err) + return err; + pos += mi.cpt_next; + } + return 0; +} + +int rst_root_namespace(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_NAMESPACE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr sbuf; + int done = 0; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx); + if (err) + return err; + if (done) { + eprintk_ctx("multiple namespaces are not supported\n"); + break; + } + done++; + err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + + return 0; +} + +int rst_stray_files(struct cpt_context *ctx) +{ + int err = 0; + loff_t sec = ctx->sections[CPT_SECT_FILES]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_object_hdr sbuf; + cpt_object_t *obj; + + err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx); + if (err) + break; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx); + if (!obj) { + struct file *file; + + dprintk_ctx("stray file %Ld\n", sec); + + file = rst_sysv_shm(sec, ctx); + + if (IS_ERR(file)) { + eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } else { + fput(file); + } + } + sec += sbuf.cpt_next; + } + + return err; +} diff -uprN linux-2.6.18/kernel/cpt/rst_mm.c linux-2.6.18.ovz/kernel/cpt/rst_mm.c --- linux-2.6.18/kernel/cpt/rst_mm.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_mm.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1097 @@ +/* + * + * kernel/cpt/rst_mm.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#include +#endif +#include +#include +#include + +#ifdef CONFIG_VE +#include +#include +#endif + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_ubc.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +#include "cpt_pagein.h" +#endif + +#include "cpt_syscalls.h" + +#define __PAGE_NX (1ULL<<63) + +static unsigned long make_prot(struct cpt_vma_image *vmai) +{ + unsigned long prot = 0; + + if (vmai->cpt_flags&VM_READ) + prot |= PROT_READ; + if (vmai->cpt_flags&VM_WRITE) + prot |= PROT_WRITE; + if (vmai->cpt_flags&VM_EXEC) + prot |= PROT_EXEC; + if (vmai->cpt_flags&VM_GROWSDOWN) + prot |= PROT_GROWSDOWN; + if (vmai->cpt_flags&VM_GROWSUP) + prot |= PROT_GROWSUP; + return prot; +} + +static unsigned long make_flags(struct cpt_vma_image *vmai) +{ + unsigned long flags = MAP_FIXED; + + if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE)) + flags |= MAP_SHARED; + else + flags |= MAP_PRIVATE; + + if (vmai->cpt_file == CPT_NULL) + flags |= MAP_ANONYMOUS; + if (vmai->cpt_flags&VM_GROWSDOWN) + flags |= MAP_GROWSDOWN; +#ifdef MAP_GROWSUP + if (vmai->cpt_flags&VM_GROWSUP) + flags |= MAP_GROWSUP; +#endif + if (vmai->cpt_flags&VM_DENYWRITE) + flags |= MAP_DENYWRITE; + if (vmai->cpt_flags&VM_EXECUTABLE) + flags |= MAP_EXECUTABLE; + if (!(vmai->cpt_flags&VM_ACCOUNT)) + flags |= MAP_NORESERVE; + return flags; +} + +#ifdef CONFIG_X86 +#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) +static int __alloc_ldt(mm_context_t *pc, int mincount) +{ + int oldsize, newsize, i; + + if (mincount <= pc->size) + return 0; + /* + * LDT got larger - reallocate if necessary. + */ + oldsize = pc->size; + mincount = (mincount+511)&(~511); + newsize = mincount*LDT_ENTRY_SIZE; + for (i = 0; i < newsize; i += PAGE_SIZE) { + int nr = i/PAGE_SIZE; + BUG_ON(i >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC); + if (!pc->ldt_pages[nr]) + return -ENOMEM; + clear_highpage(pc->ldt_pages[nr]); + } + } + pc->size = mincount; + return 0; +} + +static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) +{ + struct mm_struct *mm = current->mm; + int i; + int err; + int size; + + err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE); + if (err) + return err; + + size = mm->context.size*LDT_ENTRY_SIZE; + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i); + kunmap(mm->context.ldt_pages[nr]); + if (err) + return err; + } + + load_LDT(&mm->context); + return 0; +} + +#else + +static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) +{ + struct mm_struct *mm = current->mm; + int oldsize = mm->context.size; + void *oldldt; + void *newldt; + int err; + + if (li->cpt_size > PAGE_SIZE) + newldt = vmalloc(li->cpt_size); + else + newldt = kmalloc(li->cpt_size, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen); + if (err) + return err; + + oldldt = mm->context.ldt; + mm->context.ldt = newldt; + mm->context.size = li->cpt_size/LDT_ENTRY_SIZE; + + load_LDT(&mm->context); + + if (oldsize) { + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} +#endif +#endif + +static int +restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg) +{ + struct aio_ring_info *info = &aio_ctx->ring_info; + unsigned nr_events = aio_ctx->max_reqs; + unsigned long size; + int nr_pages; + + /* We recalculate parameters of the ring exactly like + * fs/aio.c does and then compare calculated values + * with ones, stored in dump. They must be the same. */ + + nr_events += 2; + + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_events; + nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + + if (nr_pages != aimg->cpt_ring_pages) + return -EINVAL; + + info->nr_pages = nr_pages; + + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + + if (nr_events != aimg->cpt_nr) + return -EINVAL; + + info->nr = 0; + info->ring_pages = info->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + if (!info->ring_pages) + return -ENOMEM; + memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); + } + + info->mmap_size = nr_pages * PAGE_SIZE; + + /* This piece of shit is not entirely my fault. Kernel aio.c makes + * something odd mmap()ping some pages and then pinning them. + * I guess it is just some mud remained of failed attempt to show ring + * to user space. The result is odd. :-) Immediately after + * creation of AIO context, kernel shares those pages with user + * and user can read and even write there. But after the first + * fork, pages are marked COW with evident consequences. + * I remember, I did the same mistake in the first version + * of mmapped packet socket, luckily that crap never reached + * mainstream. + * + * So, what are we going to do? I can simulate this odd behaviour + * exactly, but I am not insane yet. For now just take the pages + * from user space. Alternatively, we could keep kernel copy + * in AIO context image, which would be more correct. + * + * What is wrong now? If the pages are COWed, ring is transferred + * incorrectly. + */ + down_read(¤t->mm->mmap_sem); + info->mmap_base = aimg->cpt_mmap_base; + info->nr_pages = get_user_pages(current, current->mm, + info->mmap_base, nr_pages, + 1, 0, info->ring_pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (unlikely(info->nr_pages != nr_pages)) { + int i; + + for (i=0; inr_pages; i++) + put_page(info->ring_pages[i]); + if (info->ring_pages && info->ring_pages != info->internal_pages) + kfree(info->ring_pages); + return -EFAULT; + } + + aio_ctx->user_id = info->mmap_base; + + info->nr = nr_events; + info->tail = aimg->cpt_tail; + + return 0; +} + +static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx) +{ + int err; + struct kioctx *aio_ctx; + extern spinlock_t aio_nr_lock; + + aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (!aio_ctx) + return -ENOMEM; + + memset(aio_ctx, 0, sizeof(*aio_ctx)); + aio_ctx->max_reqs = aimg->cpt_max_reqs; + + if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) { + kmem_cache_free(kioctx_cachep, aio_ctx); + eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err); + return err; + } + + aio_ctx->mm = current->mm; + atomic_inc(&aio_ctx->mm->mm_count); + atomic_set(&aio_ctx->users, 1); + spin_lock_init(&aio_ctx->ctx_lock); + spin_lock_init(&aio_ctx->ring_info.ring_lock); + init_waitqueue_head(&aio_ctx->wait); + INIT_LIST_HEAD(&aio_ctx->active_reqs); + INIT_LIST_HEAD(&aio_ctx->run_list); + INIT_WORK(&aio_ctx->wq, aio_kick_handler, ctx); + + spin_lock(&aio_nr_lock); + aio_nr += aio_ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + write_lock(&aio_ctx->mm->ioctx_list_lock); + aio_ctx->next = aio_ctx->mm->ioctx_list; + aio_ctx->mm->ioctx_list = aio_ctx; + write_unlock(&aio_ctx->mm->ioctx_list_lock); + + return 0; +} + +struct anonvma_map +{ + struct hlist_node list; + struct anon_vma *avma; + __u64 id; +}; + +static int verify_create_anonvma(struct mm_struct *mm, + struct cpt_vma_image *vmai, + cpt_context_t *ctx) +{ + struct anon_vma *avma = NULL; + struct anon_vma *new_avma; + struct vm_area_struct *vma; + int h; + + if (!ctx->anonvmas) { + if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE) + return -EINVAL; + if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL) + return -ENOMEM; + for (h = 0; h < CPT_ANONVMA_HSIZE; h++) + INIT_HLIST_HEAD(&ctx->anonvmas[h]); + } else { + struct anonvma_map *map; + struct hlist_node *elem; + + h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); + hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) { + if (map->id == vmai->cpt_anonvmaid) { + avma = map->avma; + break; + } + } + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + return -ESRCH; + } + if (vma->vm_start != vmai->cpt_start) { + up_read(&mm->mmap_sem); + eprintk_ctx("vma start mismatch\n"); + return -EINVAL; + } + if (vma->vm_pgoff != vmai->cpt_pgoff) { + dprintk_ctx("vma pgoff mismatch, fixing\n"); + if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) { + eprintk_ctx("cannot fixup vma pgoff\n"); + up_read(&mm->mmap_sem); + return -EINVAL; + } + vma->vm_pgoff = vmai->cpt_pgoff; + } + + if (!vma->anon_vma) { + if (avma) { + vma->anon_vma = avma; + anon_vma_link(vma); + } else { + int err; + + err = anon_vma_prepare(vma); + + if (err) { + up_read(&mm->mmap_sem); + return err; + } + } + } else { + /* Note, we _can_ arrive to the situation, when two + * different anonvmaid's point to one anon_vma, this happens + * f.e. when mmap() merged new area to previous one and + * they will share one anon_vma even if they did not on + * original host. + * + * IT IS OK. To all that I understand, we may merge all + * the anon_vma's and rmap can scan all the huge list of vmas + * searching for page. It is just "suboptimal". + * + * Real disaster would happen, if vma already got an anon_vma + * with different id. It is very rare case, kernel does the + * best efforts to merge anon_vmas when some attributes are + * different. In this case we will fall to copying memory. + */ + if (avma && vma->anon_vma != avma) { + up_read(&mm->mmap_sem); + wprintk_ctx("anon_vma mismatch\n"); + return 0; + } + } + + new_avma = vma->anon_vma; + up_read(&mm->mmap_sem); + + if (!avma) { + struct anonvma_map *map; + + if (!new_avma) + return -EINVAL; + + if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL) + return -ENOMEM; + + map->id = vmai->cpt_anonvmaid; + map->avma = new_avma; + h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); + hlist_add_head(&map->list, &ctx->anonvmas[h]); + } + return 0; +} + +static int copy_mm_pages(struct mm_struct *src, unsigned long start, + unsigned long end) +{ + int err; + + for (; start < end; start += PAGE_SIZE) { + struct page *page; + struct page *spage; + void *maddr, *srcaddr; + + err = get_user_pages(current, current->mm, + start, 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) + return err; + + err = get_user_pages(current, src, + start, 1, 0, 1, &spage, NULL); + + if (err == 0) + err = -EFAULT; + if (err < 0) { + page_cache_release(page); + return err; + } + + srcaddr = kmap(spage); + maddr = kmap(page); + memcpy(maddr, srcaddr, PAGE_SIZE); + set_page_dirty_lock(page); + kunmap(page); + kunmap(spage); + page_cache_release(page); + page_cache_release(spage); + } + return 0; +} + +static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx) +{ + int err = 0; + unsigned long addr; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct file *file = NULL; + unsigned long prot; + int checked = 0; + + prot = make_prot(vmai); + + if (vmai->cpt_file != CPT_NULL) { + if (vmai->cpt_type == CPT_VMA_TYPE_0) { + file = rst_file(vmai->cpt_file, -1, ctx); + if (IS_ERR(file)) { + eprintk_ctx("do_rst_vma: rst_file: %Ld\n", (unsigned long long)vmai->cpt_file); + return PTR_ERR(file); + } + } else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) { + file = rst_sysv_shm(vmai->cpt_file, ctx); + if (IS_ERR(file)) + return PTR_ERR(file); + } + } + + down_write(&mm->mmap_sem); + addr = do_mmap_pgoff(file, vmai->cpt_start, + vmai->cpt_end-vmai->cpt_start, + prot, make_flags(vmai), + vmai->cpt_pgoff); + + if (addr != vmai->cpt_start) { + up_write(&mm->mmap_sem); + + err = -EINVAL; + if (IS_ERR((void*)addr)) + err = addr; + goto out; + } + + vma = find_vma(mm, vmai->cpt_start); + if (vma == NULL) { + up_write(&mm->mmap_sem); + eprintk_ctx("cannot find mmapped vma\n"); + err = -ESRCH; + goto out; + } + + /* do_mmap_pgoff() can merge new area to previous one (not to the next, + * we mmap in order, the rest of mm is still unmapped). This can happen + * f.e. if flags are to be adjusted later, or if we had different + * anon_vma on two adjacent regions. Split it by brute force. */ + if (vma->vm_start != vmai->cpt_start) { + dprintk_ctx("vma %Ld merged, split\n", vmapos); + err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0); + if (err) { + up_write(&mm->mmap_sem); + eprintk_ctx("cannot split vma\n"); + goto out; + } + } + up_write(&mm->mmap_sem); + + if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) { + err = verify_create_anonvma(mm, vmai, ctx); + if (err) { + eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos); + goto out; + } + } + + if (vmai->cpt_next > vmai->cpt_hdrlen) { + loff_t offset = vmapos + vmai->cpt_hdrlen; + + do { + union { + struct cpt_page_block pb; + struct cpt_remappage_block rpb; + struct cpt_copypage_block cpb; + struct cpt_lazypage_block lpb; + struct cpt_iterpage_block ipb; + } u; + loff_t pos; + + err = rst_get_object(-1, offset, &u, ctx); + if (err) { + eprintk_ctx("vma fix object: %d\n", err); + goto out; + } + if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) { + err = sc_remap_file_pages(u.rpb.cpt_start, + u.rpb.cpt_end-u.rpb.cpt_start, + 0, u.rpb.cpt_pgoff, 0); + if (err < 0) { + eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err, + (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start), + (__u32)u.rpb.cpt_pgoff); + goto out; + } + offset += u.rpb.cpt_next; + continue; + } else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) { +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + unsigned long ptr = u.lpb.cpt_start; + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + err = anon_vma_prepare(vma); + if (err) { + up_read(&mm->mmap_sem); + goto out; + } + while (ptr < u.lpb.cpt_end) { + err = rst_pagein(vma, u.lpb.cpt_index + (ptr-u.lpb.cpt_start)/PAGE_SIZE, + ptr, ctx); + if (err) + break; + ptr += PAGE_SIZE; + } + up_read(&mm->mmap_sem); +#else + err = -EINVAL; +#endif + if (err) + goto out; + offset += u.cpb.cpt_next; + continue; + } else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) { + struct vm_area_struct *vma, *vma1; + struct mm_struct *src; + struct anon_vma *src_anon; + cpt_object_t *mobj; + + if (!vmai->cpt_anonvmaid) { + err = -EINVAL; + eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n"); + goto out; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx); + if (!mobj) { + eprintk_ctx("lost mm_struct to clone pages from\n"); + err = -ESRCH; + goto out; + } + src = mobj->o_obj; + + down_read(&src->mmap_sem); + src_anon = NULL; + vma1 = find_vma(src, u.cpb.cpt_start); + if (vma1) + src_anon = vma1->anon_vma; + up_read(&src->mmap_sem); + + if (!vma1) { + eprintk_ctx("lost src vm_area_struct\n"); + err = -ESRCH; + goto out; + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + + if (!src_anon || + !vma->anon_vma || + vma->anon_vma != src_anon || + vma->vm_start - vma1->vm_start != + (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) { + up_read(&mm->mmap_sem); + wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos); + err = copy_mm_pages(mobj->o_obj, + u.cpb.cpt_start, + u.cpb.cpt_end); + } else { + err = __copy_page_range(vma, vma1, + u.cpb.cpt_start, + u.cpb.cpt_end-u.cpb.cpt_start); + up_read(&mm->mmap_sem); + } + if (err) { + eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err, + (__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start), + (long)u.cpb.cpt_source); + goto out; + } + + offset += u.cpb.cpt_next; + continue; + } else if (u.pb.cpt_object == CPT_OBJ_ITERPAGES || + u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES + ) { +#ifdef CONFIG_VZ_CHECKPOINT_ITER + unsigned long ptr = u.lpb.cpt_start; + u64 page_pos[16]; + pos = offset + sizeof(u.pb); + + err = ctx->pread(&page_pos, + 8*(u.lpb.cpt_end-ptr)/PAGE_SIZE, + ctx, + pos); + if (err) { + eprintk_ctx("Oops\n"); + goto out; + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + err = anon_vma_prepare(vma); + if (err) { + up_read(&mm->mmap_sem); + goto out; + } + while (ptr < u.lpb.cpt_end) { + err = rst_iter(vma, + page_pos[(ptr-u.lpb.cpt_start)/PAGE_SIZE], + ptr, + ctx); + if (err) + break; + ptr += PAGE_SIZE; + } + if (u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES) { + make_pages_present((unsigned long)u.lpb.cpt_start, + (unsigned long)u.lpb.cpt_end); + } + up_read(&mm->mmap_sem); +#else + err = -EINVAL; +#endif + if (err) + goto out; + offset += u.cpb.cpt_next; + continue; + } + if (u.pb.cpt_object != CPT_OBJ_PAGES) { + eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object); + err = -EINVAL; + goto out; + } + pos = offset + sizeof(u.pb); + if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) { + /* I guess this is get_user_pages() messed things, + * this happens f.e. when gdb inserts breakpoints. + */ + int i; + for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) { + struct page *page; + void *maddr; + err = get_user_pages(current, current->mm, + (unsigned long)u.pb.cpt_start + i*PAGE_SIZE, + 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) { + eprintk_ctx("get_user_pages: %d\n", err); + goto out; + } + err = 0; + maddr = kmap(page); + if (u.pb.cpt_content == CPT_CONTENT_VOID) { + memset(maddr, 0, PAGE_SIZE); + } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { + err = ctx->pread(maddr, PAGE_SIZE, + ctx, pos + i*PAGE_SIZE); + if (err) { + kunmap(page); + goto out; + } + } else { + err = -EINVAL; + kunmap(page); + goto out; + } + set_page_dirty_lock(page); + kunmap(page); + page_cache_release(page); + } + } else { + if (!(prot&PROT_WRITE)) + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); + if (u.pb.cpt_content == CPT_CONTENT_VOID) { + int i; + for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) { + err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i); + if (err) { + eprintk_ctx("__put_user 2 %d\n", err); + goto out; + } + } + } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { + loff_t tpos = pos; + err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start), + u.pb.cpt_end-u.pb.cpt_start, + &tpos); + if (err != u.pb.cpt_end-u.pb.cpt_start) { + if (err >= 0) + err = -EIO; + goto out; + } + } else { + err = -EINVAL; + goto out; + } + if (!(prot&PROT_WRITE)) + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); + } + err = 0; + offset += u.pb.cpt_next; + } while (offset < vmapos + vmai->cpt_next); + } + +check: + do { + struct vm_area_struct *vma; + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + if (vma) { + if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) { + VM_ClearReadHint(vma); + vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK; + } + if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) { + dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos); + up_read(&mm->mmap_sem); + if (vma->vm_flags&VM_LOCKED) + err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); + else + err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); + /* When mlock fails with EFAULT, it means + * that it could not bring in pages. + * It can happen after mlock() on unreadable + * VMAs. But VMA is correctly locked, + * so that this error can be ignored. */ + if (err == -EFAULT) + err = 0; + if (err) + goto out; + goto check; + } + if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX) + wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, + (unsigned long long)vma->vm_page_prot.pgprot, + (unsigned long long)vmai->cpt_pgprot); +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) + if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) && + (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE)) + wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, + (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot); +#endif + if (vma->vm_flags != vmai->cpt_flags) { + unsigned long x = vma->vm_flags ^ vmai->cpt_flags; + if (x & VM_EXEC) { + /* Crap. On i386 this is OK. + * It is impossible to make via mmap/mprotect + * exec.c clears VM_EXEC on stack. */ + vma->vm_flags &= ~VM_EXEC; + } else if ((x & VM_ACCOUNT) && !checked) { + checked = 1; + if (!(prot&PROT_WRITE)) { + up_read(&mm->mmap_sem); + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); + goto check; + } + wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, + (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); + } else { + wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, + (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); + } + } + } else { + wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos); + } + up_read(&mm->mmap_sem); + } while (0); + +out: + if (file) + fput(file); + return err; +} + +#ifndef CONFIG_IA64 +#define TASK_UNMAP_START 0 +#else +/* On IA64 the first page is a special VM_IO|VM_RESERVED mapping + * used to accelerate speculative dereferences of NULL pointer. */ +#define TASK_UNMAP_START PAGE_SIZE +#endif + +static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx) +{ + int err = 0; + unsigned int def_flags; + struct mm_struct *mm = current->mm; +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *bc; +#endif + + down_write(&mm->mmap_sem); + do_munmap(mm, TASK_UNMAP_START, TASK_SIZE-TASK_UNMAP_START); + +#ifdef CONFIG_USER_RESOURCE + /* + * MM beancounter is usually correct from the fork time, + * but not for init, for example. + * Luckily, mm_ub can be changed for a completely empty MM. + */ + bc = rst_lookup_ubc(vmi->cpt_mmub, ctx); + err = virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_RSTMM, bc); + if (err & NOTIFY_FAIL) { + up_write(&mm->mmap_sem); + return -ECHRNG; + } + if ((err & VIRTNOTIFY_CHANGE) && bc != mm->mm_ub) { + struct user_beancounter *old_bc; + + old_bc = mm->mm_ub; + mm->mm_ub = bc; + bc = old_bc; + } + err = 0; + put_beancounter(bc); +#endif + + mm->start_code = vmi->cpt_start_code; + mm->end_code = vmi->cpt_end_code; + mm->start_data = vmi->cpt_start_data; + mm->end_data = vmi->cpt_end_data; + mm->start_brk = vmi->cpt_start_brk; + mm->brk = vmi->cpt_brk; + mm->start_stack = vmi->cpt_start_stack; + mm->arg_start = vmi->cpt_start_arg; + mm->arg_end = vmi->cpt_end_arg; + mm->env_start = vmi->cpt_start_env; + mm->env_end = vmi->cpt_end_env; + mm->def_flags = 0; + def_flags = vmi->cpt_def_flags; + + mm->dumpable = vmi->cpt_dumpable; + mm->vps_dumpable = vmi->cpt_vps_dumpable; + +#if 0 /* def CONFIG_HUGETLB_PAGE*/ +/* NB: ? */ + int used_hugetlb; +#endif + up_write(&mm->mmap_sem); + + if (vmi->cpt_next > vmi->cpt_hdrlen) { + loff_t offset = pos + vmi->cpt_hdrlen; + do { + union { + struct cpt_vma_image vmai; + struct cpt_aio_ctx_image aioi; + struct cpt_obj_bits bits; + } u; + err = rst_get_object(-1, offset, &u, ctx); + if (err) + goto out; + if (u.vmai.cpt_object == CPT_OBJ_VMA) { +#ifdef CONFIG_IA64 + //// Later... + if (u.vmai.cpt_start) +#endif + err = do_rst_vma(&u.vmai, offset, pos, ctx); + if (err) + goto out; +#ifdef CONFIG_X86 + } else if (u.bits.cpt_object == CPT_OBJ_BITS && + u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) { + err = do_rst_ldt(&u.bits, offset, ctx); + if (err) + goto out; +#endif + } else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) { + err = do_rst_aio(&u.aioi, offset, ctx); + if (err) + goto out; + } else { + eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object); + err = -EINVAL; + goto out; + } + offset += u.vmai.cpt_next; + } while (offset < pos + vmi->cpt_next); + } + + down_write(&mm->mmap_sem); + mm->def_flags = def_flags; + up_write(&mm->mmap_sem); + + +out: + return err; +} + +extern void exit_mm(struct task_struct * tsk); + +int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err = 0; + cpt_object_t *mobj; + void *tmp = (void*)__get_free_page(GFP_KERNEL); + struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp; + + if (!tmp) + return -ENOMEM; + + if (ti->cpt_mm == CPT_NULL) { + if (current->mm) { + virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, + current); + exit_mm(current); + } + goto out; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); + if (mobj) { + if (current->mm != mobj->o_obj) BUG(); + goto out; + } + + if (current->mm == NULL) { + struct mm_struct *mm = mm_alloc(); + if (mm == NULL) { + err = -ENOMEM; + goto out; + } + err = init_new_context(current, mm); + if (err) { + mmdrop(mm); + goto out; + } + current->mm = mm; + } + + if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0) + goto out; + if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) { + eprintk_ctx("do_rst_mm %Ld\n", (unsigned long long)ti->cpt_mm); + goto out; + } + err = -ENOMEM; + mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx); + if (mobj != NULL) { + err = 0; + cpt_obj_setpos(mobj, ti->cpt_mm, ctx); + } + +out: + if (tmp) + free_page((unsigned long)tmp); + return err; +} + +/* This is part of mm setup, made in parent context. Mostly, it is the place, + * where we graft mm of another process to child. + */ + +int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + cpt_object_t *mobj; + + /* Task without mm. Just get rid of this. */ + if (ti->cpt_mm == CPT_NULL) { + if (tsk->mm) { + virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, + tsk); + mmput(tsk->mm); + tsk->mm = NULL; + } + return 0; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); + if (mobj) { + struct mm_struct *newmm = mobj->o_obj; + /* Good, the MM is already created. */ + if (newmm == tsk->mm) { + /* Already done by clone(). */ + return 0; + } + mmput(tsk->mm); + atomic_inc(&newmm->mm_users); + tsk->mm = newmm; + tsk->active_mm = newmm; + } + return 0; +} + +/* We use CLONE_VM when mm of child is going to be shared with parent. + * Otherwise mm is copied. + */ + +__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + if (ti->cpt_mm == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx)) + return CLONE_VM; + return 0; +} diff -uprN linux-2.6.18/kernel/cpt/rst_net.c linux-2.6.18.ovz/kernel/cpt/rst_net.c --- linux-2.6.18/kernel/cpt/rst_net.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_net.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,501 @@ +/* + * + * kernel/cpt/rst_net.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" +#include "cpt_net.h" + +#include "cpt_syscalls.h" + +extern struct in_ifaddr *inet_alloc_ifa(void); +extern int inet_insert_ifa(struct in_ifaddr *ifa); + +int rst_restore_ifaddr(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_ifaddr_image di; + struct net_device *dev; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int cindex = -1; + int err; + err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx); + if (err) + return err; + cindex = di.cpt_index; + rtnl_lock(); + dev = __dev_get_by_index(cindex); + if (dev && di.cpt_family == AF_INET) { + struct in_device *in_dev; + struct in_ifaddr *ifa; + if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) + in_dev = inetdev_init(dev); + ifa = inet_alloc_ifa(); + if (ifa) { + ifa->ifa_local = di.cpt_address[0]; + ifa->ifa_address = di.cpt_peer[0]; + ifa->ifa_broadcast = di.cpt_broadcast[0]; + ifa->ifa_prefixlen = di.cpt_masklen; + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + ifa->ifa_flags = di.cpt_flags; + ifa->ifa_scope = di.cpt_scope; + memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + err = inet_insert_ifa(ifa); + if (err && err != -EEXIST) { + rtnl_unlock(); + eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); + return err; + } + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + } else if (dev && di.cpt_family == AF_INET6) { + __u32 prefered_lft; + __u32 valid_lft; + prefered_lft = (di.cpt_flags & IFA_F_DEPRECATED) ? + 0 : di.cpt_prefered_lft; + valid_lft = (di.cpt_flags & IFA_F_PERMANENT) ? + 0xFFFFFFFF : di.cpt_valid_lft; + err = inet6_addr_add(dev->ifindex, + (struct in6_addr *)di.cpt_address, + di.cpt_masklen, + prefered_lft, + valid_lft); + if (err && err != -EEXIST) { + rtnl_unlock(); + eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); + return err; + } +#endif + } else { + rtnl_unlock(); + eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index); + return -EINVAL; + } + rtnl_unlock(); + sec += di.cpt_next; + } + return 0; +} + +static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx) +{ + int min_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + struct rtmsg *rtm = NLMSG_DATA(nlh); + __u32 prefix0 = 0; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(rta, attrlen)) { + if (rta->rta_type == RTA_DST) { + prefix0 = *(__u32*)RTA_DATA(rta); + } + rta = RTA_NEXT(rta, attrlen); + } + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (rtm->rtm_family == AF_INET6) { + if (rtm->rtm_type == RTN_LOCAL) + return 2; + if (rtm->rtm_flags & RTM_F_CLONED) + return 2; + if (rtm->rtm_protocol == RTPROT_UNSPEC || + rtm->rtm_protocol == RTPROT_RA || + rtm->rtm_protocol == RTPROT_REDIRECT || + rtm->rtm_protocol == RTPROT_KERNEL) + return 2; + if (rtm->rtm_protocol == RTPROT_BOOT && + ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) || + (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000)))) + return 2; + } +#endif + return rtm->rtm_protocol == RTPROT_KERNEL; +} + +int rst_restore_route(struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct msghdr msg; + struct iovec iov; + struct sockaddr_nl nladdr; + mm_segment_t oldfs; + loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr v; + char *pg; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + if (h.cpt_hdrlen >= h.cpt_next) + return 0; + + sec += h.cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx); + if (err < 0) + return err; + + err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); + if (err) + return err; + + pg = (char*)__get_free_page(GFP_KERNEL); + if (pg == NULL) { + err = -ENOMEM; + goto out_sock; + } + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + endsec = sec + v.cpt_next; + sec += v.cpt_hdrlen; + + while (sec < endsec) { + struct nlmsghdr *n; + struct nlmsghdr nh; + int kernel_flag; + + if (endsec - sec < sizeof(nh)) + break; + + err = ctx->pread(&nh, sizeof(nh), ctx, sec); + if (err) + goto out_sock_pg; + if (nh.nlmsg_len < sizeof(nh) || nh.nlmsg_len > PAGE_SIZE || + endsec - sec < nh.nlmsg_len) { + err = -EINVAL; + goto out_sock_pg; + } + err = ctx->pread(pg, nh.nlmsg_len, ctx, sec); + if (err) + goto out_sock_pg; + + n = (struct nlmsghdr*)pg; + n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE; + + err = rewrite_rtmsg(n, ctx); + if (err < 0) + goto out_sock_pg; + kernel_flag = err; + + if (kernel_flag == 2) + goto do_next; + + iov.iov_base=n; + iov.iov_len=nh.nlmsg_len; + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, nh.nlmsg_len); + set_fs(oldfs); + + if (err < 0) + goto out_sock_pg; + err = 0; + + iov.iov_base=pg; + iov.iov_len=PAGE_SIZE; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); + set_fs(oldfs); + if (err != -EAGAIN) { + if (err == NLMSG_LENGTH(sizeof(struct nlmsgerr)) && + n->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *e = NLMSG_DATA(n); + if (e->error != -EEXIST || !kernel_flag) + eprintk_ctx("NLMERR: %d\n", e->error); + } else { + eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type); + } + } +do_next: + err = 0; + sec += NLMSG_ALIGN(nh.nlmsg_len); + } + +out_sock_pg: + free_page((unsigned long)pg); +out_sock: + sock_release(sock); + return err; +} + +int rst_resume_network(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + env->disable_net = 0; + put_ve(env); + return 0; +} + +int rst_restore_netdev(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_netdev_image di; + struct net_device *dev; + + get_exec_env()->disable_net = 1; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int err; + struct net_device *dev_new; + err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx); + if (err) + return err; + rtnl_lock(); + dev = __dev_get_by_name(di.cpt_name); + if (dev) { + if (dev->ifindex != di.cpt_index) { + dev_new = __dev_get_by_index(di.cpt_index); + if (!dev_new) { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->index_hlist); + if (dev->iflink == dev->ifindex) + dev->iflink = di.cpt_index; + dev->ifindex = di.cpt_index; + hlist_add_head(&dev->index_hlist, + dev_index_hash(dev->ifindex, + get_exec_env())); + write_unlock_bh(&dev_base_lock); + } else { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->index_hlist); + hlist_del(&dev_new->index_hlist); + if (dev_new->iflink == dev_new->ifindex) + dev_new->iflink = dev->ifindex; + dev_new->ifindex = dev->ifindex; + if (dev->iflink == dev->ifindex) + dev->iflink = di.cpt_index; + dev->ifindex = di.cpt_index; + hlist_add_head(&dev->index_hlist, + dev_index_hash(dev->ifindex, + get_exec_env())); + hlist_add_head(&dev_new->index_hlist, + dev_index_hash(dev_new->ifindex, + get_exec_env())); + write_unlock_bh(&dev_base_lock); + } + } + if (di.cpt_flags^dev->flags) { + err = dev_change_flags(dev, di.cpt_flags); + if (err) + eprintk_ctx("dev_change_flags err: %d\n", err); + } + } else { + eprintk_ctx("unknown interface 2 %s\n", di.cpt_name); + } + rtnl_unlock(); + sec += di.cpt_next; + } + return 0; +} + +static int dumpfn(void *arg) +{ + int i; + int *pfd = arg; + char *argv[] = { "iptables-restore", "-c", NULL }; + + if (pfd[0] != 0) + sc_dup2(pfd[0], 0); + + for (i=1; ifiles->fdt->max_fds; i++) + sc_close(i); + + module_put(THIS_MODULE); + + set_fs(KERNEL_DS); + i = sc_execve("/sbin/iptables-restore", argv, NULL); + if (i == -ENOENT) + i = sc_execve("/usr/sbin/iptables-restore", argv, NULL); + eprintk("failed to exec iptables-restore: %d\n", i); + return 255 << 8; +} + +static int rst_restore_iptables(struct cpt_context * ctx) +{ + int err; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + int n; + struct cpt_section_hdr h; + loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES]; + loff_t end; + int pid; + int status; + mm_segment_t oldfs; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + if (h.cpt_hdrlen == h.cpt_next) + return 0; + if (h.cpt_hdrlen > h.cpt_next) + return -EINVAL; + sec += h.cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx); + if (err < 0) + return err; + + err = sc_pipe(pfd); + if (err < 0) + return err; + pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); + if (err < 0) + goto out; + f = fget(pfd[1]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + ctx->file->f_pos = sec + v.cpt_hdrlen; + end = sec + v.cpt_next; + do { + char *p; + char buf[16]; + + n = end - ctx->file->f_pos; + if (n > sizeof(buf)) + n = sizeof(buf); + + if (ctx->read(buf, n, ctx)) + break; + if ((p = memchr(buf, 0, n)) != NULL) + n = p - buf; + oldfs = get_fs(); set_fs(KERNEL_DS); + f->f_op->write(f, buf, n, &f->f_pos); + set_fs(oldfs); + } while (ctx->file->f_pos < end); + + fput(f); + + clear_tsk_thread_flag(current,TIF_SIGPENDING); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("iptables-restore exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("iptables-restore terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + + return err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + return err; +} + +int rst_restore_net(struct cpt_context *ctx) +{ + int err; + + err = rst_restore_netdev(ctx); + if (!err) + err = rst_restore_ifaddr(ctx); + if (!err) + err = rst_restore_route(ctx); + if (!err) + err = rst_restore_iptables(ctx); + if (!err) + err = rst_restore_ip_conntrack(ctx); + return err; +} diff -uprN linux-2.6.18/kernel/cpt/rst_proc.c linux-2.6.18.ovz/kernel/cpt/rst_proc.c --- linux-2.6.18/kernel/cpt/rst_proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_proc.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,577 @@ +/* + * + * kernel/cpt/rst_proc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" + +MODULE_AUTHOR("Alexey Kuznetsov "); +MODULE_LICENSE("GPL"); + +/* List of contexts and lock protecting the list */ +static struct list_head cpt_context_list; +static spinlock_t cpt_context_lock; + +static int proc_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + cpt_context_t *ctx; + + len += sprintf(buffer, "Ctx Id VE State\n"); + + spin_lock(&cpt_context_lock); + + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + len += sprintf(buffer+len,"%p %08x %-8u %d", + ctx, + ctx->contextid, + ctx->ve_id, + ctx->ctx_state + ); +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + len += pagein_info_printf(buffer+len, ctx); +#endif + + buffer[len++] = '\n'; + + pos = begin+len; + if (pos < offset) { + len = 0; + begin = pos; + } + if (pos > offset+length) + goto done; + } + *eof = 1; + +done: + spin_unlock(&cpt_context_lock); + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) + len = length; + if(len < 0) + len = 0; + return len; +} + +void rst_context_release(cpt_context_t *ctx) +{ + list_del(&ctx->ctx_list); + spin_unlock(&cpt_context_lock); + + if (ctx->ctx_state > 0) + rst_resume(ctx); + ctx->ctx_state = CPT_CTX_ERROR; + + rst_close_dumpfile(ctx); + + if (ctx->anonvmas) { + int h; + for (h = 0; h < CPT_ANONVMA_HSIZE; h++) { + while (!hlist_empty(&ctx->anonvmas[h])) { + struct hlist_node *elem = ctx->anonvmas[h].first; + hlist_del(elem); + kfree(elem); + } + } + free_page((unsigned long)ctx->anonvmas); + } + cpt_flush_error(ctx); + if (ctx->errorfile) { + fput(ctx->errorfile); + ctx->errorfile = NULL; + } + if (ctx->error_msg) { + free_page((unsigned long)ctx->error_msg); + ctx->error_msg = NULL; + } +#ifdef CONFIG_VZ_CHECKPOINT_ITER + rst_drop_iter_dir(ctx); +#endif +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + if (ctx->pgin_task) + put_task_struct(ctx->pgin_task); +#endif + if (ctx->filejob_queue) + rst_flush_filejobs(ctx); + if (ctx->objcount) + eprintk_ctx("%d objects leaked\n", ctx->objcount); + kfree(ctx); + + spin_lock(&cpt_context_lock); +} + +static void __cpt_context_put(cpt_context_t *ctx) +{ + if (!--ctx->refcount) + rst_context_release(ctx); +} + +static void cpt_context_put(cpt_context_t *ctx) +{ + spin_lock(&cpt_context_lock); + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); +} + +cpt_context_t * rst_context_open(void) +{ + cpt_context_t *ctx; + + if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { + rst_context_init(ctx); + spin_lock(&cpt_context_lock); + list_add_tail(&ctx->ctx_list, &cpt_context_list); + spin_unlock(&cpt_context_lock); + ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); + if (ctx->error_msg != NULL) + ctx->error_msg[0] = 0; + } + return ctx; +} + +void rst_report_error(int err, cpt_context_t *ctx) +{ + if (ctx->statusfile) { + mm_segment_t oldfs; + int status = 7 /* VZ_ENVCREATE_ERROR */; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (ctx->statusfile->f_op && ctx->statusfile->f_op->write) + ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos); + set_fs(oldfs); + fput(ctx->statusfile); + ctx->statusfile = NULL; + } +} + + +static cpt_context_t * cpt_context_lookup(unsigned int ctxid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->contextid == ctxid) { + ctx->refcount++; + spin_unlock(&cpt_context_lock); + return ctx; + } + } + spin_unlock(&cpt_context_lock); + return NULL; +} + +static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) +{ + int err = 0; + cpt_context_t *ctx; + struct file *dfile = NULL; + + unlock_kernel(); + + if (cmd == CPT_TEST_CAPS) { + err = test_cpu_caps(); + goto out_lock; + } + + if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { + cpt_context_t *old_ctx; + + ctx = NULL; + if (cmd == CPT_JOIN_CONTEXT) { + err = -ENOENT; + ctx = cpt_context_lookup(arg); + if (!ctx) + goto out_lock; + } + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + file->private_data = ctx; + + if (old_ctx) { + if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { + old_ctx->sticky = 0; + old_ctx->refcount--; + } + __cpt_context_put(old_ctx); + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_lock; + } + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + if (ctx) + ctx->refcount++; + spin_unlock(&cpt_context_lock); + + if (!ctx) { + cpt_context_t *old_ctx; + + err = -ENOMEM; + ctx = rst_context_open(); + if (!ctx) + goto out_lock; + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + if (!old_ctx) { + ctx->refcount++; + file->private_data = ctx; + } else { + old_ctx->refcount++; + } + if (old_ctx) { + __cpt_context_put(ctx); + ctx = old_ctx; + } + spin_unlock(&cpt_context_lock); + } + + if (cmd == CPT_GET_CONTEXT) { + unsigned int contextid = (unsigned int)arg; + + err = -EINVAL; + if (ctx->contextid && ctx->contextid != contextid) + goto out_nosem; + if (!ctx->contextid) { + cpt_context_t *c1 = cpt_context_lookup(contextid); + if (c1) { + cpt_context_put(c1); + err = -EEXIST; + goto out_nosem; + } + ctx->contextid = contextid; + } + spin_lock(&cpt_context_lock); + if (!ctx->sticky) { + ctx->sticky = 1; + ctx->refcount++; + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_nosem; + } + + down(&ctx->main_sem); + + err = -EBUSY; + if (ctx->ctx_state < 0) + goto out; + + err = 0; + switch (cmd) { + case CPT_SET_DUMPFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + err = -EBADF; + dfile = fget(arg); + if (dfile == NULL) + break; + if (dfile->f_op == NULL || + dfile->f_op->read == NULL) { + fput(dfile); + break; + } + err = 0; + } + if (ctx->file) + fput(ctx->file); + ctx->file = dfile; + break; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + case CPT_SET_PAGEINFDIN: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + ctx->pagein_file_in = dfile; + break; + case CPT_SET_PAGEINFDOUT: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + ctx->pagein_file_out = dfile; + break; + case CPT_PAGEIND: + err = rst_pageind(ctx); + break; +#endif +#ifdef CONFIG_VZ_CHECKPOINT_ITER + case CPT_ITER: + err = rst_iteration(ctx); + break; +#endif + case CPT_SET_LOCKFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->lockfile) + fput(ctx->lockfile); + ctx->lockfile = dfile; + break; + case CPT_SET_STATUSFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->statusfile) + fput(ctx->statusfile); + ctx->statusfile = dfile; + break; + case CPT_SET_ERRORFD: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->errorfile) + fput(ctx->errorfile); + ctx->errorfile = dfile; + break; + case CPT_SET_VEID: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ve_id = arg; + break; + case CPT_UNDUMP: + if (ctx->ctx_state > 0) { + err = -ENOENT; + break; + } + ctx->ctx_state = CPT_CTX_UNDUMPING; + err = vps_rst_undump(ctx); + if (err) { + rst_report_error(err, ctx); + if (rst_kill(ctx) == 0) + ctx->ctx_state = CPT_CTX_IDLE; + } else { + ctx->ctx_state = CPT_CTX_UNDUMPED; + } + break; + case CPT_RESUME: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + err = rst_resume(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_KILL: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + err = rst_kill(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + default: + err = -EINVAL; + break; + } + +out: + cpt_flush_error(ctx); + up(&ctx->main_sem); +out_nosem: + cpt_context_put(ctx); +out_lock: + lock_kernel(); + return err; +} + +static int rst_open(struct inode * inode, struct file * file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int rst_release(struct inode * inode, struct file * file) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + file->private_data = NULL; + if (ctx) + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); + + + module_put(THIS_MODULE); + return 0; +} + +static struct file_operations rst_fops = +{ + .owner = THIS_MODULE, + .ioctl = rst_ioctl, + .open = rst_open, + .release = rst_release, +}; + + +static struct proc_dir_entry *proc_ent; +extern void *schedule_tail_p; +extern void schedule_tail_hook(void); + +static struct ctl_table_header *ctl_header; + +static ctl_table debug_table[] = { + { + .ctl_name = 9476, + .procname = "rst", + .data = &debug_level, + .maxlen = sizeof(debug_level), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; +static ctl_table root_table[] = { + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { .ctl_name = 0 } +}; + +static int __init init_rst(void) +{ + int err; + + err = -ENOMEM; + ctl_header = register_sysctl_table(root_table, 0); + if (!ctl_header) + goto err_mon; + + spin_lock_init(&cpt_context_lock); + INIT_LIST_HEAD(&cpt_context_list); + + err = -EINVAL; + proc_ent = create_proc_entry_mod("rst", 0600, NULL, THIS_MODULE); + if (!proc_ent) + goto err_out; + + rst_fops.read = proc_ent->proc_fops->read; + rst_fops.write = proc_ent->proc_fops->write; + rst_fops.llseek = proc_ent->proc_fops->llseek; + proc_ent->proc_fops = &rst_fops; + + proc_ent->read_proc = proc_read; + proc_ent->data = NULL; + proc_ent->owner = THIS_MODULE; + return 0; + +err_out: + unregister_sysctl_table(ctl_header); +err_mon: + return err; +} +module_init(init_rst); + +static void __exit exit_rst(void) +{ + remove_proc_entry("rst", NULL); + unregister_sysctl_table(ctl_header); + + spin_lock(&cpt_context_lock); + while (!list_empty(&cpt_context_list)) { + cpt_context_t *ctx; + ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); + + if (!ctx->sticky) + ctx->refcount++; + ctx->sticky = 0; + + BUG_ON(ctx->refcount != 1); + + __cpt_context_put(ctx); + } + spin_unlock(&cpt_context_lock); +} +module_exit(exit_rst); diff -uprN linux-2.6.18/kernel/cpt/rst_process.c linux-2.6.18.ovz/kernel/cpt/rst_process.c --- linux-2.6.18/kernel/cpt/rst_process.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_process.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1594 @@ +/* + * + * kernel/cpt/rst_process.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_ubc.h" +#include "cpt_process.h" +#include "cpt_kernel.h" + + +#define HOOK_RESERVE 256 + +struct resume_info +{ + void (*hook)(struct resume_info *); + unsigned long hooks; +#define HOOK_TID 0 +#define HOOK_CONT 1 +#define HOOK_LSI 2 +#define HOOK_RESTART 3 + unsigned long tid_ptrs[2]; + siginfo_t last_siginfo; +}; + +#ifdef CONFIG_X86_32 + +#define IN_SYSCALL(regs) ((long)(regs)->orig_eax >= 0) +#define IN_ERROR(regs) ((long)(regs)->eax < 0) +#define SYSCALL_ERRNO(regs) (-(long)((regs)->eax)) +#define SYSCALL_RETVAL(regs) ((regs)->eax) +#define SYSCALL_NR(regs) ((regs)->orig_eax) + +#define SYSCALL_SETRET(regs,val) do { (regs)->eax = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->eax = (new); \ + (regs)->eip -= 2; } while (0) + +#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) + +/* In new kernels task_pt_regs() is define to something inappropriate */ +#undef task_pt_regs +#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1) + +#elif defined(CONFIG_X86_64) + +#define IN_SYSCALL(regs) ((long)(regs)->orig_rax >= 0) +#define IN_ERROR(regs) ((long)(regs)->rax < 0) +#define SYSCALL_ERRNO(regs) (-(long)((regs)->rax)) +#define SYSCALL_RETVAL(regs) ((regs)->rax) +#define SYSCALL_NR(regs) ((regs)->orig_rax) + +#define SYSCALL_SETRET(regs,val) do { (regs)->rax = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->rax = (new); \ + (regs)->rip -= 2; } while (0) + +#define __NR32_restart_syscall 0 +#define __NR32_rt_sigtimedwait 177 +#define __NR32_pause 29 +#define __NR32_futex 240 + +#define syscall_is(tsk,regs,name) ((!((tsk)->thread_info->flags&_TIF_IA32) && \ + SYSCALL_NR(regs) == __NR_##name) || \ + (((tsk)->thread_info->flags&_TIF_IA32) && \ + SYSCALL_NR(regs) == __NR32_##name)) + +#elif defined (CONFIG_IA64) + +#define IN_SYSCALL(regs) ((long)(regs)->cr_ifs >= 0) +#define IN_ERROR(regs) ((long)(regs)->r10 == -1) +#define SYSCALL_ERRNO(regs) ((regs)->r10 == -1 ? (long)((regs)->r8) : 0) +#define SYSCALL_RETVAL(regs) ((regs)->r8) +#define SYSCALL_NR(regs) ((regs)->cr_ifs >= 0 ? (regs)->r15 : -1) + +#define SYSCALL_SETRET(regs,val) do { (regs)->r8 = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->r15 = (new); \ + (regs)->r10 = 0; \ + ia64_decrement_ip(regs); } while (0) + +#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) + +#else + +#error This arch is not supported + +#endif + +#define SYSCALL_RESTART(regs) SYSCALL_RESTART2(regs, SYSCALL_NR(regs)) + + +static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si) +{ + memset(info, 0, sizeof(*info)); + switch(si->cpt_code & __SI_MASK) { + case __SI_TIMER: + info->si_tid = si->cpt_pid; + info->si_overrun = si->cpt_uid; + info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval); + info->si_sys_private = si->cpt_utime; + break; + case __SI_POLL: + info->si_band = si->cpt_pid; + info->si_fd = si->cpt_uid; + break; + case __SI_FAULT: + info->si_addr = cpt_ptr_import(si->cpt_sigval); +#ifdef __ARCH_SI_TRAPNO + info->si_trapno = si->cpt_pid; +#endif + break; + case __SI_CHLD: + info->si_pid = si->cpt_pid; + info->si_uid = si->cpt_uid; + info->si_status = si->cpt_sigval; + info->si_stime = si->cpt_stime; + info->si_utime = si->cpt_utime; + break; + case __SI_KILL: + case __SI_RT: + case __SI_MESGQ: + default: + info->si_pid = si->cpt_pid; + info->si_uid = si->cpt_uid; + info->si_ptr = cpt_ptr_import(si->cpt_sigval); + break; + } + info->si_signo = si->cpt_signo; + info->si_errno = si->cpt_errno; + info->si_code = si->cpt_code; +} + +static int restore_sigqueue(struct task_struct *tsk, + struct sigpending *queue, unsigned long start, + unsigned long end) +{ + while (start < end) { + struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start; + if (si->cpt_object == CPT_OBJ_SIGINFO) { + struct sigqueue *q = NULL; + struct user_struct *up; + up = alloc_uid(si->cpt_user); + if (!up) + return -ENOMEM; + q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); + if (!q) { + free_uid(up); + return -ENOMEM; + } + if (ub_siginfo_charge(q, get_exec_ub())) { + kmem_cache_free(sigqueue_cachep, q); + free_uid(up); + return -ENOMEM; + } + + INIT_LIST_HEAD(&q->list); + /* Preallocated elements (posix timers) are not + * supported yet. It is safe to replace them with + * a private one. */ + q->flags = 0; + q->user = up; + atomic_inc(&q->user->sigpending); + + decode_siginfo(&q->info, si); + list_add_tail(&q->list, &queue->list); + } + start += si->cpt_next; + } + return 0; +} + +int rst_process_linkage(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + if (tsk == NULL) { + eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm); + return -EINVAL; + } + + if (virt_pgid(tsk) != ti->cpt_pgrp) { + int pid; + + if ((pid = vpid_to_pid(ti->cpt_pgrp)) < 0) { + eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + + write_lock_irq(&tasklist_lock); + if (tsk->signal->pgrp != pid && find_pid(pid)) { + detach_pid(tsk, PIDTYPE_PGID); + tsk->signal->pgrp = pid; + if (thread_group_leader(tsk)) { + attach_pid(tsk, PIDTYPE_PGID, pid); + set_virt_pgid(tsk, ti->cpt_pgrp); + } + } + write_unlock_irq(&tasklist_lock); + if (tsk->signal->pgrp != pid) { + eprintk_ctx("cannot set PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + } + if (virt_sid(tsk) != ti->cpt_session) { + int pid; + + if ((pid = vpid_to_pid(ti->cpt_session)) < 0) { + eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + + write_lock_irq(&tasklist_lock); + if (tsk->signal->session != pid && find_pid(pid)) { + detach_pid(tsk, PIDTYPE_SID); + tsk->signal->session = pid; + if (thread_group_leader(tsk)) { + attach_pid(tsk, PIDTYPE_SID, pid); + set_virt_sid(tsk, ti->cpt_session); + } + } + write_unlock_irq(&tasklist_lock); + if (tsk->signal->session != pid) { + eprintk_ctx("cannot set SID " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + } + if (ti->cpt_old_pgrp > 0 && tsk->signal->tty_old_pgrp == 0) { + int pid; + + if ((pid = vpid_to_pid(ti->cpt_old_pgrp)) < 0) { + eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + + tsk->signal->tty_old_pgrp = pid; + } + } + + return 0; +} + +static int +restore_one_signal_struct(struct cpt_task_image *ti, int *exiting, cpt_context_t *ctx) +{ + int err; + struct cpt_signal_image *si = cpt_get_buf(ctx); + + current->signal->tty = NULL; + + err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (virt_pgid(current) != si->cpt_pgrp) { + int err; + struct pid *pid = NULL, *free = NULL; + + if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) { + if (!is_virtual_pid(si->cpt_pgrp)) { + eprintk_ctx("external process group " CPT_FID, CPT_TID(current)); + cpt_release_buf(ctx); + return -EINVAL; + } + pid = alloc_pid(); + if (pid == NULL) { + cpt_release_buf(ctx); + return -EINVAL; + } + if ((err = alloc_vpid(pid, si->cpt_pgrp)) < 0) { + free_pid(pid); + pid = NULL; + if (err != -EEXIST) { + cpt_release_buf(ctx); + return err; + } + } + free = pid; + } + + write_lock_irq(&tasklist_lock); + if (pid || (pid = find_vpid(si->cpt_pgrp)) != NULL) { + if (current->signal->pgrp != pid->nr) { + detach_pid(current, PIDTYPE_PGID); + current->signal->pgrp = pid->nr; + if (thread_group_leader(current)) { + attach_pid(current, PIDTYPE_PGID, pid->nr); + set_virt_pgid(current, si->cpt_pgrp); + free = NULL; + } + } + } + write_unlock_irq(&tasklist_lock); + + if (free) + free_pid(free); + } + + current->signal->tty_old_pgrp = 0; + if ((int)si->cpt_old_pgrp > 0) { + if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) { + current->signal->tty_old_pgrp = alloc_pidmap(); + if (current->signal->tty_old_pgrp < 0) { + eprintk_ctx("failed to allocate stray tty_old_pgrp\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + free_pidmap(current->signal->tty_old_pgrp); + } else { + current->signal->tty_old_pgrp = vpid_to_pid(si->cpt_old_pgrp); + if (current->signal->tty_old_pgrp < 0) { + dprintk_ctx("forward old tty PGID\n"); + current->signal->tty_old_pgrp = 0; + } + } + } + + if (virt_sid(current) != si->cpt_session) { + int err; + struct pid *pid = NULL, *free = NULL; + + if (si->cpt_session_type == CPT_PGRP_ORPHAN) { + if (!is_virtual_pid(si->cpt_session)) { + eprintk_ctx("external process session " CPT_FID, CPT_TID(current)); + cpt_release_buf(ctx); + return -EINVAL; + } + pid = alloc_pid(); + if (pid == NULL) { + cpt_release_buf(ctx); + return -EINVAL; + } + if ((err = alloc_vpid(pid, si->cpt_session)) < 0) { + free_pid(pid); + pid = NULL; + if (err != -EEXIST) { + cpt_release_buf(ctx); + return err; + } + } + free = pid; + } + write_lock_irq(&tasklist_lock); + if (pid || (pid = find_vpid(si->cpt_session)) != NULL) { + if (current->signal->session != pid->nr) { + detach_pid(current, PIDTYPE_SID); + current->signal->session = pid->nr; + if (thread_group_leader(current)) { + attach_pid(current, PIDTYPE_SID, pid->nr); + set_virt_sid(current, si->cpt_session); + free = NULL; + } + } + } + write_unlock_irq(&tasklist_lock); + + if (free) + free_pid(free); + } + + cpt_sigset_import(¤t->signal->shared_pending.signal, si->cpt_sigpending); + current->signal->leader = si->cpt_leader; + if (si->cpt_ctty != CPT_NULL) { + cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx); + if (obj) { + struct tty_struct *tty = obj->o_obj; + if (tty->session == 0 || tty->session == current->signal->session) { + tty->session = current->signal->session; + current->signal->tty = tty; + } else { + wprintk_ctx("tty session mismatch\n"); + } + } + } + + if (si->cpt_curr_target) + current->signal->curr_target = find_task_by_pid_ve(si->cpt_curr_target); + current->signal->flags = 0; + *exiting = si->cpt_group_exit; + current->signal->group_exit_code = si->cpt_group_exit_code; + if (si->cpt_group_exit_task) { + current->signal->group_exit_task = find_task_by_pid_ve(si->cpt_group_exit_task); + if (current->signal->group_exit_task == NULL) { + eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task); + cpt_release_buf(ctx); + return -EINVAL; + } + } + current->signal->notify_count = si->cpt_notify_count; + current->signal->group_stop_count = si->cpt_group_stop_count; + + if (si->cpt_next > si->cpt_hdrlen) { + char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL); + if (buf == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx, + ti->cpt_signal + si->cpt_hdrlen); + if (err) { + kfree(buf); + cpt_release_buf(ctx); + return err; + } + restore_sigqueue(current, + ¤t->signal->shared_pending, (unsigned long)buf, + (unsigned long)buf + si->cpt_next - si->cpt_hdrlen); + kfree(buf); + } + cpt_release_buf(ctx); + return 0; +} + +int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err; + struct cpt_sighand_image si; + int i; + loff_t pos, endpos; + + err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx); + if (err) + return err; + + for (i=0; i<_NSIG; i++) { + current->sighand->action[i].sa.sa_handler = SIG_DFL; +#ifndef CONFIG_IA64 + current->sighand->action[i].sa.sa_restorer = 0; +#endif + current->sighand->action[i].sa.sa_flags = 0; + memset(¤t->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t)); + } + + pos = ti->cpt_sighand + si.cpt_hdrlen; + endpos = ti->cpt_sighand + si.cpt_next; + while (pos < endpos) { + struct cpt_sighandler_image shi; + + err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx); + if (err) + return err; + current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler; +#ifndef CONFIG_IA64 + current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer; +#endif + current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags; + cpt_sigset_import(¤t->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask); + pos += shi.cpt_next; + } + + return 0; +} + + +__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + + if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx)) + flag |= CLONE_THREAD; + if (ti->cpt_sighand == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx)) + flag |= CLONE_SIGHAND; + return flag; +} + +int +rst_signal_complete(struct cpt_task_image *ti, int * exiting, cpt_context_t *ctx) +{ + int err; + cpt_object_t *obj; + + if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) { + return -EINVAL; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx); + if (obj) { + struct sighand_struct *sig = current->sighand; + if (obj->o_obj != sig) { + return -EINVAL; + } + } else { + obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setpos(obj, ti->cpt_sighand, ctx); + err = restore_one_sighand_struct(ti, ctx); + if (err) + return err; + } + + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx); + if (obj) { + struct signal_struct *sig = current->signal; + if (obj->o_obj != sig) { + return -EINVAL; + } + if (current->signal) { + set_virt_pgid(current, pid_to_vpid(current->signal->pgrp)); + set_virt_sid(current, pid_to_vpid(current->signal->session)); + } + } else { + obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setpos(obj, ti->cpt_signal, ctx); + err = restore_one_signal_struct(ti, exiting, ctx); + if (err) + return err; + } + + return 0; +} + +#ifdef CONFIG_X86 +static u32 decode_segment(u32 segid) +{ + if (segid == CPT_SEG_ZERO) + return 0; + + /* TLS descriptors */ + if (segid <= CPT_SEG_TLS3) + return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3; + + /* LDT descriptor, it is just an index to LDT array */ + if (segid >= CPT_SEG_LDT) + return ((segid - CPT_SEG_LDT) << 3) | 7; + + /* Check for one of standard descriptors */ +#ifdef CONFIG_X86_64 + if (segid == CPT_SEG_USER32_DS) + return __USER32_DS; + if (segid == CPT_SEG_USER32_CS) + return __USER32_CS; + if (segid == CPT_SEG_USER64_DS) + return __USER_DS; + if (segid == CPT_SEG_USER64_CS) + return __USER_CS; +#else + if (segid == CPT_SEG_USER32_DS) + return __USER_DS; + if (segid == CPT_SEG_USER32_CS) + return __USER_CS; +#endif + wprintk("Invalid segment reg %d\n", segid); + return 0; +} +#endif + +#if defined (CONFIG_IA64) +void ia64_decrement_ip (struct pt_regs *regs) +{ + unsigned long w0, ri = ia64_psr(regs)->ri - 1; + + if (ia64_psr(regs)->ri == 0) { + regs->cr_iip -= 16; + ri = 2; + get_user(w0, (char __user *) regs->cr_iip + 0); + if (((w0 >> 1) & 0xf) == 2) { + /* + * rfi'ing to slot 2 of an MLX bundle causes + * an illegal operation fault. We don't want + * that to happen... + */ + ri = 1; + } + } + ia64_psr(regs)->ri = ri; +} +#endif + +static void rst_child_tid(unsigned long *child_tids) +{ + dprintk("rct: " CPT_FID "\n", CPT_TID(current)); + current->clear_child_tid = (void*)child_tids[0]; + current->set_child_tid = (void*)child_tids[1]; +} + +static void rst_last_siginfo(void) +{ + int signr; + siginfo_t *info = current->last_siginfo; + struct pt_regs *regs = task_pt_regs(current); + struct k_sigaction *ka; + int ptrace_id; + + dprintk("rlsi: " CPT_FID "\n", CPT_TID(current)); + + spin_lock_irq(¤t->sighand->siglock); + current->last_siginfo = NULL; + recalc_sigpending(); + + ptrace_id = current->pn_state; + clear_pn_state(current); + + switch (ptrace_id) { + case PN_STOP_TF: + case PN_STOP_TF_RT: + /* frame_*signal */ + dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %lu %lu\n", + virt_pid(current), current->pid, current->comm, + info->si_signo, info->si_code, + current->exit_code, SYSCALL_NR(regs), + current->ptrace, current->ptrace_message); + goto out; + case PN_STOP_ENTRY: + case PN_STOP_LEAVE: + /* do_syscall_trace */ + spin_unlock_irq(¤t->sighand->siglock); + dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code); + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } + if (IN_SYSCALL(regs)) { + if (ptrace_id == PN_STOP_ENTRY +#ifdef CONFIG_X86 + && SYSCALL_ERRNO(regs) == ENOSYS +#endif + ) + SYSCALL_RESTART(regs); + else if (IN_ERROR(regs) && + syscall_is(current, regs, rt_sigtimedwait) && + (SYSCALL_ERRNO(regs) == EAGAIN || + SYSCALL_ERRNO(regs) == EINTR)) + SYSCALL_RESTART(regs); + } + return; + case PN_STOP_FORK: + /* fork */ + SYSCALL_SETRET(regs, current->ptrace_message); + dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs)); + goto out; + case PN_STOP_VFORK: + /* after vfork */ + SYSCALL_SETRET(regs, current->ptrace_message); + dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs)); + goto out; + case PN_STOP_SIGNAL: + /* normal case : dequeue signal */ + break; + case PN_STOP_EXIT: + dprintk("ptrace exit caught\n"); + current->ptrace &= ~PT_TRACE_EXIT; + spin_unlock_irq(¤t->sighand->siglock); + module_put(THIS_MODULE); + complete_and_exit(NULL, current->ptrace_message); + BUG(); + case PN_STOP_EXEC: + eprintk("ptrace after exec caught: must not happen\n"); + BUG(); + default: + eprintk("ptrace with unknown identity %d\n", ptrace_id); + BUG(); + } + + signr = current->exit_code; + if (signr == 0) { + dprintk("rlsi: canceled signal %d\n", info->si_signo); + goto out; + } + current->exit_code = 0; + + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = virt_pid(current->parent); + info->si_uid = current->parent->uid; + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + dprintk("going to requeue signal %d\n", signr); + goto out_resend_sig; + } + + ka = ¤t->sighand->action[signr-1]; + if (ka->sa.sa_handler == SIG_IGN) { + dprintk("going to resend signal %d (ignored)\n", signr); + goto out; + } + if (ka->sa.sa_handler != SIG_DFL) { + dprintk("going to resend signal %d (not SIG_DFL)\n", signr); + goto out_resend_sig; + } + if (signr == SIGCONT || + signr == SIGCHLD || + signr == SIGWINCH || + signr == SIGURG || + current->pid == 1) + goto out; + + /* All the rest, which we cannot handle are requeued. */ + dprintk("going to resend signal %d (sigh)\n", signr); +out_resend_sig: + spin_unlock_irq(¤t->sighand->siglock); + send_sig_info(signr, info, current); + return; + +out: + spin_unlock_irq(¤t->sighand->siglock); +} + +static void rst_finish_stop(void) +{ + /* ... + * do_signal() -> + * get_signal_to_deliver() -> + * do_signal_stop() -> + * finish_stop() + * + * Normally after SIGCONT it will dequeue the next signal. If no signal + * is found, do_signal restarts syscall unconditionally. + * Otherwise signal handler is pushed on user stack. + */ + + dprintk("rfs: " CPT_FID "\n", CPT_TID(current)); + + clear_stop_state(current); + current->exit_code = 0; +} + +static void rst_restart_sys(void) +{ + struct pt_regs *regs = task_pt_regs(current); + + /* This hook is supposed to be executed, when we have + * to complete some interrupted syscall. + */ + dprintk("rrs: " CPT_FID "\n", CPT_TID(current)); + + if (!IN_SYSCALL(regs) || !IN_ERROR(regs)) + return; + +#ifdef __NR_pause + if (syscall_is(current,regs,pause)) { + if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } + } else +#else + /* On this arch pause() is simulated with sigsuspend(). */ + if (syscall_is(current,regs,rt_sigsuspend)) { + if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } + } else +#endif + if (syscall_is(current,regs,rt_sigtimedwait)) { + if (SYSCALL_ERRNO(regs) == EAGAIN || + SYSCALL_ERRNO(regs) == EINTR) { + SYSCALL_RESTART(regs); + } + } else if (syscall_is(current,regs,futex)) { + if (SYSCALL_ERRNO(regs) == EINTR && + !signal_pending(current)) { + SYSCALL_RESTART(regs); + } + } + + if (!signal_pending(current) && + !test_thread_flag(TIF_RESTORE_SIGMASK)) { + if (SYSCALL_ERRNO(regs) == ERESTARTSYS || + SYSCALL_ERRNO(regs) == ERESTARTNOINTR || + SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + SYSCALL_RESTART(regs); + } else if (SYSCALL_ERRNO(regs) == ERESTART_RESTARTBLOCK) { + int new = __NR_restart_syscall; +#ifdef CONFIG_X86_64 + if (current->thread_info->flags&_TIF_IA32) + new = __NR32_restart_syscall; +#endif + SYSCALL_RESTART2(regs, new); + } + } +} + +#ifdef CONFIG_X86_32 + +static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, + struct cpt_task_image *ti, struct cpt_x86_regs *b, + struct resume_info **rip) +{ + extern char i386_ret_from_resume; + + if (b->cpt_object != CPT_OBJ_X86_REGS) + return -EINVAL; + + tsk->thread.esp = (unsigned long) regs; + tsk->thread.esp0 = (unsigned long) (regs+1); + tsk->thread.eip = (unsigned long) &i386_ret_from_resume; + + tsk->thread.fs = decode_segment(b->cpt_fs); + tsk->thread.gs = decode_segment(b->cpt_gs); + tsk->thread.debugreg[0] = b->cpt_debugreg[0]; + tsk->thread.debugreg[1] = b->cpt_debugreg[1]; + tsk->thread.debugreg[2] = b->cpt_debugreg[2]; + tsk->thread.debugreg[3] = b->cpt_debugreg[3]; + tsk->thread.debugreg[4] = b->cpt_debugreg[4]; + tsk->thread.debugreg[5] = b->cpt_debugreg[5]; + tsk->thread.debugreg[6] = b->cpt_debugreg[6]; + tsk->thread.debugreg[7] = b->cpt_debugreg[7]; + + memcpy(regs, &b->cpt_ebx, sizeof(struct pt_regs)); + + regs->xcs = decode_segment(b->cpt_xcs); + regs->xss = decode_segment(b->cpt_xss); + regs->xds = decode_segment(b->cpt_xds); + regs->xes = decode_segment(b->cpt_xes); + + tsk->thread.esp -= HOOK_RESERVE; + memset((void*)tsk->thread.esp, 0, HOOK_RESERVE); + *rip = (void*)tsk->thread.esp; + + return 0; +} + +#elif defined(CONFIG_X86_64) + +static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s) +{ + memset(d, 0, sizeof(struct pt_regs)); + d->rbp = s->cpt_ebp; + d->rbx = s->cpt_ebx; + d->rax = (s32)s->cpt_eax; + d->rcx = s->cpt_ecx; + d->rdx = s->cpt_edx; + d->rsi = s->cpt_esi; + d->rdi = s->cpt_edi; + d->orig_rax = (s32)s->cpt_orig_eax; + d->rip = s->cpt_eip; + d->cs = s->cpt_xcs; + d->eflags = s->cpt_eflags; + d->rsp = s->cpt_esp; + d->ss = s->cpt_xss; +} + +static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, + struct cpt_task_image *ti, struct cpt_obj_bits *hdr, + struct resume_info **rip) +{ + if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) { + struct cpt_x86_64_regs *b = (void*)hdr; + + tsk->thread.rsp = (unsigned long) regs; + tsk->thread.rsp0 = (unsigned long) (regs+1); + + tsk->thread.fs = b->cpt_fsbase; + tsk->thread.gs = b->cpt_gsbase; + tsk->thread.fsindex = decode_segment(b->cpt_fsindex); + tsk->thread.gsindex = decode_segment(b->cpt_gsindex); + tsk->thread.ds = decode_segment(b->cpt_ds); + tsk->thread.es = decode_segment(b->cpt_es); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs)); + + tsk->thread.userrsp = regs->rsp; + regs->cs = decode_segment(b->cpt_cs); + regs->ss = decode_segment(b->cpt_ss); + } else if (hdr->cpt_object == CPT_OBJ_X86_REGS) { + struct cpt_x86_regs *b = (void*)hdr; + + tsk->thread.rsp = (unsigned long) regs; + tsk->thread.rsp0 = (unsigned long) (regs+1); + + tsk->thread.fs = 0; + tsk->thread.gs = 0; + tsk->thread.fsindex = decode_segment(b->cpt_fs); + tsk->thread.gsindex = decode_segment(b->cpt_gs); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + xlate_ptregs_32_to_64(regs, b); + + tsk->thread.userrsp = regs->rsp; + regs->cs = decode_segment(b->cpt_xcs); + regs->ss = decode_segment(b->cpt_xss); + tsk->thread.ds = decode_segment(b->cpt_xds); + tsk->thread.es = decode_segment(b->cpt_xes); + } else { + return -EINVAL; + } + + tsk->thread.rsp -= HOOK_RESERVE; + memset((void*)tsk->thread.rsp, 0, HOOK_RESERVE); + *rip = (void*)tsk->thread.rsp; + return 0; +} + +#elif defined(CONFIG_IA64) + +#define MASK(nbits) ((1UL << (nbits)) - 1) /* mask with NBITS bits set */ + +#define PUT_BITS(first, last, nat) \ + ({ \ + unsigned long bit = ia64_unat_pos(&pt->r##first); \ + unsigned long nbits = (last - first + 1); \ + unsigned long mask = MASK(nbits) << first; \ + long dist; \ + if (bit < first) \ + dist = 64 + bit - first; \ + else \ + dist = bit - first; \ + ia64_rotl(nat & mask, dist); \ + }) + +unsigned long +ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat) +{ + unsigned long scratch_unat; + + /* + * Registers that are stored consecutively in struct pt_regs + * can be handled in parallel. If the register order in + * struct_pt_regs changes, this code MUST be updated. + */ + scratch_unat = PUT_BITS( 1, 1, nat); + scratch_unat |= PUT_BITS( 2, 3, nat); + scratch_unat |= PUT_BITS(12, 13, nat); + scratch_unat |= PUT_BITS(14, 14, nat); + scratch_unat |= PUT_BITS(15, 15, nat); + scratch_unat |= PUT_BITS( 8, 11, nat); + scratch_unat |= PUT_BITS(16, 31, nat); + + return scratch_unat; + +} + +static unsigned long +ia64_put_saved_nat_bits (struct switch_stack *pt, unsigned long nat) +{ + unsigned long scratch_unat; + + scratch_unat = PUT_BITS( 4, 7, nat); + + return scratch_unat; + +} + +#undef PUT_BITS + + +static int restore_registers(struct task_struct *tsk, struct pt_regs *pt, + struct cpt_task_image *ti, + struct cpt_ia64_regs *r, + struct resume_info **rip) +{ + extern char ia64_ret_from_resume; + struct switch_stack *sw; + struct resume_info *ri; + struct ia64_psr *psr = ia64_psr(pt); + void *krbs = (void *)tsk + IA64_RBS_OFFSET; + unsigned long reg; + + if (r->cpt_object != CPT_OBJ_IA64_REGS) + return -EINVAL; + + if (r->num_regs > 96) { + eprintk(CPT_FID " too much RSE regs %lu\n", + CPT_TID(tsk), r->num_regs); + return -EINVAL; + } + + *rip = ri = ((void*)pt) - HOOK_RESERVE; + sw = ((struct switch_stack *) ri) - 1; + + memmove(sw, (void*)tsk->thread.ksp + 16, sizeof(struct switch_stack)); + memset(ri, 0, HOOK_RESERVE); + + /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ + memcpy(&pt->r1, &r->gr[1], 8*(2-1)); + memcpy(&pt->r2, &r->gr[2], 8*(4-2)); + memcpy(&pt->r8, &r->gr[8], 8*(12-8)); + memcpy(&pt->r12, &r->gr[12], 8*(14-12)); + memcpy(&pt->r14, &r->gr[14], 8*(15-14)); + memcpy(&pt->r15, &r->gr[15], 8*(16-15)); + memcpy(&pt->r16, &r->gr[16], 8*(32-16)); + + pt->b0 = r->br[0]; + pt->b6 = r->br[6]; + pt->b7 = r->br[7]; + + pt->ar_bspstore = r->ar_bspstore; + pt->ar_unat = r->ar_unat; + pt->ar_pfs = r->ar_pfs; + pt->ar_ccv = r->ar_ccv; + pt->ar_fpsr = r->ar_fpsr; + pt->ar_csd = r->ar_csd; + pt->ar_ssd = r->ar_ssd; + pt->ar_rsc = r->ar_rsc; + + pt->cr_iip = r->cr_iip; + pt->cr_ipsr = r->cr_ipsr; + + pt->pr = r->pr; + + pt->cr_ifs = r->cfm; + + /* fpregs 6..9,10..11 are in pt_regs */ + memcpy(&pt->f6, &r->fr[2*6], 16*(10-6)); + memcpy(&pt->f10, &r->fr[2*10], 16*(12-10)); + /* fpreg 12..15 are on switch stack */ + memcpy(&sw->f12, &r->fr[2*12], 16*(16-12)); + /* fpregs 32...127 */ + tsk->thread.flags |= IA64_THREAD_FPH_VALID; + memcpy(tsk->thread.fph, &r->fr[32*2], 16*(128-32)); + ia64_drop_fpu(tsk); + psr->dfh = 1; + + memcpy(&sw->r4, &r->gr[4], 8*(8-4)); + memcpy(&sw->b1, &r->br[1], 8*(6-1)); + sw->ar_lc = r->ar_lc; + + memcpy(&sw->f2, &r->fr[2*2], 16*(6-2)); + memcpy(&sw->f16, &r->fr[2*16], 16*(32-16)); + + sw->caller_unat = 0; + sw->ar_fpsr = pt->ar_fpsr; + sw->ar_unat = 0; + if (r->nat[0] & 0xFFFFFF0FUL) + sw->caller_unat = ia64_put_scratch_nat_bits(pt, r->nat[0]); + if (r->nat[0] & 0xF0) + sw->ar_unat = ia64_put_saved_nat_bits(sw, r->nat[0]); + + sw->ar_bspstore = (unsigned long)ia64_rse_skip_regs(krbs, r->num_regs); + memset(krbs, 0, (void*)sw->ar_bspstore - krbs); + sw->ar_rnat = 0; + sw->ar_pfs = 0; + + /* This is tricky. When we are in syscall, we have frame + * of output register (sometimes, plus one input reg sometimes). + * It is not so easy to restore such frame, RSE optimizes + * and does not fetch those regs from backstore. So, we restore + * the whole frame as local registers, and then repartition it + * in ia64_ret_from_resume(). + */ + if ((long)pt->cr_ifs >= 0) { + unsigned long out = (r->cfm&0x7F) - ((r->cfm>>7)&0x7F); + sw->ar_pfs = out | (out<<7); + } + if (r->ar_ec) + sw->ar_pfs |= (r->ar_ec & 0x3F) << 52; + + for (reg = 0; reg < r->num_regs; reg++) { + unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); + unsigned long *rnatp; + unsigned long set_rnat = 0; + + *ptr = r->gr[32+reg]; + + if (reg < 32) + set_rnat = (r->nat[0] & (1UL<<(reg+32))); + else + set_rnat = (r->nat[1] & (1UL<<(reg-32))); + + if (set_rnat) { + rnatp = ia64_rse_rnat_addr(ptr); + if ((unsigned long)rnatp >= sw->ar_bspstore) + rnatp = &sw->ar_rnat; + *rnatp |= (1UL<b0 = (unsigned long) &ia64_ret_from_resume; + tsk->thread.ksp = (unsigned long) sw - 16; + +#define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */ +#define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */ +#define PRED_USER_STACK 3 /* returning to user-stacks? */ +#define PRED_SYSCALL 4 /* inside a system call? */ +#define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */ + + pt->loadrs = r->loadrs; + sw->pr = 0; + sw->pr &= ~(1UL << PRED_LEAVE_SYSCALL); + sw->pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_NON_SYSCALL)); + sw->pr &= ~(1UL << PRED_KERNEL_STACK); + sw->pr |= (1UL << PRED_USER_STACK); + if ((long)pt->cr_ifs < 0) { + sw->pr |= (1UL << PRED_NON_SYSCALL); + } else { + sw->pr |= ((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL)); + } + + return 0; +} +#endif + +asmlinkage void rst_resume_work(struct resume_info *ri) +{ + if (ri->hooks & (1<tid_ptrs); + if (ri->hooks & (1<hooks & (1<hooks & (1<o_obj; + struct cpt_task_image *ti = obj->o_image; + struct pt_regs * regs; + struct cpt_object_hdr *b; + struct cpt_siginfo_image *lsi = NULL; + struct group_info *gids, *ogids; + struct resume_info *ri = NULL; + int i; +#ifdef CONFIG_USER_RESOURCE + int err; + struct task_beancounter *tbc; + struct user_beancounter *new_bc, *old_bc; +#endif + + if (tsk == NULL) { + eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm); + return -EFAULT; + } + + wait_task_inactive(tsk); +#ifdef CONFIG_USER_RESOURCE + tbc = &tsk->task_bc; + new_bc = rst_lookup_ubc(ti->cpt_exec_ub, ctx); + err = virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTTSK, new_bc); + if (err & NOTIFY_FAIL) { + put_beancounter(new_bc); + return -ECHRNG; + } + old_bc = tbc->exec_ub; + if ((err & VIRTNOTIFY_CHANGE) && old_bc != new_bc) { + dprintk(" *** replacing ub %p by %p for %p (%d %s)\n", + old_bc, new_bc, tsk, + tsk->pid, tsk->comm); + tbc->exec_ub = new_bc; + new_bc = old_bc; + } + put_beancounter(new_bc); +#endif + regs = task_pt_regs(tsk); + + if (!tsk->exit_state) { + tsk->lock_depth = -1; +#ifdef CONFIG_PREEMPT + tsk->thread_info->preempt_count--; +#endif + } + + if (tsk->static_prio != ti->cpt_static_prio) + set_user_nice(tsk, PRIO_TO_NICE((s32)ti->cpt_static_prio)); + + cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked); + cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked); + cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked); + cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending); + + tsk->uid = ti->cpt_uid; + tsk->euid = ti->cpt_euid; + tsk->suid = ti->cpt_suid; + tsk->fsuid = ti->cpt_fsuid; + tsk->gid = ti->cpt_gid; + tsk->egid = ti->cpt_egid; + tsk->sgid = ti->cpt_sgid; + tsk->fsgid = ti->cpt_fsgid; +#ifdef CONFIG_IA64 + SET_UNALIGN_CTL(tsk, ti->cpt_prctl_uac); + SET_FPEMU_CTL(tsk, ti->cpt_prctl_fpemu); +#endif + memcpy(&tsk->cap_effective, &ti->cpt_ecap, sizeof(tsk->cap_effective)); + memcpy(&tsk->cap_inheritable, &ti->cpt_icap, sizeof(tsk->cap_inheritable)); + memcpy(&tsk->cap_permitted, &ti->cpt_pcap, sizeof(tsk->cap_permitted)); + tsk->keep_capabilities = (ti->cpt_keepcap != 0); + tsk->did_exec = (ti->cpt_did_exec != 0); + gids = groups_alloc(ti->cpt_ngids); + ogids = tsk->group_info; + if (gids) { + int i; + for (i=0; i<32; i++) + gids->small_block[i] = ti->cpt_gids[i]; + tsk->group_info = gids; + } + if (ogids) + put_group_info(ogids); + tsk->utime = ti->cpt_utime; + tsk->stime = ti->cpt_stime; + if (ctx->image_version == CPT_VERSION_8) + tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC); + else + cpt_timespec_import(&tsk->start_time, ti->cpt_starttime); + _set_normalized_timespec(&tsk->start_time, + tsk->start_time.tv_sec - + VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_sec, + tsk->start_time.tv_nsec - + VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_nsec); + + tsk->nvcsw = ti->cpt_nvcsw; + tsk->nivcsw = ti->cpt_nivcsw; + tsk->min_flt = ti->cpt_min_flt; + tsk->maj_flt = ti->cpt_maj_flt; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) + tsk->cutime = ti->cpt_cutime; + tsk->cstime = ti->cpt_cstime; + tsk->cnvcsw = ti->cpt_cnvcsw; + tsk->cnivcsw = ti->cpt_cnivcsw; + tsk->cmin_flt = ti->cpt_cmin_flt; + tsk->cmaj_flt = ti->cpt_cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; irlim[i].rlim_cur = ti->cpt_rlim_cur[i]; + tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i]; + } +#else + if (thread_group_leader(tsk) && tsk->signal) { + tsk->signal->utime = ti->cpt_utime; + tsk->signal->stime = ti->cpt_stime; + tsk->signal->cutime = ti->cpt_cutime; + tsk->signal->cstime = ti->cpt_cstime; + tsk->signal->nvcsw = ti->cpt_nvcsw; + tsk->signal->nivcsw = ti->cpt_nivcsw; + tsk->signal->cnvcsw = ti->cpt_cnvcsw; + tsk->signal->cnivcsw = ti->cpt_cnivcsw; + tsk->signal->min_flt = ti->cpt_min_flt; + tsk->signal->maj_flt = ti->cpt_maj_flt; + tsk->signal->cmin_flt = ti->cpt_cmin_flt; + tsk->signal->cmaj_flt = ti->cpt_cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; isignal->rlim[i].rlim_cur = ti->cpt_rlim_cur[i]; + tsk->signal->rlim[i].rlim_max = ti->cpt_rlim_max[i]; + } + } +#endif + +#ifdef CONFIG_X86 + for (i=0; i<3; i++) { + if (i >= GDT_ENTRY_TLS_ENTRIES) { + eprintk_ctx("too many tls descs\n"); + } else { +#ifndef CONFIG_X86_64 + tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF; + tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32; +#else + tsk->thread.tls_array[i] = ti->cpt_tls[i]; +#endif + } + } +#endif + + clear_stopped_child_used_math(tsk); + + b = (void *)(ti+1); + while ((void*)b < ((void*)ti) + ti->cpt_next) { + /* Siginfo objects are at the end of obj array */ + if (b->cpt_object == CPT_OBJ_SIGINFO) { + struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); + restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next); + set_exec_env(env); + break; + } + + switch (b->cpt_object) { +#ifdef CONFIG_X86 + case CPT_OBJ_BITS: + if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE && + cpu_has_fxsr) { + memcpy(&tsk->thread.i387, + (void*)b + b->cpt_hdrlen, + sizeof(struct i387_fxsave_struct)); + if (ti->cpt_used_math) + set_stopped_child_used_math(tsk); + } +#ifndef CONFIG_X86_64 + else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD && + !cpu_has_fxsr) { + memcpy(&tsk->thread.i387, + (void*)b + b->cpt_hdrlen, + sizeof(struct i387_fsave_struct)); + if (ti->cpt_used_math) + set_stopped_child_used_math(tsk); + } +#endif + break; +#endif + case CPT_OBJ_LASTSIGINFO: + lsi = (void*)b; + break; + case CPT_OBJ_X86_REGS: + case CPT_OBJ_X86_64_REGS: + case CPT_OBJ_IA64_REGS: + if (restore_registers(tsk, regs, ti, (void*)b, &ri)) { + eprintk_ctx("cannot restore registers: image is corrupted\n"); + return -EINVAL; + } + break; + case CPT_OBJ_SIGALTSTACK: { + struct cpt_sigaltstack_image *sas; + sas = (struct cpt_sigaltstack_image *)b; + tsk->sas_ss_sp = sas->cpt_stack; + tsk->sas_ss_size = sas->cpt_stacksize; + break; + } + case CPT_OBJ_TASK_AUX: { + struct cpt_task_aux_image *ai; + ai = (struct cpt_task_aux_image *)b; + tsk->robust_list = (void __user*)ai->cpt_robust_list; +#ifdef CONFIG_X86_64 +#ifdef CONFIG_COMPAT + if (tsk->thread_info->flags&_TIF_IA32) { + tsk->robust_list = (void __user *)NULL; + tsk->compat_robust_list = (void __user*)ai->cpt_robust_list; + } +#endif +#endif + break; + } + } + b = ((void*)b) + b->cpt_next; + } + + if (ri == NULL && !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + eprintk_ctx("missing register info\n"); + return -EINVAL; + } + + if (ti->cpt_ppid != ti->cpt_rppid) { + struct task_struct *parent; + struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); + write_lock_irq(&tasklist_lock); + parent = find_task_by_pid_ve(ti->cpt_ppid); + if (parent && parent != tsk->parent) { + list_add(&tsk->ptrace_list, &tsk->parent->ptrace_children); + remove_parent(tsk); + tsk->parent = parent; + add_parent(tsk); + } + write_unlock_irq(&tasklist_lock); + set_exec_env(env); + } + + tsk->ptrace_message = ti->cpt_ptrace_message; + tsk->pn_state = ti->cpt_pn_state; + tsk->stopped_state = ti->cpt_stopped_state; + tsk->thread_info->flags = ti->cpt_thrflags; + + /* The image was created with kernel < 2.6.16, while + * task hanged in sigsuspend -> do_signal. + * + * FIXME! This needs more brain efforts... + */ + if (ti->cpt_sigsuspend_state) { + tsk->thread_info->flags |= _TIF_RESTORE_SIGMASK; + } + +#ifdef CONFIG_X86_64 + tsk->thread_info->flags |= _TIF_FORK | _TIF_RESUME; + if (!ti->cpt_64bit) + tsk->thread_info->flags |= _TIF_IA32; +#endif + +#ifdef CONFIG_X86_32 + do { + if (regs->orig_eax == __NR__newselect && regs->edi) { + struct timeval tv; + if (access_process_vm(tsk, regs->edi, &tv, + sizeof(tv), 0) != sizeof(tv)) { + wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n", + virt_pid(tsk), tsk->pid, tsk->comm, + regs->edi); + break; + } + dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n", + virt_pid(tsk), tsk->pid, tsk->comm, + tv.tv_sec, tv.tv_usec); + tv.tv_sec -= ctx->delta_time.tv_sec; + if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { + tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; + tv.tv_sec--; + } else { + tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; + } + if (tv.tv_sec < 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } + dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n", + virt_pid(tsk), tsk->pid, tsk->comm, + tv.tv_sec, tv.tv_usec); + if (access_process_vm(tsk, regs->edi, &tv, + sizeof(tv), 1) != sizeof(tv)) { + wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n", + virt_pid(tsk), tsk->pid, tsk->comm, regs->edi); + } + + } else if (regs->orig_eax == __NR_select && regs->edi) { + struct { + unsigned long n; + fd_set __user *inp, *outp, *exp; + struct timeval __user *tvp; + } a; + struct timeval tv; + if (access_process_vm(tsk, regs->ebx, &a, + sizeof(a), 0) != sizeof(a)) { + wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid); + break; + } + if (access_process_vm(tsk, (unsigned long)a.tvp, + &tv, sizeof(tv), 0) != sizeof(tv)) { + wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid); + break; + } + dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n", + tsk->pid, tv.tv_sec, tv.tv_usec); + tv.tv_sec -= ctx->delta_time.tv_sec; + if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { + tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; + tv.tv_sec--; + } else { + tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; + } + if (tv.tv_sec < 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } + dprintk_ctx("task %d: New timeval in select: %ld.%ld\n", + tsk->pid, tv.tv_sec, tv.tv_usec); + if (access_process_vm(tsk, (unsigned long)a.tvp, + &tv, sizeof(tv), 1) != sizeof(tv)) { + wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid); + } + } + } while (0); +#endif + + if (ri && IN_SYSCALL(regs) && IN_ERROR(regs)) { + switch (SYSCALL_ERRNO(regs)) { + case ERESTARTSYS: + case ERESTARTNOINTR: + case ERESTARTNOHAND: + case ERESTART_RESTARTBLOCK: + case EAGAIN: + case EINTR: + ri->hooks |= (1<pn_state)) { + /* ... -> ptrace_notify() + * or + * ... -> do_signal() -> get_signal_to_deliver() -> + * ptrace stop + */ + tsk->last_siginfo = &ri->last_siginfo; + ri->hooks |= (1<last_siginfo, lsi); + } + + tsk->ptrace = ti->cpt_ptrace; + tsk->flags = ti->cpt_flags & ~PF_FROZEN; + clear_tsk_thread_flag(tsk, TIF_FREEZE); + tsk->exit_signal = ti->cpt_exit_signal; + + if (ri && tsk->stopped_state) { + dprintk_ctx("finish_stop\n"); + if (ti->cpt_state != TASK_STOPPED) + eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state); + ri->hooks |= (1<cpt_set_tid || ti->cpt_clear_tid)) { + ri->hooks |= (1<tid_ptrs[0] = ti->cpt_clear_tid; + ri->tid_ptrs[1] = ti->cpt_set_tid; + dprintk_ctx("settids\n"); + } + + if (ri && ri->hooks && + !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + if (try_module_get(THIS_MODULE)) + ri->hook = rst_resume_work; + } + + if (ti->cpt_state == TASK_TRACED) + tsk->state = TASK_TRACED; + else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) { + tsk->signal->it_virt_expires = 0; + tsk->signal->it_prof_expires = 0; + if (tsk->state != EXIT_DEAD) + eprintk_ctx("oops, schedule() did not make us dead\n"); + } + + if (thread_group_leader(tsk) && + ti->cpt_it_real_value && + !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + ktime_t val; + s64 nsec; + + nsec = ti->cpt_it_real_value; + val.tv64 = 0; + + if (ctx->image_version < CPT_VERSION_9) + nsec *= TICK_NSEC; + + val = ktime_add_ns(val, nsec - ctx->delta_nsec); + if (val.tv64 <= 0) + val.tv64 = NSEC_PER_USEC; + dprintk("rst itimer " CPT_FID " +%Ld %Lu\n", CPT_TID(tsk), + (long long)val.tv64, + (unsigned long long)ti->cpt_it_real_value); + + spin_lock_irq(&tsk->sighand->siglock); + if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) { + /* FIXME. Check!!!! */ + hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_REL); + } else { + wprintk_ctx("Timer clash. Impossible?\n"); + } + spin_unlock_irq(&tsk->sighand->siglock); + + dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk), + (unsigned long long)val.tv64); + } + + module_put(THIS_MODULE); + } + return 0; +} diff -uprN linux-2.6.18/kernel/cpt/rst_socket.c linux-2.6.18.ovz/kernel/cpt/rst_socket.c --- linux-2.6.18/kernel/cpt/rst_socket.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_socket.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,910 @@ +/* + * + * kernel/cpt/rst_socket.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +#include "cpt_syscalls.h" + + +static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + if (sk->sk_socket) { + sk->sk_socket->flags = si->cpt_ssflags; + sk->sk_socket->state = si->cpt_sstate; + } + sk->sk_reuse = si->cpt_reuse; + sk->sk_shutdown = si->cpt_shutdown; + sk->sk_userlocks = si->cpt_userlocks; + sk->sk_no_check = si->cpt_no_check; + sock_reset_flag(sk, SOCK_DBG); + if (si->cpt_debug) + sock_set_flag(sk, SOCK_DBG); + sock_reset_flag(sk, SOCK_RCVTSTAMP); + if (si->cpt_rcvtstamp) + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_LOCALROUTE); + if (si->cpt_localroute) + sock_set_flag(sk, SOCK_LOCALROUTE); + sk->sk_protocol = si->cpt_protocol; + sk->sk_err = si->cpt_err; + sk->sk_err_soft = si->cpt_err_soft; + sk->sk_priority = si->cpt_priority; + sk->sk_rcvlowat = si->cpt_rcvlowat; + sk->sk_rcvtimeo = si->cpt_rcvtimeo; + if (si->cpt_rcvtimeo == CPT_NULL) + sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_sndtimeo = si->cpt_sndtimeo; + if (si->cpt_sndtimeo == CPT_NULL) + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_rcvbuf = si->cpt_rcvbuf; + sk->sk_sndbuf = si->cpt_sndbuf; + sk->sk_bound_dev_if = si->cpt_bound_dev_if; + sk->sk_flags = si->cpt_flags; + sk->sk_lingertime = si->cpt_lingertime; + if (si->cpt_lingertime == CPT_NULL) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + sk->sk_peercred.pid = si->cpt_peer_pid; + sk->sk_peercred.uid = si->cpt_peer_uid; + sk->sk_peercred.gid = si->cpt_peer_gid; + cpt_timeval_import(&sk->sk_stamp, si->cpt_stamp); + return 0; +} + +static struct file *sock_mapfile(struct socket *sock) +{ + int fd = sock_map_fd(sock); + + if (fd >= 0) { + struct file *file = sock->file; + get_file(file); + sc_close(fd); + return file; + } + return ERR_PTR(fd); +} + +/* Assumption is that /tmp exists and writable. + * In previous versions we assumed that listen() will autobind + * the socket. It does not do this for AF_UNIX by evident reason: + * socket in abstract namespace is accessible, unlike socket bound + * to deleted FS object. + */ + +static int +select_deleted_name(char * name, cpt_context_t *ctx) +{ + int i; + + for (i=0; i<100; i++) { + struct nameidata nd; + unsigned int rnd = net_random(); + + sprintf(name, "/tmp/SOCK.%08x", rnd); + + if (path_lookup(name, 0, &nd) != 0) + return 0; + + path_release(&nd); + } + + eprintk_ctx("failed to allocate deleted socket inode\n"); + return -ELOOP; +} + +static int +bind_unix_socket(struct socket *sock, struct cpt_sock_image *si, + cpt_context_t *ctx) +{ + int err; + char *name; + struct sockaddr* addr; + int addrlen; + struct sockaddr_un sun; + struct nameidata nd; + + if ((addrlen = si->cpt_laddrlen) <= 2) + return 0; + + nd.dentry = NULL; + name = ((char*)si->cpt_laddr) + 2; + addr = (struct sockaddr *)si->cpt_laddr; + + if (name[0]) { + if (path_lookup(name, 0, &nd)) + nd.dentry = NULL; + + if (si->cpt_deleted) { + if (nd.dentry == NULL && + sock->ops->bind(sock, addr, addrlen) == 0) { + sc_unlink(name); + return 0; + } + + addr = (struct sockaddr*)&sun; + addr->sa_family = AF_UNIX; + name = ((char*)addr) + 2; + err = select_deleted_name(name, ctx); + if (err) + goto out; + addrlen = 2 + strlen(name); + } else if (nd.dentry) { + if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) { + eprintk_ctx("bind_unix_socket: not a socket dentry\n"); + err = -EINVAL; + goto out; + } + sc_unlink(name); + } + } + + err = sock->ops->bind(sock, addr, addrlen); + + if (!err && name[0]) { + if (nd.dentry) { + sc_chown(name, nd.dentry->d_inode->i_uid, + nd.dentry->d_inode->i_gid); + sc_chmod(name, nd.dentry->d_inode->i_mode); + } + if (si->cpt_deleted) + sc_unlink(name); + } + +out: + if (nd.dentry) + path_release(&nd); + return err; +} + +static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + struct sock *sk = sock->sk; + cpt_object_t *obj; + struct sock *parent; + + if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN) + return 0; + + if (si->cpt_parent == -1) + return bind_unix_socket(sock, si, ctx); + + obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + if (!obj) + return 0; + + parent = obj->o_obj; + if (unix_sk(parent)->addr) { + if (unix_sk(sk)->addr && + atomic_dec_and_test(&unix_sk(sk)->addr->refcnt)) + kfree(unix_sk(sk)->addr); + atomic_inc(&unix_sk(parent)->addr->refcnt); + unix_sk(sk)->addr = unix_sk(parent)->addr; + } + return 0; +} + +static int generic_restore_queues(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + __u32 type; + + skb = rst_skb(&pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else { + wprintk_ctx("strange socket queue type %u\n", type); + kfree_skb(skb); + } + } + return 0; +} + +static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct socket *sock2 = NULL; + struct file *file; + cpt_object_t *fobj; + cpt_object_t *pobj = NULL; + + err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, + &sock); + if (err) + return err; + + if (si->cpt_socketpair) { + err = sock_create_kern(si->cpt_family, si->cpt_type, + si->cpt_protocol, &sock2); + if (err) + goto err_out; + + err = sock->ops->socketpair(sock, sock2); + if (err < 0) + goto err_out; + + /* Socketpair with a peer outside our environment. + * So, we create real half-open pipe and do not worry + * about dead end anymore. */ + if (si->cpt_peer == -1) { + sock_release(sock2); + sock2 = NULL; + } + } + + cpt_obj_setobj(obj, sock->sk, ctx); + + if (si->cpt_file != CPT_NULL) { + file = sock_mapfile(sock); + err = PTR_ERR(file); + if (IS_ERR(file)) + goto err_out; + + err = -ENOMEM; + + obj->o_parent = file; + + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(fobj, si->cpt_file, ctx); + cpt_obj_setindex(fobj, si->cpt_index, ctx); + } + + if (sock2) { + struct file *file2; + + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx); + if (!pobj) BUG(); + if (pobj->o_obj) BUG(); + cpt_obj_setobj(pobj, sock2->sk, ctx); + + if (pobj->o_ppos != CPT_NULL) { + file2 = sock_mapfile(sock2); + err = PTR_ERR(file2); + if (IS_ERR(file2)) + goto err_out; + + err = -ENOMEM; + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(fobj, pobj->o_ppos, ctx); + cpt_obj_setindex(fobj, si->cpt_peer, ctx); + + pobj->o_parent = file2; + } + } + + setup_sock_common(sock->sk, si, obj->o_pos, ctx); + if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) { + int saved_reuse = sock->sk->sk_reuse; + + inet_sk(sock->sk)->freebind = 1; + sock->sk->sk_reuse = 2; + if (si->cpt_laddrlen) { + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + if (err) { + dprintk_ctx("binding failed: %d, do not worry\n", err); + } + } + sock->sk->sk_reuse = saved_reuse; + rst_socket_in(si, obj->o_pos, sock->sk, ctx); + } else if (sock->sk->sk_family == AF_NETLINK) { + struct sockaddr_nl *nl = (struct sockaddr_nl *)&si->cpt_laddr; + if (nl->nl_pid) { + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + if (err) { + eprintk_ctx("AF_NETLINK binding failed: %d\n", err); + } + } + if (si->cpt_raddrlen && nl->nl_pid) { + err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK); + if (err) { + eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err); + } + } + generic_restore_queues(sock->sk, si, obj->o_pos, ctx); + } else if (sock->sk->sk_family == PF_PACKET) { + struct sockaddr_ll *ll = (struct sockaddr_ll *)&si->cpt_laddr; + if (ll->sll_protocol || ll->sll_ifindex) { + int alen = si->cpt_laddrlen; + if (alen < sizeof(struct sockaddr_ll)) + alen = sizeof(struct sockaddr_ll); + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, alen); + if (err) { + eprintk_ctx("AF_PACKET binding failed: %d\n", err); + } + } + generic_restore_queues(sock->sk, si, obj->o_pos, ctx); + } + fixup_unix_address(sock, si, ctx); + + if (sock2) { + err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx); + if (err) + return err; + setup_sock_common(sock2->sk, si, pobj->o_pos, ctx); + fixup_unix_address(sock2, si, ctx); + } + + if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) + && (int)si->cpt_parent != -1) { + cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0) + sock->sk = NULL; + } + + + if (si->cpt_file == CPT_NULL && sock->sk && + sock->sk->sk_family == AF_INET) { + struct sock *sk = sock->sk; + + if (sk) { + sock->sk = NULL; + + local_bh_disable(); + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + eprintk_ctx("oops, sock is locked by user\n"); + + sock_hold(sk); + sock_orphan(sk); + ub_inc_orphan_count(sk); + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); + dprintk_ctx("orphaning socket %p\n", sk); + } + } + + if (si->cpt_file == CPT_NULL && sock->sk == NULL) + sock_release(sock); + + return 0; + +err_out: + if (sock2) + sock_release(sock2); + sock_release(sock); + return err; +} + +static int open_listening_socket(loff_t pos, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct file *file; + cpt_object_t *obj, *fobj; + + err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, + &sock); + if (err) { + eprintk_ctx("open_listening_socket: sock_create_kern: %d\n", err); + return err; + } + + sock->sk->sk_reuse = 2; + sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if; + + if (sock->sk->sk_family == AF_UNIX) { + err = bind_unix_socket(sock, si, ctx); + } else if (si->cpt_laddrlen) { + if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) + inet_sk(sock->sk)->freebind = 1; + + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + + if (err) { + eprintk_ctx("open_listening_socket: bind: %d\n", err); + goto err_out; + } + } + + err = sock->ops->listen(sock, si->cpt_max_ack_backlog); + if (err) { + eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted); + goto err_out; + } + + /* Now we may access socket body directly and fixup all the things. */ + + file = sock_mapfile(sock); + err = PTR_ERR(file); + if (IS_ERR(file)) { + eprintk_ctx("open_listening_socket: map: %d\n", err); + goto err_out; + } + + err = -ENOMEM; + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) + goto err_out; + if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(obj, pos, ctx); + cpt_obj_setindex(obj, si->cpt_index, ctx); + obj->o_parent = file; + cpt_obj_setpos(fobj, si->cpt_file, ctx); + cpt_obj_setindex(fobj, si->cpt_index, ctx); + + setup_sock_common(sock->sk, si, pos, ctx); + + if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) + rst_restore_synwait_queue(sock->sk, si, pos, ctx); + + return 0; + +err_out: + sock_release(sock); + return err; +} + +static int +rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + loff_t pos = *pos_p; + struct cpt_sockmc_image v; + + err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx); + if (err) + return err; + + *pos_p += v.cpt_next; + + if (v.cpt_family == AF_INET) + return rst_sk_mcfilter_in(sk, &v, pos, ctx); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (v.cpt_family == AF_INET6) + return rst_sk_mcfilter_in6(sk, &v, pos, ctx); +#endif + else + return -EAFNOSUPPORT; +} + + +static int +rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + struct sk_filter *fp, *old_fp; + loff_t pos = *pos_p; + struct cpt_obj_bits v; + + err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx); + if (err) + return err; + + *pos_p += v.cpt_next; + + if (v.cpt_size % sizeof(struct sock_filter)) + return -EINVAL; + + fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC); + if (fp == NULL) + return -ENOMEM; + atomic_set(&fp->refcnt, 1); + fp->len = v.cpt_size/sizeof(struct sock_filter); + + err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen); + if (err) { + sk_filter_release(sk, fp); + return err; + } + + old_fp = sk->sk_filter; + sk->sk_filter = fp; + if (old_fp) + sk_filter_release(sk, old_fp); + return 0; +} + + +int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + loff_t pos = *pos_p; + + err = rst_sock_attr_skfilter(pos_p, sk, ctx); + if (err && pos == *pos_p) + err = rst_sock_attr_mcfilter(pos_p, sk, ctx); + return err; +} + +struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx) +{ + int err; + struct sk_buff *skb; + struct cpt_skb_image v; + loff_t pos = *pos_p; + struct scm_fp_list *fpl = NULL; + struct timeval tmptv; + + err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx); + if (err) + return ERR_PTR(err); + *pos_p = pos + v.cpt_next; + + if (owner) + *owner = v.cpt_owner; + if (queue) + *queue = v.cpt_queue; + + skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL); + if (skb == NULL) + return ERR_PTR(-ENOMEM); + skb_reserve(skb, v.cpt_hspace); + skb_put(skb, v.cpt_len); + skb->h.raw = skb->head + v.cpt_h; + skb->nh.raw = skb->head + v.cpt_nh; + skb->mac.raw = skb->head + v.cpt_mac; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v.cpt_cb)); + memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); + skb->mac_len = v.cpt_mac_len; + + skb->csum = v.cpt_csum; + skb->local_df = v.cpt_local_df; + skb->pkt_type = v.cpt_pkt_type; + skb->ip_summed = v.cpt_ip_summed; + skb->priority = v.cpt_priority; + skb->protocol = v.cpt_protocol; + cpt_timeval_import(&tmptv, v.cpt_stamp); + skb_set_timestamp(skb, &tmptv); + + skb_shinfo(skb)->gso_segs = v.cpt_gso_segs; + skb_shinfo(skb)->gso_size = v.cpt_gso_size; + if (ctx->image_version == 0) { + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + } + + if (v.cpt_next > v.cpt_hdrlen) { + pos = pos + v.cpt_hdrlen; + while (pos < *pos_p) { + union { + struct cpt_obj_bits b; + struct cpt_fd_image f; + } u; + + err = rst_get_object(-1, pos, &u, ctx); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + if (u.b.cpt_object == CPT_OBJ_BITS) { + if (u.b.cpt_size != v.cpt_hspace + skb->len) { + eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len); + kfree_skb(skb); + return ERR_PTR(-EINVAL); + } + + err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + } else if (u.f.cpt_object == CPT_OBJ_FILEDESC) { + if (!fpl) { + fpl = ub_kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + if (!fpl) { + kfree_skb(skb); + return ERR_PTR(-ENOMEM); + } + fpl->count = 0; + UNIXCB(skb).fp = fpl; + } + fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx); + if (!IS_ERR(fpl->fp[fpl->count])) + fpl->count++; + } + pos += u.b.cpt_next; + } + } + + return skb; +} + +static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + struct sock *owner_sk; + __u32 owner; + + skb = rst_skb(&pos, &owner, NULL, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + owner_sk = unix_peer(sk); + if (owner != -1) { + cpt_object_t *pobj; + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx); + if (pobj == NULL) { + eprintk_ctx("orphan af_unix skb?\n"); + kfree_skb(skb); + continue; + } + owner_sk = pobj->o_obj; + } + if (owner_sk == NULL) { + dprintk_ctx("orphan af_unix skb 2?\n"); + kfree_skb(skb); + continue; + } + skb_set_owner_w(skb, owner_sk); + if (UNIXCB(skb).fp) + skb->destructor = unix_destruct_fds; + skb_queue_tail(&sk->sk_receive_queue, skb); + if (sk->sk_state == TCP_LISTEN) { + struct socket *sock = skb->sk->sk_socket; + if (sock == NULL) BUG(); + if (sock->file) BUG(); + skb->sk->sk_socket = NULL; + skb->sk->sk_sleep = NULL; + sock->sk = NULL; + sock_release(sock); + } + } + return 0; +} + + +/* All the sockets are created before we start to open files */ + +int rst_sockets(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SOCKET]; + loff_t endsec; + cpt_object_t *obj; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) { + eprintk_ctx("rst_sockets: ctx->pread: %d\n", err); + return err; + } + if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) { + eprintk_ctx("rst_sockets: hdr err\n"); + return -EINVAL; + } + + /* The first pass: we create socket index and open listening sockets. */ + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_sock_image *sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); + if (err) { + eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); + cpt_release_buf(ctx); + return err; + } + if (sbuf->cpt_state == TCP_LISTEN) { + err = open_listening_socket(sec, sbuf, ctx); + cpt_release_buf(ctx); + if (err) { + eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err); + return err; + } + } else { + cpt_release_buf(ctx); + obj = alloc_cpt_object(GFP_KERNEL, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setindex(obj, sbuf->cpt_index, ctx); + cpt_obj_setpos(obj, sec, ctx); + obj->o_ppos = sbuf->cpt_file; + intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx); + } + sec += sbuf->cpt_next; + } + + /* Pass 2: really restore sockets */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct cpt_sock_image *sbuf; + if (obj->o_obj != NULL) + continue; + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); + cpt_release_buf(ctx); + return err; + } + if (sbuf->cpt_state == TCP_LISTEN) BUG(); + err = open_socket(obj, sbuf, ctx); + cpt_release_buf(ctx); + if (err) { + eprintk_ctx("rst_sockets: open_socket: %d\n", err); + return err; + } + } + + return 0; +} + +int rst_orphans(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_ORPHANS]; + loff_t endsec; + cpt_object_t *obj; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_sock_image *sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + obj = alloc_cpt_object(GFP_KERNEL, ctx); + if (obj == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + obj->o_pos = sec; + obj->o_ppos = sbuf->cpt_file; + err = open_socket(obj, sbuf, ctx); + dprintk_ctx("Restoring orphan: %d\n", err); + free_cpt_object(obj, ctx); + cpt_release_buf(ctx); + if (err) + return err; + sec += sbuf->cpt_next; + } + + return 0; +} + + +/* Pass 3: I understand, this is not funny already :-), + * but we have to do another pass to establish links between + * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX + * skb queues with proper skb->sk links. + * + * This could be made at the end of rst_sockets(), but we defer + * restoring af_unix queues up to the end of restoring files to + * make restoring passed FDs cleaner. + */ + +int rst_sockets_complete(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_SOCKET) { + struct cpt_sock_image *sbuf; + struct sock *sk = obj->o_obj; + struct sock *peer; + + if (!sk) BUG(); + + if (sk->sk_family != AF_UNIX) + continue; + + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (sbuf->cpt_next > sbuf->cpt_hdrlen) + restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx); + + cpt_release_buf(ctx); + + if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) { + cpt_object_t *pobj; + + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (sbuf->cpt_peer != -1) { + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx); + if (pobj) { + peer = pobj->o_obj; + sock_hold(peer); + unix_peer(sk) = peer; + } + } + cpt_release_buf(ctx); + } + } + + rst_orphans(ctx); + + return 0; +} + diff -uprN linux-2.6.18/kernel/cpt/rst_socket_in.c linux-2.6.18.ovz/kernel/cpt/rst_socket_in.c --- linux-2.6.18/kernel/cpt/rst_socket_in.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_socket_in.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,494 @@ +/* + * + * kernel/cpt/rst_socket_in.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +static inline unsigned long jiffies_import(__u32 tmo) +{ + __s32 delta = tmo; + return jiffies + (long)delta; +} + +static inline __u32 tcp_jiffies_import(__u32 tmo) +{ + return ((__u32)jiffies) + tmo; +} + + +static int restore_queues(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + __u32 type; + + skb = rst_skb(&pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + if (sk->sk_type == SOCK_STREAM) { + if (type == CPT_SKB_RQ) { + sk_stream_set_owner_r(skb, sk); + ub_tcprcvbuf_charge_forced(sk, skb); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else if (type == CPT_SKB_OFOQ) { + struct tcp_sock *tp = tcp_sk(sk); + sk_stream_set_owner_r(skb, sk); + ub_tcprcvbuf_charge_forced(sk, skb); + skb_queue_tail(&tp->out_of_order_queue, skb); + } else if (type == CPT_SKB_WQ) { + sk->sk_wmem_queued += skb->truesize; + sk->sk_forward_alloc -= skb->truesize; + ub_tcpsndbuf_charge_forced(sk, skb); + skb_queue_tail(&sk->sk_write_queue, skb); + } else { + wprintk_ctx("strange stream queue type %u\n", type); + kfree_skb(skb); + } + } else { + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else if (type == CPT_SKB_WQ) { + struct inet_sock *inet = inet_sk(sk); + if (inet->cork.fragsize) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&sk->sk_write_queue, skb); + } else { + eprintk_ctx("cork skb is dropped\n"); + kfree_skb(skb); + } + } else { + wprintk_ctx("strange dgram queue type %u\n", type); + kfree_skb(skb); + } + } + } + return 0; +} + +static struct sock *find_parent(__u16 sport, cpt_context_t *ctx) +{ + cpt_object_t *obj; + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && + sk->sk_state == TCP_LISTEN && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && + inet_sk(sk)->sport == sport) + return sk; + } + return NULL; +} + +static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk, + struct cpt_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + tp->pred_flags = si->cpt_pred_flags; + tp->rcv_nxt = si->cpt_rcv_nxt; + tp->snd_nxt = si->cpt_snd_nxt; + tp->snd_una = si->cpt_snd_una; + tp->snd_sml = si->cpt_snd_sml; + tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp); + tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime); + tp->tcp_header_len = si->cpt_tcp_header_len; + inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending; + inet_csk(sk)->icsk_ack.quick = si->cpt_quick; + inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong; + inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked; + inet_csk(sk)->icsk_ack.ato = si->cpt_ato; + inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout); + inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime); + inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size; + inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss; + tp->snd_wl1 = si->cpt_snd_wl1; + tp->snd_wnd = si->cpt_snd_wnd; + tp->max_window = si->cpt_max_window; + inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie; + tp->mss_cache = si->cpt_mss_cache; + tp->rx_opt.mss_clamp = si->cpt_mss_clamp; + inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len; + inet_csk(sk)->icsk_ca_state = si->cpt_ca_state; + inet_csk(sk)->icsk_retransmits = si->cpt_retransmits; + tp->reordering = si->cpt_reordering; + tp->frto_counter = si->cpt_frto_counter; + tp->frto_highmark = si->cpt_frto_highmark; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + // // tp->adv_cong = si->cpt_adv_cong; +#endif + inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept; + inet_csk(sk)->icsk_backoff = si->cpt_backoff; + tp->srtt = si->cpt_srtt; + tp->mdev = si->cpt_mdev; + tp->mdev_max = si->cpt_mdev_max; + tp->rttvar = si->cpt_rttvar; + tp->rtt_seq = si->cpt_rtt_seq; + inet_csk(sk)->icsk_rto = si->cpt_rto; + tp->packets_out = si->cpt_packets_out; + tp->left_out = si->cpt_left_out; + tp->retrans_out = si->cpt_retrans_out; + tp->lost_out = si->cpt_lost_out; + tp->sacked_out = si->cpt_sacked_out; + tp->fackets_out = si->cpt_fackets_out; + tp->snd_ssthresh = si->cpt_snd_ssthresh; + tp->snd_cwnd = si->cpt_snd_cwnd; + tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt; + tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp; + tp->snd_cwnd_used = si->cpt_snd_cwnd_used; + tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp); + inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout); + tp->rcv_wnd = si->cpt_rcv_wnd; + tp->rcv_wup = si->cpt_rcv_wup; + tp->write_seq = si->cpt_write_seq; + tp->pushed_seq = si->cpt_pushed_seq; + tp->copied_seq = si->cpt_copied_seq; + tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok; + tp->rx_opt.wscale_ok = si->cpt_wscale_ok; + tp->rx_opt.sack_ok = si->cpt_sack_ok; + tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp; + tp->rx_opt.snd_wscale = si->cpt_snd_wscale; + tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale; + tp->nonagle = si->cpt_nonagle; + tp->keepalive_probes = si->cpt_keepalive_probes; + tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval; + tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr; + tp->rx_opt.ts_recent = si->cpt_ts_recent; + tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp; + tp->rx_opt.user_mss = si->cpt_user_mss; + tp->rx_opt.dsack = si->cpt_dsack; + tp->rx_opt.eff_sacks = si->cpt_num_sacks; + tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0]; + tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1]; + tp->selective_acks[0].start_seq = si->cpt_sack_array[2]; + tp->selective_acks[0].end_seq = si->cpt_sack_array[3]; + tp->selective_acks[1].start_seq = si->cpt_sack_array[4]; + tp->selective_acks[1].end_seq = si->cpt_sack_array[5]; + tp->selective_acks[2].start_seq = si->cpt_sack_array[6]; + tp->selective_acks[2].end_seq = si->cpt_sack_array[7]; + tp->selective_acks[3].start_seq = si->cpt_sack_array[8]; + tp->selective_acks[3].end_seq = si->cpt_sack_array[9]; + + tp->window_clamp = si->cpt_window_clamp; + tp->rcv_ssthresh = si->cpt_rcv_ssthresh; + inet_csk(sk)->icsk_probes_out = si->cpt_probes_out; + tp->rx_opt.num_sacks = si->cpt_num_sacks; + tp->advmss = si->cpt_advmss; + inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries; + tp->ecn_flags = si->cpt_ecn_flags; + tp->prior_ssthresh = si->cpt_prior_ssthresh; + tp->high_seq = si->cpt_high_seq; + tp->retrans_stamp = si->cpt_retrans_stamp; + tp->undo_marker = si->cpt_undo_marker; + tp->undo_retrans = si->cpt_undo_retrans; + tp->urg_seq = si->cpt_urg_seq; + tp->urg_data = si->cpt_urg_data; + inet_csk(sk)->icsk_pending = si->cpt_pending; + tp->urg_mode = si->cpt_urg_mode; + tp->snd_up = si->cpt_snd_up; + tp->keepalive_time = si->cpt_keepalive_time; + tp->keepalive_intvl = si->cpt_keepalive_intvl; + tp->linger2 = si->cpt_linger2; + + sk->sk_send_head = NULL; + for (skb = skb_peek(&sk->sk_write_queue); + skb && skb != (struct sk_buff*)&sk->sk_write_queue; + skb = skb->next) { + if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) { + sk->sk_send_head = skb; + break; + } + } + + if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) { + struct inet_sock *inet = inet_sk(sk); + if (inet->num == 0) { + cpt_object_t *lobj = NULL; + + if ((int)si->cpt_parent != -1) + lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + + if (lobj && lobj->o_obj) { + inet->num = ntohs(inet->sport); + local_bh_disable(); + __inet_inherit_port(&tcp_hashinfo, lobj->o_obj, sk); + local_bh_enable(); + dprintk_ctx("port inherited from parent\n"); + } else { + struct sock *lsk = find_parent(inet->sport, ctx); + if (lsk) { + inet->num = ntohs(inet->sport); + local_bh_disable(); + __inet_inherit_port(&tcp_hashinfo, lsk, sk); + local_bh_enable(); + dprintk_ctx("port inherited\n"); + } else { + eprintk_ctx("we are kinda lost...\n"); + } + } + } + + sk->sk_prot->hash(sk); + + if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER) + sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout); + if (inet_csk(sk)->icsk_pending) + sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer, + inet_csk(sk)->icsk_timeout); + if (sock_flag(sk, SOCK_KEEPOPEN)) { + unsigned long expires = jiffies_import(si->cpt_ka_timeout); + if (time_after(jiffies, expires)) + expires = jiffies + HZ; + sk_reset_timer(sk, &sk->sk_timer, expires); + } + } + + return 0; +} + + +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk, + struct cpt_context *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + + lock_sock(sk); + + sk->sk_state = si->cpt_state; + + inet->daddr = si->cpt_daddr; + inet->dport = si->cpt_dport; + inet->saddr = si->cpt_saddr; + inet->rcv_saddr = si->cpt_rcv_saddr; + inet->sport = si->cpt_sport; + inet->uc_ttl = si->cpt_uc_ttl; + inet->tos = si->cpt_tos; + inet->cmsg_flags = si->cpt_cmsg_flags; + inet->mc_index = si->cpt_mc_index; + inet->mc_addr = si->cpt_mc_addr; + inet->hdrincl = si->cpt_hdrincl; + inet->mc_ttl = si->cpt_mc_ttl; + inet->mc_loop = si->cpt_mc_loop; + inet->pmtudisc = si->cpt_pmtudisc; + inet->recverr = si->cpt_recverr; + inet->freebind = si->cpt_freebind; + inet->id = si->cpt_idcounter; + + inet->cork.flags = si->cpt_cork_flags; + inet->cork.fragsize = si->cpt_cork_fragsize; + inet->cork.length = si->cpt_cork_length; + inet->cork.addr = si->cpt_cork_addr; + inet->cork.fl.fl4_src = si->cpt_cork_saddr; + inet->cork.fl.fl4_dst = si->cpt_cork_daddr; + inet->cork.fl.oif = si->cpt_cork_oif; + if (inet->cork.fragsize) { + if (ip_route_output_key(&inet->cork.rt, &inet->cork.fl)) { + eprintk_ctx("failed to restore cork route\n"); + inet->cork.fragsize = 0; + } + } + + if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { + struct udp_sock *up = udp_sk(sk); + up->pending = si->cpt_udp_pending; + up->corkflag = si->cpt_udp_corkflag; + up->encap_type = si->cpt_udp_encap; + up->len = si->cpt_udp_len; + } + + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + memcpy(&np->saddr, si->cpt_saddr6, 16); + memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16); + memcpy(&np->daddr, si->cpt_daddr6, 16); + np->flow_label = si->cpt_flow_label6; + np->frag_size = si->cpt_frag_size6; + np->hop_limit = si->cpt_hop_limit6; + np->mcast_hops = si->cpt_mcast_hops6; + np->mcast_oif = si->cpt_mcast_oif6; + np->rxopt.all = si->cpt_rxopt6; + np->mc_loop = si->cpt_mc_loop6; + np->recverr = si->cpt_recverr6; + np->sndflow = si->cpt_sndflow6; + np->pmtudisc = si->cpt_pmtudisc6; + np->ipv6only = si->cpt_ipv6only6; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (si->cpt_mapped) { + extern struct inet_connection_sock_af_ops ipv6_mapped; + if (sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) { + inet_csk(sk)->icsk_af_ops = &ipv6_mapped; + sk->sk_backlog_rcv = tcp_v4_do_rcv; + } + } +#endif + } + + restore_queues(sk, si, pos, ctx); + + if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) + rst_socket_tcp(si, pos, sk, ctx); + + release_sock(sk); + return 0; +} + +int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx) +{ + struct request_sock *req; + + if (lsk->sk_state != TCP_LISTEN) + return -EINVAL; + + req = reqsk_alloc(&tcp_request_sock_ops); + if (!req) + return -ENOMEM; + + sk->sk_socket = NULL; + sk->sk_sleep = NULL; + inet_csk_reqsk_queue_add(lsk, req, sk); + return 0; +} + +static __inline__ u32 __tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) +{ + return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); +} + +int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end = si->cpt_next; + + pos += si->cpt_hdrlen; + while (pos < end) { + struct cpt_openreq_image oi; + + err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx); + if (err) { + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + continue; + } + + if (oi.cpt_object == CPT_OBJ_OPENREQ) { + struct request_sock *req = reqsk_alloc(&tcp_request_sock_ops); + if (req == NULL) + return -ENOMEM; + + memset(req, 0, sizeof(*req)); + tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn; + tcp_rsk(req)->snt_isn = oi.cpt_snt_isn; + inet_rsk(req)->rmt_port = oi.cpt_rmt_port; + req->mss = oi.cpt_mss; + req->retrans = oi.cpt_retrans; + inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale; + inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale; + inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok; + inet_rsk(req)->sack_ok = oi.cpt_sack_ok; + inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok; + inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok; + inet_rsk(req)->acked = oi.cpt_acked; + req->window_clamp = oi.cpt_window_clamp; + req->rcv_wnd = oi.cpt_rcv_wnd; + req->ts_recent = oi.cpt_ts_recent; + req->expires = jiffies_import(oi.cpt_expires); + + if (oi.cpt_family == AF_INET) { + memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4); + memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } else { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16); + memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16); + inet6_rsk(req)->iif = oi.cpt_iif; + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); +#endif + } + } + pos += oi.cpt_next; + } + return 0; +} + +int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx) +{ + struct ip_mreqn imr; + + if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { + eprintk_ctx("IGMPv3 is still not supported\n"); + return -EINVAL; + } + + memset(&imr, 0, sizeof(imr)); + imr.imr_ifindex = v->cpt_ifindex; + imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0]; + return ip_mc_join_group(sk, &imr); +} + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx) +{ + + if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { + eprintk_ctx("IGMPv3 is still not supported\n"); + return -EINVAL; + } + + return ipv6_sock_mc_join(sk, v->cpt_ifindex, + (struct in6_addr*)v->cpt_mcaddr); +} +#endif diff -uprN linux-2.6.18/kernel/cpt/rst_sysvipc.c linux-2.6.18.ovz/kernel/cpt/rst_sysvipc.c --- linux-2.6.18/kernel/cpt/rst_sysvipc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_sysvipc.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,409 @@ +/* + * + * kernel/cpt/rst_sysvipc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* FIXME. x86_64 has asm/ipc.h forgotten? */ +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" + +struct _warg { + struct file *file; + struct cpt_sysvshm_image *v; +}; + +static int fixup_one_shm(struct shmid_kernel *shp, void *arg) +{ + struct _warg *warg = arg; + + if (shp->shm_file != warg->file) + return 0; + if (shp->shm_nattch) + return -EEXIST; + + shp->shm_perm.uid = warg->v->cpt_uid; + shp->shm_perm.gid = warg->v->cpt_gid; + shp->shm_perm.cuid = warg->v->cpt_cuid; + shp->shm_perm.cgid = warg->v->cpt_cgid; + shp->shm_perm.mode = warg->v->cpt_mode; + + shp->shm_atim = warg->v->cpt_atime; + shp->shm_dtim = warg->v->cpt_dtime; + shp->shm_ctim = warg->v->cpt_ctime; + shp->shm_cprid = warg->v->cpt_creator; + shp->shm_lprid = warg->v->cpt_last; + + /* TODO: fix shp->mlock_user? */ + return 1; +} + +static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v) +{ + struct _warg warg; + + warg.file = file; + warg.v = v; + + return sysvipc_walk_shm(fixup_one_shm, &warg); +} + +static int fixup_shm_data(struct file *file, loff_t pos, loff_t end, + struct cpt_context *ctx) +{ + struct cpt_page_block pgb; + ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); + + do_write = file->f_dentry->d_inode->i_fop->write; + if (do_write == NULL) { + eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n"); + return -EINVAL; + } + + while (pos < end) { + loff_t opos; + loff_t ipos; + int count; + int err; + + err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); + if (err) + return err; + dprintk_ctx("restoring SHM block: %08x-%08x\n", + (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); + ipos = pos + pgb.cpt_hdrlen; + opos = pgb.cpt_start; + count = pgb.cpt_end-pgb.cpt_start; + while (count > 0) { + mm_segment_t oldfs; + int copy = count; + + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + oldfs = get_fs(); set_fs(KERNEL_DS); + err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); + set_fs(oldfs); + if (err) { + __cpt_release_buf(ctx); + return err; + } + oldfs = get_fs(); set_fs(KERNEL_DS); + ipos += copy; + err = do_write(file, ctx->tmpbuf, copy, &opos); + set_fs(oldfs); + __cpt_release_buf(ctx); + if (err != copy) { + eprintk_ctx("write() failure\n"); + if (err >= 0) + err = -EIO; + return err; + } + count -= copy; + } + pos += pgb.cpt_next; + } + return 0; +} + +struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx) +{ + struct file *file; + int err; + loff_t dpos, epos; + union { + struct cpt_file_image fi; + struct cpt_sysvshm_image shmi; + struct cpt_inode_image ii; + } u; + + err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); + if (err < 0) + goto err_out; + pos = u.fi.cpt_inode; + err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); + if (err < 0) + goto err_out; + dpos = pos + u.ii.cpt_hdrlen; + epos = pos + u.ii.cpt_next; + err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); + if (err < 0) + goto err_out; + dpos += u.shmi.cpt_next; + + file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id, + u.shmi.cpt_segsz, u.shmi.cpt_mode); + if (!IS_ERR(file)) { + err = fixup_shm(file, &u.shmi); + if (err != -EEXIST && dpos < epos) + err = fixup_shm_data(file, dpos, epos, ctx); + } + + return file; + +err_out: + return ERR_PTR(err); +} + +static int attach_one_undo(int semid, struct sem_array *sma, void *arg) +{ + struct sem_undo *su = arg; + struct sem_undo_list *undo_list = current->sysvsem.undo_list; + + if (semid != su->semid) + return 0; + + su->proc_next = undo_list->proc_list; + undo_list->proc_list = su; + + su->id_next = sma->undo; + sma->undo = su; + + return 1; +} + +static int attach_undo(struct sem_undo *su) +{ + return sysvipc_walk_sem(attach_one_undo, su); +} + +static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx) +{ + int err; + struct sem_undo_list *undo_list; + + if (current->sysvsem.undo_list) { + eprintk_ctx("Funny undo_list\n"); + return 0; + } + + undo_list = ub_kmalloc(sizeof(struct sem_undo_list), GFP_KERNEL); + if (undo_list == NULL) + return -ENOMEM; + memset(undo_list, 0, sizeof(struct sem_undo_list)); + atomic_set(&undo_list->refcnt, 1); + spin_lock_init(&undo_list->lock); + current->sysvsem.undo_list = undo_list; + + if (sui->cpt_next > sui->cpt_hdrlen) { + loff_t offset = pos + sui->cpt_hdrlen; + do { + struct sem_undo *new; + struct cpt_sysvsem_undo_image spi; + err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx); + if (err) + goto out; + new = ub_kmalloc(sizeof(struct sem_undo) + + sizeof(short)*spi.cpt_nsem, GFP_KERNEL); + if (!new) { + err = -ENOMEM; + goto out; + } + + memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem); + new->semadj = (short *) &new[1]; + new->semid = spi.cpt_id; + err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen); + if (err) { + kfree(new); + goto out; + } + err = attach_undo(new); + if (err <= 0) { + if (err == 0) + err = -ENOENT; + kfree(new); + goto out; + } + offset += spi.cpt_next; + } while (offset < pos + sui->cpt_next); + } + err = 0; + +out: + return err; +} + +__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + +#if 0 + if (ti->cpt_sysvsem_undo == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo)) + flag |= CLONE_SYSVSEM; +#endif + return flag; +} + +int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err; + struct sem_undo_list *f = current->sysvsem.undo_list; + cpt_object_t *obj; + struct cpt_object_hdr sui; + + if (ti->cpt_sysvsem_undo == CPT_NULL) { + exit_sem(current); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx); + if (obj) { + if (obj->o_obj != f) { + exit_sem(current); + f = obj->o_obj; + atomic_inc(&f->refcnt); + current->sysvsem.undo_list = f; + } + return 0; + } + + if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0) + goto out; + + if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0) + goto out; + + err = -ENOMEM; + obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx); + if (obj) { + err = 0; + cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx); + } + + return 0; + +out: + return err; +} + +struct _sarg { + int semid; + struct cpt_sysvsem_image *v; + __u32 *arr; +}; + +static int fixup_one_sem(int semid, struct sem_array *sma, void *arg) +{ + struct _sarg *warg = arg; + + if (semid != warg->semid) + return 0; + + sma->sem_perm.uid = warg->v->cpt_uid; + sma->sem_perm.gid = warg->v->cpt_gid; + sma->sem_perm.cuid = warg->v->cpt_cuid; + sma->sem_perm.cgid = warg->v->cpt_cgid; + sma->sem_perm.mode = warg->v->cpt_mode; + sma->sem_perm.seq = warg->v->cpt_seq; + + sma->sem_ctime = warg->v->cpt_ctime; + sma->sem_otime = warg->v->cpt_otime; + memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8); + return 1; +} + +static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr) +{ + struct _sarg warg; + + warg.semid = semid; + warg.v = v; + warg.arr = arr; + + return sysvipc_walk_sem(fixup_one_sem, &warg); +} + + +static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si, + struct cpt_context *ctx) +{ + int err; + __u32 *arr; + int nsems = (si->cpt_next - si->cpt_hdrlen)/8; + + arr = kmalloc(nsems*8, GFP_KERNEL); + if (!arr) + return -ENOMEM; + + err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen); + if (err) + goto out; + err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode); + if (err < 0) { + eprintk_ctx("SEM 3\n"); + goto out; + } + err = fixup_sem(si->cpt_id, si, arr); + if (err == 0) + err = -ESRCH; + if (err > 0) + err = 0; +out: + kfree(arr); + return err; +} + +static int rst_sysv_sem(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_sysvsem_image sbuf; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int err; + err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx); + if (err) + return err; + err = restore_sem(sec, &sbuf, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + return 0; +} + +int rst_sysv_ipc(struct cpt_context *ctx) +{ + return rst_sysv_sem(ctx); +} diff -uprN linux-2.6.18/kernel/cpt/rst_tty.c linux-2.6.18.ovz/kernel/cpt/rst_tty.c --- linux-2.6.18/kernel/cpt/rst_tty.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_tty.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,380 @@ +/* + * + * kernel/cpt/rst_tty.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" + +static int pty_setup(struct tty_struct *stty, loff_t pos, + struct cpt_tty_image *pi, struct cpt_context *ctx) +{ + unsigned long flags; + + stty->pgrp = -1; + stty->session = 0; + stty->packet = pi->cpt_packet; + stty->stopped = pi->cpt_stopped; + stty->hw_stopped = pi->cpt_hw_stopped; + stty->flow_stopped = pi->cpt_flow_stopped; +#define DONOT_CHANGE ((1<flags & DONOT_CHANGE; + stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE); + stty->ctrl_status = pi->cpt_ctrl_status; + stty->winsize.ws_row = pi->cpt_ws_row; + stty->winsize.ws_col = pi->cpt_ws_col; + stty->winsize.ws_ypixel = pi->cpt_ws_prow; + stty->winsize.ws_xpixel = pi->cpt_ws_pcol; + stty->canon_column = pi->cpt_canon_column; + stty->column = pi->cpt_column; + stty->raw = pi->cpt_raw; + stty->real_raw = pi->cpt_real_raw; + stty->erasing = pi->cpt_erasing; + stty->lnext = pi->cpt_lnext; + stty->icanon = pi->cpt_icanon; + stty->closing = pi->cpt_closing; + stty->minimum_to_wake = pi->cpt_minimum_to_wake; + + stty->termios->c_iflag = pi->cpt_c_iflag; + stty->termios->c_oflag = pi->cpt_c_oflag; + stty->termios->c_lflag = pi->cpt_c_lflag; + stty->termios->c_cflag = pi->cpt_c_cflag; + memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS); + memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags)); + + if (pi->cpt_next > pi->cpt_hdrlen) { + int err; + struct cpt_obj_bits b; + err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx); + if (err) + return err; + if (b.cpt_size == 0) + return 0; + err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen); + if (err) + return err; + + spin_lock_irq(&stty->read_lock); + stty->read_tail = 0; + stty->read_cnt = b.cpt_size; + stty->read_head = b.cpt_size; + stty->canon_head = stty->read_tail + pi->cpt_canon_head; + stty->canon_data = pi->cpt_canon_data; + spin_unlock_irq(&stty->read_lock); + } + + return 0; +} + +/* Find slave/master tty in image, when we already know master/slave. + * It might be optimized, of course. */ +static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_TTY]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_tty_image *pibuf; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return CPT_NULL; + if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) + return CPT_NULL; + pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL); + if (pibuf == NULL) { + eprintk_ctx("cannot allocate buffer\n"); + return CPT_NULL; + } + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) + return CPT_NULL; + if (pibuf->cpt_index == pi->cpt_index && + !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) && + pos != sec) { + pty_setup(stty, sec, pibuf, ctx); + return sec; + } + sec += pibuf->cpt_next; + } + kfree(pibuf); + return CPT_NULL; +} + +static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master, + struct cpt_context *ctx) +{ + int err; + struct iattr newattrs; + struct dentry *d = master->f_dentry; + + newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; + newattrs.ia_uid = ii->cpt_uid; + newattrs.ia_gid = ii->cpt_gid; + newattrs.ia_mode = ii->cpt_mode; + + mutex_lock(&d->d_inode->i_mutex); + err = notify_change(d, &newattrs); + mutex_unlock(&d->d_inode->i_mutex); + + return err; +} + +/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open + * /dev/ptmx until we get pty with desired index. + */ + +struct file *ptmx_open(int index, unsigned int flags) +{ + struct file *file; + struct file **stack = NULL; + int depth = 0; + + for (;;) { + struct tty_struct *tty; + + file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + if (IS_ERR(file)) + break; + tty = file->private_data; + if (tty->index == index) + break; + + if (depth == PAGE_SIZE/sizeof(struct file *)) { + fput(file); + file = ERR_PTR(-EBUSY); + break; + } + if (stack == NULL) { + stack = (struct file **)__get_free_page(GFP_KERNEL); + if (!stack) { + fput(file); + file = ERR_PTR(-ENOMEM); + break; + } + } + stack[depth] = file; + depth++; + } + while (depth > 0) { + depth--; + fput(stack[depth]); + } + if (stack) + free_page((unsigned long)stack); + return file; +} + + +struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, + unsigned flags, struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct file *master, *slave; + struct tty_struct *stty; + struct cpt_tty_image *pi; + static char *a = "pqrstuvwxyzabcde"; + static char *b = "0123456789abcdef"; + char pairname[16]; + unsigned master_flags, slave_flags; + + if (fi->cpt_priv == CPT_NULL) + return ERR_PTR(-EINVAL); + + obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx); + if (obj && obj->o_parent) { + dprintk_ctx("obtained pty as pair to existing\n"); + master = obj->o_parent; + stty = master->private_data; + + if (stty->driver->subtype == PTY_TYPE_MASTER && + (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) { + wprintk_ctx("cloning ptmx\n"); + get_file(master); + return master; + } + + master = dentry_open(dget(master->f_dentry), + mntget(master->f_vfsmnt), flags); + if (!IS_ERR(master)) { + stty = master->private_data; + if (stty->driver->subtype != PTY_TYPE_MASTER) + fixup_tty_attrs(ii, master, ctx); + } + return master; + } + + pi = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx); + if (err) { + cpt_release_buf(ctx); + return ERR_PTR(err); + } + + master_flags = slave_flags = 0; + if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) + master_flags = flags; + else + slave_flags = flags; + + /* + * Open pair master/slave. + */ + if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) { + master = ptmx_open(pi->cpt_index, master_flags); + } else { + sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]); + master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + } + if (IS_ERR(master)) { + eprintk_ctx("filp_open master: %Ld %ld\n", (long long)fi->cpt_priv, PTR_ERR(master)); + cpt_release_buf(ctx); + return master; + } + stty = master->private_data; + clear_bit(TTY_PTY_LOCK, &stty->flags); + if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) + sprintf(pairname, "/dev/pts/%d", stty->index); + else + sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]); + slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + if (IS_ERR(slave)) { + eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave)); + fput(master); + cpt_release_buf(ctx); + return slave; + } + + if (pi->cpt_drv_subtype != PTY_TYPE_MASTER) + fixup_tty_attrs(ii, slave, ctx); + + cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx); + cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx); + cpt_object_add(CPT_OBJ_FILE, master, ctx); + cpt_object_add(CPT_OBJ_FILE, slave, ctx); + + if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) { + loff_t pos; + obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); + obj->o_parent = master; + cpt_obj_setpos(obj, fi->cpt_priv, ctx); + pty_setup(stty, fi->cpt_priv, pi, ctx); + + obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); + obj->o_parent = slave; + pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx); + cpt_obj_setpos(obj, pos, ctx); + + obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx); + cpt_obj_setpos(obj, CPT_NULL, ctx); + get_file(master); + cpt_release_buf(ctx); + return master; + } else { + loff_t pos; + obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); + obj->o_parent = slave; + cpt_obj_setpos(obj, fi->cpt_priv, ctx); + pty_setup(stty->link, fi->cpt_priv, pi, ctx); + + obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); + obj->o_parent = master; + pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx); + cpt_obj_setpos(obj, pos, ctx); + + obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx); + cpt_obj_setpos(obj, CPT_NULL, ctx); + get_file(slave); + cpt_release_buf(ctx); + return slave; + } +} + +int rst_tty_jobcontrol(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_TTY]; + loff_t endsec; + struct cpt_section_hdr h; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_tty_image *pibuf = cpt_get_buf(ctx); + + if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) { + cpt_release_buf(ctx); + return -EINVAL; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx); + if (obj) { + struct tty_struct *stty = obj->o_obj; + if ((int)pibuf->cpt_pgrp > 0) { + stty->pgrp = vpid_to_pid(pibuf->cpt_pgrp); + if (stty->pgrp == -1) + dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp); + } else if (pibuf->cpt_pgrp) { + stty->pgrp = alloc_pidmap(); + if (stty->pgrp < 0) { + eprintk_ctx("cannot allocate stray tty->pgrp"); + cpt_release_buf(ctx); + return -EINVAL; + } + free_pidmap(stty->pgrp); + } + if ((int)pibuf->cpt_session > 0) { + int sess; + sess = vpid_to_pid(pibuf->cpt_session); + if (sess == -1) { + dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session); + } else if (stty->session <= 0) { + stty->session = sess; + } else if (stty->session != sess) { + wprintk_ctx("tty session mismatch 2\n"); + } + } + } + sec += pibuf->cpt_next; + cpt_release_buf(ctx); + } + return 0; +} diff -uprN linux-2.6.18/kernel/cpt/rst_ubc.c linux-2.6.18.ovz/kernel/cpt/rst_ubc.c --- linux-2.6.18/kernel/cpt/rst_ubc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_ubc.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,131 @@ +/* + * + * kernel/cpt/rst_ubc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx); + if (obj == NULL) { + eprintk("RST: unknown ub @%Ld\n", (long long)pos); + return get_beancounter(get_exec_ub()); + } + return get_beancounter(obj->o_obj); +} + +void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id) +{ + to[bc_parm_id].barrier = from[bc_parm_id].barrier; + to[bc_parm_id].limit = from[bc_parm_id].limit; +} + +void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id) +{ + ubprm[bc_parm_id].barrier = UB_MAXVALUE; + ubprm[bc_parm_id].limit = UB_MAXVALUE; +} + +static void restore_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, + int held) +{ + prm->barrier = (dmp->barrier == CPT_NULL ? UB_MAXVALUE : dmp->barrier); + prm->limit = (dmp->limit == CPT_NULL ? UB_MAXVALUE : dmp->limit); + if (held) + prm->held = dmp->held; + prm->maxheld = dmp->maxheld; + prm->minheld = dmp->minheld; + prm->failcnt = dmp->failcnt; +} + +static int restore_one_bc(struct cpt_beancounter_image *v, + cpt_object_t *obj, struct cpt_context *ctx) +{ + struct user_beancounter *bc; + cpt_object_t *pobj; + int i; + + if (v->cpt_parent != CPT_NULL) { + pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx); + if (pobj == NULL) + return -ESRCH; + bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1); + } else { + bc = get_exec_ub(); + while (bc->parent) + bc = bc->parent; + get_beancounter(bc); + } + if (bc == NULL) + return -ENOMEM; + obj->o_obj = bc; + + if (ctx->image_version < CPT_VERSION_18 && + CPT_VERSION_MINOR(ctx->image_version) < 1) + goto out; + + for (i = 0; i < UB_RESOURCES; i++) { + restore_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + restore_one_bc_parm(v->cpt_parms + i * 2 + 1, + bc->ub_store + i, 1); + } + +out: + if (!bc->parent) + for (i = 0; i < UB_RESOURCES; i++) + copy_one_ubparm(bc->ub_parms, ctx->saved_ubc, i); + + return 0; +} + +int rst_undump_ubc(struct cpt_context *ctx) +{ + loff_t start, end; + struct cpt_beancounter_image *v; + cpt_object_t *obj; + int err; + + err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end); + if (err) + return err; + + while (start < end) { + v = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_UBC, start, v, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + obj = alloc_cpt_object(GFP_KERNEL, ctx); + cpt_obj_setpos(obj, start, ctx); + intern_cpt_object(CPT_OBJ_UBC, obj, ctx); + + restore_one_bc(v, obj, ctx); + + cpt_release_buf(ctx); + start += v->cpt_next; + } + return 0; +} + +void rst_finish_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_UBC) + put_beancounter(obj->o_obj); +} diff -uprN linux-2.6.18/kernel/cpt/rst_undump.c linux-2.6.18.ovz/kernel/cpt/rst_undump.c --- linux-2.6.18/kernel/cpt/rst_undump.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/cpt/rst_undump.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,881 @@ +/* + * + * kernel/cpt/rst_undump.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_socket.h" +#include "cpt_net.h" +#include "cpt_ubc.h" +#include "cpt_kernel.h" + +static int rst_utsname(cpt_context_t *ctx); + + +struct thr_context { + struct completion init_complete; + struct completion task_done; + int error; + struct cpt_context *ctx; + cpt_object_t *tobj; +}; + +static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx); + +static int vps_rst_veinfo(struct cpt_context *ctx) +{ + int err; + struct cpt_veinfo_image *i; + struct ve_struct *ve; + struct timespec delta; + loff_t start, end; + struct ipc_namespace *ns; + + err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end); + if (err) + goto out; + + i = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx); + if (err) + goto out_rel; + + ve = get_exec_env(); + ns = ve->ve_ns->ipc_ns; + + ns->shm_ctlall = i->shm_ctl_all; + ns->shm_ctlmax = i->shm_ctl_max; + ns->shm_ctlmni = i->shm_ctl_mni; + + ns->msg_ctlmax = i->msg_ctl_max; + ns->msg_ctlmni = i->msg_ctl_mni; + ns->msg_ctlmnb = i->msg_ctl_mnb; + + BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr)); + ns->sem_ctls[0] = i->sem_ctl_arr[0]; + ns->sem_ctls[1] = i->sem_ctl_arr[1]; + ns->sem_ctls[2] = i->sem_ctl_arr[2]; + ns->sem_ctls[3] = i->sem_ctl_arr[3]; + + cpt_timespec_import(&delta, i->start_timespec_delta); + _set_normalized_timespec(&ve->start_timespec, + ve->start_timespec.tv_sec - delta.tv_sec, + ve->start_timespec.tv_nsec - delta.tv_nsec); + ve->start_jiffies -= i->start_jiffies_delta; + // // FIXME: what??? + // // ve->start_cycles -= (s64)i->start_jiffies_delta * cycles_per_jiffy; + + err = 0; +out_rel: + cpt_release_buf(ctx); +out: + return err; +} + +static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err; + struct env_create_param3 param; + + do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); + do_gettimespec(&ctx->delta_time); + + _set_normalized_timespec(&ctx->delta_time, + ctx->delta_time.tv_sec - ctx->start_time.tv_sec, + ctx->delta_time.tv_nsec - ctx->start_time.tv_nsec); + ctx->delta_nsec = (s64)ctx->delta_time.tv_sec*NSEC_PER_SEC + ctx->delta_time.tv_nsec; + if (ctx->delta_nsec < 0) { + wprintk_ctx("Wall time is behind source by %Ld ns, " + "time sensitive applications can misbehave\n", (long long)-ctx->delta_nsec); + } + + _set_normalized_timespec(&ctx->cpt_monotonic_time, + ctx->cpt_monotonic_time.tv_sec - ctx->delta_time.tv_sec, + ctx->cpt_monotonic_time.tv_nsec - ctx->delta_time.tv_nsec); + + memset(¶m, 0, sizeof(param)); + param.iptables_mask = ctx->iptables_mask; + param.feature_mask = ctx->features; + + /* feature_mask is set as required - pretend we know everything */ + param.known_features = (ctx->image_version < CPT_VERSION_18) ? + VE_FEATURES_OLD : ~(__u64)0; + + err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK, 2, + ¶m, sizeof(param)); + if (err < 0) + eprintk_ctx("real_env_create: %d\n", err); + + get_exec_env()->jiffies_fixup = + (ctx->delta_time.tv_sec < 0 ? + 0 : timespec_to_jiffies(&ctx->delta_time)) - + (unsigned long)(get_jiffies_64() - ctx->virt_jiffies64); + dprintk_ctx("JFixup %ld %Ld\n", get_exec_env()->jiffies_fixup, + (long long)ctx->delta_nsec); + return err < 0 ? err : 0; +} + +static int hook(void *arg) +{ + struct thr_context *thr_ctx = arg; + struct cpt_context *ctx; + cpt_object_t *tobj; + struct cpt_task_image *ti; + int err = 0; + int exiting = 0; + + current->state = TASK_UNINTERRUPTIBLE; + complete(&thr_ctx->init_complete); + schedule(); + + ctx = thr_ctx->ctx; + tobj = thr_ctx->tobj; + ti = tobj->o_image; + + current->fs->umask = 0; + + if (ti->cpt_pid == 1) { +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *bc; +#endif + + err = vps_rst_reparent_root(tobj, ctx); + + if (err) { + rst_report_error(err, ctx); + goto out; + } + + memcpy(&cap_bset, &ti->cpt_ecap, sizeof(kernel_cap_t)); + + if (ctx->statusfile) { + fput(ctx->statusfile); + ctx->statusfile = NULL; + } + + if (ctx->lockfile) { + mm_segment_t oldfs; + ssize_t err = -EINVAL; + char b; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (ctx->lockfile->f_op && ctx->lockfile->f_op->read) + err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos); + set_fs(oldfs); + fput(ctx->lockfile); + ctx->lockfile = NULL; + } + + err = vps_rst_veinfo(ctx); + if (err) { + eprintk_ctx("rst_veinfo: %d\n", err); + goto out; + } + + err = rst_utsname(ctx); + if (err) { + eprintk_ctx("rst_utsname: %d\n", err); + goto out; + } + + err = rst_root_namespace(ctx); + if (err) { + eprintk_ctx("rst_namespace: %d\n", err); + goto out; + } + + if ((err = rst_restore_net(ctx)) != 0) { + eprintk_ctx("rst_restore_net: %d\n", err); + goto out; + } + + err = rst_sockets(ctx); + if (err) { + eprintk_ctx("rst_sockets: %d\n", err); + goto out; + } + err = rst_sysv_ipc(ctx); + if (err) { + eprintk_ctx("rst_sysv_ipc: %d\n", err); + goto out; + } +#ifdef CONFIG_USER_RESOURCE + bc = get_exec_ub(); + set_one_ubparm_to_max(bc->ub_parms, UB_KMEMSIZE); + set_one_ubparm_to_max(bc->ub_parms, UB_NUMPROC); + set_one_ubparm_to_max(bc->ub_parms, UB_NUMFILE); + set_one_ubparm_to_max(bc->ub_parms, UB_DCACHESIZE); +#endif + } + + do { + if (current->user->uid != ti->cpt_user) { + struct user_struct *u = alloc_uid(ti->cpt_user); + if (!u) { + eprintk_ctx("alloc_user\n"); + } else { + switch_uid(u); + } + } + } while (0); + + if ((err = rst_mm_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_mm: %d\n", err); + goto out; + } + + if ((err = rst_files_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_files: %d\n", err); + goto out; + } + + if ((err = rst_fs_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_fs: %d\n", err); + goto out; + } + + if ((err = rst_semundo_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_semundo: %d\n", err); + goto out; + } + + if ((err = rst_signal_complete(ti, &exiting, ctx)) != 0) { + eprintk_ctx("rst_signal: %d\n", err); + goto out; + } + + if (ti->cpt_personality != 0) + __set_personality(ti->cpt_personality); + + current->set_child_tid = NULL; + current->clear_child_tid = NULL; + current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV); + current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV); + current->exit_code = ti->cpt_exit_code; + current->pdeath_signal = ti->cpt_pdeath_signal; + + if (ti->cpt_restart.fn != CPT_RBL_0) { + if (ti->cpt_restart.fn != CPT_RBL_NANOSLEEP + && ti->cpt_restart.fn != CPT_RBL_COMPAT_NANOSLEEP + ) { + eprintk_ctx("unknown restart block\n"); + } else { + ktime_t e; + + e.tv64 = 0; + + current->thread_info->restart_block.fn = nanosleep_restart; +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) + if (!ti->cpt_64bit) + current->thread_info->restart_block.fn = compat_nanosleep_restart; +#endif + if (ctx->image_version >= CPT_VERSION_9) + e = ktime_add_ns(e, ti->cpt_restart.arg0); + else + e = ktime_add_ns(e, ti->cpt_restart.arg0*TICK_NSEC); + if (e.tv64 < 0) + e.tv64 = TICK_NSEC; + e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + current->thread_info->restart_block.arg0 = e.tv64 & 0xFFFFFFFF; + current->thread_info->restart_block.arg1 = e.tv64 >> 32; + if (ctx->image_version >= CPT_VERSION_9) { + current->thread_info->restart_block.arg2 = ti->cpt_restart.arg2; + current->thread_info->restart_block.arg3 = ti->cpt_restart.arg3; + } else { + current->thread_info->restart_block.arg2 = ti->cpt_restart.arg1; + current->thread_info->restart_block.arg3 = CLOCK_MONOTONIC; + } + } + } + + if (thread_group_leader(current)) { + current->signal->it_real_incr.tv64 = 0; + if (ctx->image_version >= CPT_VERSION_9) { + current->signal->it_real_incr = + ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr); + } else { + current->signal->it_real_incr = + ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC); + } + current->signal->it_prof_incr = ti->cpt_it_prof_incr; + current->signal->it_virt_incr = ti->cpt_it_virt_incr; + current->signal->it_prof_expires = ti->cpt_it_prof_value; + current->signal->it_virt_expires = ti->cpt_it_virt_value; + } + + err = rst_clone_children(tobj, ctx); + if (err) { + eprintk_ctx("rst_clone_children\n"); + goto out; + } + + if (exiting) + current->signal->flags |= SIGNAL_GROUP_EXIT; + + if (ti->cpt_pid == 1) { + if ((err = rst_process_linkage(ctx)) != 0) { + eprintk_ctx("rst_process_linkage: %d\n", err); + goto out; + } + if ((err = rst_do_filejobs(ctx)) != 0) { + eprintk_ctx("rst_do_filejobs: %d\n", err); + goto out; + } + if ((err = rst_eventpoll(ctx)) != 0) { + eprintk_ctx("rst_eventpoll: %d\n", err); + goto out; + } + if ((err = rst_sockets_complete(ctx)) != 0) { + eprintk_ctx("rst_sockets_complete: %d\n", err); + goto out; + } + if ((err = rst_stray_files(ctx)) != 0) { + eprintk_ctx("rst_stray_files: %d\n", err); + goto out; + } + if ((err = rst_posix_locks(ctx)) != 0) { + eprintk_ctx("rst_posix_locks: %d\n", err); + goto out; + } + if ((err = rst_tty_jobcontrol(ctx)) != 0) { + eprintk_ctx("rst_tty_jobcontrol: %d\n", err); + goto out; + } + if ((err = rst_restore_fs(ctx)) != 0) { + eprintk_ctx("rst_restore_fs: %d\n", err); + goto out; + } + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RESTORE, ctx) & NOTIFY_FAIL) { + err = -ECHRNG; + eprintk_ctx("scp_restore failed\n"); + goto out; + } + } + +out: + thr_ctx->error = err; + complete(&thr_ctx->task_done); + + if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + current->flags |= PF_EXIT_RESTART; + do_exit(ti->cpt_exit_code); + } else { + __set_current_state(TASK_UNINTERRUPTIBLE); + } + + schedule(); + + dprintk_ctx("leaked through %d/%d %p\n", current->pid, virt_pid(current), current->mm); + + module_put(THIS_MODULE); + complete_and_exit(NULL, 0); + return 0; +} + +#if 0 +static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct task_beancounter *tbc; + + tbc = task_bc(current); + + put_beancounter(tbc->fork_sub); + tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx); + if (ti->cpt_mm_ub != CPT_NULL) { + put_beancounter(tbc->exec_ub); + tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx); + } +} +#endif + +static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx, + struct thr_context *thr_ctx) +{ + struct task_struct *tsk; + int pid; + + thr_ctx->ctx = ctx; + thr_ctx->error = 0; + init_completion(&thr_ctx->init_complete); + init_completion(&thr_ctx->task_done); +#if 0 + set_task_ubs(obj->o_image, ctx); +#endif + + pid = local_kernel_thread(hook, thr_ctx, 0, 0); + if (pid < 0) + return pid; + read_lock(&tasklist_lock); + tsk = find_task_by_pid_ve(pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (tsk == NULL) + return -ESRCH; + cpt_obj_setobj(obj, tsk, ctx); + thr_ctx->tobj = obj; + return 0; +} + +static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm)); + rst_mm_basic(obj, ti, ctx); + return 0; +} + +static int make_baby(cpt_object_t *cobj, + struct cpt_task_image *pi, + struct cpt_context *ctx) +{ + unsigned long flags; + struct cpt_task_image *ci = cobj->o_image; + struct thr_context thr_ctx; + struct task_struct *tsk; + pid_t pid; + struct fs_struct *tfs = NULL; + + flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx) + | rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx); + if (ci->cpt_rppid != pi->cpt_pid) { + flags |= CLONE_THREAD|CLONE_PARENT; + if (ci->cpt_signal != pi->cpt_signal || + !(flags&CLONE_SIGHAND) || + (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) { + eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n", + (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid, + (long long)ci->cpt_signal, (long long)pi->cpt_signal, flags + ); + return -EINVAL; + } + } + + thr_ctx.ctx = ctx; + thr_ctx.error = 0; + init_completion(&thr_ctx.init_complete); + init_completion(&thr_ctx.task_done); + thr_ctx.tobj = cobj; + +#if 0 + set_task_ubs(ci, ctx); +#endif + + if (current->fs == NULL) { + tfs = get_exec_env()->init_entry->fs; + if (tfs == NULL) + return -EINVAL; + atomic_inc(&tfs->count); + current->fs = tfs; + } + pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid); + if (tfs) { + current->fs = NULL; + atomic_dec(&tfs->count); + } + if (pid < 0) + return pid; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid_ve(pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (tsk == NULL) + return -ESRCH; + cpt_obj_setobj(cobj, tsk, ctx); + thr_ctx.tobj = cobj; + wait_for_completion(&thr_ctx.init_complete); + wait_task_inactive(cobj->o_obj); + rst_basic_init_task(cobj, ctx); + + /* clone() increases group_stop_count if it was not zero and + * CLONE_THREAD was asked. Undo. + */ + if (current->signal->group_stop_count && (flags & CLONE_THREAD)) { + if (tsk->signal != current->signal) BUG(); + current->signal->group_stop_count--; + } + + wake_up_process(tsk); + wait_for_completion(&thr_ctx.task_done); + wait_task_inactive(tsk); + + return thr_ctx.error; +} + +static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err = 0; + struct cpt_task_image *ti = obj->o_image; + cpt_object_t *cobj; + + for_each_object(cobj, CPT_OBJ_TASK) { + struct cpt_task_image *ci = cobj->o_image; + if (cobj == obj) + continue; + if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) || + (ci->cpt_leader == ti->cpt_pid && + ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) { + err = make_baby(cobj, ti, ctx); + if (err) { + eprintk_ctx("make_baby: %d\n", err); + return err; + } + } + } + return 0; +} + +static int read_task_images(struct cpt_context *ctx) +{ + int err; + loff_t start, end; + + err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end); + if (err) + return err; + + while (start < end) { + cpt_object_t *obj; + struct cpt_task_image *ti = cpt_get_buf(ctx); + + err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) { + eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid); + cpt_release_buf(ctx); + return -EINVAL; + } + obj = alloc_cpt_object(GFP_KERNEL, ctx); + cpt_obj_setpos(obj, start, ctx); + intern_cpt_object(CPT_OBJ_TASK, obj, ctx); + obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL); + if (obj->o_image == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + memcpy(obj->o_image, ti, sizeof(*ti)); + err = ctx->pread(obj->o_image + sizeof(*ti), + ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti)); + cpt_release_buf(ctx); + if (err) + return err; + start += ti->cpt_next; + } + return 0; +} + + +static int vps_rst_restore_tree(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct thr_context thr_ctx_root; + + err = read_task_images(ctx); + if (err) + return err; + + err = rst_undump_ubc(ctx); + if (err) + return err; + + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTCHECK, ctx) & NOTIFY_FAIL) + return -ECHRNG; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = rst_setup_pagein(ctx); + if (err) + return err; +#endif + for_each_object(obj, CPT_OBJ_TASK) { + err = create_root_task(obj, ctx, &thr_ctx_root); + if (err) + return err; + + wait_for_completion(&thr_ctx_root.init_complete); + wait_task_inactive(obj->o_obj); + rst_basic_init_task(obj, ctx); + + wake_up_process(obj->o_obj); + wait_for_completion(&thr_ctx_root.task_done); + wait_task_inactive(obj->o_obj); + err = thr_ctx_root.error; + if (err) + return err; + break; + } + + return err; +} + + +int vps_rst_undump(struct cpt_context *ctx) +{ + int err; + unsigned long umask; + + err = rst_open_dumpfile(ctx); + if (err) + return err; + + if (ctx->tasks64) { +#if defined(CONFIG_IA64) + if (ctx->image_arch != CPT_OS_ARCH_IA64) +#elif defined(CONFIG_X86_64) + if (ctx->image_arch != CPT_OS_ARCH_EMT64) +#else + if (1) +#endif + { + eprintk_ctx("Cannot restore 64 bit VE on this architecture\n"); + return -EINVAL; + } + } + + umask = current->fs->umask; + current->fs->umask = 0; + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = rst_setup_pagein(ctx); +#endif + + if (err == 0) + err = vps_rst_restore_tree(ctx); + + if (err == 0) + err = rst_restore_process(ctx); + + if (err) + virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTFAIL, ctx); + + current->fs->umask = umask; + + return err; +} + +static int rst_unlock_ve(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + down_write(&env->op_sem); + env->is_locked = 0; + up_write(&env->op_sem); + put_ve(env); + return 0; +} + +int rst_resume(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int err = 0; +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *bc; +#endif + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + fput(file); + } + +#ifdef CONFIG_USER_RESOURCE + bc = get_beancounter_byuid(ctx->ve_id, 0); + BUG_ON(!bc); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_KMEMSIZE); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMPROC); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMFILE); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_DCACHESIZE); + put_beancounter(bc); +#endif + + rst_resume_network(ctx); + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + if (!tsk) + continue; + + if (ti->cpt_state == TASK_UNINTERRUPTIBLE) { + dprintk_ctx("task %d/%d(%s) is started\n", virt_pid(tsk), tsk->pid, tsk->comm); + + /* Weird... If a signal is sent to stopped task, + * nobody makes recalc_sigpending(). We have to do + * this by hands after wake_up_process(). + * if we did this before a signal could arrive before + * wake_up_process() and stall. + */ + spin_lock_irq(&tsk->sighand->siglock); + if (!signal_pending(tsk)) + recalc_sigpending_tsk(tsk); + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + } else { + if (ti->cpt_state == TASK_STOPPED || + ti->cpt_state == TASK_TRACED) { + set_task_state(tsk, ti->cpt_state); + } + } + put_task_struct(tsk); + } + + rst_unlock_ve(ctx); + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + rst_complete_pagein(ctx, 0); +#endif + + rst_finish_ubc(ctx); + cpt_object_destroy(ctx); + + return err; +} + +int rst_kill(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int err = 0; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + fput(file); + } + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + if (tsk == NULL) + continue; + + if (tsk->exit_state == 0) { + send_sig(SIGKILL, tsk, 1); + + spin_lock_irq(&tsk->sighand->siglock); + sigfillset(&tsk->blocked); + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(tsk, TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_FREEZE); + if (tsk->flags & PF_FROZEN) + tsk->flags &= ~PF_FROZEN; + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + } + + put_task_struct(tsk); + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + rst_complete_pagein(ctx, 1); +#endif + + rst_finish_ubc(ctx); + cpt_object_destroy(ctx); + + return err; +} + +static int rst_utsname(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_UTSNAME]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr o; + struct ve_struct *ve; + struct uts_namespace *ns; + int i; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + ve = get_exec_env(); + ns = ve->ve_ns->uts_ns; + + i = 0; + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int len; + char *ptr; + err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx); + if (err) + return err; + len = o.cpt_next - o.cpt_hdrlen; + if (len > __NEW_UTS_LEN+1) + return -ENAMETOOLONG; + switch (i) { + case 0: + ptr = ns->name.nodename; break; + case 1: + ptr = ns->name.domainname; break; + default: + return -EINVAL; + } + err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen); + if (err) + return err; + i++; + sec += o.cpt_next; + } + + return 0; +} diff -uprN linux-2.6.18/kernel/cpu.c linux-2.6.18.ovz/kernel/cpu.c --- linux-2.6.18/kernel/cpu.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/cpu.c 2007-06-13 06:55:07.000000000 -0400 @@ -23,6 +23,10 @@ static __cpuinitdata BLOCKING_NOTIFIER_H #ifdef CONFIG_HOTPLUG_CPU +#ifdef CONFIG_SCHED_VCPU +#error "CONFIG_HOTPLUG_CPU isn't supported with CONFIG_SCHED_VCPU" +#endif + /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ static struct task_struct *recursive; static int recursive_depth; @@ -81,8 +85,8 @@ static inline void check_for_tasks(int c struct task_struct *p; write_lock_irq(&tasklist_lock); - for_each_process(p) { - if (task_cpu(p) == cpu && + for_each_process_all(p) { + if (task_pcpu(p) == cpu && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ @@ -92,6 +96,13 @@ static inline void check_for_tasks(int c write_unlock_irq(&tasklist_lock); } +#ifdef CONFIG_SCHED_VCPU +#error VCPU vs. HOTPLUG: fix hotplug code below +/* + * What should be fixed: + * - check for if (idle_cpu()) yield() + */ +#endif /* Take this CPU down. */ static int take_cpu_down(void *unused) { diff -uprN linux-2.6.18/kernel/cpuset.c linux-2.6.18.ovz/kernel/cpuset.c --- linux-2.6.18/kernel/cpuset.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/cpuset.c 2007-06-13 06:55:07.000000000 -0400 @@ -965,7 +965,7 @@ static int update_nodemask(struct cpuset n = 0; /* Load up mmarray[] with mm reference for each task in cpuset. */ - do_each_thread(g, p) { + do_each_thread_all(g, p) { struct mm_struct *mm; if (n >= ntasks) { @@ -979,7 +979,7 @@ static int update_nodemask(struct cpuset if (!mm) continue; mmarray[n++] = mm; - } while_each_thread(g, p); + } while_each_thread_all(g, p); write_unlock_irq(&tasklist_lock); /* @@ -1193,7 +1193,7 @@ static int attach_task(struct cpuset *cs if (pid) { read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_pid_all(pid); if (!tsk || tsk->flags & PF_EXITING) { read_unlock(&tasklist_lock); return -ESRCH; @@ -1651,13 +1651,13 @@ static int pid_array_load(pid_t *pidarra read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (p->cpuset == cs) { pidarray[n++] = p->pid; if (unlikely(n == npids)) goto array_full; } - } while_each_thread(g, p); + } while_each_thread_all(g, p); array_full: read_unlock(&tasklist_lock); @@ -1743,12 +1743,7 @@ static ssize_t cpuset_tasks_read(struct { struct ctr_struct *ctr = file->private_data; - if (*ppos + nbytes > ctr->bufsz) - nbytes = ctr->bufsz - *ppos; - if (copy_to_user(buf, ctr->buf + *ppos, nbytes)) - return -EFAULT; - *ppos += nbytes; - return nbytes; + return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); } static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) diff -uprN linux-2.6.18/kernel/exit.c linux-2.6.18.ovz/kernel/exit.c --- linux-2.6.18/kernel/exit.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/exit.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,12 +14,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -39,6 +41,8 @@ #include /* for audit_free() */ #include +#include + #include #include #include @@ -47,7 +51,7 @@ extern void sem_exit (void); extern struct task_struct *child_reaper; -static void exit_mm(struct task_struct * tsk); +void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) { @@ -58,6 +62,9 @@ static void __unhash_process(struct task detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); +#ifdef CONFIG_VE + list_del_rcu(&p->ve_task_info.vetask_list); +#endif __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); @@ -144,6 +151,8 @@ repeat: ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); + nr_zombie--; + atomic_inc(&nr_dead); /* * If we are the last non-leader member of the thread @@ -170,6 +179,8 @@ repeat: write_unlock_irq(&tasklist_lock); proc_flush_task(p); release_thread(p); + pput_ve(p->ve_task_info.owner_env); + ub_task_uncharge(p); call_rcu(&p->rcu, delayed_put_task_struct); p = leader; @@ -187,14 +198,16 @@ int session_of_pgrp(int pgrp) struct task_struct *p; int sid = -1; + WARN_ON(is_virtual_pid(pgrp)); + read_lock(&tasklist_lock); - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { if (p->signal->session > 0) { sid = p->signal->session; goto out; } - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); - p = find_task_by_pid(pgrp); + } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); + p = find_task_by_pid_ve(pgrp); if (p) sid = p->signal->session; out: @@ -216,17 +229,19 @@ static int will_become_orphaned_pgrp(int struct task_struct *p; int ret = 1; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + WARN_ON(is_virtual_pid(pgrp)); + + do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task || p->exit_state - || p->real_parent->pid == 1) + || virt_pid(p->real_parent) == 1) continue; if (process_group(p->real_parent) != pgrp && p->real_parent->signal->session == p->signal->session) { ret = 0; break; } - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); return ret; /* (sighing) "Often!" */ } @@ -234,6 +249,8 @@ int is_orphaned_pgrp(int pgrp) { int retval; + WARN_ON(is_virtual_pid(pgrp)); + read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(pgrp, NULL); read_unlock(&tasklist_lock); @@ -246,7 +263,7 @@ static int has_stopped_jobs(int pgrp) int retval = 0; struct task_struct *p; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { if (p->state != TASK_STOPPED) continue; @@ -262,7 +279,7 @@ static int has_stopped_jobs(int pgrp) retval = 1; break; - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); return retval; } @@ -311,6 +328,9 @@ void __set_special_pids(pid_t session, p { struct task_struct *curr = current->group_leader; + WARN_ON(__is_virtual_pid(pgrp)); + WARN_ON(__is_virtual_pid(session)); + if (curr->signal->session != session) { detach_pid(curr, PIDTYPE_SID); curr->signal->session = session; @@ -329,6 +349,7 @@ void set_special_pids(pid_t session, pid __set_special_pids(session, pgrp); write_unlock_irq(&tasklist_lock); } +EXPORT_SYMBOL(set_special_pids); /* * Let kernel threads use this to say that they @@ -408,9 +429,11 @@ void daemonize(const char *name, ...) fs = init_task.fs; current->fs = fs; atomic_inc(&fs->count); - exit_namespace(current); - current->namespace = init_task.namespace; - get_namespace(current->namespace); + + exit_task_namespaces(current); + current->nsproxy = init_task.nsproxy; + get_task_namespaces(current); + exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -487,6 +510,18 @@ void fastcall put_files_struct(struct fi EXPORT_SYMBOL(put_files_struct); +void reset_files_struct(struct task_struct *tsk, struct files_struct *files) +{ + struct files_struct *old; + + old = tsk->files; + task_lock(tsk); + tsk->files = files; + task_unlock(tsk); + put_files_struct(old); +} +EXPORT_SYMBOL(reset_files_struct); + static inline void __exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; @@ -548,13 +583,17 @@ EXPORT_SYMBOL_GPL(exit_fs); * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; mm_release(tsk, mm); if (!mm) return; + + if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) + mm->oom_killed = 1; + /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_waiters @@ -583,6 +622,7 @@ static void exit_mm(struct task_struct * task_unlock(tsk); mmput(mm); } +EXPORT_SYMBOL_GPL(exit_mm); static inline void choose_new_parent(struct task_struct *p, struct task_struct *reaper) @@ -663,13 +703,12 @@ reparent_thread(struct task_struct *p, s static void forget_original_parent(struct task_struct *father, struct list_head *to_release) { - struct task_struct *p, *reaper = father; + struct task_struct *p, *tsk_reaper, *reaper = father; struct list_head *_p, *_n; do { reaper = next_thread(reaper); if (reaper == father) { - reaper = child_reaper; break; } } while (reaper->exit_state); @@ -691,9 +730,16 @@ forget_original_parent(struct task_struc /* if father isn't the real parent, then ptrace must be enabled */ BUG_ON(father != p->real_parent && !ptrace); + tsk_reaper = reaper; + if (tsk_reaper == father) +#ifdef CONFIG_VE + tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry; + if (tsk_reaper == p) +#endif + tsk_reaper = child_reaper; if (father == p->real_parent) { - /* reparent with a reaper, real father it's us */ - choose_new_parent(p, reaper); + /* reparent with a tsk_reaper, real father it's us */ + choose_new_parent(p, tsk_reaper); reparent_thread(p, father, 0); } else { /* reparent ptraced task to its real parent */ @@ -714,7 +760,16 @@ forget_original_parent(struct task_struc } list_for_each_safe(_p, _n, &father->ptrace_children) { p = list_entry(_p, struct task_struct, ptrace_list); - choose_new_parent(p, reaper); + + tsk_reaper = reaper; + if (tsk_reaper == father) +#ifdef CONFIG_VE + tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry; + if (tsk_reaper == p || + p->group_leader == VE_TASK_INFO(p)->owner_env->init_entry) +#endif + tsk_reaper = child_reaper; + choose_new_parent(p, tsk_reaper); reparent_thread(p, father, 1); } } @@ -810,6 +865,9 @@ static void exit_notify(struct task_stru && !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; + if (tsk->exit_signal != -1 && t == child_reaper) + /* We dont want people slaying init. */ + tsk->exit_signal = SIGCHLD; /* If something other than our normal parent is ptracing us, then * send it a SIGCHLD instead of honoring exit_signal. exit_signal @@ -828,6 +886,7 @@ static void exit_notify(struct task_stru unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) state = EXIT_DEAD; tsk->exit_state = state; + nr_zombie++; write_unlock_irq(&tasklist_lock); @@ -842,6 +901,82 @@ static void exit_notify(struct task_stru release_task(tsk); } +#ifdef CONFIG_VE +/* + * Handle exitting of init process, it's a special case for VE. + */ +static void do_initproc_exit(void) +{ + struct task_struct *tsk; + struct ve_struct *env; + struct siginfo info; + struct task_struct *g, *p; + long delay = 1L; + + tsk = current; + env = VE_TASK_INFO(current)->owner_env; + if (env->init_entry != tsk) + return; + + if (ve_is_super(env) && tsk->pid == 1) + panic("Attempted to kill init!"); + + memset(&info, 0, sizeof(info)); + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = virt_pid(tsk); + info.si_uid = current->uid; + info.si_signo = SIGKILL; + + /* + * Here the VE changes its state into "not running". + * op_sem taken for write is a barrier to all VE manipulations from + * ioctl: it waits for operations currently in progress and blocks all + * subsequent operations until is_running is set to 0 and op_sem is + * released. + */ + down_write(&env->op_sem); + env->is_running = 0; + up_write(&env->op_sem); + + /* send kill to all processes of VE */ + read_lock(&tasklist_lock); + do_each_thread_ve(g, p) { + force_sig_info(SIGKILL, &info, p); + } while_each_thread_ve(g, p); + read_unlock(&tasklist_lock); + + /* wait for all init childs exit */ + while (atomic_read(&env->pcounter) > 1) { + if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0) + continue; + /* it was ENOCHLD or no more children somehow */ + if (atomic_read(&env->pcounter) == 1) + break; + + /* clear all signals to avoid wakeups */ + if (signal_pending(tsk)) + flush_signals(tsk); + /* we have child without signal sent */ + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(delay); + delay = (delay < HZ) ? (delay << 1) : HZ; + read_lock(&tasklist_lock); + do_each_thread_ve(g, p) { + if (p != tsk) + force_sig_info(SIGKILL, &info, p); + } while_each_thread_ve(g, p); + read_unlock(&tasklist_lock); + } + env->init_entry = child_reaper; + write_lock_irq(&tasklist_lock); + remove_parent(tsk); + tsk->parent = tsk->real_parent = child_reaper; + add_parent(tsk); + write_unlock_irq(&tasklist_lock); +} +#endif + fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@ -857,12 +992,20 @@ fastcall NORET_TYPE void do_exit(long co panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); +#ifdef CONFIG_VE + do_initproc_exit(); +#else if (unlikely(tsk == child_reaper)) panic("Attempted to kill init!"); +#endif + + (void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL); if (unlikely(current->ptrace & PT_TRACE_EXIT)) { current->ptrace_message = code; + set_pn_state(current, PN_STOP_EXIT); ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); + clear_pn_state(current); } /* @@ -898,12 +1041,14 @@ fastcall NORET_TYPE void do_exit(long co exit_itimers(tsk->signal); } acct_collect(code, group_dead); - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); + if (!(tsk->flags & PF_EXIT_RESTART)) { + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); #endif + } if (unlikely(tsk->audit_context)) audit_free(tsk); taskstats_exit_send(tsk, tidstats, group_dead, mycpu); @@ -916,7 +1061,7 @@ fastcall NORET_TYPE void do_exit(long co exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); - exit_namespace(tsk); + exit_task_namespaces(tsk); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); @@ -929,8 +1074,15 @@ fastcall NORET_TYPE void do_exit(long co module_put(tsk->binfmt->module); tsk->exit_code = code; - proc_exit_connector(tsk); - exit_notify(tsk); + if (!(tsk->flags & PF_EXIT_RESTART)) { + proc_exit_connector(tsk); + exit_notify(tsk); + } else { + write_lock_irq(&tasklist_lock); + tsk->exit_state = EXIT_ZOMBIE; + nr_zombie++; + write_unlock_irq(&tasklist_lock); + } #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; @@ -1024,14 +1176,19 @@ asmlinkage void sys_exit_group(int error static int eligible_child(pid_t pid, int options, struct task_struct *p) { if (pid > 0) { - if (p->pid != pid) + if ((is_virtual_pid(pid) ? virt_pid(p) : p->pid) != pid) return 0; } else if (!pid) { if (process_group(p) != process_group(current)) return 0; } else if (pid != -1) { - if (process_group(p) != -pid) - return 0; + if (__is_virtual_pid(-pid)) { + if (virt_pgid(p) != -pid) + return 0; + } else { + if (process_group(p) != -pid) + return 0; + } } /* @@ -1102,7 +1259,7 @@ static int wait_task_zombie(struct task_ int status; if (unlikely(noreap)) { - pid_t pid = p->pid; + pid_t pid = get_task_pid(p); uid_t uid = p->uid; int exit_code = p->exit_code; int why, status; @@ -1214,7 +1371,7 @@ static int wait_task_zombie(struct task_ retval = put_user(status, &infop->si_status); } if (!retval && infop) - retval = put_user(p->pid, &infop->si_pid); + retval = put_user(get_task_pid(p), &infop->si_pid); if (!retval && infop) retval = put_user(p->uid, &infop->si_uid); if (retval) { @@ -1222,7 +1379,7 @@ static int wait_task_zombie(struct task_ p->exit_state = EXIT_ZOMBIE; return retval; } - retval = p->pid; + retval = get_task_pid(p); if (p->real_parent != p->parent) { write_lock_irq(&tasklist_lock); /* Double-check with lock held. */ @@ -1282,7 +1439,7 @@ static int wait_task_stopped(struct task read_unlock(&tasklist_lock); if (unlikely(noreap)) { - pid_t pid = p->pid; + pid_t pid = get_task_pid(p); uid_t uid = p->uid; int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; @@ -1353,11 +1510,11 @@ bail_ref: if (!retval && infop) retval = put_user(exit_code, &infop->si_status); if (!retval && infop) - retval = put_user(p->pid, &infop->si_pid); + retval = put_user(get_task_pid(p), &infop->si_pid); if (!retval && infop) retval = put_user(p->uid, &infop->si_uid); if (!retval) - retval = p->pid; + retval = get_task_pid(p); put_task_struct(p); BUG_ON(!retval); @@ -1394,7 +1551,7 @@ static int wait_task_continued(struct ta p->signal->flags &= ~SIGNAL_STOP_CONTINUED; spin_unlock_irq(&p->sighand->siglock); - pid = p->pid; + pid = get_task_pid(p); uid = p->uid; get_task_struct(p); read_unlock(&tasklist_lock); @@ -1405,7 +1562,7 @@ static int wait_task_continued(struct ta if (!retval && stat_addr) retval = put_user(0xffff, stat_addr); if (!retval) - retval = p->pid; + retval = get_task_pid(p); } else { retval = wait_noreap_copyout(p, pid, uid, CLD_CONTINUED, SIGCONT, @@ -1638,6 +1795,7 @@ asmlinkage long sys_wait4(pid_t pid, int prevent_tail_call(ret); return ret; } +EXPORT_SYMBOL_GPL(sys_wait4); #ifdef __ARCH_WANT_SYS_WAITPID diff -uprN linux-2.6.18/kernel/fairsched.c linux-2.6.18.ovz/kernel/fairsched.c --- linux-2.6.18/kernel/fairsched.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/fairsched.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1383 @@ +/* + * Fair Scheduler + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Start-tag scheduling follows the theory presented in + * http://www.cs.utexas.edu/users/dmcl/papers/ps/SIGCOMM96.ps + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* we need it for vsched routines in sched.c */ +spinlock_t fairsched_lock = SPIN_LOCK_UNLOCKED; + +#ifdef CONFIG_FAIRSCHED + +#define FAIRSHED_DEBUG " debug" + + +/*********************************************************************/ +/* + * Special arithmetics + */ +/*********************************************************************/ + +#define CYCLES_SHIFT (8) +#define SCYCLES_TIME(time) \ + ((scycles_t) {((time) + (1 << CYCLES_SHIFT) - 1) >> CYCLES_SHIFT}) + +#define CYCLES_ZERO (0) +static inline int CYCLES_BEFORE(cycles_t x, cycles_t y) +{ + return (__s64)(x-y) < 0; +} +static inline int CYCLES_AFTER(cycles_t x, cycles_t y) +{ + return (__s64)(y-x) < 0; +} +static inline void CYCLES_DADD(cycles_t *x, fschdur_t y) {*x+=y.d;} + +/* + * fairsched_schedule() can be called rarely than on each timer tick + * due to main scheduler optimizations, so new abstract timeslice must + * be introduced. It can have arbitrary number ot cycles, but main + * scheduler mustn't exceed this value and call fairsched scheduler + * before this timeslice is expired on a node. + */ +static cycles_t cycles_per_timeslice; +#define FSCHDUR_ZERO (0) +#define TICK_DUR ((fschdur_t){cycles_per_timeslice}) +static inline fschdur_t FSCHDURATION(cycles_t x, cycles_t y) +{ + return (fschdur_t){x - y}; +} +static inline int FSCHDUR_CMP(fschdur_t x, fschdur_t y) +{ + if (x.d < y.d) return -1; + if (x.d > y.d) return 1; + return 0; +} +static inline fschdur_t FSCHDUR_SUB(fschdur_t x, fschdur_t y) +{ + return (fschdur_t){x.d - y.d}; +} + +#define FSCHTAG_ZERO ((fschtag_t){0}) +static inline int FSCHTAG_CMP(fschtag_t x, fschtag_t y) +{ + if (x.t < y.t) return -1; + if (x.t > y.t) return 1; + return 0; +} +static inline fschtag_t FSCHTAG_MAX(fschtag_t x, fschtag_t y) +{ + return x.t >= y.t ? x : y; +} +static inline int FSCHTAG_DADD(fschtag_t *tag, fschdur_t dur, unsigned w) +{ + cycles_t new_tag; + new_tag = tag->t + (cycles_t)dur.d * w; + if (new_tag < tag->t) + return -1; + /* DEBUG */ + if (new_tag >= (1ULL << 48)) + return -1; + tag->t = new_tag; + return 0; +} +static inline int FSCHTAG_ADD(fschtag_t *tag, fschtag_t y) +{ + cycles_t new_tag; + new_tag = tag->t + y.t; + if (new_tag < tag->t) + return -1; + tag->t = new_tag; + return 0; +} +static inline fschtag_t FSCHTAG_SUB(fschtag_t x, fschtag_t y) +{ + return (fschtag_t){x.t - y.t}; +} + +#define FSCHVALUE_FMT "%Lu" +#define FSCHVALUE_PRINT(x) ((x).v) +#define FSCHVALUE_ZERO ((fschvalue_t){0}) +#define TICK_VALUE ((fschvalue_t) \ + {(cycles_t)cycles_per_timeslice << FSCHRATE_SHIFT}) +static inline fschvalue_t FSCHVALUE(unsigned long t) +{ + return (fschvalue_t){(cycles_t)t << FSCHRATE_SHIFT}; +} +static inline int FSCHVALUE_CMP(fschvalue_t x, fschvalue_t y) +{ + if (x.v < y.v) return -1; + if (x.v > y.v) return 1; + return 0; +} +static inline void FSCHVALUE_DADD(fschvalue_t *val, fschdur_t dur, + unsigned rate) +{ + val->v += (cycles_t)dur.d * rate; +} +static inline fschvalue_t FSCHVALUE_SUB(fschvalue_t x, fschvalue_t y) +{ + return (fschvalue_t){x.v - y.v}; +} +static inline cycles_t FSCHVALUE_TO_DELAY(fschvalue_t val, unsigned rate) +{ + unsigned long t; + /* + * Here we lose precision to make the division 32-bit on IA-32. + * The value is not greater than TICK_VALUE. + * (TICK_VALUE >> FSCHRATE_SHIFT) fits unsigned long. + */ + t = (val.v + (1 << FSCHRATE_SHIFT) - 1) >> FSCHRATE_SHIFT; + return (cycles_t)((t + rate - 1) / rate) << FSCHRATE_SHIFT; +} + + +/*********************************************************************/ +/* + * Global data + */ +/*********************************************************************/ + +/* + * Assertions. + * Called with preemption disabled. + */ + +#define fsch_assert(x) \ + do { \ + static int count; \ + if (x) \ + break; \ + if (count++ > 10) \ + break; \ + __printk_no_wake++; \ + printk("fsch_assert " #x " failed\n"); \ + __printk_no_wake--; \ + } while (0) + +#define fsch_validate(x, fmt...) \ + do { \ + static int count; \ + if (x) \ + break; \ + if (count++ > 10) \ + break; \ + __printk_no_wake++; \ + printk("fsch_assert " #x " failed\n"); \ + printk("fsch_assert: " fmt); \ + __printk_no_wake--; \ + } while (0) + +/* + * Configurable parameters + */ +unsigned fairsched_max_latency = 25; /* jiffies */ + +/* + * Parameters initialized at startup + */ +/* Number of online CPUs */ +unsigned fairsched_nr_cpus; +/* Token Bucket depth (burst size) */ +static fschvalue_t max_value; + +struct fairsched_node fairsched_init_node = { + .id = INT_MAX, +#ifdef CONFIG_VE + .owner_env = get_ve0(), +#endif + .weight = 1, +}; +EXPORT_SYMBOL(fairsched_init_node); + +struct fairsched_node fairsched_idle_node = { + .id = -1, +}; + +static int fairsched_nr_nodes; +static LIST_HEAD(fairsched_node_head); +static LIST_HEAD(fairsched_running_head); +static LIST_HEAD(fairsched_delayed_head); + +DEFINE_PER_CPU(cycles_t, prev_schedule); +static fschtag_t max_latency; + +static DEFINE_MUTEX(fairsched_mutex); + +/*********************************************************************/ +/* + * Small helper routines + */ +/*********************************************************************/ + +/* this didn't proved to be very valuable statistics... */ +#define fairsched_inc_ve_strv(node, cycles) do {} while(0) +#define fairsched_dec_ve_strv(node, cycles) do {} while(0) + +/*********************************************************************/ +/* + * Runlist management + */ +/*********************************************************************/ + +/* + * Returns the start_tag of the first runnable node, or 0. + */ +static inline fschtag_t virtual_time(void) +{ + struct fairsched_node *p; + + if (!list_empty(&fairsched_running_head)) { + p = list_first_entry(&fairsched_running_head, + struct fairsched_node, runlist); + return p->start_tag; + } + return FSCHTAG_ZERO; +} + +static void fairsched_recompute_max_latency(void) +{ + struct fairsched_node *p; + unsigned w; + fschtag_t tag; + + w = FSCHWEIGHT_MAX; + for_each_fairsched_node(p) { + if (p->weight < w) + w = p->weight; + } + tag = FSCHTAG_ZERO; + (void) FSCHTAG_DADD(&tag, TICK_DUR, + fairsched_nr_cpus * fairsched_max_latency * w); + max_latency = tag; +} + +static void fairsched_reset_start_tags(void) +{ + struct fairsched_node *cnode; + fschtag_t min_tag; + + min_tag = virtual_time(); + for_each_fairsched_node(cnode) { + if (FSCHTAG_CMP(cnode->start_tag, min_tag) > 0) + cnode->start_tag = FSCHTAG_SUB(cnode->start_tag, + min_tag); + else + cnode->start_tag = FSCHTAG_ZERO; + } +} + +static void fairsched_running_insert(struct fairsched_node *node) +{ + struct list_head *tmp; + struct fairsched_node *p; + fschtag_t start_tag_max; + + if (!list_empty(&fairsched_running_head)) { + start_tag_max = virtual_time(); + if (!FSCHTAG_ADD(&start_tag_max, max_latency) && + FSCHTAG_CMP(start_tag_max, node->start_tag) < 0) + node->start_tag = start_tag_max; + } + + list_for_each(tmp, &fairsched_running_head) { + p = list_entry(tmp, struct fairsched_node, runlist); + if (FSCHTAG_CMP(node->start_tag, p->start_tag) <= 0) + break; + } + /* insert node just before tmp */ + list_add_tail(&node->runlist, tmp); +} + +static inline void fairsched_running_insert_fromsleep( + struct fairsched_node *node) +{ + node->start_tag = FSCHTAG_MAX(node->start_tag, virtual_time()); + fairsched_running_insert(node); +} + + +/*********************************************************************/ +/* + * CPU limiting helper functions + * + * These functions compute rates, delays and manipulate with sleep + * lists and so on. + */ +/*********************************************************************/ + +/* + * Insert a node into the list of nodes removed from scheduling, + * sorted by the time at which the the node is allowed to run, + * historically called `delay'. + */ +static void fairsched_delayed_insert(struct fairsched_node *node) +{ + struct fairsched_node *p; + struct list_head *tmp; + + list_for_each(tmp, &fairsched_delayed_head) { + p = list_entry(tmp, struct fairsched_node, + runlist); + if (CYCLES_AFTER(p->delay, node->delay)) + break; + } + /* insert node just before tmp */ + list_add_tail(&node->runlist, tmp); +} + +static inline void nodevalue_add(struct fairsched_node *node, + fschdur_t duration, unsigned rate) +{ + FSCHVALUE_DADD(&node->value, duration, rate); + if (FSCHVALUE_CMP(node->value, max_value) > 0) + node->value = max_value; +} + +/* + * The node has been selected to run. + * This function accounts in advance for the time that the node will run. + * The advance not used by the node will be credited back. + */ +static void fairsched_ratelimit_charge_advance( + struct fairsched_node *node, + cycles_t time) +{ + fsch_assert(!node->delayed); + fsch_validate(FSCHVALUE_CMP(node->value, TICK_VALUE) >= 0, + "charge, value " FSCHVALUE_FMT + ", tick " FSCHVALUE_FMT + ", delay %Lu, time %Lu" + ", lastupd %Lu, rate %u\n", + FSCHVALUE_PRINT(node->value), + FSCHVALUE_PRINT(TICK_VALUE), + node->delay, time, + node->last_updated_at, node->rate); + + /* + * Account for the time passed since last update. + * It might be needed if the node has become runnable because of + * a wakeup, but hasn't gone through other functions updating + * the bucket value. + */ + if (CYCLES_AFTER(time, node->last_updated_at)) { + nodevalue_add(node, FSCHDURATION(time, node->last_updated_at), + node->rate); + node->last_updated_at = time; + } + + /* charge for the full tick the node might be running */ + node->value = FSCHVALUE_SUB(node->value, TICK_VALUE); + if (FSCHVALUE_CMP(node->value, TICK_VALUE) < 0) { + list_del(&node->runlist); + node->delayed = 1; + node->delay = node->last_updated_at + FSCHVALUE_TO_DELAY( + FSCHVALUE_SUB(TICK_VALUE, node->value), + node->rate); + node->nr_ready = 0; + fairsched_delayed_insert(node); + } +} + +static void fairsched_ratelimit_credit_unused( + struct fairsched_node *node, + cycles_t time, fschdur_t duration) +{ + /* account for the time passed since last update */ + if (CYCLES_AFTER(time, node->last_updated_at)) { + nodevalue_add(node, FSCHDURATION(time, node->last_updated_at), + node->rate); + node->last_updated_at = time; + } + + /* + * When the node was given this CPU, it was charged for 1 tick. + * Credit back the unused time. + */ + if (FSCHDUR_CMP(duration, TICK_DUR) < 0) + nodevalue_add(node, FSCHDUR_SUB(TICK_DUR, duration), + 1 << FSCHRATE_SHIFT); + + /* check if the node is allowed to run */ + if (FSCHVALUE_CMP(node->value, TICK_VALUE) < 0) { + /* + * The node was delayed and remain such. + * But since the bucket value has been updated, + * update the delay time and move the node in the list. + */ + fsch_assert(node->delayed); + node->delay = node->last_updated_at + FSCHVALUE_TO_DELAY( + FSCHVALUE_SUB(TICK_VALUE, node->value), + node->rate); + } else if (node->delayed) { + /* + * The node was delayed, but now it is allowed to run. + * We do not manipulate with lists, it will be done by the + * caller. + */ + node->nr_ready = node->nr_runnable; + node->delayed = 0; + } +} + +static void fairsched_delayed_wake(cycles_t time) +{ + struct fairsched_node *p; + + while (!list_empty(&fairsched_delayed_head)) { + p = list_entry(fairsched_delayed_head.next, + struct fairsched_node, + runlist); + if (CYCLES_AFTER(p->delay, time)) + break; + + /* ok, the delay period is completed */ + /* account for the time passed since last update */ + if (CYCLES_AFTER(time, p->last_updated_at)) { + nodevalue_add(p, FSCHDURATION(time, p->last_updated_at), + p->rate); + p->last_updated_at = time; + } + + fsch_validate(FSCHVALUE_CMP(p->value, TICK_VALUE) >= 0, + "wake, value " FSCHVALUE_FMT + ", tick " FSCHVALUE_FMT + ", delay %Lu, time %Lu" + ", lastupd %Lu, rate %u\n", + FSCHVALUE_PRINT(p->value), + FSCHVALUE_PRINT(TICK_VALUE), + p->delay, time, + p->last_updated_at, p->rate); + p->nr_ready = p->nr_runnable; + p->delayed = 0; + list_del_init(&p->runlist); + if (p->nr_ready) + fairsched_running_insert_fromsleep(p); + } +} + +static struct fairsched_node *fairsched_find(unsigned int id); + +void fairsched_cpu_online_map(int id, cpumask_t *mask) +{ + struct fairsched_node *node; + + mutex_lock(&fairsched_mutex); + node = fairsched_find(id); + if (node == NULL) + *mask = CPU_MASK_NONE; + else + vsched_cpu_online_map(node->vsched, mask); + mutex_unlock(&fairsched_mutex); +} + +/*********************************************************************/ +/* + * The heart of the algorithm: + * fairsched_incrun, fairsched_decrun, fairsched_schedule + * + * Note: old property nr_ready >= nr_pcpu doesn't hold anymore. + * However, nr_runnable, nr_ready and delayed are maintained in sync. + */ +/*********************************************************************/ + +/* + * Called on a wakeup inside the node. + */ +void fairsched_incrun(struct fairsched_node *node) +{ + if (!node->delayed && !node->nr_ready++) + /* the node wasn't on the running list, insert */ + fairsched_running_insert_fromsleep(node); + node->nr_runnable++; +} + +/* + * Called from inside schedule() when a sleeping state is entered. + */ +void fairsched_decrun(struct fairsched_node *node) +{ + if (!node->delayed && !--node->nr_ready) + /* nr_ready changed 1->0, remove from the running list */ + list_del_init(&node->runlist); + --node->nr_runnable; +} + +void fairsched_inccpu(struct fairsched_node *node) +{ + node->nr_pcpu++; + fairsched_dec_ve_strv(node, cycles); +} + +static inline void __fairsched_deccpu(struct fairsched_node *node) +{ + node->nr_pcpu--; + fairsched_inc_ve_strv(node, cycles); +} + +void fairsched_deccpu(struct fairsched_node *node) +{ + if (node == &fairsched_idle_node) + return; + + __fairsched_deccpu(node); +} + +static void fairsched_account(struct fairsched_node *node, + cycles_t time) +{ + fschdur_t duration; + + duration = FSCHDURATION(time, __get_cpu_var(prev_schedule)); +#ifdef CONFIG_VE + CYCLES_DADD(&node->owner_env->cpu_used_ve, duration); +#endif + + /* + * The duration is not greater than TICK_DUR since + * task->need_resched is always 1. + */ + if (FSCHTAG_DADD(&node->start_tag, duration, node->weight)) { + fairsched_reset_start_tags(); + (void) FSCHTAG_DADD(&node->start_tag, duration, + node->weight); + } + + list_del_init(&node->runlist); + if (node->rate_limited) + fairsched_ratelimit_credit_unused(node, time, duration); + if (!node->delayed) { + if (node->nr_ready) + fairsched_running_insert(node); + } else + fairsched_delayed_insert(node); +} + +/* + * Scheduling decision + * + * Updates CPU usage for the node releasing the CPU and selects a new node. + */ +struct fairsched_node *fairsched_schedule( + struct fairsched_node *prev_node, + struct fairsched_node *cur_node, + int cur_node_active, + cycles_t time) +{ + struct fairsched_node *p; + + if (prev_node != &fairsched_idle_node) + fairsched_account(prev_node, time); + __get_cpu_var(prev_schedule) = time; + + fairsched_delayed_wake(time); + + list_for_each_entry(p, &fairsched_running_head, runlist) { + if (p->nr_pcpu < p->nr_ready || + (cur_node_active && p == cur_node)) { + if (p->rate_limited) + fairsched_ratelimit_charge_advance(p, time); + return p; + } + } + return NULL; +} + + +/*********************************************************************/ +/* + * System calls + * + * All do_xxx functions are called under fairsched semaphore and after + * capability check. + * + * The binary interfaces follow some other Fair Scheduler implementations + * (although some system call arguments are not needed for our implementation). + */ +/*********************************************************************/ + +static struct fairsched_node *fairsched_find(unsigned int id) +{ + struct fairsched_node *p; + + for_each_fairsched_node(p) { + if (p->id == id) + return p; + } + return NULL; +} + +static int do_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid) +{ + struct fairsched_node *node; + int retval; + + retval = -EINVAL; + if (weight < 1 || weight > FSCHWEIGHT_MAX) + goto out; + if (newid < 0 || newid > INT_MAX) + goto out; + + retval = -EBUSY; + if (fairsched_find(newid) != NULL) + goto out; + + retval = -ENOMEM; + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (node == NULL) + goto out; + + memset(node, 0, sizeof(*node)); + node->weight = weight; + INIT_LIST_HEAD(&node->runlist); + node->id = newid; + node->vcpus = 0; +#ifdef CONFIG_VE + node->owner_env = get_exec_env(); +#endif + + spin_lock_irq(&fairsched_lock); + list_add(&node->nodelist, &fairsched_node_head); + fairsched_nr_nodes++; + fairsched_recompute_max_latency(); + spin_unlock_irq(&fairsched_lock); + + retval = newid; +out: + return retval; +} + +asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_mknod(parent, weight, newid); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_mknod); + +static int do_fairsched_rmnod(unsigned int id) +{ + struct fairsched_node *node; + int retval; + + retval = -EINVAL; + node = fairsched_find(id); + if (node == NULL) + goto out; + if (node == &fairsched_init_node) + goto out; + + retval = vsched_destroy(node->vsched); + if (retval) + goto out; + + spin_lock_irq(&fairsched_lock); + list_del(&node->runlist); /* required for delayed nodes */ + list_del(&node->nodelist); + fairsched_nr_nodes--; + fairsched_recompute_max_latency(); + spin_unlock_irq(&fairsched_lock); + + kfree(node); + retval = 0; +out: + return retval; +} + +asmlinkage int sys_fairsched_rmnod(unsigned int id) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_rmnod(id); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_rmnod); + +int do_fairsched_chwt(unsigned int id, unsigned weight) +{ + struct fairsched_node *node; + + if (id == 0) + return -EINVAL; + if (weight < 1 || weight > FSCHWEIGHT_MAX) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + spin_lock_irq(&fairsched_lock); + node->weight = weight; + fairsched_recompute_max_latency(); + spin_unlock_irq(&fairsched_lock); + + return 0; +} + +asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned weight) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_chwt(id, weight); + mutex_unlock(&fairsched_mutex); + + return retval; +} + +int do_fairsched_vcpus(unsigned int id, unsigned int vcpus) +{ + struct fairsched_node *node; + int ret = 0; + + if (id == 0) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + if (vcpus < 1 || vcpus > num_online_cpus()) + vcpus = num_online_cpus(); + + node->vcpus = vcpus; + if (node->vsched != NULL) { + ret = vsched_set_vcpus(node->vsched, vcpus); + /* FIXME: adjust rate ... */ + } + + return ret; +} + +asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_vcpus(id, vcpus); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_vcpus); + +int do_fairsched_rate(unsigned int id, int op, unsigned rate) +{ + struct fairsched_node *node; + cycles_t time; + int retval; + + if (id == 0) + return -EINVAL; + if (op == FAIRSCHED_SET_RATE && (rate < 1 || rate >= (1UL << 31))) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + retval = -EINVAL; + spin_lock_irq(&fairsched_lock); + time = get_cycles(); + switch (op) { + case FAIRSCHED_SET_RATE: + node->rate = rate; + if (node->rate > (fairsched_nr_cpus << FSCHRATE_SHIFT)) + node->rate = + fairsched_nr_cpus << FSCHRATE_SHIFT; + node->rate_limited = 1; + node->value = max_value; + if (node->delayed) { + list_del(&node->runlist); + node->delay = time; + fairsched_delayed_insert(node); + node->last_updated_at = time; + fairsched_delayed_wake(time); + } + retval = node->rate; + break; + case FAIRSCHED_DROP_RATE: + node->rate = 0; /* This assignment is not needed + for the kernel code, and it should + not rely on rate being 0 when it's + unset. This is a band-aid for some + existing tools (don't know which one + exactly). --SAW */ + node->rate_limited = 0; + node->value = max_value; + if (node->delayed) { + list_del(&node->runlist); + node->delay = time; + fairsched_delayed_insert(node); + node->last_updated_at = time; + fairsched_delayed_wake(time); + } + retval = 0; + break; + case FAIRSCHED_GET_RATE: + if (node->rate_limited) + retval = node->rate; + else + retval = -ENODATA; + break; + } + spin_unlock_irq(&fairsched_lock); + + return retval; +} + +asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_rate(id, op, rate); + mutex_unlock(&fairsched_mutex); + + return retval; +} + +/* + * Called under fairsched_mutex. + */ +static int __do_fairsched_mvpr(struct task_struct *p, + struct fairsched_node *node) +{ + int retval; + + if (node->vsched == NULL) { + retval = vsched_create(node->id, node); + if (retval < 0) + return retval; + } + + /* no need to destroy vsched in case of mvpr failure */ + return vsched_mvpr(p, node->vsched); +} + +int do_fairsched_mvpr(pid_t pid, unsigned int nodeid) +{ + struct task_struct *p; + struct fairsched_node *node; + int retval; + + retval = -ENOENT; + node = fairsched_find(nodeid); + if (node == NULL) + goto out; + + read_lock(&tasklist_lock); + retval = -ESRCH; + p = find_task_by_pid_all(pid); + if (p == NULL) + goto out_unlock; + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = __do_fairsched_mvpr(p, node); + put_task_struct(p); + return retval; + +out_unlock: + read_unlock(&tasklist_lock); +out: + return retval; +} + +asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_mvpr(pid, nodeid); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_mvpr); + + +/*********************************************************************/ +/* + * proc interface + */ +/*********************************************************************/ + +struct fairsched_node_dump { +#ifdef CONFIG_VE + envid_t veid; +#endif + int id; + unsigned weight; + unsigned rate; + unsigned rate_limited : 1, + delayed : 1; + fschtag_t start_tag; + fschvalue_t value; + cycles_t delay; + int nr_ready; + int nr_runnable; + int nr_pcpu; + int nr_tasks, nr_runtasks; +}; + +struct fairsched_dump { + int len, compat; + struct fairsched_node_dump nodes[0]; +}; + +static struct fairsched_dump *fairsched_do_dump(int compat) +{ + int nr_nodes; + int len, i; + struct fairsched_dump *dump; + struct fairsched_node *node; + struct fairsched_node_dump *p; + unsigned long flags; + +start: + nr_nodes = (ve_is_super(get_exec_env()) ? fairsched_nr_nodes + 16 : 1); + len = sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]); + dump = ub_vmalloc(len); + if (dump == NULL) + goto out; + + spin_lock_irqsave(&fairsched_lock, flags); + if (ve_is_super(get_exec_env()) && nr_nodes < fairsched_nr_nodes) + goto repeat; + p = dump->nodes; + list_for_each_entry_reverse(node, &fairsched_node_head, nodelist) { + if ((char *)p - (char *)dump >= len) + break; + p->nr_tasks = 0; + p->nr_runtasks = 0; +#ifdef CONFIG_VE + if (!ve_accessible(node->owner_env, get_exec_env())) + continue; + p->veid = node->owner_env->veid; + if (compat) { + p->nr_tasks = atomic_read(&node->owner_env->pcounter); + for_each_online_cpu(i) + p->nr_runtasks += + VE_CPU_STATS(node->owner_env, i) + ->nr_running; + if (p->nr_runtasks < 0) + p->nr_runtasks = 0; + } +#endif + p->id = node->id; + p->weight = node->weight; + p->rate = node->rate; + p->rate_limited = node->rate_limited; + p->delayed = node->delayed; + p->start_tag = node->start_tag; + p->value = node->value; + p->delay = node->delay; + p->nr_ready = node->nr_ready; + p->nr_runnable = node->nr_runnable; + p->nr_pcpu = node->nr_pcpu; + p++; + } + dump->len = p - dump->nodes; + dump->compat = compat; + spin_unlock_irqrestore(&fairsched_lock, flags); + +out: + return dump; + +repeat: + spin_unlock_irqrestore(&fairsched_lock, flags); + vfree(dump); + goto start; +} + +#define FAIRSCHED_PROC_HEADLINES 2 + +#if defined(CONFIG_VE) +/* + * File format is dictated by compatibility reasons. + */ +static int fairsched_seq_show(struct seq_file *m, void *v) +{ + struct fairsched_dump *dump; + struct fairsched_node_dump *p; + unsigned vid, nid, pid, r; + + dump = m->private; + p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL); + if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { + if (p == dump->nodes) + seq_printf(m, "Version: 2.6 debug\n"); + else if (p == dump->nodes + 1) + seq_printf(m, + " veid " + " id " + " parent " + "weight " + " rate " + "tasks " + " run " + "cpus" + " " + "flg " + "ready " + " start_tag " + " value " + " delay" + "\n"); + } else { + p -= FAIRSCHED_PROC_HEADLINES; + vid = nid = pid = 0; + r = (unsigned long)v & 3; + if (p == dump->nodes) { + if (r == 2) + nid = p->id; + } else { + if (!r) + nid = p->id; + else if (r == 1) + vid = pid = p->id; + else + vid = p->id, nid = 1; + } + seq_printf(m, + "%10u " + "%10u %10u %6u %5u %5u %5u %4u" + " " + " %c%c %5u %20Lu %20Lu %20Lu" + "\n", + vid, + nid, + pid, + p->weight, + p->rate, + p->nr_tasks, + p->nr_runtasks, + p->nr_pcpu, + p->rate_limited ? 'L' : '.', + p->delayed ? 'D' : '.', + p->nr_ready, + (unsigned long long)p->start_tag.t, + (unsigned long long)p->value.v, + (unsigned long long)p->delay + ); + } + + return 0; +} + +static void *fairsched_seq_start(struct seq_file *m, loff_t *pos) +{ + struct fairsched_dump *dump; + unsigned long l; + + dump = m->private; + if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES) + return NULL; + if (*pos < FAIRSCHED_PROC_HEADLINES) + return dump->nodes + *pos; + /* guess why... */ + l = (unsigned long)(dump->nodes + + ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3); + l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3; + return (void *)l; +} +static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return fairsched_seq_start(m, pos); +} +#endif + +static int fairsched2_seq_show(struct seq_file *m, void *v) +{ + struct fairsched_dump *dump; + struct fairsched_node_dump *p; + + dump = m->private; + p = v; + if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { + if (p == dump->nodes) + seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n"); + else if (p == dump->nodes + 1) + seq_printf(m, + " id " + "weight " + " rate " + " run " + "cpus" +#ifdef FAIRSHED_DEBUG + " " + "flg " + "ready " + " start_tag " + " value " + " delay" +#endif + "\n"); + } else { + p -= FAIRSCHED_PROC_HEADLINES; + seq_printf(m, + "%10u %6u %5u %5u %4u" +#ifdef FAIRSHED_DEBUG + " " + " %c%c %5u %20Lu %20Lu %20Lu" +#endif + "\n", + p->id, + p->weight, + p->rate, + p->nr_runnable, + p->nr_pcpu +#ifdef FAIRSHED_DEBUG + , + p->rate_limited ? 'L' : '.', + p->delayed ? 'D' : '.', + p->nr_ready, + (unsigned long long)p->start_tag.t, + (unsigned long long)p->value.v, + (unsigned long long)p->delay +#endif + ); + } + + return 0; +} + +static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos) +{ + struct fairsched_dump *dump; + + dump = m->private; + if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES) + return NULL; + return dump->nodes + *pos; +} +static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return fairsched2_seq_start(m, pos); +} +static void fairsched2_seq_stop(struct seq_file *m, void *v) +{ +} + +#ifdef CONFIG_VE +static struct seq_operations fairsched_seq_op = { + .start = fairsched_seq_start, + .next = fairsched_seq_next, + .stop = fairsched2_seq_stop, + .show = fairsched_seq_show +}; +#endif +static struct seq_operations fairsched2_seq_op = { + .start = fairsched2_seq_start, + .next = fairsched2_seq_next, + .stop = fairsched2_seq_stop, + .show = fairsched2_seq_show +}; +static int fairsched_seq_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + int compat; + +#ifdef CONFIG_VE + compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1); + ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op); +#else + compat = 0; + ret = seq_open(file, &fairsched2_seq_op); +#endif + if (ret) + return ret; + m = file->private_data; + m->private = fairsched_do_dump(compat); + if (m->private == NULL) { + seq_release(inode, file); + ret = -ENOMEM; + } + return ret; +} +static int fairsched_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *m; + struct fairsched_dump *dump; + + m = file->private_data; + dump = m->private; + m->private = NULL; + vfree(dump); + seq_release(inode, file); + return 0; +} +static struct file_operations proc_fairsched_operations = { + .open = fairsched_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = fairsched_seq_release +}; + + +/*********************************************************************/ +/* + * Fairsched initialization + */ +/*********************************************************************/ + +int fsch_sysctl_latency(ctl_table *ctl, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (!write || *valp == val) + return ret; + + spin_lock_irq(&fairsched_lock); + fairsched_recompute_max_latency(); + spin_unlock_irq(&fairsched_lock); + return ret; +} + +static void fairsched_calibrate(void) +{ + fairsched_nr_cpus = num_online_cpus(); + cycles_per_timeslice = msecs_to_jiffies(FSCH_TIMESLICE) + * cycles_per_jiffy; + max_value = FSCHVALUE(cycles_per_timeslice * (fairsched_nr_cpus + 1)); +} + +void __init fairsched_init_early(void) +{ + fairsched_init_node.vcpus = num_online_cpus(); + list_add(&fairsched_init_node.nodelist, &fairsched_node_head); + fairsched_nr_nodes++; +} + +/* + * Note: this function is execute late in the initialization sequence. + * We ourselves need calibrated cycles and initialized procfs... + * The consequence of this late initialization is that start tags are + * efficiently ignored and each node preempts others on insertion. + * But it isn't a problem (only init node can be runnable). + */ +void __init fairsched_init_late(void) +{ + struct proc_dir_entry *entry; + + if (get_cycles() == 0) + panic("FAIRSCHED: no TSC!\n"); + fairsched_calibrate(); + fairsched_recompute_max_latency(); + + entry = create_proc_glob_entry("fairsched", S_IRUGO, NULL); + if (entry) + entry->proc_fops = &proc_fairsched_operations; + entry = create_proc_glob_entry("fairsched2", S_IRUGO, NULL); + if (entry) + entry->proc_fops = &proc_fairsched_operations; +} + + +#else /* CONFIG_FAIRSCHED */ + + +/*********************************************************************/ +/* + * No Fairsched + */ +/*********************************************************************/ + +asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid) +{ + return -ENOSYS; +} + +asmlinkage int sys_fairsched_rmnod(unsigned int id) +{ + return -ENOSYS; +} + +asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight) +{ + return -ENOSYS; +} + +asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid) +{ + return -ENOSYS; +} + +asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate) +{ + return -ENOSYS; +} + +asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus) +{ + return -ENOSYS; +} + +void __init fairsched_init_late(void) +{ +} + +#endif /* CONFIG_FAIRSCHED */ diff -uprN linux-2.6.18/kernel/fork.c linux-2.6.18.ovz/kernel/fork.c --- linux-2.6.18/kernel/fork.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/fork.c 2007-06-13 06:55:07.000000000 -0400 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -26,7 +27,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -35,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -53,17 +57,23 @@ #include #include +#include +#include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ +EXPORT_SYMBOL(nr_threads); int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +EXPORT_SYMBOL(tasklist_lock); int nr_processes(void) { @@ -114,14 +124,20 @@ void __put_task_struct(struct task_struc WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + ub_task_put(tsk); security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); delayacct_tsk_free(tsk); +#ifdef CONFIG_VE + put_ve(VE_TASK_INFO(tsk)->owner_env); + atomic_dec(&nr_dead); +#endif if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); void __init fork_init(unsigned long mempages) { @@ -132,7 +148,7 @@ void __init fork_init(unsigned long memp /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); + ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_UBC, NULL, NULL); #endif /* @@ -221,7 +237,12 @@ static inline int dup_mmap(struct mm_str -pages); continue; } + charge = 0; + if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start, + mpnt->vm_flags & ~VM_LOCKED, + mpnt->vm_file, UB_HARD)) + goto fail_noch; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory(len)) @@ -268,7 +289,7 @@ static inline int dup_mmap(struct mm_str rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -285,6 +306,9 @@ out: fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: + ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start, + mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file); +fail_noch: retval = -ENOMEM; vm_unacct_memory(charge); goto out; @@ -315,7 +339,8 @@ static inline void mm_free_pgd(struct mm #include -static struct mm_struct * mm_init(struct mm_struct * mm) +static struct mm_struct * mm_init(struct mm_struct * mm, + struct task_struct *tsk) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); @@ -330,11 +355,14 @@ static struct mm_struct * mm_init(struct mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + set_mm_ub(mm, tsk); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; return mm; } + + put_mm_ub(mm); free_mm(mm); return NULL; } @@ -349,10 +377,11 @@ struct mm_struct * mm_alloc(void) mm = allocate_mm(); if (mm) { memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm); + mm = mm_init(mm, NULL); } return mm; } +EXPORT_SYMBOL_GPL(mm_alloc); /* * Called when the last reference to the mm @@ -364,8 +393,10 @@ void fastcall __mmdrop(struct mm_struct BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + put_mm_ub(mm); free_mm(mm); } +EXPORT_SYMBOL_GPL(__mmdrop); /* * Decrement the use count and release all resources for an mm. @@ -383,6 +414,9 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + (void) virtinfo_gencall(VIRTINFO_EXITMMAP, mm); + if (mm->oom_killed) + ub_oom_task_dead(current); mmdrop(mm); } } @@ -470,7 +504,7 @@ static struct mm_struct *dup_mm(struct t memcpy(mm, oldmm, sizeof(*mm)); - if (!mm_init(mm)) + if (!mm_init(mm, tsk)) goto fail_nomem; if (init_new_context(tsk, mm)) @@ -497,6 +531,7 @@ fail_nocontext: * because it calls destroy_context() */ mm_free_pgd(mm); + put_mm_ub(mm); free_mm(mm); return NULL; } @@ -917,7 +952,7 @@ asmlinkage long sys_set_tid_address(int { current->clear_child_tid = tidptr; - return current->pid; + return virt_pid(current); } static inline void rt_mutex_init_task(struct task_struct *p) @@ -943,13 +978,19 @@ static struct task_struct *copy_process( unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr, - int pid) + struct pid *pidp, int pid0) { - int retval; + int retval, vpid, vtgid; struct task_struct *p = NULL; + pid_t pid = pidp ? pidp->nr : 0; +#ifdef CONFIG_VE + if (clone_flags & CLONE_NAMESPACES_MASK) + return ERR_PTR(-EINVAL); +#else if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); +#endif /* * Thread groups must share signals as well, and detached threads @@ -975,6 +1016,9 @@ static struct task_struct *copy_process( if (!p) goto fork_out; + if (ub_task_charge(current, p)) + goto bad_fork_charge; + #ifdef CONFIG_TRACE_IRQFLAGS DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); @@ -983,7 +1027,7 @@ static struct task_struct *copy_process( if (atomic_read(&p->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->user != &root_user) + p->user->uid != 0) goto bad_fork_free; } @@ -1009,9 +1053,18 @@ static struct task_struct *copy_process( delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); p->pid = pid; +#ifdef CONFIG_VE + vpid = pid; + if (pidp) { + retval = alloc_vpid(pidp, pid0 ? : -1); + if (retval < 0) + goto bad_fork_cleanup_delays_binfmt; + vpid = retval; + } +#endif retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) - if (put_user(p->pid, parent_tidptr)) + if (put_user(vpid, parent_tidptr)) goto bad_fork_cleanup_delays_binfmt; INIT_LIST_HEAD(&p->children); @@ -1029,6 +1082,7 @@ static struct task_struct *copy_process( p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ + task_io_accounting_init(p); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; @@ -1082,8 +1136,11 @@ static struct task_struct *copy_process( #endif p->tgid = p->pid; - if (clone_flags & CLONE_THREAD) + vtgid = vpid; + if (clone_flags & CLONE_THREAD) { p->tgid = current->tgid; + vtgid = virt_tgid(current); + } if ((retval = security_task_alloc(p))) goto bad_fork_cleanup_policy; @@ -1104,11 +1161,11 @@ static struct task_struct *copy_process( goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; - if ((retval = copy_namespace(clone_flags, p))) + if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* @@ -1173,8 +1230,8 @@ static struct task_struct *copy_process( */ p->cpus_allowed = current->cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || - !cpu_online(task_cpu(p)))) - set_task_cpu(p, smp_processor_id()); + !vcpu_online(task_cpu(p)))) + set_task_cpu(p, task_cpu(current)); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) @@ -1194,11 +1251,11 @@ static struct task_struct *copy_process( * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); - if (signal_pending(current)) { + if (signal_pending(current) && !pid0) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; } if (clone_flags & CLONE_THREAD) { @@ -1239,20 +1296,33 @@ static struct task_struct *copy_process( attach_pid(p, PIDTYPE_SID, p->signal->session); list_add_tail_rcu(&p->tasks, &init_task.tasks); +#ifdef CONFIG_VE + list_add_tail_rcu(&p->ve_task_info.vetask_list, + &p->ve_task_info.owner_env->vetask_lh); +#endif __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, p->pid); + set_virt_pid(p, vpid); + set_virt_tgid(p, vtgid); + set_virt_pgid(p, virt_pgid(current)); + set_virt_sid(p, virt_sid(current)); nr_threads++; } + get_ve(p->ve_task_info.owner_env); + pget_ve(p->ve_task_info.owner_env); +#ifdef CONFIG_VE + seqcount_init(&p->ve_task_info.wakeup_lock); +#endif total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); return p; -bad_fork_cleanup_namespace: - exit_namespace(p); +bad_fork_cleanup_namespaces: + exit_task_namespaces(p); bad_fork_cleanup_keys: exit_keys(p); bad_fork_cleanup_mm: @@ -1289,6 +1359,9 @@ bad_fork_cleanup_count: atomic_dec(&p->user->processes); free_uid(p->user); bad_fork_free: + ub_task_uncharge(p); + ub_task_put(p); +bad_fork_charge: free_task(p); fork_out: return ERR_PTR(retval); @@ -1305,7 +1378,7 @@ struct task_struct * __devinit fork_idle struct task_struct *task; struct pt_regs regs; - task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, NULL, 0); if (!task) return ERR_PTR(-ENOMEM); init_idle(task, cpu); @@ -1335,20 +1408,27 @@ static inline int fork_traceflag (unsign * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long do_fork(unsigned long clone_flags, +long do_fork_pid(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr) + int __user *child_tidptr, + long pid0) { struct task_struct *p; int trace = 0; - struct pid *pid = alloc_pid(); + struct pid *pid; long nr; + nr = virtinfo_gencall(VIRTINFO_DOFORK, (void *)clone_flags); + if (nr) + return nr; + + pid = alloc_pid(); if (!pid) return -EAGAIN; + nr = pid->nr; if (unlikely(current->ptrace)) { trace = fork_traceflag (clone_flags); @@ -1356,7 +1436,8 @@ long do_fork(unsigned long clone_flags, clone_flags |= CLONE_PTRACE; } - p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr); + p = copy_process(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr, pid, pid0); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1364,6 +1445,7 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; + nr = virt_pid(p); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); @@ -1377,6 +1459,7 @@ long do_fork(unsigned long clone_flags, set_tsk_thread_flag(p, TIF_SIGPENDING); } + (void)virtinfo_gencall(VIRTINFO_DOFORKRET, p); if (!(clone_flags & CLONE_STOPPED)) wake_up_new_task(p, clone_flags); else @@ -1384,20 +1467,25 @@ long do_fork(unsigned long clone_flags, if (unlikely (trace)) { current->ptrace_message = nr; + set_pn_state(current, PN_STOP_FORK); ptrace_notify ((trace << 8) | SIGTRAP); + clear_pn_state(current); } if (clone_flags & CLONE_VFORK) { wait_for_completion(&vfork); if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { current->ptrace_message = nr; + set_pn_state(current, PN_STOP_VFORK); ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); + clear_pn_state(current); } } } else { free_pid(pid); nr = PTR_ERR(p); } + (void)virtinfo_gencall(VIRTINFO_DOFORKPOST, (void *)(long)pid); return nr; } @@ -1414,27 +1502,41 @@ static void sighand_ctor(void *data, kme spin_lock_init(&sighand->siglock); } +EXPORT_SYMBOL(do_fork_pid); + +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + return do_fork_pid(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr, 0); +} + void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, + SLAB_HWCACHE_ALIGN|SLAB_PANIC| + SLAB_DESTROY_BY_RCU|SLAB_UBC, sighand_ctor, NULL); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL, NULL); + SLAB_PANIC|SLAB_UBC, NULL, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); } @@ -1505,10 +1607,9 @@ static int unshare_fs(unsigned long unsh */ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->namespace; + struct namespace *ns = current->nsproxy->namespace; - if ((unshare_flags & CLONE_NEWNS) && - (ns && atomic_read(&ns->count) > 1)) { + if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1580,6 +1681,16 @@ static int unshare_semundo(unsigned long return 0; } +#ifndef CONFIG_IPC_NS +static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) +{ + if (flags & CLONE_NEWIPC) + return -EINVAL; + + return 0; +} +#endif + /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1597,14 +1708,22 @@ asmlinkage long sys_unshare(unsigned lon struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; + struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; + struct uts_namespace *uts, *new_uts = NULL; + struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); /* Return -EINVAL for all unsupported flags */ err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC)) + goto bad_unshare_out; +#ifdef CONFIG_VE + if (unshare_flags & CLONE_NAMESPACES_MASK) goto bad_unshare_out; +#endif if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; @@ -1620,11 +1739,32 @@ asmlinkage long sys_unshare(unsigned lon goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; +#ifndef CONFIG_VE + if ((err = unshare_utsname(unshare_flags, &new_uts))) + goto bad_unshare_cleanup_semundo; + if ((err = unshare_ipcs(unshare_flags, &new_ipc))) + goto bad_unshare_cleanup_uts; +#endif - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + if (new_ns || new_uts || new_ipc) { + old_nsproxy = current->nsproxy; + new_nsproxy = dup_namespaces(old_nsproxy); + if (!new_nsproxy) { + err = -ENOMEM; + goto bad_unshare_cleanup_ipc; + } + } + + if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || + new_uts || new_ipc) { task_lock(current); + if (new_nsproxy) { + current->nsproxy = new_nsproxy; + new_nsproxy = old_nsproxy; + } + if (new_fs) { fs = current->fs; current->fs = new_fs; @@ -1632,8 +1772,8 @@ asmlinkage long sys_unshare(unsigned lon } if (new_ns) { - ns = current->namespace; - current->namespace = new_ns; + ns = current->nsproxy->namespace; + current->nsproxy->namespace = new_ns; new_ns = ns; } @@ -1658,9 +1798,35 @@ asmlinkage long sys_unshare(unsigned lon new_fd = fd; } + if (new_uts) { + uts = current->nsproxy->uts_ns; + current->nsproxy->uts_ns = new_uts; + new_uts = uts; + } + + if (new_ipc) { + ipc = current->nsproxy->ipc_ns; + current->nsproxy->ipc_ns = new_ipc; + new_ipc = ipc; + } + task_unlock(current); } + if (new_nsproxy) + put_nsproxy(new_nsproxy); + +bad_unshare_cleanup_ipc: +#ifndef CONFIG_VE + if (new_ipc) + put_ipc_ns(new_ipc); + +bad_unshare_cleanup_uts: + if (new_uts) + put_uts_ns(new_uts); + +bad_unshare_cleanup_semundo: +#endif bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff -uprN linux-2.6.18/kernel/futex.c linux-2.6.18.ovz/kernel/futex.c --- linux-2.6.18/kernel/futex.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/futex.c 2007-06-13 06:55:07.000000000 -0400 @@ -390,7 +390,7 @@ static struct task_struct * futex_find_g struct task_struct *p; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); if (!p) goto out_unlock; if ((current->euid != p->euid) && (current->euid != p->uid)) { @@ -505,7 +505,23 @@ lookup_pi_state(u32 uval, struct futex_h p = futex_find_get_task(pid); if (!p) return -ESRCH; + if (unlikely(p == current)) { + put_task_struct(p); + return -EDEADLK; + } + read_lock(&tasklist_lock); + /* To this moment p can go through do_exit and + * clean its pi_state_list. We are going to recreate it + * and it wil leak. The most obvious solution is to take + * tasklist_lock. Probably, we can use pi_lock for the + * same purpose. _ANK_ + */ + if (p->exit_state) { + read_unlock(&tasklist_lock); + put_task_struct(p); + return -ESRCH; + } pi_state = alloc_pi_state(); /* @@ -526,6 +542,7 @@ lookup_pi_state(u32 uval, struct futex_h put_task_struct(p); me->pi_state = pi_state; + read_unlock(&tasklist_lock); return 0; } @@ -566,6 +583,7 @@ static int wake_futex_pi(u32 __user *uad if (!pi_state) return -EINVAL; + spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -583,7 +601,7 @@ static int wake_futex_pi(u32 __user *uad * preserve the owner died bit.) */ if (!(uval & FUTEX_OWNER_DIED)) { - newval = FUTEX_WAITERS | new_owner->pid; + newval = FUTEX_WAITERS | virt_pid(new_owner); inc_preempt_count(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); @@ -605,6 +623,7 @@ static int wake_futex_pi(u32 __user *uad pi_state->owner = new_owner; spin_unlock_irq(&new_owner->pi_lock); + spin_unlock(&pi_state->pi_mutex.wait_lock); rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -1156,7 +1175,7 @@ static int futex_lock_pi(u32 __user *uad * (by doing a 0 -> TID atomic cmpxchg), while holding all * the locks. It will most likely not succeed. */ - newval = current->pid; + newval = virt_pid(current); inc_preempt_count(); curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); @@ -1166,7 +1185,7 @@ static int futex_lock_pi(u32 __user *uad goto uaddr_faulted; /* We own the lock already */ - if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { + if (unlikely((curval & FUTEX_TID_MASK) == virt_pid(current))) { if (!detect && 0) force_sig(SIGKILL, current); ret = -EDEADLK; @@ -1212,7 +1231,7 @@ static int futex_lock_pi(u32 __user *uad */ if (curval & FUTEX_OWNER_DIED) { uval = newval; - newval = current->pid | + newval = virt_pid(current) | FUTEX_OWNER_DIED | FUTEX_WAITERS; inc_preempt_count(); @@ -1260,7 +1279,7 @@ static int futex_lock_pi(u32 __user *uad * did a lock-steal - fix up the PI-state in that case. */ if (!ret && q.pi_state->owner != curr) { - u32 newtid = current->pid | FUTEX_WAITERS; + u32 newtid = virt_pid(current) | FUTEX_WAITERS; /* Owner died? */ if (q.pi_state->owner != NULL) { @@ -1369,7 +1388,7 @@ retry: /* * We release only a lock we actually own: */ - if ((uval & FUTEX_TID_MASK) != current->pid) + if ((uval & FUTEX_TID_MASK) != virt_pid(current)) return -EPERM; /* * First take all the futex related locks: @@ -1391,7 +1410,7 @@ retry_locked: */ if (!(uval & FUTEX_OWNER_DIED)) { inc_preempt_count(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + uval = futex_atomic_cmpxchg_inatomic(uaddr, virt_pid(current), 0); dec_preempt_count(); } @@ -1401,7 +1420,7 @@ retry_locked: * Rare case: we managed to release the lock atomically, * no need to wake anyone else up: */ - if (unlikely(uval == current->pid)) + if (unlikely(uval == virt_pid(current))) goto out_unlock; /* @@ -1625,7 +1644,7 @@ sys_get_robust_list(int pid, struct robu ret = -ESRCH; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); if (!p) goto err_unlock; ret = -EPERM; @@ -1658,7 +1677,7 @@ retry: if (get_user(uval, uaddr)) return -1; - if ((uval & FUTEX_TID_MASK) == curr->pid) { + if ((uval & FUTEX_TID_MASK) == virt_pid(curr)) { /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically diff -uprN linux-2.6.18/kernel/futex_compat.c linux-2.6.18.ovz/kernel/futex_compat.c --- linux-2.6.18/kernel/futex_compat.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/futex_compat.c 2007-06-13 06:55:07.000000000 -0400 @@ -116,7 +116,7 @@ compat_sys_get_robust_list(int pid, comp ret = -ESRCH; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); if (!p) goto err_unlock; ret = -EPERM; diff -uprN linux-2.6.18/kernel/hrtimer.c linux-2.6.18.ovz/kernel/hrtimer.c --- linux-2.6.18/kernel/hrtimer.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/hrtimer.c 2007-06-13 06:55:07.000000000 -0400 @@ -454,6 +454,21 @@ hrtimer_start(struct hrtimer *timer, kti } EXPORT_SYMBOL_GPL(hrtimer_start); +/* + * schedule_hrtimer taken from 2.6.16 kernel + * needed by CPT + */ +ktime_t __sched +schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode) +{ + hrtimer_start(timer, timer->expires, mode); + schedule(); + hrtimer_cancel(timer); + + return hrtimer_get_remaining(timer); +} + + /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop @@ -693,7 +708,7 @@ static int __sched do_nanosleep(struct h return t->task == NULL; } -static long __sched nanosleep_restart(struct restart_block *restart) +long __sched nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; struct timespec __user *rmtp; @@ -723,6 +738,7 @@ static long __sched nanosleep_restart(st /* The other values in restart are already filled in */ return -ERESTART_RESTARTBLOCK; } +EXPORT_SYMBOL_GPL(nanosleep_restart); long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) diff -uprN linux-2.6.18/kernel/irq/handle.c linux-2.6.18.ovz/kernel/irq/handle.c --- linux-2.6.18/kernel/irq/handle.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/irq/handle.c 2007-06-13 06:55:07.000000000 -0400 @@ -233,10 +233,10 @@ fastcall unsigned int __do_IRQ(unsigned spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, regs, action); - - spin_lock(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret, regs); + + spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; diff -uprN linux-2.6.18/kernel/kmod.c linux-2.6.18.ovz/kernel/kmod.c --- linux-2.6.18/kernel/kmod.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/kmod.c 2007-06-13 06:55:07.000000000 -0400 @@ -77,6 +77,10 @@ int request_module(const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + /* Don't allow request_module() inside VE. */ + if (!ve_is_super(get_exec_env())) + return -EPERM; + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); @@ -246,6 +250,9 @@ int call_usermodehelper_keys(char *path, }; DECLARE_WORK(work, __call_usermodehelper, &sub_info); + if (!ve_is_super(get_exec_env())) + return -EPERM; + if (!khelper_wq) return -EBUSY; diff -uprN linux-2.6.18/kernel/kthread.c linux-2.6.18.ovz/kernel/kthread.c --- linux-2.6.18/kernel/kthread.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/kthread.c 2007-06-13 06:55:07.000000000 -0400 @@ -123,7 +123,7 @@ static void keventd_create_kthread(void } else { wait_for_completion(&create->started); read_lock(&tasklist_lock); - create->result = find_task_by_pid(pid); + create->result = find_task_by_pid_all(pid); read_unlock(&tasklist_lock); } complete(&create->done); diff -uprN linux-2.6.18/kernel/lockdep.c linux-2.6.18.ovz/kernel/lockdep.c --- linux-2.6.18/kernel/lockdep.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/lockdep.c 2007-06-13 06:55:07.000000000 -0400 @@ -2678,13 +2678,13 @@ retry: if (count != 10) printk(" locked it.\n"); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (p->lockdep_depth) lockdep_print_held_locks(p); if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; - } while_each_thread(g, p); + } while_each_thread_all(g, p); printk("\n"); printk("=============================================\n\n"); diff -uprN linux-2.6.18/kernel/module.c linux-2.6.18.ovz/kernel/module.c --- linux-2.6.18/kernel/module.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/module.c 2007-06-13 06:55:07.000000000 -0400 @@ -1054,6 +1054,12 @@ static int mod_sysfs_setup(struct module { int err; + if (!module_subsys.kset.subsys) { + printk(KERN_ERR "%s: module_subsys not initialized\n", + mod->name); + err = -EINVAL; + goto out; + } memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); if (err) @@ -2080,6 +2086,8 @@ static void *m_start(struct seq_file *m, loff_t n = 0; mutex_lock(&module_mutex); + if (!ve_is_super(get_exec_env())) + return NULL; list_for_each(i, &modules) { if (n++ == *pos) break; diff -uprN linux-2.6.18/kernel/nsproxy.c linux-2.6.18.ovz/kernel/nsproxy.c --- linux-2.6.18/kernel/nsproxy.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/nsproxy.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Author: Serge Hallyn + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * Jun 2006 - namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov + */ + +#include +#include +#include +#include +#include + +void exit_task_namespaces(struct task_struct *p) +{ + struct nsproxy *ns = p->nsproxy; + if (ns) { + task_lock(p); + p->nsproxy = NULL; + task_unlock(p); + put_nsproxy(ns); + } +} + +void get_task_namespaces(struct task_struct *tsk) +{ + struct nsproxy *ns = tsk->nsproxy; + if (ns) { + get_nsproxy(ns); + } +} + +/* + * creates a copy of "orig" with refcount 1. + * This does not grab references to the contained namespaces, + * so that needs to be done by dup_namespaces. + */ +static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns; + + ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); + if (ns) { + memcpy(ns, orig, sizeof(struct nsproxy)); + atomic_set(&ns->count, 1); + } + return ns; +} + +/* + * copies the nsproxy, setting refcount to 1, and grabbing a + * reference to all contained namespaces. Called from + * sys_unshare() + */ +struct nsproxy *dup_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns = clone_namespaces(orig); + + if (ns) { + if (ns->namespace) + get_namespace(ns->namespace); + if (ns->uts_ns) + get_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + get_ipc_ns(ns->ipc_ns); + } + + return ns; +} + +/* + * called from clone. This now handles copy for nsproxy and all + * namespaces therein. + */ +int copy_namespaces(int flags, struct task_struct *tsk) +{ + struct nsproxy *old_ns = tsk->nsproxy; + struct nsproxy *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_nsproxy(old_ns); + + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + return 0; + + new_ns = clone_namespaces(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->nsproxy = new_ns; + + err = copy_namespace(flags, tsk); + if (err) + goto out_ns; + + err = copy_utsname(flags, tsk); + if (err) + goto out_uts; + + err = copy_ipcs(flags, tsk); + if (err) + goto out_ipc; + +out: + put_nsproxy(old_ns); + return err; + +out_ipc: + if (new_ns->uts_ns) + put_uts_ns(new_ns->uts_ns); +out_uts: + if (new_ns->namespace) + put_namespace(new_ns->namespace); +out_ns: + tsk->nsproxy = old_ns; + kfree(new_ns); + goto out; +} +EXPORT_SYMBOL(copy_namespaces); + +void free_nsproxy(struct nsproxy *ns) +{ + if (ns->namespace) + put_namespace(ns->namespace); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); + kfree(ns); +} +EXPORT_SYMBOL(free_nsproxy); + +struct namespace * get_task_mnt_ns(struct task_struct *tsk) +{ + struct namespace *mnt_ns = NULL; + + task_lock(tsk); + if (tsk->nsproxy) + mnt_ns = tsk->nsproxy->namespace; + if (mnt_ns) + get_namespace(mnt_ns); + task_unlock(tsk); + + return mnt_ns; +} +EXPORT_SYMBOL(get_task_mnt_ns); diff -uprN linux-2.6.18/kernel/panic.c linux-2.6.18.ovz/kernel/panic.c --- linux-2.6.18/kernel/panic.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/panic.c 2007-06-13 06:55:07.000000000 -0400 @@ -27,6 +27,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; +int kernel_text_csum_broken; +EXPORT_SYMBOL(kernel_text_csum_broken); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -158,7 +160,8 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", + kernel_text_csum_broken ? 'B' : ' ', tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', diff -uprN linux-2.6.18/kernel/pid.c linux-2.6.18.ovz/kernel/pid.c --- linux-2.6.18/kernel/pid.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/pid.c 2007-06-13 06:55:07.000000000 -0400 @@ -27,6 +27,11 @@ #include #include +#ifdef CONFIG_VE +int glob_virt_pids = 1; +EXPORT_SYMBOL(glob_virt_pids); +#endif + #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; static int pidhash_shift; @@ -58,8 +63,14 @@ typedef struct pidmap { void *page; } pidmap_t; +#ifdef CONFIG_VE +#define PIDMAP_NRFREE (BITS_PER_PAGE/2) +#else +#define PIDMAP_NRFREE BITS_PER_PAGE +#endif + static pidmap_t pidmap_array[PIDMAP_ENTRIES] = - { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; + { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(PIDMAP_NRFREE), NULL } }; /* * Note: disable interrupts while the pidmap_lock is held as an @@ -76,21 +87,27 @@ static pidmap_t pidmap_array[PIDMAP_ENTR */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(int pid) +fastcall void free_pidmap(int pid) { pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; - clear_bit(offset, map->page); + BUG_ON(__is_virtual_pid(pid) || pid == 1); + + if (test_and_clear_bit(offset, map->page) == 0) + BUG(); atomic_inc(&map->nr_free); } +EXPORT_SYMBOL_GPL(free_pidmap); -static int alloc_pidmap(void) +int alloc_pidmap(void) { int i, offset, max_scan, pid, last = last_pid; pidmap_t *map; pid = last + 1; + if (__is_virtual_pid(pid)) + pid += VPID_DIV; if (pid >= pid_max) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; @@ -120,6 +137,8 @@ static int alloc_pidmap(void) return pid; } offset = find_next_offset(map, offset); + if (__is_virtual_pid(offset)) + offset += VPID_DIV; pid = mk_pid(map, offset); /* * find_next_offset() found a bit, the pid from it @@ -144,6 +163,7 @@ static int alloc_pidmap(void) } return -1; } +EXPORT_SYMBOL_GPL(alloc_pidmap); fastcall void put_pid(struct pid *pid) { @@ -167,11 +187,16 @@ fastcall void free_pid(struct pid *pid) spin_lock_irqsave(&pidmap_lock, flags); hlist_del_rcu(&pid->pid_chain); +#ifdef CONFIG_VE + if (pid->veid) + hlist_del_rcu(&pid->vpid_chain); +#endif spin_unlock_irqrestore(&pidmap_lock, flags); free_pidmap(pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } +EXPORT_SYMBOL_GPL(free_pid); struct pid *alloc_pid(void) { @@ -188,9 +213,14 @@ struct pid *alloc_pid(void) goto out_free; atomic_set(&pid->count, 1); - pid->nr = nr; + pid->nr = pid->vnr = nr; for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); +#ifdef CONFIG_VE + pid->vnr = nr; + pid->veid = 0; + INIT_HLIST_NODE(&pid->vpid_chain); +#endif spin_lock_irq(&pidmap_lock); hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]); @@ -204,6 +234,7 @@ out_free: pid = NULL; goto out; } +EXPORT_SYMBOL_GPL(alloc_pid); struct pid * fastcall find_pid(int nr) { @@ -217,6 +248,17 @@ struct pid * fastcall find_pid(int nr) } return NULL; } +EXPORT_SYMBOL(find_pid); + +static struct pid *__lookup_vpid_mapping(int vnr, int veid); + +struct pid * fastcall find_vpid(int nr) +{ + return (!is_virtual_pid(nr) ? find_pid(nr) : + __lookup_vpid_mapping(nr, VEID(get_exec_env()))); +} + +EXPORT_SYMBOL(find_vpid); int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) { @@ -232,6 +274,7 @@ int fastcall attach_pid(struct task_stru return 0; } +EXPORT_SYMBOL_GPL(attach_pid); void fastcall detach_pid(struct task_struct *task, enum pid_type type) { @@ -251,6 +294,7 @@ void fastcall detach_pid(struct task_str free_pid(pid); } +EXPORT_SYMBOL_GPL(detach_pid); struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) { @@ -263,17 +307,47 @@ struct task_struct * fastcall pid_task(s } return result; } +EXPORT_SYMBOL(pid_task); /* * Must be called under rcu_read_lock() or with tasklist_lock read-held. */ struct task_struct *find_task_by_pid_type(int type, int nr) { - return pid_task(find_pid(nr), type); + BUG(); + return NULL; } EXPORT_SYMBOL(find_task_by_pid_type); +struct task_struct *find_task_by_pid_type_all(int type, int nr) +{ + BUG_ON(nr != -1 && is_virtual_pid(nr)); + return pid_task(find_pid(nr), type); +} + +EXPORT_SYMBOL(find_task_by_pid_type_all); + +#ifdef CONFIG_VE + +struct task_struct *find_task_by_pid_type_ve(int type, int nr) +{ + struct task_struct *tsk; + struct pid *pid; + + pid = find_vpid(nr); + if (!pid) + return NULL; + + tsk = pid_task(pid, type); + return (tsk != NULL && ve_accessible(VE_TASK_INFO(tsk)->owner_env, + get_exec_env()) ? tsk : NULL); +} + +EXPORT_SYMBOL(find_task_by_pid_type_ve); + +#endif + struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; @@ -296,6 +370,204 @@ struct pid *find_get_pid(pid_t nr) return pid; } +#ifdef CONFIG_VE + +/* Virtual PID bits. + * + * At the moment all internal structures in kernel store real global pid. + * The only place, where virtual PID is used, is at user frontend. We + * remap virtual pids obtained from user to global ones (vpid_to_pid) and + * map globals to virtuals before showing them to user (virt_pid_type). + * + * We hold virtual PIDs inside struct pid, so map global -> virtual is easy. + */ + +pid_t _pid_to_vpid(pid_t pid) +{ + struct pid * p; + + if (unlikely(is_virtual_pid(pid))) + return -1; + + rcu_read_lock(); + p = find_pid(pid); + pid = (p != NULL ? p->vnr : -1); + rcu_read_unlock(); + return pid; +} +EXPORT_SYMBOL_GPL(_pid_to_vpid); + +pid_t pid_to_vpid(pid_t pid) +{ + int vpid; + + if (unlikely(pid <= 0)) + return pid; + + BUG_ON(is_virtual_pid(pid)); + + if (ve_is_super(get_exec_env())) + return pid; + + vpid = _pid_to_vpid(pid); + if (unlikely(vpid == -1)) + /* It is allowed: global pid can be used everywhere. + * This can happen, when kernel remembers stray pids: + * signal queues, locks etc. + */ + vpid = pid; + + return vpid; +} +EXPORT_SYMBOL_GPL(pid_to_vpid); + +/* To map virtual pids to global we maintain special hash table. + * + * Mapping entries are allocated when a process with non-trivial + * mapping is forked, which is possible only after VE migrated. + * Mappings are destroyed, when a global pid is removed from global + * pidmap, which means we do not need to refcount mappings. + */ + +static struct hlist_head *vpid_hash; + +static inline int vpid_hashfn(int vnr, int veid) +{ + return hash_long((unsigned long)(vnr+(veid<<16)), pidhash_shift); +} + +static struct pid *__lookup_vpid_mapping(int vnr, int veid) +{ + struct hlist_node *elem; + struct pid *map; + + hlist_for_each_entry_rcu(map, elem, + &vpid_hash[vpid_hashfn(vnr, veid)], vpid_chain) { + if (map->vnr == vnr && map->veid == veid) + return map; + } + return NULL; +} + +/* __vpid_to_pid() is raw version of vpid_to_pid(). It is to be used + * only under tasklist_lock. In some places we must use only this version + * (f.e. __kill_pg_info is called under write lock!) + * + * Caller should pass virtual pid. This function returns an error, when + * seeing a global pid. + */ +pid_t __vpid_to_pid(pid_t pid) +{ + struct pid *map; + + if (unlikely(!is_virtual_pid(pid) || ve_is_super(get_exec_env()))) + return -1; + + if (!get_exec_env()->sparse_vpid) { + int init_pid; + + init_pid = get_exec_env()->init_entry->pid; + if (pid == 1) + return init_pid; + if (pid == init_pid + VPID_DIV) + return -1; /* vpid of init is 1 */ + return pid - VPID_DIV; + } + + rcu_read_lock(); + map = __lookup_vpid_mapping(pid, VEID(get_exec_env())); + pid = (map != NULL ? map->nr : -1); + rcu_read_unlock(); + return pid; +} +EXPORT_SYMBOL_GPL(__vpid_to_pid); + +pid_t vpid_to_pid(pid_t pid) +{ + /* User gave bad pid. It is his problem. */ + if (unlikely(pid <= 0)) + return pid; + + if (!is_virtual_pid(pid)) + return pid; + + return __vpid_to_pid(pid); +} +EXPORT_SYMBOL_GPL(vpid_to_pid); + +pid_t alloc_vpid(struct pid *pid, pid_t virt_pid) +{ + int result; + struct ve_struct *env = get_exec_env(); + + if (ve_is_super(env) || !env->virt_pids) + return pid->vnr; + + BUG_ON(pid->veid != 0); + + spin_lock_irq(&pidmap_lock); + if (!env->sparse_vpid) { + result = pid->nr + VPID_DIV; + if (virt_pid == -1) + goto out_success; + + result = virt_pid; + if (virt_pid == 1 || virt_pid == pid->nr + VPID_DIV) + goto out_success; + + env->sparse_vpid = 1; + } + + result = (virt_pid == -1) ? pid->nr + VPID_DIV : virt_pid; + + if (unlikely(__lookup_vpid_mapping(result, VEID(env)))) { + if (virt_pid > 0) { + result = -EEXIST; + goto out; + } + + /* No luck. Now we search for some not-existing vpid. + * It is weak place. We do linear search. */ + do { + result++; + if (!__is_virtual_pid(result)) + result += VPID_DIV; + if (result >= pid_max) + result = RESERVED_PIDS + VPID_DIV; + } while (__lookup_vpid_mapping(result, VEID(env)) != NULL); + + /* And set last_pid in hope future alloc_pidmap to avoid + * collisions after future alloc_pidmap() */ + last_pid = result - VPID_DIV; + } + if (result > 0) { +out_success: + pid->veid = VEID(env); + pid->vnr = result; + hlist_add_head_rcu(&pid->vpid_chain, + &vpid_hash[vpid_hashfn(result, pid->veid)]); + } +out: + spin_unlock_irq(&pidmap_lock); + return result; +} +EXPORT_SYMBOL(alloc_vpid); + +void free_vpid(struct pid * pid) +{ + if (pid->veid == 0) + return; + + spin_lock_irq(&pidmap_lock); + hlist_del_rcu(&pid->vpid_chain); + spin_unlock_irq(&pidmap_lock); + + pid->veid = 0; + pid->vnr = pid->nr; +} +EXPORT_SYMBOL(free_vpid); +#endif + /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or @@ -319,6 +591,14 @@ void __init pidhash_init(void) panic("Could not alloc pidhash!\n"); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); + +#ifdef CONFIG_VE + vpid_hash = alloc_bootmem(pidhash_size * sizeof(struct hlist_head)); + if (!vpid_hash) + panic("Could not alloc vpid_hash!\n"); + for (i = 0; i < pidhash_size; i++) + INIT_HLIST_HEAD(&vpid_hash[i]); +#endif } void __init pidmap_init(void) @@ -330,5 +610,5 @@ void __init pidmap_init(void) pid_cachep = kmem_cache_create("pid", sizeof(struct pid), __alignof__(struct pid), - SLAB_PANIC, NULL, NULL); + SLAB_PANIC|SLAB_UBC, NULL, NULL); } diff -uprN linux-2.6.18/kernel/posix-cpu-timers.c linux-2.6.18.ovz/kernel/posix-cpu-timers.c --- linux-2.6.18/kernel/posix-cpu-timers.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/posix-cpu-timers.c 2007-06-13 06:55:07.000000000 -0400 @@ -20,7 +20,7 @@ static int check_clock(const clockid_t w return 0; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? p->tgid != current->tgid : p->tgid != pid)) { error = -EINVAL; @@ -88,6 +88,19 @@ static inline union cpu_time_count cpu_t } /* + * Divide and limit the result to res >= 1 + * + * This is necessary to prevent signal delivery starvation, when the result of + * the division would be rounded down to 0. + */ +static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) +{ + cputime_t res = cputime_div(time, div); + + return max_t(cputime_t, res, 1); +} + +/* * Update expiry time from increment, and increase overrun count, * given the current clock sample. */ @@ -292,7 +305,7 @@ int posix_cpu_clock_get(const clockid_t */ struct task_struct *p; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); if (p) { if (CPUCLOCK_PERTHREAD(which_clock)) { if (p->tgid == current->tgid) { @@ -336,7 +349,7 @@ int posix_cpu_timer_create(struct k_itim if (pid == 0) { p = current; } else { - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); if (p && p->tgid != current->tgid) p = NULL; } @@ -344,7 +357,7 @@ int posix_cpu_timer_create(struct k_itim if (pid == 0) { p = current->group_leader; } else { - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); if (p && p->tgid != pid) p = NULL; } @@ -483,8 +496,8 @@ static void process_timer_rebalance(stru BUG(); break; case CPUCLOCK_PROF: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(prof_ticks(t), left); @@ -498,8 +511,8 @@ static void process_timer_rebalance(stru } while (t != p); break; case CPUCLOCK_VIRT: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(virt_ticks(t), left); @@ -515,6 +528,7 @@ static void process_timer_rebalance(stru case CPUCLOCK_SCHED: nsleft = expires.sched - val.sched; do_div(nsleft, nthreads); + nsleft = max_t(unsigned long long, nsleft, 1); do { if (likely(!(t->flags & PF_EXITING))) { ns = t->sched_time + nsleft; @@ -1159,12 +1173,13 @@ static void check_process_timers(struct prof_left = cputime_sub(prof_expires, utime); prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div(prof_left, nthreads); + prof_left = cputime_div_non_zero(prof_left, nthreads); virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div(virt_left, nthreads); + virt_left = cputime_div_non_zero(virt_left, nthreads); if (sched_expires) { sched_left = sched_expires - sched_time; do_div(sched_left, nthreads); + sched_left = max_t(unsigned long long, sched_left, 1); } else { sched_left = 0; } diff -uprN linux-2.6.18/kernel/posix-timers.c linux-2.6.18.ovz/kernel/posix-timers.c --- linux-2.6.18/kernel/posix-timers.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/posix-timers.c 2007-06-13 06:55:07.000000000 -0400 @@ -31,6 +31,7 @@ * POSIX clocks & timers */ #include +#include #include #include #include @@ -49,6 +50,8 @@ #include #include +#include + /* * Management arrays for POSIX timers. Timers are kept in slab memory * Timer ids are allocated by an external routine that keeps track of the @@ -242,7 +245,8 @@ static __init int init_posix_timers(void register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, 0, NULL, NULL); + sizeof (struct k_itimer), 0, + SLAB_UBC, NULL, NULL); idr_init(&posix_timers_id); return 0; } @@ -298,6 +302,13 @@ void do_schedule_next_timer(struct sigin int posix_timer_event(struct k_itimer *timr,int si_private) { + int ret; + struct ve_struct *ve; + struct user_beancounter *ub; + + ve = set_exec_env(timr->it_process->ve_task_info.owner_env); + ub = set_exec_ub(timr->it_process->task_bc.task_ub); + memset(&timr->sigq->info, 0, sizeof(siginfo_t)); timr->sigq->info.si_sys_private = si_private; /* Send signal to the process that owns this timer.*/ @@ -310,11 +321,11 @@ int posix_timer_event(struct k_itimer *t if (timr->it_sigev_notify & SIGEV_THREAD_ID) { struct task_struct *leader; - int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, + ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, timr->it_process); if (likely(ret >= 0)) - return ret; + goto out; timr->it_sigev_notify = SIGEV_SIGNAL; leader = timr->it_process->group_leader; @@ -322,8 +333,12 @@ int posix_timer_event(struct k_itimer *t timr->it_process = leader; } - return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, + ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq, timr->it_process); +out: + (void)set_exec_ub(ub); + (void)set_exec_env(ve); + return ret; } EXPORT_SYMBOL_GPL(posix_timer_event); @@ -372,7 +387,7 @@ static struct task_struct * good_sigeven struct task_struct *rtn = current->group_leader; if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || + (!(rtn = find_task_by_pid_ve(event->sigev_notify_thread_id)) || rtn->tgid != current->tgid || (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) return NULL; diff -uprN linux-2.6.18/kernel/power/Kconfig linux-2.6.18.ovz/kernel/power/Kconfig --- linux-2.6.18/kernel/power/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/power/Kconfig 2007-06-13 06:55:07.000000000 -0400 @@ -56,7 +56,7 @@ config PM_TRACE config SOFTWARE_SUSPEND bool "Software Suspend" - depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) + depends on PM && SWAP && X86 || ((FRV || PPC32) && !SMP) ---help--- Enable the possibility of suspending the machine. It doesn't need ACPI or APM. diff -uprN linux-2.6.18/kernel/power/process.c linux-2.6.18.ovz/kernel/power/process.c --- linux-2.6.18/kernel/power/process.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/power/process.c 2007-06-13 06:55:07.000000000 -0400 @@ -19,6 +19,7 @@ */ #define TIMEOUT (20 * HZ) +extern atomic_t global_suspend; static inline int freezeable(struct task_struct * p) { @@ -26,34 +27,12 @@ static inline int freezeable(struct task (p->flags & PF_NOFREEZE) || (p->exit_state == EXIT_ZOMBIE) || (p->exit_state == EXIT_DEAD) || - (p->state == TASK_STOPPED)) + (p->state == TASK_STOPPED) || + (p->state == TASK_TRACED)) return 0; return 1; } -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) -{ - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ - long save; - save = current->state; - pr_debug("%s entered refrigerator\n", current->comm); - printk("="); - - frozen_process(current); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - while (frozen(current)) { - current->state = TASK_UNINTERRUPTIBLE; - schedule(); - } - pr_debug("%s left refrigerator\n", current->comm); - current->state = save; -} - static inline void freeze_process(struct task_struct *p) { unsigned long flags; @@ -86,13 +65,14 @@ int freeze_processes(void) unsigned long start_time; struct task_struct *g, *p; + atomic_inc(&global_suspend); printk( "Stopping tasks: " ); start_time = jiffies; user_frozen = 0; do { nr_user = todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (!freezeable(p)) continue; if (frozen(p)) @@ -115,7 +95,7 @@ int freeze_processes(void) freeze_process(p); todo++; } - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); todo += nr_user; if (!user_frozen && !nr_user) { @@ -128,6 +108,8 @@ int freeze_processes(void) break; } while(todo); + atomic_dec(&global_suspend); + /* This does not unfreeze processes that are already frozen * (we have slightly ugly calling convention in that respect, * and caller must call thaw_processes() if something fails), @@ -139,16 +121,16 @@ int freeze_processes(void) "after %d seconds (%d tasks remaining):\n", TIMEOUT / HZ, todo); read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (freezeable(p) && !frozen(p)) printk(KERN_ERR " %s\n", p->comm); cancel_freezing(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); return todo; } - printk( "|\n" ); + /* printk( "|\n" ); */ BUG_ON(in_atomic()); return 0; } @@ -159,16 +141,14 @@ void thaw_processes(void) printk( "Restarting tasks..." ); read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (!freezeable(p)) continue; if (!thaw_process(p)) printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); schedule(); printk( " done\n" ); } - -EXPORT_SYMBOL(refrigerator); diff -uprN linux-2.6.18/kernel/power/snapshot.c linux-2.6.18.ovz/kernel/power/snapshot.c --- linux-2.6.18/kernel/power/snapshot.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/power/snapshot.c 2007-06-13 06:55:07.000000000 -0400 @@ -502,7 +502,7 @@ static void init_header(struct swsusp_in memset(info, 0, sizeof(struct swsusp_info)); info->version_code = LINUX_VERSION_CODE; info->num_physpages = num_physpages; - memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); info->cpus = num_online_cpus(); info->image_pages = nr_copy_pages; info->pages = nr_copy_pages + nr_meta_pages + 1; @@ -643,13 +643,13 @@ static int check_header(struct swsusp_in reason = "kernel version"; if (info->num_physpages != num_physpages) reason = "memory size"; - if (strcmp(info->uts.sysname,system_utsname.sysname)) + if (strcmp(info->uts.sysname,init_utsname()->sysname)) reason = "system type"; - if (strcmp(info->uts.release,system_utsname.release)) + if (strcmp(info->uts.release,init_utsname()->release)) reason = "kernel release"; - if (strcmp(info->uts.version,system_utsname.version)) + if (strcmp(info->uts.version,init_utsname()->version)) reason = "version"; - if (strcmp(info->uts.machine,system_utsname.machine)) + if (strcmp(info->uts.machine,init_utsname()->machine)) reason = "machine"; if (reason) { printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); diff -uprN linux-2.6.18/kernel/printk.c linux-2.6.18.ovz/kernel/printk.c --- linux-2.6.18/kernel/printk.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/printk.c 2007-06-13 06:55:07.000000000 -0400 @@ -30,7 +30,9 @@ #include #include #include +#include #include +#include #include @@ -54,6 +56,9 @@ int console_printk[4] = { EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */ +struct printk_aligned printk_no_wake_var[NR_CPUS]; +EXPORT_SYMBOL(printk_no_wake_var); + /* * Low lever drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -84,7 +89,7 @@ static int console_locked, console_suspe * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +DEFINE_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -115,6 +120,7 @@ static int preferred_console = -1; /* Flag: console code may call schedule() */ static int console_may_schedule; +int console_silence_loglevel; #ifdef CONFIG_PRINTK @@ -123,6 +129,19 @@ static char *log_buf = __log_buf; static int log_buf_len = __LOG_BUF_LEN; static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ +static int __init setup_console_silencelevel(char *str) +{ + int level; + + if (get_option(&str, &level) != 1) + return 0; + + console_silence_loglevel = level; + return 1; +} + +__setup("silencelevel=", setup_console_silencelevel); + static int __init log_buf_len_setup(char *str) { unsigned long size = memparse(str, &str); @@ -186,6 +205,9 @@ int do_syslog(int type, char __user *buf char c; int error = 0; + if (!ve_is_super(get_exec_env()) && (type == 6 || type == 7)) + goto out; + error = security_syslog(type); if (error) return error; @@ -206,15 +228,15 @@ int do_syslog(int type, char __user *buf error = -EFAULT; goto out; } - error = wait_event_interruptible(log_wait, - (log_start - log_end)); + error = wait_event_interruptible(ve_log_wait, + (ve_log_start - ve_log_end)); if (error) goto out; i = 0; spin_lock_irq(&logbuf_lock); - while (!error && (log_start != log_end) && i < len) { - c = LOG_BUF(log_start); - log_start++; + while (!error && (ve_log_start != ve_log_end) && i < len) { + c = VE_LOG_BUF(ve_log_start); + ve_log_start++; spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; @@ -240,15 +262,17 @@ int do_syslog(int type, char __user *buf error = -EFAULT; goto out; } + if (ve_log_buf == NULL) + goto out; count = len; - if (count > log_buf_len) - count = log_buf_len; + if (count > ve_log_buf_len) + count = ve_log_buf_len; spin_lock_irq(&logbuf_lock); - if (count > logged_chars) - count = logged_chars; + if (count > ve_logged_chars) + count = ve_logged_chars; if (do_clear) - logged_chars = 0; - limit = log_end; + ve_logged_chars = 0; + limit = ve_log_end; /* * __put_user() could sleep, and while we sleep * printk() could overwrite the messages @@ -257,9 +281,9 @@ int do_syslog(int type, char __user *buf */ for (i = 0; i < count && !error; i++) { j = limit-1-i; - if (j + log_buf_len < log_end) + if (j + ve_log_buf_len < ve_log_end) break; - c = LOG_BUF(j); + c = VE_LOG_BUF(j); spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); @@ -283,7 +307,7 @@ int do_syslog(int type, char __user *buf } break; case 5: /* Clear ring buffer */ - logged_chars = 0; + ve_logged_chars = 0; break; case 6: /* Disable logging to console */ console_loglevel = minimum_console_loglevel; @@ -295,16 +319,19 @@ int do_syslog(int type, char __user *buf error = -EINVAL; if (len < 1 || len > 8) goto out; + error = 0; + /* VE has no console, so return success */ + if (!ve_is_super(get_exec_env())) + goto out; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; - error = 0; break; case 9: /* Number of chars in the log buffer */ - error = log_end - log_start; + error = ve_log_end - ve_log_start; break; case 10: /* Size of the log buffer */ - error = log_buf_len; + error = ve_log_buf_len; break; default: error = -EINVAL; @@ -403,16 +430,18 @@ static void call_console_drivers(unsigne static void emit_log_char(char c) { - LOG_BUF(log_end) = c; - log_end++; - if (log_end - log_start > log_buf_len) - log_start = log_end - log_buf_len; - if (log_end - con_start > log_buf_len) - con_start = log_end - log_buf_len; - if (logged_chars < log_buf_len) - logged_chars++; + VE_LOG_BUF(ve_log_end) = c; + ve_log_end++; + if (ve_log_end - ve_log_start > ve_log_buf_len) + ve_log_start = ve_log_end - ve_log_buf_len; + if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len) + con_start = ve_log_end - ve_log_buf_len; + if (ve_logged_chars < ve_log_buf_len) + ve_logged_chars++; } +static unsigned long do_release_console_sem(unsigned long *flags); + /* * Zap console related locks when oopsing. Only zap at most once * every 10 seconds, to leave time for slow consoles to print a @@ -488,6 +517,30 @@ static int have_callable_console(void) * printf(3) */ +static inline int ve_log_init(void) +{ +#ifdef CONFIG_VE + if (ve_log_buf != NULL) + return 0; + + if (ve_is_super(get_exec_env())) { + ve0._log_wait = &log_wait; + ve0._log_start = &log_start; + ve0._log_end = &log_end; + ve0._logged_chars = &logged_chars; + ve0.log_buf = log_buf; + return 0; + } + + ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC); + if (!ve_log_buf) + return -ENOMEM; + + memset(ve_log_buf, 0, ve_log_buf_len); +#endif + return 0; +} + asmlinkage int printk(const char *fmt, ...) { va_list args; @@ -503,13 +556,14 @@ asmlinkage int printk(const char *fmt, . /* cpu currently holding logbuf_lock */ static volatile unsigned int printk_cpu = UINT_MAX; -asmlinkage int vprintk(const char *fmt, va_list args) +asmlinkage int __vprintk(const char *fmt, va_list args) { unsigned long flags; int printed_len; char *p; static char printk_buf[1024]; static int log_level_unknown = 1; + int err, need_wake; preempt_disable(); if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) @@ -523,6 +577,12 @@ asmlinkage int vprintk(const char *fmt, spin_lock(&logbuf_lock); printk_cpu = smp_processor_id(); + err = ve_log_init(); + if (err) { + spin_unlock_irqrestore(&logbuf_lock, flags); + return err; + } + /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); @@ -583,7 +643,26 @@ asmlinkage int vprintk(const char *fmt, log_level_unknown = 1; } - if (!down_trylock(&console_sem)) { + if (!ve_is_super(get_exec_env())) { + need_wake = (ve_log_start != ve_log_end); + spin_unlock_irqrestore(&logbuf_lock, flags); + if (!oops_in_progress && need_wake) + wake_up_interruptible(&ve_log_wait); + } else if (__printk_no_wake) { + /* + * A difficult case, created by the console semaphore mess... + * All wakeups are omitted. + */ + if (!atomic_add_negative(-1, &console_sem.count)) { + console_locked = 1; + console_may_schedule = 0; + do_release_console_sem(&flags); + console_locked = 0; + console_may_schedule = 0; + } + atomic_inc(&console_sem.count); + spin_unlock_irqrestore(&logbuf_lock, flags); + } else if (!down_trylock(&console_sem)) { /* * We own the drivers. We can drop the spinlock and * let release_console_sem() print the text, maybe ... @@ -626,6 +705,63 @@ asmlinkage int vprintk(const char *fmt, EXPORT_SYMBOL(printk); EXPORT_SYMBOL(vprintk); +static struct timer_list conswakeup_timer; +static void conswakeup_timer_call(unsigned long dumy) +{ + if (!down_trylock(&console_sem)) { + console_locked = 1; + console_may_schedule = 0; + release_console_sem(); + } + mod_timer(&conswakeup_timer, jiffies + 5 * HZ); +} + +static int __init conswakeup_init(void) +{ + init_timer(&conswakeup_timer); + conswakeup_timer.function = &conswakeup_timer_call; + conswakeup_timer.expires = jiffies + 5 * HZ; + add_timer(&conswakeup_timer); + return 0; +} +console_initcall(conswakeup_init); + +asmlinkage int vprintk(const char *fmt, va_list args) +{ + int i; + struct ve_struct *env; + + env = set_exec_env(get_ve0()); + i = __vprintk(fmt, args); + set_exec_env(env); + return i; +} + +asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) +{ + int printed_len; + + printed_len = 0; + if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) + printed_len = vprintk(fmt, args); + if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) + printed_len = __vprintk(fmt, args); + return printed_len; +} + +asmlinkage int ve_printk(int dst, const char *fmt, ...) +{ + va_list args; + int printed_len; + + va_start(args, fmt); + printed_len = ve_vprintk(dst, fmt, args); + va_end(args); + return printed_len; +} +EXPORT_SYMBOL(ve_printk); + + #else asmlinkage long sys_syslog(int type, char __user *buf, int len) @@ -738,6 +874,18 @@ void resume_console(void) release_console_sem(); } +void wake_up_klogd(void) +{ + if (!oops_in_progress && waitqueue_active(&log_wait)) + /* + * If we printk from within the lock dependency code, + * from within the scheduler code, then do not lock + * up due to self-recursion: + */ + if (!lockdep_internal()) + wake_up_interruptible(&log_wait); +} + /** * acquire_console_sem - lock the console system for exclusive use. * @@ -789,43 +937,45 @@ EXPORT_UNUSED_SYMBOL(is_console_locked); * * release_console_sem() may be called from any context. */ -void release_console_sem(void) +static unsigned long do_release_console_sem(unsigned long *flags) { - unsigned long flags; unsigned long _con_start, _log_end; unsigned long wake_klogd = 0; if (console_suspended) { up(&secondary_console_sem); - return; + goto out; } console_may_schedule = 0; for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); + spin_unlock_irqrestore(&logbuf_lock, *flags); call_console_drivers(_con_start, _log_end); - local_irq_restore(flags); + spin_lock_irqsave(&logbuf_lock, *flags); } +out: + return wake_klogd; +} + +void release_console_sem(void) +{ + unsigned long flags; + unsigned long wake_klogd; + + spin_lock_irqsave(&logbuf_lock, flags); + wake_klogd = do_release_console_sem(&flags); console_locked = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); - if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { - /* - * If we printk from within the lock dependency code, - * from within the scheduler code, then do not lock - * up due to self-recursion: - */ - if (!lockdep_internal()) - wake_up_interruptible(&log_wait); - } + if (wake_klogd) + wake_up_klogd(); } EXPORT_SYMBOL(release_console_sem); @@ -1105,3 +1255,33 @@ int printk_ratelimit(void) printk_ratelimit_burst); } EXPORT_SYMBOL(printk_ratelimit); + +/* + * Rate limiting stuff. + */ +int vz_ratelimit(struct vz_rate_info *p) +{ + unsigned long cjif, djif; + unsigned long flags; + static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; + long new_bucket; + + spin_lock_irqsave(&ratelimit_lock, flags); + cjif = jiffies; + djif = cjif - p->last; + if (djif < p->interval) { + if (p->bucket >= p->burst) { + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 0; + } + p->bucket++; + } else { + new_bucket = p->bucket - (djif / (unsigned)p->interval); + if (new_bucket < 0) + new_bucket = 0; + p->bucket = new_bucket + 1; + } + p->last = cjif; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 1; +} diff -uprN linux-2.6.18/kernel/ptrace.c linux-2.6.18.ovz/kernel/ptrace.c --- linux-2.6.18/kernel/ptrace.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/ptrace.c 2007-06-13 06:55:07.000000000 -0400 @@ -129,6 +129,8 @@ static int may_attach(struct task_struct * or halting the specified task is impossible. */ int dumpable = 0; + int vps_dumpable = 0; + /* Don't let security modules deny introspection */ if (task == current) return 0; @@ -140,11 +142,17 @@ static int may_attach(struct task_struct (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); - if (task->mm) + if (task->mm) { dumpable = task->mm->dumpable; + vps_dumpable = (task->mm->vps_dumpable == 1); + } + if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - + if (!vps_dumpable && !ve_is_super(get_exec_env())) + return -EPERM; + if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env())) + return -EPERM; return security_ptrace(current, task); } @@ -190,6 +198,8 @@ repeat: if (!task->mm) goto bad; + if (task->mm->vps_dumpable == 2) + goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; @@ -294,6 +304,7 @@ int access_process_vm(struct task_struct return buf - old_buf; } +EXPORT_SYMBOL_GPL(access_process_vm); int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) { @@ -491,7 +502,7 @@ struct task_struct *ptrace_get_task_stru return ERR_PTR(-EPERM); read_lock(&tasklist_lock); - child = find_task_by_pid(pid); + child = find_task_by_pid_ve(pid); if (child) get_task_struct(child); read_unlock(&tasklist_lock); diff -uprN linux-2.6.18/kernel/rtmutex-debug.c linux-2.6.18.ovz/kernel/rtmutex-debug.c --- linux-2.6.18/kernel/rtmutex-debug.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/rtmutex-debug.c 2007-06-13 06:55:07.000000000 -0400 @@ -154,7 +154,7 @@ void debug_rt_mutex_print_deadlock(struc if (!waiter->deadlock_lock || !rt_trace_on) return; - task = find_task_by_pid(waiter->deadlock_task_pid); + task = find_task_by_pid_all(waiter->deadlock_task_pid); if (!task) return; diff -uprN linux-2.6.18/kernel/sched.c linux-2.6.18.ovz/kernel/sched.c --- linux-2.6.18/kernel/sched.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/sched.c 2007-06-13 06:55:07.000000000 -0400 @@ -52,6 +52,8 @@ #include #include #include +#include +#include #include #include @@ -137,7 +139,7 @@ #ifdef CONFIG_SMP #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) + vsched_num_online_vcpus(task_vsched(p))) #else #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) @@ -202,6 +204,7 @@ struct prio_array { * (such as the load balancing or the thread migration code), lock * acquire operations must be ordered by ascending &runqueue. */ +typedef struct vcpu_struct *vcpu_t; struct rq { spinlock_t lock; @@ -224,9 +227,12 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long nr_sleeping; + unsigned long nr_stopped; + unsigned long expired_timestamp; unsigned long long timestamp_last_tick; - struct task_struct *curr, *idle; + struct task_struct *curr; struct mm_struct *prev_mm; struct prio_array *active, *expired, arrays[2]; int best_expired_prio; @@ -237,11 +243,12 @@ struct rq { /* For active balancing */ int active_balance; - int push_cpu; +#endif + vcpu_t push_cpu; struct task_struct *migration_thread; + int migration_thread_init; struct list_head migration_queue; -#endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ @@ -262,10 +269,66 @@ struct rq { unsigned long ttwu_cnt; unsigned long ttwu_local; #endif +#ifndef CONFIG_SCHED_VCPU + /* + * with VCPU scheduler each rq is dynamic object + * so assign a common static class to them and + * use lock nesting rules in double_rq_lock etc + */ struct lock_class_key rq_lock_key; +#endif }; -static DEFINE_PER_CPU(struct rq, runqueues); +/* VCPU scheduler state description */ +struct vcpu_struct; +struct vcpu_scheduler { + struct list_head idle_list; + struct list_head active_list; + struct list_head running_list; +#ifdef CONFIG_FAIRSCHED + struct fairsched_node *node; +#endif + struct list_head list; + struct vcpu_struct *vcpu[NR_CPUS]; + int id; + cpumask_t vcpu_online_map, vcpu_running_map; + cpumask_t pcpu_running_map; + int num_online_vcpus; +} ____cacheline_internodealigned_in_smp; + +/* virtual CPU description */ +struct vcpu_struct { + struct rq rq; +#ifdef CONFIG_SCHED_VCPU + unsigned active : 1, + running : 1; + struct list_head list; + struct vcpu_scheduler *vsched; + int last_pcpu; + unsigned long start_time; + unsigned long stop_time; +#endif + int id; +} ____cacheline_internodealigned_in_smp; + +/* physical CPU description */ +struct pcpu_info { + struct vcpu_scheduler *vsched; + struct vcpu_struct *vcpu; + struct task_struct *idle; +#ifdef CONFIG_SMP + struct sched_domain *sd; +#endif + int id; +} ____cacheline_internodealigned_in_smp; + +struct pcpu_info pcpu_info[NR_CPUS]; + +static LIST_HEAD(vsched_list); +static DEFINE_SPINLOCK(vsched_list_lock); + +#define pcpu(nr) (&pcpu_info[nr]) +#define this_pcpu() (pcpu(smp_processor_id())) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -274,13 +337,600 @@ static DEFINE_PER_CPU(struct rq, runqueu * The domain tree of any CPU may only be accessed from within * preempt-disabled sections. */ +#define for_each_pdomain(sd, domain) \ +for (domain = rcu_dereference(sd); domain; domain = domain->parent) + #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + for_each_pdomain(vcpu_rq(cpu)->sd, __sd) + +#ifdef CONFIG_SCHED_VCPU + +/* Used in find_idle_vsched() */ +static DEFINE_PER_CPU(int, find_busvs_last_pcpu); + +/* + * vcpu_timeslice - how many msec's runnable VCPU will stay on the same + * physical CPU. If vcpu_timeslice < 0, actual vcpu timeslice value will + * be calculated according to number of 'ready to run' vcpu's: + * + * vcpu_timeslice_actual = VCPU_TIMESLICE_MAX >> + * ((nr_runnable_vcpus - 1) / nr_pcpus) + */ +#define VCPU_TIMESLICE_MAX 8 +int vcpu_timeslice_actual; +unsigned int nr_online_pcpus = 1; /* mustn't be 0, cause it's divisor */ +/* + * Set initial value to -1, to not subtract '-1' each time. + */ +unsigned int nr_runnable_vcpus = -1; + +u32 vcpu_sched_timeslice = 5; +int vcpu_timeslice = -1; +u32 vcpu_hot_timeslice = 4; /* < 4 won't work for HZ=250 */ +EXPORT_SYMBOL(vcpu_sched_timeslice); +EXPORT_SYMBOL(vcpu_timeslice); +EXPORT_SYMBOL(vcpu_hot_timeslice); + +extern spinlock_t fairsched_lock; +static struct vcpu_scheduler default_vsched, idle_vsched; +static struct vcpu_struct boot_vcpu, boot_idle_vcpu; + +#define vsched_default_vsched() (&default_vsched) +#define vsched_default_vcpu(id) (default_vsched.vcpu[id]) + +/* + * All macroses below could be used without locks, if there is no + * strict ordering requirements, because we assume, that: + * + * 1. VCPU could not disappear "on the fly" (FIXME) + * + * 2. p->vsched access is atomic. + */ + +#define task_vsched(tsk) ((tsk)->vsched) +#define this_vsched() (task_vsched(current)) + +#define vsched_vcpu(vsched, id) ((vsched)->vcpu[id]) +#define this_vcpu() (task_vcpu(current)) +#define task_vcpu(p) ((p)->vcpu) + +#define vsched_id(vsched) ((vsched)->id) +#define vsched_vcpu_online_map(vsched) ((vsched)->vcpu_online_map) +#define vsched_num_online_vcpus(vsched) ((vsched)->num_online_vcpus) +#define vsched_pcpu_running_map(vsched) ((vsched)->pcpu_running_map) + +#define vcpu_vsched(vcpu) ((vcpu)->vsched) +#define vcpu_last_pcpu(vcpu) ((vcpu)->last_pcpu) +#define vcpu_isset(vcpu, mask) (cpu_isset((vcpu)->id, mask)) +#define vcpu_is_offline(vcpu) (!vcpu_isset(vcpu, \ + vcpu_vsched(vcpu)->vcpu_online_map)) + +static int __add_vcpu(struct vcpu_scheduler *vsched, int id); + +#define vcpu_is_hot(vcpu) (jiffies - (vcpu)->start_time \ + < msecs_to_jiffies(vcpu_timeslice_actual)) +#else /* CONFIG_SCHED_VCPU */ + +static DEFINE_PER_CPU(struct vcpu_struct, vcpu_struct); + +#define task_vsched(p) NULL +#define this_vcpu() (task_vcpu(current)) +#define task_vcpu(p) (vcpu(task_cpu(p))) + +#define vsched_vcpu(sched, id) (vcpu(id)) +#define vsched_id(vsched) 0 +#define vsched_default_vsched() NULL +#define vsched_default_vcpu(id) (vcpu(id)) + +#define vsched_vcpu_online_map(vsched) (cpu_online_map) +#define vsched_num_online_vcpus(vsched) (num_online_cpus()) +#define vsched_pcpu_running_map(vsched) (cpu_online_map) + +#define vcpu(id) (&per_cpu(vcpu_struct, id)) + +#define vcpu_vsched(vcpu) NULL +#define vcpu_last_pcpu(vcpu) ((vcpu)->id) +#define vcpu_isset(vcpu, mask) (cpu_isset((vcpu)->id, mask)) +#define vcpu_is_offline(vcpu) (cpu_is_offline((vcpu)->id)) + +#define vcpu_is_hot(vcpu) (1) +#endif /* CONFIG_SCHED_VCPU */ + +#define this_rq() (vcpu_rq(this_vcpu())) +#define task_rq(p) (vcpu_rq(task_vcpu(p))) +#define vcpu_rq(vcpu) (&(vcpu)->rq) +#define get_vcpu() ({ preempt_disable(); this_vcpu(); }) +#define put_vcpu() ({ put_cpu(); }) +#define rq_vcpu(__rq) (container_of((__rq), struct vcpu_struct, rq)) + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return vcpu_last_pcpu(rq_vcpu(rq)); +#else + return 0; +#endif +} + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +struct task_struct *idle_task(int cpu) +{ + return pcpu(cpu)->idle; +} + +#ifdef CONFIG_SMP +static inline void update_rq_cpu_load(struct rq *this_rq) +{ + unsigned long this_load; + int i, scale; + + if (unlikely(this_rq->nr_running == 0)) { + for (i = 0; i < 3; i++) + this_rq->cpu_load[i] = 0; + return; + } + + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { + unsigned long old_load, new_load; + + old_load = this_rq->cpu_load[i]; + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + } +} +#else /* CONFIG_SMP */ +static inline void update_rq_cpu_load(struct rq *this_rq) +{ +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SCHED_VCPU +static inline void recalc_vcpu_timeslice(void) +{ + int val; + + if (vcpu_timeslice < 0) { + val = nr_runnable_vcpus / nr_online_pcpus; + val = val > 31 ? 31 : val; + val = VCPU_TIMESLICE_MAX >> val; + } else + val = vcpu_timeslice; + + /* + * Optimization (?) - don't invalidate other CPU's cacheline + * if vcpu_timeslice_actual is not changed. + */ + if (vcpu_timeslice_actual != val) + vcpu_timeslice_actual = val; +} + +void fastcall vsched_cpu_online_map(struct vcpu_scheduler *vsched, + cpumask_t *mask) +{ + unsigned long flags; + + spin_lock_irqsave(&fairsched_lock, flags); + *mask = vsched->vcpu_online_map; + spin_unlock_irqrestore(&fairsched_lock, flags); +} + +static inline void set_task_vsched(struct task_struct *p, + struct vcpu_scheduler *vsched) +{ + /* NOTE: set_task_cpu() is required after every set_task_vsched()! */ + p->vsched = vsched; + p->vsched_id = vsched_id(vsched); +} + +inline void set_task_cpu(struct task_struct *p, unsigned int vcpu_id) +{ + p->vcpu = vsched_vcpu(task_vsched(p), vcpu_id); + p->vcpu_id = vcpu_id; +} + +static inline void set_task_vcpu(struct task_struct *p, vcpu_t vcpu) +{ + p->vcpu = vcpu; + p->vcpu_id = vcpu->id; +} + +/* this is called when rq->nr_running changes from 0 to 1 */ +static void vcpu_attach(struct rq *rq) +{ + struct vcpu_scheduler *vsched; + vcpu_t vcpu; + + vcpu = rq_vcpu(rq); + vsched = vcpu_vsched(vcpu); + + BUG_ON(vcpu->active); + spin_lock(&fairsched_lock); + vcpu->active = 1; + if (!vcpu->running) + list_move_tail(&vcpu->list, &vsched->active_list); + + fairsched_incrun(vsched->node); + nr_runnable_vcpus++; + spin_unlock(&fairsched_lock); + + recalc_vcpu_timeslice(); +} + +/* this is called when rq->nr_running changes from 1 to 0 */ +static void vcpu_detach(struct rq *rq) +{ + struct vcpu_scheduler *vsched; + vcpu_t vcpu; + + vcpu = rq_vcpu(rq); + vsched = vcpu_vsched(vcpu); + BUG_ON(!vcpu->active); + + spin_lock(&fairsched_lock); + fairsched_decrun(vsched->node); + + vcpu->active = 0; + if (!vcpu->running) + list_move_tail(&vcpu->list, &vsched->idle_list); + nr_runnable_vcpus--; + spin_unlock(&fairsched_lock); + + recalc_vcpu_timeslice(); +} + +static inline void __vcpu_get(vcpu_t vcpu) +{ + struct pcpu_info *pcpu; + struct vcpu_scheduler *vsched; + + BUG_ON(!this_vcpu()->running); + + pcpu = this_pcpu(); + vsched = vcpu_vsched(vcpu); + + pcpu->vcpu = vcpu; + pcpu->vsched = vsched; + + fairsched_inccpu(vsched->node); + + list_move_tail(&vcpu->list, &vsched->running_list); + vcpu->start_time = jiffies; + vcpu->last_pcpu = pcpu->id; + vcpu->running = 1; + __set_bit(vcpu->id, vsched->vcpu_running_map.bits); + __set_bit(pcpu->id, vsched->pcpu_running_map.bits); +#ifdef CONFIG_SMP + vcpu_rq(vcpu)->sd = pcpu->sd; +#endif +} + +static void vcpu_put(vcpu_t vcpu) +{ + struct vcpu_scheduler *vsched; + struct pcpu_info *cur_pcpu; + struct rq *rq; + + vsched = vcpu_vsched(vcpu); + rq = vcpu_rq(vcpu); + cur_pcpu = this_pcpu(); + + BUG_ON(!vcpu->running); + + spin_lock(&fairsched_lock); + vcpu->running = 0; + list_move_tail(&vcpu->list, + vcpu->active ? &vsched->active_list : &vsched->idle_list); + fairsched_deccpu(vsched->node); + __clear_bit(vcpu->id, vsched->vcpu_running_map.bits); + if (vsched != this_vsched()) + __clear_bit(cur_pcpu->id, vsched->pcpu_running_map.bits); + + vcpu->stop_time = jiffies; + if (!rq->nr_running) + rq->expired_timestamp = 0; + /* from this point task_running(prev_rq, prev) will be 0 */ + rq->curr = cur_pcpu->idle; + update_rq_cpu_load(rq); + spin_unlock(&fairsched_lock); +} + +/* + * Find an idle VCPU in given vsched. VCPU runned on this pcpu is + * preferrable. Idle VCPU must be present in *cpus mask also. + */ +static vcpu_t find_idle_vcpu(struct vcpu_scheduler *vsched, cpumask_t *cpus) +{ + vcpu_t vcpu; + vcpu_t best_vcpu; + int this_pcpu = smp_processor_id(); + + best_vcpu = NULL; + + spin_lock(&fairsched_lock); + if (!list_empty(&vsched->idle_list)) { + list_for_each_entry(vcpu, &vsched->idle_list, list) { + if (unlikely(vcpu_is_offline(vcpu))) + continue; + if (!cpu_isset(vcpu_last_pcpu(vcpu), *cpus)) + continue; + best_vcpu = vcpu; + if (vcpu_last_pcpu(vcpu) == this_pcpu) + break; + } + } + spin_unlock(&fairsched_lock); + return best_vcpu; +} + +/* + * find_busiest_vsched - find busiest vsched among running vsched's. + * An active vsched will be balanced when it becomes running. + * + * This routine must be simple and fast. + */ +static inline struct vcpu_scheduler *find_busiest_vsched(cpumask_t *cpus) +{ + vcpu_t vcpu; + int i, n; + cpumask_t mask, tmp_mask; + int step; + + step = 0; + + cpus_and(mask, *cpus, cpu_online_map); + + /* + * We implement simple round robin strategy to get + * PCPU id to start from. Last PCPU number is saved in + * per_cpu(find_busvs_last_pcpu). + * + * Assume the mask is 0x6789abcd and it's time to start + * from PCPU #13: + * + * 1) In the first pass we must use mask 0x6789a000: + * + * ((0x6789abcd >> 13) << 13) => 0x6789a000 + * + * 2) In the second pass we must use mask 0x00000bcd: + * + * 0x6789abcd ^ 0x6789a000 => 0x00000bcd + */ + n = per_cpu(find_busvs_last_pcpu, raw_smp_processor_id()); + + cpus_shift_right(tmp_mask, mask, n); + cpus_shift_left(tmp_mask, tmp_mask, n); +restart: + for_each_cpu_mask(i, tmp_mask) { + vcpu = pcpu(i)->vcpu; + if (vcpu_is_offline(vcpu)) + continue; + if (vcpu->vsched == &idle_vsched) + continue; + if (vcpu == this_vcpu()) + continue; + + /* + * 'Busiest' mean there at least 2 tasks on this vsched. + */ + if (vcpu->rq.nr_running > 1) { + per_cpu(find_busvs_last_pcpu, raw_smp_processor_id()) + = ++n % NR_CPUS; + return vcpu->vsched; + } + } + if (!step++) { + /* Second pass */ + cpus_xor(tmp_mask, mask, tmp_mask); + goto restart; + } + return NULL; +} + +/* + * Find idle VCPUs in a vsched, that can be balanced + */ +static inline vcpu_t find_idle_target(cpumask_t *cpus) +{ + vcpu_t vcpu; + struct vcpu_scheduler *vsched; + + /* + * First of all we have to find busiest vsched + */ + vsched = find_busiest_vsched(cpus); + if (vsched == NULL) + return NULL; + + /* + * Try to find an idle VCPU in the target vsched. + * VCPU that was last running on this PCPU is preferred. + */ + vcpu = find_idle_vcpu(vsched, cpus); + if (!vcpu) + return NULL; + return vcpu; +} + +static int idle_balance(vcpu_t this_cpu, struct rq *this_rq); + +static vcpu_t schedule_vcpu(vcpu_t cur_vcpu, cycles_t cycles) +{ + struct vcpu_scheduler *vsched; + vcpu_t vcpu, best_vcpu; + unsigned long time; + struct rq *rq; +#ifdef CONFIG_FAIRSCHED + struct fairsched_node *node, *nodec; + + nodec = vcpu_vsched(cur_vcpu)->node; + node = nodec; +#endif + + BUG_ON(!cur_vcpu->running); +restart: + if (unlikely(system_state == SYSTEM_BOOTING)) + goto affine; + + spin_lock(&fairsched_lock); +#ifdef CONFIG_FAIRSCHED + node = fairsched_schedule(node, nodec, + cur_vcpu->active, + cycles); + if (unlikely(node == NULL)) + goto idle; + + vsched = node->vsched; +#else + vsched = &default_vsched; +#endif + /* FIXME: optimize vcpu switching, maybe we do not need to call + fairsched_schedule() at all if vcpu is still active and too + little time have passed so far */ + if (cur_vcpu->vsched == vsched && cur_vcpu->active && + jiffies - cur_vcpu->start_time < msecs_to_jiffies(vcpu_sched_timeslice)) { + vcpu = cur_vcpu; + goto done; + } + + if (list_empty(&vsched->active_list)) { + /* nothing except for this cpu can be scheduled */ + if (likely(cur_vcpu->vsched == vsched && cur_vcpu->active)) { + /* + * Current vcpu is the one we need. We have not + * put it yet, so it's not on the active_list. + */ + vcpu = cur_vcpu; + vcpu->start_time = jiffies; + goto done; + } else + goto none; + } + + /* + * Ok, we are going to choose new VCPU now. + */ + time = jiffies - msecs_to_jiffies(vcpu_hot_timeslice); + /* + * First vcpu in the list is more preferable, because it has waited + * for CPU longer than others. If all vcpu's are hot, use the oldest + * one. + */ + best_vcpu = list_entry(vsched->active_list.next, + struct vcpu_struct, list); + list_for_each_entry(vcpu, &vsched->active_list, list) { + /* Skip hot VCPU's that were running on another CPU's */ + if (vcpu->stop_time > time && + vcpu_last_pcpu(vcpu) != raw_smp_processor_id()) + continue; + + best_vcpu = vcpu; + break; + } + vcpu = best_vcpu; + + /* add it to running list */ + __vcpu_get(vcpu); +done: + spin_unlock(&fairsched_lock); + + rq = vcpu_rq(vcpu); + if (unlikely(vcpu != cur_vcpu)) { + spin_unlock(&vcpu_rq(cur_vcpu)->lock); + spin_lock(&rq->lock); + if (unlikely(!rq->nr_running)) { + /* race with balancing? */ + spin_unlock(&rq->lock); + vcpu_put(vcpu); + spin_lock(&vcpu_rq(cur_vcpu)->lock); + goto restart; + } + } + BUG_ON(!rq->nr_running); + return vcpu; + +none: +#ifdef CONFIG_FAIRSCHED + spin_unlock(&fairsched_lock); + + /* fairsched doesn't schedule more CPUs than we have active */ + BUG_ON(1); +#else + goto idle; +#endif + +idle: + vcpu = task_vcpu(this_pcpu()->idle); + __vcpu_get(vcpu); + spin_unlock(&fairsched_lock); + spin_unlock(&vcpu_rq(cur_vcpu)->lock); + + spin_lock(&vcpu_rq(vcpu)->lock); + return vcpu; + +affine: + vcpu = vsched_vcpu(&default_vsched, raw_smp_processor_id()); + /* current VCPU busy, continue */ + if (cur_vcpu == vcpu && vcpu->active) + return cur_vcpu; + /* current is idle and nothing to run, keep idle */ + if (vcpu_vsched(cur_vcpu) == &idle_vsched && !vcpu->active) + return cur_vcpu; + + /* need to switch to idle... */ + if (cur_vcpu == vcpu) { + spin_lock(&fairsched_lock); + goto idle; + } + + /* ... and from idle */ + spin_lock(&fairsched_lock); + __vcpu_get(vcpu); + goto done; +} + +int vcpu_online(int cpu) +{ + return cpu_isset(cpu, vsched_vcpu_online_map(this_vsched())); +} +#else /* CONFIG_SCHED_VCPU */ + +#define set_task_vsched(task, vsched) do { } while (0) + +static inline void vcpu_attach(struct rq *rq) +{ +} + +static inline void vcpu_detach(struct rq *rq) +{ +} + +static inline void vcpu_put(vcpu_t vcpu) +{ +} + +static inline vcpu_t schedule_vcpu(vcpu_t prev_vcpu, cycles_t cycles) +{ + return prev_vcpu; +} + +static inline void set_task_vcpu(struct task_struct *p, vcpu_t vcpu) +{ + set_task_pcpu(p, vcpu->id); +} + +#endif /* CONFIG_SCHED_VCPU */ -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) @@ -289,6 +939,27 @@ static DEFINE_PER_CPU(struct rq, runqueu # define finish_arch_switch(prev) do { } while (0) #endif +struct kernel_stat_glob kstat_glob; +spinlock_t kstat_glb_lock = SPIN_LOCK_UNLOCKED; +EXPORT_SYMBOL(kstat_glob); +EXPORT_SYMBOL(kstat_glb_lock); + +static inline void finish_vsched_switch(struct rq *rq, vcpu_t prev_vcpu) +{ + vcpu_t vcpu; + + vcpu = rq_vcpu(rq); + if (prev_vcpu != vcpu) { +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); + vcpu_put(prev_vcpu); + local_irq_enable(); +#else + vcpu_put(prev_vcpu); +#endif + } +} + #ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { @@ -301,6 +972,7 @@ static inline void prepare_lock_switch(s static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { + vcpu_t prev_vcpu; #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -312,7 +984,10 @@ static inline void finish_lock_switch(st */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - spin_unlock_irq(&rq->lock); + prev_vcpu = task_vcpu(prev); + spin_unlock(&rq->lock); + finish_vsched_switch(rq, prev_vcpu); + local_irq_enable(); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -344,6 +1019,8 @@ static inline void prepare_lock_switch(s static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { + /* vcpu_put() should be done before setting prev->oncpu = 0 */ + finish_vsched_switch(rq, task_vcpu(prev)); #ifdef CONFIG_SMP /* * After ->oncpu is cleared, the task can be moved to a different CPU. @@ -411,26 +1088,235 @@ static inline void task_rq_unlock(struct spin_unlock_irqrestore(&rq->lock, *flags); } +#ifdef CONFIG_VE +#define ve_nr_unint_inc(env, cpu) \ + do { \ + VE_CPU_STATS((env), (cpu))->nr_unint++; \ + } while(0) +#define ve_nr_unint_dec(env, cpu) \ + do { \ + VE_CPU_STATS((env), (cpu))->nr_unint--; \ + } while(0) + +#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0) + +cycles_t __ve_sched_get_idle_time(struct ve_struct *ve, int cpu) +{ + struct ve_cpu_stats *ve_stat; + unsigned v; + cycles_t strt, ret, cycles; + + ve_stat = VE_CPU_STATS(ve, cpu); + do { + v = read_seqcount_begin(&ve_stat->stat_lock); + ret = ve_stat->idle_time; + strt = ve_stat->strt_idle_time; + if (strt && nr_uninterruptible_ve(ve) == 0) { + cycles = get_cycles(); + if (cycles_after(cycles, strt)) + ret += cycles - strt; + } + } while (read_seqcount_retry(&ve_stat->stat_lock, v)); + return ret; +} +EXPORT_SYMBOL(__ve_sched_get_idle_time); + +cycles_t ve_sched_get_iowait_time(int cpu) +{ + struct ve_struct *ve; + struct ve_cpu_stats *ve_stat; + unsigned v; + cycles_t strt, ret, cycles; + vcpu_t vcpu; + + preempt_disable(); + ret = 0; + vcpu = vsched_vcpu(this_vsched(), cpu); + if (!vcpu) + goto done; + + ve = get_exec_env(); + ve_stat = VE_CPU_STATS(ve, cpu); + do { + struct rq *rq; + rq = vcpu_rq(vcpu); + v = read_seqcount_begin(&ve_stat->stat_lock); + ret = ve_stat->iowait_time; + strt = ve_stat->strt_idle_time; + if (strt && atomic_read(&rq->nr_iowait) > 0) { + cycles = get_cycles(); + if (cycles_after(cycles, strt)) + ret += cycles - strt; + } + } while (read_seqcount_retry(&ve_stat->stat_lock, v)); +done: + preempt_enable(); + return ret; +} + +EXPORT_SYMBOL(ve_sched_get_iowait_time); + +static inline void ve_stop_idle(struct ve_struct *ve, + vcpu_t vcpu, cycles_t cycles) +{ + struct ve_cpu_stats *ve_stat; + + ve_stat = VE_CPU_STATS(ve, vcpu->id); + + write_seqcount_begin(&ve_stat->stat_lock); + if (ve_stat->strt_idle_time) { + if (cycles_after(cycles, ve_stat->strt_idle_time)) { + if (atomic_read(&vcpu_rq(vcpu)->nr_iowait) == 0) + ve_stat->idle_time += cycles - + ve_stat->strt_idle_time; + else + ve_stat->iowait_time += cycles - + ve_stat->strt_idle_time; + } + ve_stat->strt_idle_time = 0; + } + write_seqcount_end(&ve_stat->stat_lock); +} + +static inline void ve_strt_idle(struct ve_struct *ve, + unsigned int cpu, cycles_t cycles) +{ + struct ve_cpu_stats *ve_stat; + + ve_stat = VE_CPU_STATS(ve, cpu); + + write_seqcount_begin(&ve_stat->stat_lock); + ve_stat->strt_idle_time = cycles; + write_seqcount_end(&ve_stat->stat_lock); +} + +#define ve_nr_running_inc(env, cpu) do { \ + VE_CPU_STATS((env), (cpu))->nr_running++; \ + } while (0) +#define ve_nr_running_dec(env, cpu) do { \ + VE_CPU_STATS((env), (cpu))->nr_running--; \ + } while (0) + +void ve_sched_attach(struct ve_struct *envid) +{ + struct task_struct *tsk; + unsigned int cpu; + + tsk = current; + preempt_disable(); + cpu = task_cpu(tsk); + ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu); + ve_nr_running_inc(envid, cpu); + preempt_enable(); +} +EXPORT_SYMBOL(ve_sched_attach); + +static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc) +{ + struct ve_task_info *ti; + + ti = VE_TASK_INFO(p); + write_seqcount_begin(&ti->wakeup_lock); + ti->wakeup_stamp = cyc; + write_seqcount_end(&ti->wakeup_lock); +} + +static inline void update_sched_lat(struct task_struct *t, cycles_t cycles) +{ + int cpu; + cycles_t ve_wstamp; + + /* safe due to runqueue lock */ + cpu = smp_processor_id(); + ve_wstamp = t->ve_task_info.wakeup_stamp; + + if (ve_wstamp && cycles > ve_wstamp) { + KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat, + cpu, cycles - ve_wstamp); + KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve, + cpu, cycles - ve_wstamp); + } +} + +static inline void update_ve_task_info(struct task_struct *prev, + cycles_t cycles) +{ + if (prev != this_pcpu()->idle) { + VE_CPU_STATS(prev->ve_task_info.owner_env, + smp_processor_id())->used_time += + cycles - prev->ve_task_info.sched_time; + + prev->ve_task_info.sched_time = cycles; + } +} +#else /* CONFIG_VE */ +#define ve_nr_running_inc(env, cpu) do { } while(0) +#define ve_nr_running_dec(env, cpu) do { } while(0) +#define ve_nr_unint_inc(env, cpu) do { } while(0) +#define ve_nr_unint_dec(env, cpu) do { } while(0) +#define update_ve_task_info(prev, cycles) do { } while (0) +#define ve_stop_idle(ve, vcpu, cycles) do { } while (0) +#define ve_strt_idle(ve, cpu, cycles) do { } while (0) +#endif /* CONFIG_VE */ + +struct task_nrs_struct { + long nr_running; + long nr_unint; + long nr_stopped; + long nr_sleeping; + atomic_t nr_iowait; + long long nr_switches; +} ____cacheline_aligned_in_smp; + +static struct task_nrs_struct glob_task_nrs[NR_CPUS]; +#define nr_running_inc(cpu) do { glob_task_nrs[cpu].nr_running++; } while (0) +#define nr_running_dec(cpu) do { glob_task_nrs[cpu].nr_running--; } while (0) +#define nr_unint_inc(cpu) do { glob_task_nrs[cpu].nr_unint++; } while (0) +#define nr_unint_dec(cpu) do { glob_task_nrs[cpu].nr_unint--; } while (0) +#define nr_stopped_inc(cpu) do { glob_task_nrs[cpu].nr_stopped++; } while (0) +#define nr_stopped_dec(cpu) do { glob_task_nrs[cpu].nr_stopped--; } while (0) +#define nr_sleeping_inc(cpu) do { glob_task_nrs[cpu].nr_sleeping++; } while (0) +#define nr_sleeping_dec(cpu) do { glob_task_nrs[cpu].nr_sleeping--; } while (0) +#define nr_iowait_inc(cpu) do { \ + atomic_inc(&glob_task_nrs[cpu].nr_iowait); \ + } while (0) +#define nr_iowait_dec(cpu) do { \ + atomic_dec(&glob_task_nrs[cpu].nr_iowait); \ + } while (0) + + +unsigned long nr_zombie = 0; /* protected by tasklist_lock */ +EXPORT_SYMBOL(nr_zombie); + +atomic_t nr_dead = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_dead); + #ifdef CONFIG_SCHEDSTATS + /* * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ #define SCHEDSTAT_VERSION 12 -static int show_schedstat(struct seq_file *seq, void *v) +static int show_schedstat_vsched(struct seq_file *seq, + struct vcpu_scheduler *vsched) { int cpu; - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + seq_printf(seq, "vsched%d\n", vsched->id); + + for_each_cpu_mask (cpu, vsched_vcpu_online_map(vsched)) { + vcpu_t vcpu; + struct rq *rq; #ifdef CONFIG_SMP struct sched_domain *sd; int dcnt = 0; #endif + vcpu = vsched_vcpu(vsched, cpu); + rq = vcpu_rq(vcpu); + /* runqueue-specific stats */ seq_printf(seq, "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", @@ -446,7 +1332,7 @@ static int show_schedstat(struct seq_fil #ifdef CONFIG_SMP /* domain-specific stats */ preempt_disable(); - for_each_domain(cpu, sd) { + for_each_domain(vcpu, sd) { enum idle_type itype; char mask_str[NR_CPUS]; @@ -476,6 +1362,20 @@ static int show_schedstat(struct seq_fil return 0; } +static int show_schedstat(struct seq_file *seq, void *v) +{ + struct vcpu_scheduler *vsched; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + + spin_lock(&vsched_list_lock); + list_for_each_entry (vsched, &vsched_list, list) + show_schedstat_vsched(seq, vsched); + spin_unlock(&vsched_list_lock); + return 0; +} + static int schedstat_open(struct inode *inode, struct file *file) { unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); @@ -633,17 +1533,18 @@ static inline void sched_info_depart(str static inline void __sched_info_switch(struct task_struct *prev, struct task_struct *next) { - struct rq *rq = task_rq(prev); + int cpu; + cpu = smp_processor_id(); /* * prev now departs the cpu. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ - if (prev != rq->idle) + if (prev != idle_task(cpu)) sched_info_depart(prev); - if (next != rq->idle) + if (next != idle_task(cpu)) sched_info_arrive(next); } static inline void @@ -832,11 +1733,25 @@ static int effective_prio(struct task_st static void __activate_task(struct task_struct *p, struct rq *rq) { struct prio_array *target = rq->active; - + cycles_t cycles; +#ifdef CONFIG_VE + struct ve_struct *ve; + + cycles = get_cycles(); + write_wakeup_stamp(p, cycles); + p->ve_task_info.sleep_time += cycles; + ve = VE_TASK_INFO(p)->owner_env; +#endif if (batch_task(p)) target = rq->expired; enqueue_task(p, target); inc_nr_running(p, rq); + ve_nr_running_inc(ve, task_cpu(p)); + nr_running_inc(smp_processor_id()); + if (rq->nr_running == 1) { + ve_stop_idle(ve, task_vcpu(p), cycles); + vcpu_attach(rq); + } } /* @@ -973,9 +1888,40 @@ static void activate_task(struct task_st */ static void deactivate_task(struct task_struct *p, struct rq *rq) { + cycles_t cycles; +#ifdef CONFIG_VE + unsigned int cpu, pcpu; + struct ve_struct *ve; + + cycles = get_cycles(); + cpu = task_cpu(p); + pcpu = smp_processor_id(); + ve = p->ve_task_info.owner_env; + + p->ve_task_info.sleep_time -= cycles; +#endif + if (p->state == TASK_UNINTERRUPTIBLE) { + ve_nr_unint_inc(ve, cpu); + nr_unint_inc(pcpu); + } + if (p->state == TASK_INTERRUPTIBLE) { + rq->nr_sleeping++; + nr_sleeping_inc(pcpu); + } + if (p->state == TASK_STOPPED) { + rq->nr_stopped++; + nr_stopped_inc(pcpu); + } + + ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, cpu); + nr_running_dec(pcpu); dec_nr_running(p, rq); dequeue_task(p, p->array); p->array = NULL; + if (rq->nr_running == 0) { + ve_strt_idle(ve, cpu, cycles); + vcpu_detach(rq); + } } /* @@ -991,18 +1937,22 @@ static void deactivate_task(struct task_ #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif +/* FIXME: need to add vsched arg */ static void resched_task(struct task_struct *p) { int cpu; +#if 0 + /* FIXME: this fails due to idle rq->curre == idle */ assert_spin_locked(&task_rq(p)->lock); +#endif if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) return; set_tsk_thread_flag(p, TIF_NEED_RESCHED); - cpu = task_cpu(p); + cpu = task_pcpu(p); if (cpu == smp_processor_id()) return; @@ -1014,7 +1964,10 @@ static void resched_task(struct task_str #else static inline void resched_task(struct task_struct *p) { +#if 0 + /* FIXME: this fails due to idle rq->curre == idle */ assert_spin_locked(&task_rq(p)->lock); +#endif set_tsk_need_resched(p); } #endif @@ -1025,21 +1978,39 @@ static inline void resched_task(struct t */ inline int task_curr(const struct task_struct *p) { - return cpu_curr(task_cpu(p)) == p; + return task_rq(p)->curr == p; } -/* Used instead of source_load when we know the type == 0 */ -unsigned long weighted_cpuload(const int cpu) +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +inline int idle_cpu(int cpu) { - return cpu_rq(cpu)->raw_weighted_load; +#ifdef CONFIG_SCHED_VCPU + return pcpu(cpu)->vsched == &idle_vsched; +#else + return vcpu_rq(pcpu(cpu)->vcpu)->curr == pcpu(cpu)->idle; +#endif } -#ifdef CONFIG_SMP +EXPORT_SYMBOL_GPL(idle_cpu); + +static inline int idle_vcpu(vcpu_t cpu) +{ +#ifdef CONFIG_SCHED_VCPU + return !cpu->active; +#else + return idle_cpu(cpu->id); +#endif +} + +#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_VCPU) struct migration_req { struct list_head list; struct task_struct *task; - int dest_cpu; + vcpu_t dest_cpu; struct completion done; }; @@ -1049,7 +2020,7 @@ struct migration_req { * Returns true if you have to wait for migration thread. */ static int -migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) +migrate_task(struct task_struct *p, vcpu_t dest_cpu, struct migration_req *req) { struct rq *rq = task_rq(p); @@ -1057,8 +2028,13 @@ migrate_task(struct task_struct *p, int * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ +#ifdef CONFIG_SCHED_VCPU + BUG_ON(task_vsched(p) == &idle_vsched); + BUG_ON(vcpu_vsched(dest_cpu) == &idle_vsched); +#endif if (!p->array && !task_running(rq, p)) { - set_task_cpu(p, dest_cpu); + set_task_vsched(p, vcpu_vsched(dest_cpu)); + set_task_vcpu(p, dest_cpu); return 0; } @@ -1099,6 +2075,7 @@ repeat: } task_rq_unlock(rq, &flags); } +EXPORT_SYMBOL_GPL(wait_task_inactive); /*** * kick_process - kick a running thread to enter/exit the kernel @@ -1118,12 +2095,18 @@ void kick_process(struct task_struct *p) int cpu; preempt_disable(); - cpu = task_cpu(p); + cpu = task_pcpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) + /* FIXME: ??? think over */ + /* should add something like get_pcpu(cpu)->vcpu->id == task_cpu(p), + but with serialization of vcpu access... */ smp_send_reschedule(cpu); preempt_enable(); } +#endif + +#ifdef CONFIG_SMP /* * Return a low guess at the load of a migration-source cpu weighted * according to the scheduling class and "nice" value. @@ -1131,9 +2114,9 @@ void kick_process(struct task_struct *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu, int type) +static inline unsigned long source_load(vcpu_t cpu, int type) { - struct rq *rq = cpu_rq(cpu); + struct rq *rq = vcpu_rq(cpu); if (type == 0) return rq->raw_weighted_load; @@ -1145,9 +2128,9 @@ static inline unsigned long source_load( * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static inline unsigned long target_load(int cpu, int type) +static inline unsigned long target_load(vcpu_t cpu, int type) { - struct rq *rq = cpu_rq(cpu); + struct rq *rq = vcpu_rq(cpu); if (type == 0) return rq->raw_weighted_load; @@ -1158,9 +2141,9 @@ static inline unsigned long target_load( /* * Return the average load per task on the cpu's run queue */ -static inline unsigned long cpu_avg_load_per_task(int cpu) +static inline unsigned long cpu_avg_load_per_task(vcpu_t vcpu) { - struct rq *rq = cpu_rq(cpu); + struct rq *rq = vcpu_rq(vcpu); unsigned long n = rq->nr_running; return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; @@ -1171,33 +2154,35 @@ static inline unsigned long cpu_avg_load * domain. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, vcpu_t this_cpu) { struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + struct vcpu_scheduler *vsched; + vcpu_t vcpu; + int this_pcpu; + vsched = vcpu_vsched(this_cpu); + this_pcpu = vcpu_last_pcpu(this_cpu); do { unsigned long load, avg_load; int local_group; int i; - /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) - goto nextgroup; - - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpu_isset(this_pcpu, group->cpumask); /* Tally up the load of all CPUs in the group */ avg_load = 0; for_each_cpu_mask(i, group->cpumask) { + vcpu = pcpu(i)->vcpu; /* Bias balancing toward cpus of our domain */ if (local_group) - load = source_load(i, load_idx); + load = source_load(vcpu, load_idx); else - load = target_load(i, load_idx); + load = target_load(vcpu, load_idx); avg_load += load; } @@ -1212,7 +2197,6 @@ find_idlest_group(struct sched_domain *s min_load = avg_load; idlest = group; } -nextgroup: group = group->next; } while (group != sd->groups); @@ -1221,26 +2205,42 @@ nextgroup: return idlest; } +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(vcpu_t vcpu) +{ + return vcpu_rq(vcpu)->raw_weighted_load; +} + /* * find_idlest_queue - find the idlest runqueue among the cpus in group. */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +static vcpu_t +find_idlest_cpu(struct sched_group *group, struct task_struct *p, vcpu_t this_cpu) { - cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; - int idlest = -1; + cpumask_t vmask; + struct vcpu_scheduler *vsched; + vcpu_t idlest = (vcpu_t)-1; + vcpu_t vcpu; int i; - /* Traverse only the allowed CPUs */ - cpus_and(tmp, group->cpumask, p->cpus_allowed); + vsched = vcpu_vsched(this_cpu); + BUG_ON(vsched != task_vsched(p)); - for_each_cpu_mask(i, tmp) { - load = weighted_cpuload(i); + cpus_and(vmask, vsched_vcpu_online_map(vsched), p->cpus_allowed); + for_each_cpu_mask(i, vmask) { + vcpu = vsched_vcpu(vsched, i); + + if (!cpu_isset(vcpu_last_pcpu(vcpu), group->cpumask)) + continue; + if (vcpu_is_offline(vcpu)) + continue; - if (load < min_load || (load == min_load && i == this_cpu)) { + load = weighted_cpuload(vcpu); + + if (load < min_load || (load == min_load && vcpu == this_cpu)) { min_load = load; - idlest = i; + idlest = vcpu; } } @@ -1258,7 +2258,7 @@ find_idlest_cpu(struct sched_group *grou * * preempt must be disabled. */ -static int sched_balance_self(int cpu, int flag) +static vcpu_t sched_balance_self(vcpu_t cpu, int flag) { struct task_struct *t = current; struct sched_domain *tmp, *sd = NULL; @@ -1276,7 +2276,7 @@ static int sched_balance_self(int cpu, i while (sd) { cpumask_t span; struct sched_group *group; - int new_cpu; + vcpu_t new_cpu; int weight; span = sd->span; @@ -1285,7 +2285,7 @@ static int sched_balance_self(int cpu, i goto nextlevel; new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) + if (new_cpu == (vcpu_t)(-1) || new_cpu == cpu) goto nextlevel; /* Now try balancing at a lower domain level */ @@ -1316,21 +2316,27 @@ nextlevel: * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) -static int wake_idle(int cpu, struct task_struct *p) +static vcpu_t wake_idle(vcpu_t cpu, struct task_struct *p) { - cpumask_t tmp; + cpumask_t vtmp; struct sched_domain *sd; + struct vcpu_scheduler *vsched; int i; - if (idle_cpu(cpu)) + if (idle_vcpu(cpu)) return cpu; + vsched = vcpu_vsched(cpu); + cpus_and(vtmp, vsched_vcpu_online_map(vsched), p->cpus_allowed); for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_IDLE) { - cpus_and(tmp, sd->span, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { - if (idle_cpu(i)) - return i; + for_each_cpu_mask(i, vtmp) { + vcpu_t vcpu; + vcpu = vsched_vcpu(vsched, i); + if (!cpu_isset(vcpu_last_pcpu(vcpu), sd->span)) + continue; + if (idle_vcpu(vcpu)) + return vcpu; } } else @@ -1339,7 +2345,7 @@ static int wake_idle(int cpu, struct tas return cpu; } #else -static inline int wake_idle(int cpu, struct task_struct *p) +static inline vcpu_t wake_idle(vcpu_t cpu, struct task_struct *p) { return cpu; } @@ -1361,15 +2367,17 @@ static inline int wake_idle(int cpu, str */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) { - int cpu, this_cpu, success = 0; + vcpu_t cpu, this_cpu; + int success = 0; unsigned long flags; long old_state; struct rq *rq; #ifdef CONFIG_SMP struct sched_domain *sd, *this_sd = NULL; unsigned long load, this_load; - int new_cpu; + vcpu_t new_cpu; #endif + cpu = NULL; rq = task_rq_lock(p, &flags); old_state = p->state; @@ -1379,8 +2387,8 @@ static int try_to_wake_up(struct task_st if (p->array) goto out_running; - cpu = task_cpu(p); - this_cpu = smp_processor_id(); + cpu = task_vcpu(p); + this_cpu = this_vcpu(); #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) @@ -1389,20 +2397,25 @@ static int try_to_wake_up(struct task_st new_cpu = cpu; schedstat_inc(rq, ttwu_cnt); + /* FIXME: add vsched->last_vcpu array to optimize wakeups in different vsched */ + if (vcpu_vsched(cpu) != vcpu_vsched(this_cpu)) + goto out_set_cpu; if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); goto out_set_cpu; } for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpu_isset(vcpu_last_pcpu(cpu), sd->span)) { schedstat_inc(sd, ttwu_wake_remote); this_sd = sd; break; } } - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!vcpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + if (vcpu_is_offline(this_cpu)) goto out_set_cpu; /* @@ -1460,7 +2473,7 @@ static int try_to_wake_up(struct task_st out_set_cpu: new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu) { - set_task_cpu(p, new_cpu); + set_task_vcpu(p, new_cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ rq = task_rq_lock(p, &flags); @@ -1470,13 +2483,21 @@ out_set_cpu: if (p->array) goto out_running; - this_cpu = smp_processor_id(); - cpu = task_cpu(p); + this_cpu = this_vcpu(); + cpu = task_vcpu(p); } out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state == TASK_INTERRUPTIBLE) { + nr_sleeping_dec(smp_processor_id()); + rq->nr_sleeping--; + } else if (old_state == TASK_STOPPED) { + nr_stopped_dec(smp_processor_id()); + rq->nr_stopped--; + } else if (old_state == TASK_UNINTERRUPTIBLE) { + nr_unint_dec(smp_processor_id()); + ve_nr_unint_dec(p->ve_task_info.owner_env, task_cpu(p)); rq->nr_uninterruptible--; /* * Tasks on involuntary sleep don't earn @@ -1530,17 +2551,45 @@ int fastcall wake_up_state(struct task_s } /* + * init is special, it is forked from swapper (idle_vsched) and should + * belong to default_vsched, so we have to change it's vsched/fairsched manually + */ +static void wake_up_init(struct task_struct *p) +{ + struct rq *rq; + unsigned long flags; + + /* we should change both fairsched node and vsched here */ + set_task_vsched(p, &default_vsched); + set_task_cpu(p, raw_smp_processor_id()); + + /* + * can't call wake_up_new_task() directly here, + * since it assumes that a child belongs to the same vsched + */ + p->state = TASK_RUNNING; + p->sleep_avg = 0; + p->prio = effective_prio(p); + + rq = task_rq_lock(p, &flags); + __activate_task(p, rq); + task_rq_unlock(rq, &flags); +} + +/* * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ void fastcall sched_fork(struct task_struct *p, int clone_flags) { - int cpu = get_cpu(); - + vcpu_t cpu; + + preempt_disable(); + cpu = this_vcpu(); #ifdef CONFIG_SMP cpu = sched_balance_self(cpu, SD_BALANCE_FORK); #endif - set_task_cpu(p, cpu); + set_task_vcpu(p, cpu); /* * We mark the process as running here, but have not actually @@ -1582,6 +2631,10 @@ void fastcall sched_fork(struct task_str p->first_time_slice = 1; current->time_slice >>= 1; p->timestamp = sched_clock(); +#ifdef CONFIG_VE + /*cosmetic: sleep till wakeup below*/ + p->ve_task_info.sleep_time -= get_cycles(); +#endif if (unlikely(!current->time_slice)) { /* * This case is rare, it happens when the parent has only @@ -1592,7 +2645,7 @@ void fastcall sched_fork(struct task_str scheduler_tick(); } local_irq_enable(); - put_cpu(); + preempt_enable(); } /* @@ -1606,12 +2659,19 @@ void fastcall wake_up_new_task(struct ta { struct rq *rq, *this_rq; unsigned long flags; - int this_cpu, cpu; + vcpu_t this_cpu, cpu; + + if (unlikely(p->pid == 1)) { + /* FIXME - fastpath */ + wake_up_init(p); + return; + } rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); - this_cpu = smp_processor_id(); - cpu = task_cpu(p); + BUG_ON(task_vsched(current) != task_vsched(p)); + this_cpu = this_vcpu(); + cpu = task_vcpu(p); /* * We decrease the sleep average of forking parents @@ -1640,6 +2700,9 @@ void fastcall wake_up_new_task(struct ta p->array = current->array; p->array->nr_active++; inc_nr_running(p, rq); + ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, + task_cpu(p)); + nr_running_inc(smp_processor_id()); } set_need_resched(); } else @@ -1653,7 +2716,7 @@ void fastcall wake_up_new_task(struct ta */ this_rq = rq; } else { - this_rq = cpu_rq(this_cpu); + this_rq = vcpu_rq(this_cpu); /* * Not the local CPU - must adjust timestamp. This should @@ -1696,7 +2759,7 @@ void fastcall sched_exit(struct task_str * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { + if (p->first_time_slice && task_vcpu(p) == task_vcpu(p->parent)) { p->parent->time_slice += p->time_slice; if (unlikely(p->parent->time_slice > task_timeslice(p))) p->parent->time_slice = task_timeslice(p); @@ -1763,6 +2826,7 @@ static inline void finish_task_switch(st prev_task_flags = prev->flags; finish_arch_switch(prev); finish_lock_switch(rq, prev); + if (mm) mmdrop(mm); if (unlikely(prev_task_flags & PF_DEAD)) { @@ -1790,8 +2854,9 @@ asmlinkage void schedule_tail(struct tas preempt_enable(); #endif if (current->set_child_tid) - put_user(current->pid, current->set_child_tid); + put_user(virt_pid(current), current->set_child_tid); } +EXPORT_SYMBOL_GPL(schedule_tail); /* * context_switch - switch to the new MM and the new @@ -1841,78 +2906,175 @@ context_switch(struct rq *rq, struct tas */ unsigned long nr_running(void) { - unsigned long i, sum = 0; + unsigned long i, sum; + + sum = 0; + for_each_online_cpu(i) + sum += glob_task_nrs[i].nr_running; + + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} +EXPORT_SYMBOL(nr_running); + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum; + + sum = 0; + for_each_online_cpu(i) + sum += glob_task_nrs[i].nr_unint; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +EXPORT_SYMBOL(nr_uninterruptible); + +unsigned long long nr_context_switches(void) +{ + int i; + unsigned long long sum; + + sum = 0; + for_each_online_cpu(i) + sum += glob_task_nrs[i].nr_switches; + + if (unlikely((long)sum < 0)) + sum = 0; + return sum; +} + +EXPORT_SYMBOL(nr_context_switches); + +unsigned long nr_iowait(void) +{ + unsigned long i, sum; + + sum = 0; + for_each_online_cpu(i) + sum += atomic_read(&glob_task_nrs[i].nr_iowait); + + if (unlikely((long)sum < 0)) + sum = 0; + return sum; +} + +unsigned long nr_active(void) +{ + unsigned long i, running = 0, uninterruptible = 0; + + for_each_online_cpu(i) { + running += glob_task_nrs[i].nr_running; + uninterruptible += glob_task_nrs[i].nr_unint; + } + + if (unlikely((long)uninterruptible < 0)) + uninterruptible = 0; + if (unlikely((long)running < 0)) + running = 0; + + return running + uninterruptible; +} + +EXPORT_SYMBOL(nr_iowait); + +unsigned long nr_stopped(void) +{ + unsigned long i, sum; + sum = 0; for_each_online_cpu(i) - sum += cpu_rq(i)->nr_running; + sum += glob_task_nrs[i].nr_stopped; + + if (unlikely((long)sum < 0)) + sum = 0; return sum; } -unsigned long nr_uninterruptible(void) +EXPORT_SYMBOL(nr_stopped); + +unsigned long nr_sleeping(void) { - unsigned long i, sum = 0; + unsigned long i, sum; - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; + sum = 0; + for_each_online_cpu(i) + sum += glob_task_nrs[i].nr_sleeping; - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ if (unlikely((long)sum < 0)) sum = 0; return sum; } -unsigned long long nr_context_switches(void) +EXPORT_SYMBOL(nr_sleeping); + +#ifdef CONFIG_VE +unsigned long nr_running_ve(struct ve_struct *ve) { int i; - unsigned long long sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_switches; + long sum; - return sum; + sum = 0; + for_each_online_cpu(i) + sum += VE_CPU_STATS(ve, i)->nr_running; + return (unsigned long)(sum < 0 ? 0 : sum); } -unsigned long nr_iowait(void) -{ - unsigned long i, sum = 0; +EXPORT_SYMBOL(nr_running_ve); - for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); +unsigned long nr_uninterruptible_ve(struct ve_struct *ve) +{ + int i; + long sum; - return sum; + sum = 0; + for_each_online_cpu(i) + sum += VE_CPU_STATS(ve, i)->nr_unint; + return (unsigned long)(sum < 0 ? 0 : sum); } -unsigned long nr_active(void) -{ - unsigned long i, running = 0, uninterruptible = 0; +EXPORT_SYMBOL(nr_uninterruptible_ve); - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } +unsigned long nr_iowait_ve(void) +{ + long sum = 0; - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; +#ifdef CONFIG_SCHED_VCPU + int i; + struct vcpu_scheduler *vsched; + vsched = this_vsched(); + for_each_cpu_mask(i, vsched_vcpu_online_map(vsched)) { + struct rq *rq; - return running + uninterruptible; + rq = vcpu_rq(vsched_vcpu(vsched, i)); + sum += atomic_read(&rq->nr_iowait); + } +#endif + return (unsigned long)(sum < 0 ? 0 : sum); } -#ifdef CONFIG_SMP +EXPORT_SYMBOL(nr_iowait_ve); +#endif +#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_VCPU) /* - * Is this task likely cache-hot: + * This has calready hanged two times since 2.6.16 started, so + * let's keep generic rq_compare() to handle it next time + * SCHED_VCPU has many rq-s so somparing of their ->cpu-s + * doesn't work as expected. */ -static inline int -task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) -{ - return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; -} - +#define rq_compare(rq1, rq2) (rq1 < rq2) /* * double_rq_lock - safely lock two runqueues * @@ -1923,16 +3085,25 @@ static void double_rq_lock(struct rq *rq __acquires(rq1->lock) __acquires(rq2->lock) { + BUG_ON(!irqs_disabled()); if (rq1 == rq2) { spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { - if (rq1 < rq2) { + if (rq_compare(rq1, rq2)) { spin_lock(&rq1->lock); +#ifdef CONFIG_SCHED_VCPU + spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); +#else spin_lock(&rq2->lock); +#endif } else { spin_lock(&rq2->lock); +#ifdef CONFIG_SCHED_VCPU + spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); +#else spin_lock(&rq1->lock); +#endif } } } @@ -1955,38 +3126,20 @@ static void double_rq_unlock(struct rq * } /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static void double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock(&this_rq->lock); - } else - spin_lock(&busiest->lock); - } -} - -/* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then * the cpu_allowed mask is restored. */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) +static void sched_migrate_task(struct task_struct *p, vcpu_t dest_cpu) { struct migration_req req; unsigned long flags; struct rq *rq; rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) - || unlikely(cpu_is_offline(dest_cpu))) + if (unlikely(!vcpu_isset(dest_cpu, p->cpus_allowed) + || vcpu_is_offline(dest_cpu))) goto out; /* force the process onto the specified CPU */ @@ -2005,6 +3158,49 @@ static void sched_migrate_task(struct ta out: task_rq_unlock(rq, &flags); } +#endif + +#ifdef CONFIG_SMP + +/* + * Is this task likely cache-hot: + */ +static inline int +task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) +{ + return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; +} + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static void double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + if (unlikely(!spin_trylock(&busiest->lock))) { + if (rq_compare(busiest, this_rq)) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); +#ifdef CONFIG_SCHED_VCPU + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); +#else + spin_lock(&this_rq->lock); +#endif + } else +#ifdef CONFIG_SCHED_VCPU + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); +#else + spin_lock(&busiest->lock); +#endif + } +} /* * sched_exec - execve() is a valuable balancing opportunity, because at @@ -2012,9 +3208,12 @@ out: */ void sched_exec(void) { - int new_cpu, this_cpu = get_cpu(); + vcpu_t new_cpu, this_cpu; + + preempt_disable(); + this_cpu = this_vcpu(); new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); - put_cpu(); + preempt_enable(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); } @@ -2025,11 +3224,31 @@ void sched_exec(void) */ static void pull_task(struct rq *src_rq, struct prio_array *src_array, struct task_struct *p, struct rq *this_rq, - struct prio_array *this_array, int this_cpu) + struct prio_array *this_array, vcpu_t this_cpu) { + cycles_t cycles; + int cpu; +#ifdef CONFIG_VE + struct ve_struct *ve; + + ve = VE_TASK_INFO(p)->owner_env; +#endif + cycles = get_cycles(); + dequeue_task(p, src_array); dec_nr_running(p, src_rq); - set_task_cpu(p, this_cpu); + cpu = task_cpu(p); + ve_nr_running_dec(ve, cpu); + if (src_rq->nr_running == 0) { + ve_strt_idle(ve, cpu, cycles); + vcpu_detach(src_rq); + } + set_task_vcpu(p, this_cpu); + if (this_rq->nr_running == 0) { + ve_stop_idle(ve, this_cpu, cycles); + vcpu_attach(this_rq); + } + ve_nr_running_inc(ve, task_cpu(p)); inc_nr_running(p, this_rq); enqueue_task(p, this_array); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) @@ -2046,7 +3265,7 @@ static void pull_task(struct rq *src_rq, * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, +int can_migrate_task(struct task_struct *p, struct rq *rq, vcpu_t this_cpu, struct sched_domain *sd, enum idle_type idle, int *all_pinned) { @@ -2056,7 +3275,7 @@ int can_migrate_task(struct task_struct * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) + if (!vcpu_isset(this_cpu, p->cpus_allowed)) return 0; *all_pinned = 0; @@ -2086,7 +3305,7 @@ int can_migrate_task(struct task_struct * * Called with both runqueues locked. */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, +static int move_tasks(struct rq *this_rq, vcpu_t this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum idle_type idle, int *all_pinned) @@ -2098,6 +3317,8 @@ static int move_tasks(struct rq *this_rq struct task_struct *tmp; long rem_load_move; + if (vcpu_is_offline(this_cpu)) + goto out; if (max_nr_move == 0 || max_load_move == 0) goto out; @@ -2210,8 +3431,9 @@ out: * should be moved to restore balance via the imbalance parameter. */ static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle, int *sd_idle) +find_busiest_group(struct sched_domain *sd, vcpu_t this_cpu, + unsigned long *imbalance, enum idle_type idle, int *sd_idle, + cpumask_t *cpus) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2225,6 +3447,11 @@ find_busiest_group(struct sched_domain * unsigned long min_nr_running = ULONG_MAX; struct sched_group *group_min = NULL, *group_leader = NULL; #endif + struct vcpu_scheduler *vsched; + int this_pcpu; + + vsched = vcpu_vsched(this_cpu); + this_pcpu = vcpu_last_pcpu(this_cpu); max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; @@ -2237,27 +3464,31 @@ find_busiest_group(struct sched_domain * load_idx = sd->idle_idx; do { + cpumask_t tmp; unsigned long load, group_capacity; int local_group; int i; unsigned long sum_nr_running, sum_weighted_load; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpu_isset(this_pcpu, group->cpumask); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; + cpus_and(tmp, group->cpumask, vsched_pcpu_running_map(vsched)); + cpus_and(tmp, tmp, *cpus); - for_each_cpu_mask(i, group->cpumask) { - struct rq *rq = cpu_rq(i); + for_each_cpu_mask(i, tmp) { + vcpu_t vcpu = pcpu(i)->vcpu; + struct rq *rq = vcpu_rq(vcpu); if (*sd_idle && !idle_cpu(i)) *sd_idle = 0; /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i, load_idx); + load = target_load(vcpu, load_idx); else - load = source_load(i, load_idx); + load = source_load(vcpu, load_idx); avg_load += load; sum_nr_running += rq->nr_running; @@ -2345,6 +3576,8 @@ group_next: if (!busiest || this_load >= max_load || busiest_nr_running == 0) goto out_balanced; + if (!this) + this = busiest; /* this->cpu_power is needed below */ avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; @@ -2464,26 +3697,33 @@ ret: /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static struct rq * -find_busiest_queue(struct sched_group *group, enum idle_type idle, - unsigned long imbalance) +static vcpu_t find_busiest_queue(vcpu_t this_vcpu, struct sched_group *group, + enum idle_type idle, unsigned long imbalance, cpumask_t *cpus) { - struct rq *busiest = NULL, *rq; + struct vcpu_scheduler *vsched; + vcpu_t vcpu, busiest = NULL; + struct rq *rq; + cpumask_t tmp; unsigned long max_load = 0; int i; - for_each_cpu_mask(i, group->cpumask) { - rq = cpu_rq(i); + vsched = vcpu_vsched(this_vcpu); + cpus_and(tmp, group->cpumask, *cpus); + + for_each_cpu_mask(i, vsched_vcpu_online_map(vsched)) { + vcpu = vsched_vcpu(vsched, i); + if (!cpu_isset(vcpu_last_pcpu(vcpu), tmp)) + continue; + rq = vcpu_rq(vcpu); if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) continue; if (rq->raw_weighted_load > max_load) { max_load = rq->raw_weighted_load; - busiest = rq; + busiest = vcpu; } } - return busiest; } @@ -2504,13 +3744,15 @@ static inline unsigned long minus_1_or_z * * Called with this_rq unlocked. */ -static int load_balance(int this_cpu, struct rq *this_rq, +static int load_balance(vcpu_t this_cpu, struct rq *this_rq, struct sched_domain *sd, enum idle_type idle) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; + vcpu_t busiest_vcpu; unsigned long imbalance; struct rq *busiest; + cpumask_t cpus = CPU_MASK_ALL; if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) @@ -2518,19 +3760,36 @@ static int load_balance(int this_cpu, st schedstat_inc(sd, lb_cnt[idle]); - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); +redo: +#ifdef CONFIG_SCHED_VCPU + if (likely(vcpu_vsched(this_cpu) == &idle_vsched)) { + /* + * Find idle vcpu to balance to + */ + this_cpu = find_idle_target(&cpus); + if (!this_cpu) + goto out_balanced; + this_rq = vcpu_rq(this_cpu); + } +#endif + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + &cpus); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance); - if (!busiest) { + busiest_vcpu = find_busiest_queue(this_cpu, group, idle, + imbalance, &cpus); + if (!busiest_vcpu) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; } - BUG_ON(busiest == this_rq); + busiest = vcpu_rq(busiest_vcpu); + + if (unlikely(busiest == this_rq)) + goto out_balanced; schedstat_add(sd, lb_imbalance[idle], imbalance); @@ -2549,8 +3808,12 @@ static int load_balance(int this_cpu, st double_rq_unlock(this_rq, busiest); /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(all_pinned)) + if (unlikely(all_pinned)) { + cpu_clear(cpu_of(busiest), cpus); + if (!cpus_empty(cpus)) + goto redo; goto out_balanced; + } } if (!nr_moved) { @@ -2564,7 +3827,7 @@ static int load_balance(int this_cpu, st /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!vcpu_isset(this_cpu, busiest->curr->cpus_allowed)) { spin_unlock(&busiest->lock); all_pinned = 1; goto out_one_pinned; @@ -2632,31 +3895,35 @@ out_one_pinned: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) +load_balance_newidle(vcpu_t this_cpu, struct rq *this_rq, struct sched_domain *sd) { struct sched_group *group; - struct rq *busiest = NULL; + struct rq *busiest; + vcpu_t busiest_vcpu; unsigned long imbalance; int nr_moved = 0; int sd_idle = 0; + cpumask_t cpus = CPU_MASK_ALL; if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); +redo: + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, + &sd_idle, &cpus); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); - if (!busiest) { + busiest_vcpu = find_busiest_queue(this_cpu, group, NEWLY_IDLE, + imbalance, &cpus); + if (!busiest_vcpu || busiest_vcpu == this_cpu) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; } - - BUG_ON(busiest == this_rq); + busiest = vcpu_rq(busiest_vcpu); schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); @@ -2668,6 +3935,12 @@ load_balance_newidle(int this_cpu, struc minus_1_or_zero(busiest->nr_running), imbalance, sd, NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); + + if (!nr_moved) { + cpu_clear(cpu_of(busiest), cpus); + if (!cpus_empty(cpus)) + goto redo; + } } if (!nr_moved) { @@ -2692,8 +3965,11 @@ out_balanced: /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. + * + * Returns whether to continue with another runqueue + * instead of switching to idle. */ -static void idle_balance(int this_cpu, struct rq *this_rq) +static int idle_balance(vcpu_t this_cpu, struct rq *this_rq) { struct sched_domain *sd; @@ -2701,9 +3977,10 @@ static void idle_balance(int this_cpu, s if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ if (load_balance_newidle(this_cpu, this_rq, sd)) - break; + return 1; } } + return 0; } /* @@ -2713,10 +3990,18 @@ static void idle_balance(int this_cpu, s * logical imbalances. * * Called with busiest_rq locked. + * + * In human terms: balancing of CPU load by moving tasks between CPUs is + * performed by 2 methods, push and pull. + * In certain places when CPU is found to be idle, it performs pull from busy + * CPU to current (idle) CPU. + * active_load_balance implements push method, with migration thread getting + * scheduled on a busy CPU (hence, making all running processes on this CPU sit + * in the queue) and selecting where to push and which task. */ -static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) +static void active_load_balance(struct rq *busiest_rq, vcpu_t busiest_cpu) { - int target_cpu = busiest_rq->push_cpu; + vcpu_t target_cpu = busiest_rq->push_cpu; struct sched_domain *sd; struct rq *target_rq; @@ -2724,7 +4009,7 @@ static void active_load_balance(struct r if (busiest_rq->nr_running <= 1) return; - target_rq = cpu_rq(target_cpu); + target_rq = vcpu_rq(target_cpu); /* * This condition is "impossible", if it occurs @@ -2736,10 +4021,17 @@ static void active_load_balance(struct r /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); + /* + * Our main candidate where to push our tasks is busiest->push_cpu. + * First, find the domain that spans over both that candidate CPU and + * the current one. + * + * FIXME: make sure that push_cpu doesn't disappear before we get here. + */ /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpu_isset(vcpu_last_pcpu(busiest_cpu), sd->span)) break; } @@ -2772,31 +4064,19 @@ static inline unsigned long cpu_offset(i } static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +rebalance_tick(vcpu_t this_cpu, struct rq *this_rq, enum idle_type idle) { - unsigned long this_load, interval, j = cpu_offset(this_cpu); + unsigned long j; struct sched_domain *sd; - int i, scale; - this_load = this_rq->raw_weighted_load; /* Update our load: */ - for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { - unsigned long old_load, new_load; - - old_load = this_rq->cpu_load[i]; - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; - } + update_rq_cpu_load(this_rq); + j = jiffies + cpu_offset(smp_processor_id()); for_each_domain(this_cpu, sd) { + unsigned long interval; + if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -2826,18 +4106,20 @@ rebalance_tick(int this_cpu, struct rq * /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) +static inline void rebalance_tick(vcpu_t cpu, struct rq *rq, enum idle_type idle) { } -static inline void idle_balance(int cpu, struct rq *rq) +static inline int idle_balance(vcpu_t cpu, struct rq *rq) { } #endif -static inline int wake_priority_sleeper(struct rq *rq) +static inline int wake_priority_sleeper(struct rq *rq, struct task_struct *idle) { int ret = 0; +#ifndef CONFIG_SCHED_VCPU + /* FIXME: can we implement SMT priority sleeping for this? */ #ifdef CONFIG_SCHED_SMT spin_lock(&rq->lock); /* @@ -2845,11 +4127,13 @@ static inline int wake_priority_sleeper( * reasons reschedule the idle task to see if it can now run. */ if (rq->nr_running) { - resched_task(rq->idle); + /* FIXME */ + resched_task(idle); ret = 1; } spin_unlock(&rq->lock); #endif +#endif return ret; } @@ -2905,6 +4189,15 @@ static inline int expired_starving(struc return 0; } +#ifdef CONFIG_VE +#define update_ve_cpu_time(p, time, tick) do { \ + VE_CPU_STATS((p)->ve_task_info.owner_env, \ + task_cpu(p))->time += tick; \ + } while (0) +#else +#define update_ve_cpu_time(p, time, tick) do { } while (0) +#endif + /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to @@ -2920,10 +4213,13 @@ void account_user_time(struct task_struc /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (TASK_NICE(p) > 0) { cpustat->nice = cputime64_add(cpustat->nice, tmp); - else + update_ve_cpu_time(p, nice, tmp); + } else { cpustat->user = cputime64_add(cpustat->user, tmp); + update_ve_cpu_time(p, user, tmp); + } } /* @@ -2936,20 +4232,22 @@ void account_system_time(struct task_str cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - struct rq *rq = this_rq(); + int this_pcpu = raw_smp_processor_id(); cputime64_t tmp; p->stime = cputime_add(p->stime, cputime); + tmp = cputime_to_cputime64(cputime); + + update_ve_cpu_time(p, system, tmp); /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) + else if (p != this_pcpu()->idle) cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) + else if ((atomic_read(&glob_task_nrs[this_pcpu].nr_iowait) > 0)) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else cpustat->idle = cputime64_add(cpustat->idle, tmp); @@ -2968,7 +4266,7 @@ void account_steal_time(struct task_stru cputime64_t tmp = cputime_to_cputime64(steal); struct rq *rq = this_rq(); - if (p == rq->idle) { + if (p == this_pcpu()->idle) { p->stime = cputime_add(p->stime, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); @@ -2990,16 +4288,21 @@ void scheduler_tick(void) unsigned long long now = sched_clock(); struct task_struct *p = current; int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); + vcpu_t vcpu; + struct rq *rq; + vcpu = this_vcpu(); + rq = vcpu_rq(vcpu); update_cpu_clock(p, rq, now); rq->timestamp_last_tick = now; - if (p == rq->idle) { - if (wake_priority_sleeper(rq)) + set_tsk_need_resched(p); //FIXME + + if (p == pcpu(cpu)->idle) { + if (wake_priority_sleeper(rq, pcpu(cpu)->idle)) goto out; - rebalance_tick(cpu, rq, SCHED_IDLE); + rebalance_tick(vcpu, rq, SCHED_IDLE); return; } @@ -3075,10 +4378,14 @@ void scheduler_tick(void) out_unlock: spin_unlock(&rq->lock); out: - rebalance_tick(cpu, rq, NOT_IDLE); + rebalance_tick(vcpu, rq, NOT_IDLE); } -#ifdef CONFIG_SCHED_SMT +#if defined(CONFIG_SCHED_SMT) && !defined(CONFIG_SCHED_VCPU) +/* FIXME: SMT scheduling + * rq->cpu is initialized with rq address if FAIRSCED is on + * this is not correct for SMT case + */ static inline void wakeup_busy_runqueue(struct rq *rq) { /* If an SMT runqueue is sleeping due to priority reasons wake it up */ @@ -3089,7 +4396,7 @@ static inline void wakeup_busy_runqueue( /* * Called with interrupt disabled and this_rq's runqueue locked. */ -static void wake_sleeping_dependent(int this_cpu) +static void wake_sleeping_dependent(vcpu_t this_cpu) { struct sched_domain *tmp, *sd = NULL; int i; @@ -3135,7 +4442,7 @@ smt_slice(struct task_struct *p, struct * need to be obeyed. */ static int -dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) +dependent_sleeper(vcpu_t this_cpu, struct task_struct *p) { struct sched_domain *tmp, *sd = NULL; int ret = 0, i; @@ -3198,11 +4505,11 @@ unlock: return ret; } #else -static inline void wake_sleeping_dependent(int this_cpu) +static inline void wake_sleeping_dependent(vcpu_t this_cpu) { } static inline int -dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) +dependent_sleeper(vcpu_t this_cpu, struct task_struct *p) { return 0; } @@ -3261,7 +4568,9 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx, new_prio; + int idx, new_prio; + vcpu_t vcpu; + cycles_t cycles; long *switch_count; struct rq *rq; @@ -3283,13 +4592,14 @@ need_resched: prev = current; release_kernel_lock(prev); need_resched_nonpreemptible: + cycles = get_cycles(); rq = this_rq(); /* * The idle thread is not allowed to schedule! * Remove this check after it has been exercised a bit. */ - if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { + if (unlikely(prev == this_pcpu()->idle) && prev->state != TASK_RUNNING) { printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); } @@ -3327,17 +4637,32 @@ need_resched_nonpreemptible: } } - cpu = smp_processor_id(); + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0) + prev->sleep_avg = 0; + + vcpu = rq_vcpu(rq); + if (rq->nr_running && vcpu_is_hot(vcpu)) + goto same_vcpu; + + if (unlikely(!rq->nr_running)) + idle_balance(vcpu, rq); + vcpu = schedule_vcpu(vcpu, cycles); + rq = vcpu_rq(vcpu); + if (unlikely(!rq->nr_running)) { - idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu); - goto switch_tasks; - } + next = this_pcpu()->idle; + rq->expired_timestamp = 0; + wake_sleeping_dependent(vcpu); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + goto switch_tasks; } +same_vcpu: array = rq->active; if (unlikely(!array->nr_active)) { /* @@ -3373,30 +4698,52 @@ need_resched_nonpreemptible: } } next->sleep_type = SLEEP_NORMAL; - if (dependent_sleeper(cpu, rq, next)) - next = rq->idle; + if (dependent_sleeper(vcpu, next)) + next = this_pcpu()->idle; + switch_tasks: - if (next == rq->idle) + if (next == this_pcpu()->idle) schedstat_inc(rq, sched_goidle); prefetch(next); prefetch_stack(next); clear_tsk_need_resched(prev); - rcu_qsctr_inc(task_cpu(prev)); + rcu_qsctr_inc(task_pcpu(prev)); update_cpu_clock(prev, rq, now); - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; + /* updated w/o rq->lock, which is ok due to after-read-checks */ prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next); if (likely(prev != next)) { + cycles_t cycles; + + /* current physical CPU id should be valid after switch */ + set_task_vcpu(next, vcpu); + set_task_pcpu(next, task_pcpu(prev)); + cycles = get_cycles(); next->timestamp = now; rq->nr_switches++; + glob_task_nrs[smp_processor_id()].nr_switches++; rq->curr = next; ++*switch_count; +#ifdef CONFIG_VE + prev->ve_task_info.sleep_stamp = cycles; + if (prev->state == TASK_RUNNING && prev != this_pcpu()->idle) + write_wakeup_stamp(prev, cycles); + update_sched_lat(next, cycles); + + /* because next & prev are protected with + * runqueue lock we may not worry about + * wakeup_stamp and sched_time protection + * (same thing in 'else' branch below) + */ + update_ve_task_info(prev, cycles); + next->ve_task_info.sched_time = cycles; + write_wakeup_stamp(next, 0); +#endif + prepare_task_switch(rq, next); prev = context_switch(rq, prev, next); barrier(); @@ -3406,8 +4753,10 @@ switch_tasks: * frame will be invalid. */ finish_task_switch(this_rq(), prev); - } else + } else { + update_ve_task_info(prev, get_cycles()); spin_unlock_irq(&rq->lock); + } prev = current; if (unlikely(reacquire_kernel_lock(prev) < 0)) @@ -3993,30 +5342,12 @@ int task_nice(const struct task_struct * EXPORT_SYMBOL_GPL(task_nice); /** - * idle_cpu - is a given cpu idle currently? - * @cpu: the processor in question. - */ -int idle_cpu(int cpu) -{ - return cpu_curr(cpu) == cpu_rq(cpu)->idle; -} - -/** - * idle_task - return the idle task for a given cpu. - * @cpu: the processor in question. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -/** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. */ static inline struct task_struct *find_process_by_pid(pid_t pid) { - return pid ? find_task_by_pid(pid) : current; + return pid ? find_task_by_pid_ve(pid) : current; } /* Actually do priority change: must hold rq lock. */ @@ -4077,7 +5408,7 @@ recheck: /* * Allow unprivileged RT tasks to decrease priority: */ - if (!capable(CAP_SYS_NICE)) { + if (!capable(CAP_SYS_ADMIN)) { /* * can't change policy, except between SCHED_NORMAL * and SCHED_BATCH: @@ -4558,11 +5889,20 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = this_rq(); + int cpu; +#ifdef CONFIG_VE + struct ve_struct *ve; + ve = current->ve_task_info.owner_env; +#endif + + cpu = raw_smp_processor_id(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + nr_iowait_inc(cpu); schedule(); + nr_iowait_dec(cpu); atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } @@ -4570,12 +5910,21 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = this_rq(); long ret; + int cpu; +#ifdef CONFIG_VE + struct ve_struct *ve; + ve = current->ve_task_info.owner_env; +#endif + + cpu = raw_smp_processor_id(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + nr_iowait_inc(cpu); ret = schedule_timeout(timeout); + nr_iowait_dec(cpu); atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; @@ -4700,15 +6049,9 @@ static void show_task(struct task_struct printk("%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if (BITS_PER_LONG == 32) - if (state == TASK_RUNNING) - printk(" running "); - else - printk(" %08lX ", thread_saved_pc(p)); + printk(" %08lX ", (unsigned long)p); #else - if (state == TASK_RUNNING) - printk(" running task "); - else - printk(" %016lx ", thread_saved_pc(p)); + printk(" %016lx ", (unsigned long)p); #endif #ifdef CONFIG_DEBUG_STACK_USAGE { @@ -4747,26 +6090,43 @@ void show_state(void) #if (BITS_PER_LONG == 32) printk("\n" " sibling\n"); - printk(" task PC pid father child younger older\n"); + printk(" task taskaddr pid father child younger older\n"); #else printk("\n" " sibling\n"); - printk(" task PC pid father child younger older\n"); + printk(" task taskaddr pid father child younger older\n"); #endif read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take alot of time: */ touch_nmi_watchdog(); show_task(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); debug_show_all_locks(); } +#ifdef CONFIG_SCHED_VCPU +static void init_boot_vcpus(long cpu) +{ + if (vsched_vcpu(&idle_vsched, cpu) != NULL) + return; + + if (__add_vcpu(&idle_vsched, cpu) != 0) + panic("Can't create idle vcpu %ld\n", cpu); + + /* Also create vcpu for default_vsched */ + if (__add_vcpu(&default_vsched, cpu) != 0) + panic("Can't create default vcpu %ld\n", cpu); + + cpu_set(cpu, idle_vsched.pcpu_running_map); +} +#endif + /** * init_idle - set up an idle thread for a given CPU * @idle: task in question @@ -4777,22 +6137,51 @@ void show_state(void) */ void __devinit init_idle(struct task_struct *idle, int cpu) { - struct rq *rq = cpu_rq(cpu); + struct vcpu_scheduler *vsched; + vcpu_t vcpu; + struct rq *rq; unsigned long flags; +#ifdef CONFIG_SCHED_VCPU + init_boot_vcpus(cpu); + vsched = &idle_vsched; +#else + vsched = NULL; +#endif + vcpu = vsched_vcpu(vsched, cpu); + rq = vcpu_rq(vcpu); + idle->timestamp = sched_clock(); idle->sleep_avg = 0; idle->array = NULL; idle->prio = idle->normal_prio = MAX_PRIO; idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); + set_task_vsched(idle, &idle_vsched); set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); - rq->curr = rq->idle = idle; + pcpu(cpu)->idle = idle; + rq->curr = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; #endif + set_task_pcpu(idle, cpu); + set_task_vsched(idle, vsched); + set_task_vcpu(idle, vcpu); +#ifdef CONFIG_SCHED_VCPU + /* the following code is very close to vcpu_get */ + spin_lock(&fairsched_lock); + pcpu(cpu)->vcpu = vcpu; + pcpu(cpu)->vsched = vcpu->vsched; + list_move_tail(&vcpu->list, &vsched->running_list); + __set_bit(cpu, vsched->vcpu_running_map.bits); + __set_bit(cpu, vsched->pcpu_running_map.bits); + vcpu->running = 1; + spin_unlock(&fairsched_lock); +#else + pcpu(cpu)->vcpu = vcpu; +#endif spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -4812,7 +6201,6 @@ void __devinit init_idle(struct task_str */ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; -#ifdef CONFIG_SMP /* * This is how migration works: * @@ -4829,6 +6217,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; * 7) we wake up and the migration is done. */ +#ifdef CONFIG_SMP /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -4844,9 +6233,11 @@ int set_cpus_allowed(struct task_struct unsigned long flags; struct rq *rq; int ret = 0; + struct vcpu_scheduler *vsched; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(new_mask, cpu_online_map)) { + vsched = task_vsched(p); + if (!cpus_intersects(new_mask, vsched_vcpu_online_map(vsched))) { ret = -EINVAL; goto out; } @@ -4856,7 +6247,8 @@ int set_cpus_allowed(struct task_struct if (cpu_isset(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, any_online_cpu(new_mask), &req)) { + if (migrate_task(p, vsched_vcpu(vsched, any_online_cpu(new_mask)), + &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -4870,6 +6262,7 @@ out: return ret; } EXPORT_SYMBOL_GPL(set_cpus_allowed); +#endif /* * Move (not current) task off this cpu, onto dest cpu. We're doing @@ -4882,26 +6275,31 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); * * Returns non-zero if task was successfully migrated. */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +static int __migrate_task(struct task_struct *p, vcpu_t src_cpu, vcpu_t dest_cpu) { struct rq *rq_dest, *rq_src; int ret = 0; - if (unlikely(cpu_is_offline(dest_cpu))) + if (unlikely(vcpu_is_offline(dest_cpu))) return ret; - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); +#ifdef CONFIG_SCHED_VCPU + BUG_ON(vcpu_vsched(src_cpu) == &idle_vsched); +#endif + rq_src = vcpu_rq(src_cpu); + rq_dest = vcpu_rq(dest_cpu); double_rq_lock(rq_src, rq_dest); /* Already moved. */ - if (task_cpu(p) != src_cpu) + if (task_vcpu(p) != src_cpu) goto out; /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) + if (!vcpu_isset(dest_cpu, p->cpus_allowed)) goto out; - set_task_cpu(p, dest_cpu); + BUG_ON(task_running(rq_src, p)); + set_task_vsched(p, vcpu_vsched(dest_cpu)); + set_task_vcpu(p, dest_cpu); if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -4927,13 +6325,21 @@ out: * thread migration by bumping thread off CPU then 'pushing' onto * another runqueue. */ +#if defined (CONFIG_HOTPLUG_CPU) || defined (CONFIG_SCHED_VCPU) +static void migrate_live_tasks(vcpu_t src_cpu); +static void migrate_dead_tasks(vcpu_t dead_cpu); +#endif static int migration_thread(void *data) { - int cpu = (long)data; struct rq *rq; + vcpu_t cpu = (vcpu_t)data; - rq = cpu_rq(cpu); + rq = vcpu_rq(cpu); BUG_ON(rq->migration_thread != current); + BUG_ON(!rq->migration_thread_init); + + /* migration thread startup has complete */ + rq->migration_thread_init = 0; set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { @@ -4944,15 +6350,17 @@ static int migration_thread(void *data) spin_lock_irq(&rq->lock); - if (cpu_is_offline(cpu)) { + if (vcpu_is_offline(cpu)) { spin_unlock_irq(&rq->lock); goto wait_to_die; } +#ifdef CONFIG_SMP if (rq->active_balance) { active_load_balance(rq, cpu); rq->active_balance = 0; } +#endif head = &rq->migration_queue; @@ -4971,8 +6379,7 @@ static int migration_thread(void *data) complete(&req->done); } - __set_current_state(TASK_RUNNING); - return 0; + goto die; wait_to_die: /* Wait for kthread_stop */ @@ -4981,20 +6388,33 @@ wait_to_die: schedule(); set_current_state(TASK_INTERRUPTIBLE); } +die: __set_current_state(TASK_RUNNING); +#if defined (CONFIG_HOTPLUG_CPU) || defined (CONFIG_SCHED_VCPU) + migrate_live_tasks(cpu); + spin_lock_irq(&rq->lock); + migrate_dead_tasks(cpu); + spin_unlock_irq(&rq->lock); +#endif return 0; } -#ifdef CONFIG_HOTPLUG_CPU -/* Figure out where task on dead CPU should go, use force if neccessary. */ -static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_SCHED_VCPU) +/* + * Figure out where task on dead CPU should go, use force if neccessary. + * NOTE: interrupts should be disabled by the caller + */ +static void move_task_off_dead_cpu(vcpu_t dead_cpu, struct task_struct *p) { unsigned long flags; - cpumask_t mask; struct rq *rq; + struct vcpu_scheduler *vsched; + cpumask_t mask; int dest_cpu; restart: +#ifndef CONFIG_SCHED_VCPU +#error "FIXME: wrong code" /* On same node? */ mask = node_to_cpumask(cpu_to_node(dead_cpu)); cpus_and(mask, mask, p->cpus_allowed); @@ -5021,10 +6441,27 @@ restart: "longer affine to cpu%d\n", p->pid, p->comm, dead_cpu); } - if (!__migrate_task(p, dead_cpu, dest_cpu)) +#else + vsched = vcpu_vsched(dead_cpu); + cpus_and(mask, vsched_vcpu_online_map(vsched), p->cpus_allowed); + dest_cpu = any_online_cpu(mask); + + /* On any allowed CPU? */ + if (dest_cpu == NR_CPUS) { + rq = task_rq_lock(p, &flags); + cpus_setall(p->cpus_allowed); + task_rq_unlock(rq, &flags); + dest_cpu = any_online_cpu(vsched_vcpu_online_map(vsched)); + } + /* this can happen only when non-empty node is removed... */ + if (dest_cpu == NR_CPUS) + printk("BUG: no where to move task %s(%d)\n", p->comm, p->pid); +#endif + if (!__migrate_task(p, dead_cpu, vsched_vcpu(vsched, dest_cpu))) goto restart; } +#ifdef CONFIG_HOTPLUG_CPU /* * While a dead CPU has no uninterruptible tasks queued at this point, * it might still have a nonzero ->nr_uninterruptible counter, because @@ -5044,25 +6481,30 @@ static void migrate_nr_uninterruptible(s double_rq_unlock(rq_src, rq_dest); local_irq_restore(flags); } +#endif /* Run through task list and migrate tasks from the dead cpu. */ -static void migrate_live_tasks(int src_cpu) +static void migrate_live_tasks(vcpu_t src_cpu) { struct task_struct *p, *t; + BUG_ON(vcpu_isset(src_cpu, vsched_vcpu_online_map(vcpu_vsched(src_cpu)))); write_lock_irq(&tasklist_lock); - do_each_thread(t, p) { + do_each_thread_all(t, p) { if (p == current) continue; + if (p == vcpu_rq(src_cpu)->migration_thread) + continue; - if (task_cpu(p) == src_cpu) + if (task_vcpu(p) == src_cpu) move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); + } while_each_thread_all(t, p); write_unlock_irq(&tasklist_lock); } +#ifdef CONFIG_HOTPLUG_CPU /* Schedules idle task to be the next runnable task on current CPU. * It does so by boosting its priority to highest possible and adding it to * the _front_ of the runqueue. Used by CPU offline code. @@ -5086,6 +6528,9 @@ void sched_idle_next(void) __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); /* Add idle task to the _front_ of its priority queue: */ +#ifdef CONFIG_SCHED_VCPU +#error "FIXME: VCPU vs. HOTPLUG: fix the code below" +#endif __activate_idle_task(p, rq); spin_unlock_irqrestore(&rq->lock, flags); @@ -5105,10 +6550,12 @@ void idle_task_exit(void) switch_mm(mm, &init_mm, current); mmdrop(mm); } +#endif /* CONFIG_HOTPLUG_CPU */ -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) +/* called under rq->lock with disabled interrupts */ +static void migrate_dead(vcpu_t dead_cpu, struct task_struct *p) { - struct rq *rq = cpu_rq(dead_cpu); + struct rq *rq = vcpu_rq(dead_cpu); /* Must be exiting, otherwise would be on tasklist. */ BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); @@ -5122,88 +6569,148 @@ static void migrate_dead(unsigned int de * Drop lock around migration; if someone else moves it, * that's OK. No task can be added to this CPU, so iteration is * fine. + * NOTE: interrupts should be left disabled --dev@ */ - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + spin_lock(&rq->lock); put_task_struct(p); } /* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) +static void migrate_dead_tasks(vcpu_t dead_cpu) { - struct rq *rq = cpu_rq(dead_cpu); + struct rq *rq = vcpu_rq(dead_cpu); unsigned int arr, i; for (arr = 0; arr < 2; arr++) { for (i = 0; i < MAX_PRIO; i++) { struct list_head *list = &rq->arrays[arr].queue[i]; - - while (!list_empty(list)) - migrate_dead(dead_cpu, list_entry(list->next, - struct task_struct, run_list)); + struct task_struct *tsk; +restart: + list_for_each_entry(tsk, list, run_list) { + if (tsk == rq->migration_thread) + continue; + migrate_dead(dead_cpu, tsk); + goto restart; + } } } } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_SCHED_VCPU */ + +static void migration_thread_bind(struct task_struct *k, vcpu_t cpu) +{ + BUG_ON(k->state != TASK_INTERRUPTIBLE); + /* Must have done schedule() in kthread() before we set_task_cpu */ + wait_task_inactive(k); + + set_task_vsched(k, vcpu_vsched(cpu)); + set_task_vcpu(k, cpu); + k->cpus_allowed = cpumask_of_cpu(cpu->id); +} + +static void migration_thread_stop(struct rq *rq) +{ + struct task_struct *thread; + + thread = rq->migration_thread; + if (thread == NULL) + return; + + /* + * Wait until migration thread has really started, i.e. + * migration_thread() function has been called. It's important, + * because migration thread can be still sleeping after creation, but + * it's vcpu is already marked as online, and tasks can migrate to this + * cpu. If we kill non-started migration thread now, migration_thread() + * function will not be called at all (see how kthread() works). + * And if migration_thread() is not called, there is no way to move + * tasks away from thread's vcpu. So, rq->nr_running will be != 0 even + * after migration thread is dead. + */ + while (rq->migration_thread_init) + yield(); + + get_task_struct(thread); + if (kthread_stop(thread) == -EINTR) + /* + * Somebody else has called kthread_stop() without + * waiting for migration thread init has complete. + */ + BUG_ON(1); + + /* We MUST ensure, that the do_exit of the migration thread is + * completed and it will never scheduled again before vsched_destroy. + * The task with flag PF_DEAD if unscheduled will never receive + * CPU again. */ + while (!(thread->flags & PF_DEAD) || task_running(rq, thread)) + yield(); + put_task_struct(thread); + + rq->migration_thread = NULL; +} /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ -static int __cpuinit -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int vmigration_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) { struct task_struct *p; - int cpu = (long)hcpu; + vcpu_t cpu = (vcpu_t)hcpu; unsigned long flags; struct rq *rq; switch (action) { case CPU_UP_PREPARE: - p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); + p = kthread_create(migration_thread, hcpu, "migration/%d/%d", + vsched_id(vcpu_vsched(cpu)), cpu->id); if (IS_ERR(p)) return NOTIFY_BAD; p->flags |= PF_NOFREEZE; - kthread_bind(p, cpu); - /* Must be high prio: stop_machine expects to yield to it. */ + + migration_thread_bind(p, cpu); rq = task_rq_lock(p, &flags); + /* Must be high prio: stop_machine expects to yield to it. */ __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); task_rq_unlock(rq, &flags); - cpu_rq(cpu)->migration_thread = p; + vcpu_rq(cpu)->migration_thread = p; + vcpu_rq(cpu)->migration_thread_init = 1; + cpu_set(cpu->id, vsched_vcpu_online_map(vcpu_vsched(cpu))); break; case CPU_ONLINE: /* Strictly unneccessary, as first user will wake it. */ - wake_up_process(cpu_rq(cpu)->migration_thread); + wake_up_process(vcpu_rq(cpu)->migration_thread); break; -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_SCHED_VCPU) +#error "FIXME: CPU down code doesn't work yet with VCPUs" +#endif case CPU_UP_CANCELED: - if (!cpu_rq(cpu)->migration_thread) + if (!vcpu_rq(cpu)->migration_thread) break; /* Unbind it from offline cpu so it can run. Fall thru. */ - kthread_bind(cpu_rq(cpu)->migration_thread, - any_online_cpu(cpu_online_map)); - kthread_stop(cpu_rq(cpu)->migration_thread); - cpu_rq(cpu)->migration_thread = NULL; + migration_thread_bind(vcpu_rq(cpu)->migration_thread, this_vcpu()); + migration_thread_stop(vcpu_rq(cpu)); break; case CPU_DEAD: - migrate_live_tasks(cpu); - rq = cpu_rq(cpu); - kthread_stop(rq->migration_thread); - rq->migration_thread = NULL; + rq = vcpu_rq(cpu); + migration_thread_stop(rq); +#ifdef CONFIG_HOTPLUG_CPU /* Idle task back to normal (off runqueue, low prio) */ rq = task_rq_lock(rq->idle, &flags); deactivate_task(rq->idle, rq); rq->idle->static_prio = MAX_PRIO; __setscheduler(rq->idle, SCHED_NORMAL, 0); - migrate_dead_tasks(cpu); task_rq_unlock(rq, &flags); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); +#endif /* No need to migrate the tasks: it was best-effort if * they didn't do lock_cpu_hotplug(). Just wake up @@ -5219,11 +6726,21 @@ migration_call(struct notifier_block *nf } spin_unlock_irq(&rq->lock); break; -#endif } return NOTIFY_OK; } +static int migration_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ +#ifdef CONFIG_SCHED_VCPU + if (action == CPU_UP_PREPARE) + init_boot_vcpus((long)hcpu); +#endif + /* we need to translate pcpu to vcpu */ + return vmigration_call(nfb, action, vsched_default_vcpu((long)hcpu)); +} + /* Register at highest priority so that task migration (migrate_all_tasks) * happens before everything else. */ @@ -5243,7 +6760,6 @@ int __init migration_init(void) return 0; } -#endif #ifdef CONFIG_SMP #undef SCHED_DOMAIN_DEBUG @@ -5271,7 +6787,7 @@ static void sched_domain_debug(struct sc printk(KERN_DEBUG); for (i = 0; i < level + 1; i++) printk(" "); - printk("domain %d: ", level); + printk("domain %d, flags %x: ", level, sd->flags); if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); @@ -5396,7 +6912,7 @@ sd_parent_degenerate(struct sched_domain */ static void cpu_attach_domain(struct sched_domain *sd, int cpu) { - struct rq *rq = cpu_rq(cpu); + struct rq *rq = vcpu_rq(vsched_default_vcpu(cpu)); struct sched_domain *tmp; /* Remove the sched domains which do not contribute to scheduling. */ @@ -5413,6 +6929,7 @@ static void cpu_attach_domain(struct sch sched_domain_debug(sd, cpu); + rcu_assign_pointer(pcpu(cpu)->sd, sd); rcu_assign_pointer(rq->sd, sd); } @@ -5591,7 +7108,7 @@ static unsigned long domain_distance(int unsigned long distance = 0; struct sched_domain *sd; - for_each_domain(cpu1, sd) { + for_each_pdomain(pcpu(cpu1)->sd, sd) { WARN_ON(!cpu_isset(cpu1, sd->span)); if (cpu_isset(cpu2, sd->span)) return distance; @@ -5913,7 +7430,7 @@ static void calibrate_migration_costs(co */ for_each_cpu_mask(cpu, *cpu_map) { distance = 0; - for_each_domain(cpu, sd) { + for_each_pdomain(pcpu(cpu)->sd, sd) { sd->cache_hot_time = migration_cost[distance]; distance++; } @@ -6540,6 +8057,7 @@ static int arch_init_sched_domains(const err = build_sched_domains(&cpu_default_map); + nr_online_pcpus = num_online_cpus(); return err; } @@ -6725,44 +8243,457 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } -void __init sched_init(void) +static void init_rq(struct rq *rq, int cpu) { - int i, j, k; - - for_each_possible_cpu(i) { - struct prio_array *array; - struct rq *rq; + int j, k; + struct prio_array *array; - rq = cpu_rq(i); - spin_lock_init(&rq->lock); - lockdep_set_class(&rq->lock, &rq->rq_lock_key); - rq->nr_running = 0; - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; + spin_lock_init(&rq->lock); + rq->nr_running = 0; +#ifndef CONFIG_SCHED_VCPU + lockdep_set_class(&rq->lock, &rq->rq_lock_key); +#endif + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP - rq->sd = NULL; - for (j = 1; j < 3; j++) - rq->cpu_load[j] = 0; - rq->active_balance = 0; - rq->push_cpu = 0; - rq->migration_thread = NULL; - INIT_LIST_HEAD(&rq->migration_queue); -#endif - atomic_set(&rq->nr_iowait, 0); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); + rq->sd = NULL; + for (j = 0; j < 3; j++) + rq->cpu_load[j] = 0; + rq->active_balance = 0; +#endif + rq->push_cpu = NULL; + rq->migration_thread = NULL; + INIT_LIST_HEAD(&rq->migration_queue); + atomic_set(&rq->nr_iowait, 0); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } +} + +static void init_vcpu(vcpu_t vcpu, int id) +{ + memset(vcpu, 0, sizeof(struct vcpu_struct)); + vcpu->id = id; + vcpu_last_pcpu(vcpu) = id; + init_rq(vcpu_rq(vcpu), id); +} + +#if defined(CONFIG_SCHED_VCPU) || defined(CONFIG_FAIRSCHED) +/* both rq and vsched lock should be taken */ +static void __install_vcpu(struct vcpu_scheduler *vsched, vcpu_t vcpu) +{ + int id; + + id = vcpu->id; + vcpu->vsched = vsched; + vsched->vcpu[id] = vcpu; + vcpu_last_pcpu(vcpu) = id; + wmb(); + /* FIXME: probably locking should be reworked, e.g. + we don't have corresponding rmb(), so we need to update mask + only after quiscent state */ + /* init_boot_vcpu() should be remade if RCU is used here */ + list_add(&vcpu->list, &vsched->idle_list); + vsched->num_online_vcpus++; +} + +static int install_vcpu(vcpu_t vcpu, struct vcpu_scheduler *vsched) +{ + struct rq *rq; + unsigned long flags; + int res = 0; + + rq = vcpu_rq(vcpu); + spin_lock_irqsave(&rq->lock, flags); + spin_lock(&fairsched_lock); + + if (vsched->vcpu[vcpu->id] != NULL) + res = -EBUSY; + else + __install_vcpu(vsched, vcpu); + + spin_unlock(&fairsched_lock); + spin_unlock_irqrestore(&rq->lock, flags); + return res; +} + +static int __add_vcpu(struct vcpu_scheduler *vsched, int id) +{ + vcpu_t vcpu; + int res; + + res = -ENOMEM; + vcpu = kmalloc(sizeof(struct vcpu_struct), GFP_KERNEL); + if (vcpu == NULL) + goto out; + + init_vcpu(vcpu, id); + vcpu_rq(vcpu)->curr = this_pcpu()->idle; + res = install_vcpu(vcpu, vsched); + if (res < 0) + goto out_free; + return 0; + +out_free: + kfree(vcpu); +out: + return res; +} + +void vsched_init(struct vcpu_scheduler *vsched, int id) +{ + memset(vsched, 0, sizeof(*vsched)); + + INIT_LIST_HEAD(&vsched->idle_list); + INIT_LIST_HEAD(&vsched->active_list); + INIT_LIST_HEAD(&vsched->running_list); + vsched->num_online_vcpus = 0; + vsched->vcpu_online_map = CPU_MASK_NONE; + vsched->vcpu_running_map = CPU_MASK_NONE; + vsched->pcpu_running_map = CPU_MASK_NONE; + vsched->id = id; + + spin_lock(&vsched_list_lock); + list_add(&vsched->list, &vsched_list); + spin_unlock(&vsched_list_lock); +} + +#ifdef CONFIG_FAIRSCHED +int scale_vcpu_frequency = 1; +EXPORT_SYMBOL(scale_vcpu_frequency); + +unsigned long ve_scale_khz(unsigned long khz) +{ + struct fairsched_node *node; + int cpus; + unsigned long rate; + + if (!scale_vcpu_frequency) + return khz; + + rate = fairsched_nr_cpus << FSCHRATE_SHIFT; + + /* + * Ideally fairsched node should be taken from the current ve_struct. + * However, to simplify the code and locking, it is taken from current + * (currently fairsched_node can be changed only for a sleeping task). + * That means that VE0 processes moved to some special node will get + * fake CPU speed, but that shouldn't be a big problem. + */ + preempt_disable(); + node = current->vsched->node; + cpus = node->vcpus; + if (node->rate_limited) + rate = node->rate; + preempt_enable(); + + return min((unsigned long long)khz, + ((unsigned long long)khz * (rate / cpus)) >> FSCHRATE_SHIFT); +} + +/* No locks supposed to be held */ +static void vsched_del_vcpu(vcpu_t vcpu, int empty); +static int vsched_add_vcpu(struct vcpu_scheduler *vsched) +{ + int res, err; + vcpu_t vcpu; + int id; + static DECLARE_MUTEX(id_mutex); + + down(&id_mutex); + id = find_first_zero_bit(vsched->vcpu_online_map.bits, NR_CPUS); + if (id >= NR_CPUS) { + err = -EBUSY; + goto out_up; + } + + err = __add_vcpu(vsched, id); + if (err < 0) + goto out_up; + memset(VE_CPU_STATS(vsched->node->owner_env, id), 0, + sizeof(struct ve_cpu_stats)); + /* Kick idle time collecting logic */ + ve_strt_idle(vsched->node->owner_env, id, get_cycles()); + + vcpu = vsched_vcpu(vsched, id); + err = -ENOMEM; + + res = vmigration_call(&migration_notifier, CPU_UP_PREPARE, vcpu); + if (res != NOTIFY_OK) + goto out_del_up; + + res = vmigration_call(&migration_notifier, CPU_ONLINE, vcpu); + if (res != NOTIFY_OK) + goto out_cancel_del_up; + + err = 0; + +out_up: + up(&id_mutex); + return err; + +out_cancel_del_up: + vmigration_call(&migration_notifier, CPU_UP_CANCELED, vcpu); +out_del_up: + vsched_del_vcpu(vcpu, 0); + goto out_up; +} + +static void vsched_del_vcpu(vcpu_t vcpu, int empty) +{ + struct vcpu_scheduler *vsched; + struct rq *rq; + + vsched = vcpu_vsched(vcpu); + rq = vcpu_rq(vcpu); + + spin_lock_irq(&rq->lock); + spin_lock(&fairsched_lock); + cpu_clear(vcpu->id, vsched->vcpu_online_map); + vsched->num_online_vcpus--; + spin_unlock(&fairsched_lock); + spin_unlock_irq(&rq->lock); + + /* no need to syncronize, if no tasks at all */ + if (!empty) + synchronize_sched(); + + /* + * FIXME: ideas for VCPU hotplug: + * + * - push_cpu should be checked/cleanuped + * - serialization + */ + + /* + * all tasks should migrate from this VCPU somewhere, + * also, since this moment VCPU is offline, so migration_thread + * won't accept any new tasks... + */ + vmigration_call(&migration_notifier, CPU_DEAD, vcpu); + BUG_ON(rq->nr_running != 0); + + /* vcpu_put() is called after deactivate_task. This loop makes sure + * that vcpu_put() was finished and vcpu can be freed */ + while ((volatile int)vcpu->running) + yield(); + + BUG_ON(vcpu->active); /* should be in idle_list */ + BUG_ON(vcpu_rq(vcpu)->prev_mm != NULL); + + spin_lock_irq(&fairsched_lock); + list_del(&vcpu->list); + vsched_vcpu(vsched, vcpu->id) = NULL; + spin_unlock_irq(&fairsched_lock); + + kfree(vcpu); +} + +int vsched_set_vcpus(struct vcpu_scheduler *vsched, unsigned int vcpus) +{ + int i, ret = 0; + vcpu_t vcpu; + + if (vsched->num_online_vcpus < vcpus) { + /* need to add more VCPUs */ + for (i = vcpus - vsched->num_online_vcpus; i > 0; i--) { + ret = vsched_add_vcpu(vsched); + if (ret < 0) + break; } + } else if (vsched->num_online_vcpus > vcpus) { + /* remove some VCPUs */ + while (vcpus != vsched->num_online_vcpus) { + vcpu = vsched_vcpu(vsched, vsched->num_online_vcpus - 1); + BUG_ON(!vcpu); + vsched_del_vcpu(vcpu, 0); + } + } +#ifdef CONFIG_FAIRSCHED + vsched->node->vcpus = vsched->num_online_vcpus; +#endif + return ret; +} + +int vsched_mvpr(struct task_struct *p, struct vcpu_scheduler *vsched) +{ + vcpu_t dest_vcpu; + int id; + + id = first_cpu(vsched->vcpu_online_map); + if (id >= NR_CPUS) + goto err; + + dest_vcpu = vsched_vcpu(vsched, id); + set_cpus_allowed(p, CPU_MASK_ALL); + sched_migrate_task(p, dest_vcpu); + + if (task_vsched_id(p) != vsched_id(vsched)) { + /* race: probably someone changed cpus_allowed? */ + printk("vsched_mvpr: failed to move task\n"); + goto err; } + return 0; + +err: + return -EINVAL; +} + +void vsched_fairsched_link(struct vcpu_scheduler *vsched, + struct fairsched_node *node) +{ + vsched->node = node; + node->vsched = vsched; +} + +void vsched_fairsched_unlink(struct vcpu_scheduler *vsched, + struct fairsched_node *node) +{ + vsched->node = NULL; + node->vsched = NULL; +} + +int vsched_create(int id, struct fairsched_node *node) +{ + struct vcpu_scheduler *vsched; + int res, cpus; + + vsched = kmalloc(sizeof(*vsched), GFP_KERNEL); + if (vsched == NULL) + return -ENOMEM; + + vsched_init(vsched, node->id); + vsched_fairsched_link(vsched, node); + + cpus = node->vcpus ? : num_online_cpus(); + res = vsched_set_vcpus(vsched, cpus); + if (res < 0) + goto err_add; + + return 0; + +err_add: + vsched_destroy(vsched); + return res; +} + +int vsched_destroy(struct vcpu_scheduler *vsched) +{ + if (vsched == NULL) + return 0; + + vsched_set_vcpus(vsched, 0); + + spin_lock_irq(&fairsched_lock); + if (vsched->num_online_vcpus || + !list_empty(&vsched->running_list) || + !list_empty(&vsched->active_list) || + !list_empty(&vsched->idle_list)) + goto err_busy; + + vsched_fairsched_unlink(vsched, vsched->node); + spin_unlock_irq(&fairsched_lock); + + spin_lock(&vsched_list_lock); + list_del(&vsched->list); + spin_unlock(&vsched_list_lock); + + kfree(vsched); + return 0; + +err_busy: + oops_in_progress = 1; + printk(KERN_ERR "BUG in vsched_destroy, id %d: n%d r%d a%d i%d\n", + vsched->id, + vsched->num_online_vcpus, + !list_empty(&vsched->running_list), + !list_empty(&vsched->active_list), + !list_empty(&vsched->idle_list)); + spin_unlock_irq(&fairsched_lock); + oops_in_progress = 0; + return -EBUSY; + +} +#endif /* defined(CONFIG_FAIRSCHED) */ + +static void init_boot_vcpu(void) +{ + int res; + + /* + * We setup boot_vcpu and it's runqueue until init_idle() happens + * on cpu0. This is required since timer interrupts can happen + * between sched_init() and init_idle(). + */ + init_vcpu(&boot_idle_vcpu, raw_smp_processor_id()); + vcpu_rq(&boot_idle_vcpu)->curr = current; + res = install_vcpu(&boot_idle_vcpu, &idle_vsched); + if (res < 0) + panic("Can't install boot idle vcpu"); + + init_vcpu(&boot_vcpu, raw_smp_processor_id()); + vcpu_rq(&boot_vcpu)->curr = current; + res = install_vcpu(&boot_vcpu, &default_vsched); + if (res < 0) + panic("Can't install boot vcpu"); + + cpu_set(boot_vcpu.id, default_vsched.vcpu_online_map); + + this_pcpu()->vcpu = &boot_idle_vcpu; + this_pcpu()->vsched = &idle_vsched; +} +#endif /* defined(CONFIG_SCHED_VCPU) || defined(CONFIG_FAIRSCHED) */ + +static void init_pcpu(int id) +{ + struct pcpu_info *pcpu; + + pcpu = pcpu(id); + pcpu->id = id; +#ifdef CONFIG_SMP + pcpu->sd = NULL; +#endif + +#ifndef CONFIG_SCHED_VCPU + init_vcpu(vcpu(id), id); +#endif +} + +static void init_pcpus(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) + init_pcpu(i); +} + +void __init sched_init(void) +{ + init_pcpus(); +#if defined(CONFIG_SCHED_VCPU) + vsched_init(&idle_vsched, -1); + vsched_init(&default_vsched, 0); +#if defined(CONFIG_FAIRSCHED) + fairsched_init_early(); + vsched_fairsched_link(&idle_vsched, &fairsched_idle_node); + vsched_fairsched_link(&default_vsched, &fairsched_init_node); +#endif + init_boot_vcpu(); +#else +#if defined(CONFIG_FAIRSCHED) + fairsched_init_early(); +#endif +#endif + set_load_weight(&init_task); #ifdef CONFIG_RT_MUTEXES @@ -6784,6 +8715,155 @@ void __init sched_init(void) init_idle(current, smp_processor_id()); } +#ifdef CONFIG_SCHED_VCPU +static void show_vcpu_list(struct vcpu_scheduler *vsched, struct list_head *lh) +{ + cpumask_t m; + vcpu_t vcpu; + int i; + + cpus_clear(m); + list_for_each_entry(vcpu, lh, list) + cpu_set(vcpu->id, m); + + for (i = 0; i < NR_CPUS; i++) + if (cpu_isset(i, m)) + printk("%d ", i); +} + +#define PRINT(s, sz, fmt...) \ + do { \ + int __out; \ + __out = scnprintf(*s, *sz, fmt); \ + *s += __out; \ + *sz -= __out; \ + } while(0) + +static void show_rq_array(struct prio_array *array, char *header, char **s, int *sz) +{ + struct list_head *list; + struct task_struct *p; + int k, h; + + h = 0; + for (k = 0; k < MAX_PRIO; k++) { + list = array->queue + k; + if (list_empty(list)) + continue; + + if (!h) { + PRINT(s, sz, header); + h = 1; + } + + PRINT(s, sz, " prio %d (", k); + list_for_each_entry(p, list, run_list) + PRINT(s, sz, "%s[%d] ", p->comm, p->pid); + PRINT(s, sz, ")"); + } + if (h) + PRINT(s, sz, "\n"); +} + +static void show_vcpu(vcpu_t vcpu) +{ + struct rq *rq; + char buf[1024], *s; + unsigned long flags; + int sz; + unsigned long nr_running, cpu_load[3]; + unsigned long long nr_switches; + struct sched_domain *sd; + struct task_struct *curr; + + if (vcpu == NULL) + return; + + printk(" vcpu %d: last_pcpu %d, state %s%s\n", + vcpu->id, vcpu->last_pcpu, + vcpu->active ? "A" : "", + vcpu->running ? "R" : ""); + s = buf; + sz = sizeof(buf) - 1; + + rq = vcpu_rq(vcpu); + spin_lock_irqsave(&rq->lock, flags); + nr_running = rq->nr_running; +#ifdef CONFIG_SMP + cpu_load[0] = rq->cpu_load[0]; + cpu_load[1] = rq->cpu_load[1]; + cpu_load[2] = rq->cpu_load[2]; + sd = rq->sd; +#else + cpu_load[0] = cpu_load[1] = cpu_load[2] = 0; + sd = NULL; +#endif + nr_switches = rq->nr_switches; + curr = rq->curr; + + show_rq_array(rq->active, " active:", &s, &sz); + show_rq_array(rq->expired, " expired:", &s, &sz); + spin_unlock_irqrestore(&rq->lock, flags); + *s = 0; + + printk(" rq: running %lu, load {%lu,%lu,%lu}, sw %Lu, sd %p, curr %p\n", + nr_running, cpu_load[0], cpu_load[1], cpu_load[2], nr_switches, + sd, curr); + + printk("%s", buf); +} + +static inline void fairsched_show_node(struct vcpu_scheduler *vsched) +{ +#ifdef CONFIG_FAIRSCHED + struct fairsched_node *node; + + node = vsched->node; + printk("fsnode: ready %d run %d cpu %d vsched %p, pcpu %d\n", + node->nr_ready, node->nr_runnable, node->nr_pcpu, + node->vsched, smp_processor_id()); +#endif +} + +static void __show_vsched(struct vcpu_scheduler *vsched) +{ + char mask[NR_CPUS + 1]; + int i; + + spin_lock(&fairsched_lock); + printk("vsched id=%d\n", vsched_id(vsched)); + fairsched_show_node(vsched); + + printk(" idle cpus "); + show_vcpu_list(vsched, &vsched->idle_list); + printk("; active cpus "); + show_vcpu_list(vsched, &vsched->active_list); + printk("; running cpus "); + show_vcpu_list(vsched, &vsched->running_list); + printk("\n"); + + cpumask_scnprintf(mask, NR_CPUS, vsched->vcpu_online_map); + printk(" num_online_cpus=%d, mask=%s (w=%d)\n", + vsched->num_online_vcpus, mask, + cpus_weight(vsched->vcpu_online_map)); + spin_unlock(&fairsched_lock); + + for (i = 0; i < NR_CPUS; i++) + show_vcpu(vsched->vcpu[i]); +} + +void show_vsched(void) +{ + struct vcpu_scheduler *vsched; + unsigned long flags; + + spin_lock_irqsave(&vsched_list_lock, flags); + list_for_each_entry (vsched, &vsched_list, list) + __show_vsched(vsched); + spin_unlock_irqrestore(&vsched_list_lock, flags); +} +#endif /* CONFIG_SCHED_VCPU */ + #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP void __might_sleep(char *file, int line) { @@ -6815,7 +8895,7 @@ void normalize_rt_tasks(void) struct rq *rq; read_lock_irq(&tasklist_lock); - for_each_process(p) { + for_each_process_all(p) { if (!rt_task(p)) continue; @@ -6858,7 +8938,7 @@ void normalize_rt_tasks(void) */ struct task_struct *curr_task(int cpu) { - return cpu_curr(cpu); + return vcpu_rq(pcpu(cpu)->vcpu)->curr; } /** @@ -6878,7 +8958,7 @@ struct task_struct *curr_task(int cpu) */ void set_curr_task(int cpu, struct task_struct *p) { - cpu_curr(cpu) = p; + vcpu_rq(pcpu(cpu)->vcpu)->curr = p; } #endif diff -uprN linux-2.6.18/kernel/signal.c linux-2.6.18.ovz/kernel/signal.c --- linux-2.6.18/kernel/signal.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/signal.c 2007-06-13 06:55:07.000000000 -0400 @@ -22,18 +22,21 @@ #include #include #include +#include #include #include #include #include #include +#include #include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. */ -static kmem_cache_t *sigqueue_cachep; +kmem_cache_t *sigqueue_cachep; +EXPORT_SYMBOL_GPL(sigqueue_cachep); /* * In POSIX a signal is sent either to a specific thread (Linux task) @@ -155,6 +158,23 @@ static kmem_cache_t *sigqueue_cachep; (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) +static int sig_ve_ignored(int sig, struct siginfo *info, struct task_struct *t) +{ + struct ve_struct *ve; + + /* always allow signals from the kernel */ + if (info == SEND_SIG_FORCED || + (!is_si_special(info) && SI_FROMKERNEL(info))) + return 0; + + ve = current->ve_task_info.owner_env; + if (ve->init_entry != t) + return 0; + if (ve_is_super(get_exec_env())) + return 0; + return !sig_user_defined(t, sig) || sig_kernel_only(sig); +} + static int sig_ignored(struct task_struct *t, int sig) { void __user * handler; @@ -221,6 +241,7 @@ fastcall void recalc_sigpending_tsk(stru else clear_tsk_thread_flag(t, TIF_SIGPENDING); } +EXPORT_SYMBOL_GPL(recalc_sigpending_tsk); void recalc_sigpending(void) { @@ -271,8 +292,13 @@ static struct sigqueue *__sigqueue_alloc atomic_inc(&t->user->sigpending); if (override_rlimit || atomic_read(&t->user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { q = kmem_cache_alloc(sigqueue_cachep, flags); + if (q && ub_siginfo_charge(q, get_task_ub(t))) { + kmem_cache_free(sigqueue_cachep, q); + q = NULL; + } + } if (unlikely(q == NULL)) { atomic_dec(&t->user->sigpending); } else { @@ -289,6 +315,7 @@ static void __sigqueue_free(struct sigqu return; atomic_dec(&q->user->sigpending); free_uid(q->user); + ub_siginfo_uncharge(q); kmem_cache_free(sigqueue_cachep, q); } @@ -419,7 +446,16 @@ static int __dequeue_signal(struct sigpe { int sig = 0; - sig = next_signal(pending, mask); + /* SIGKILL must have priority, otherwise it is quite easy + * to create an unkillable process, sending sig < SIGKILL + * to self */ + if (unlikely(sigismember(&pending->signal, SIGKILL))) { + if (!sigismember(mask, SIGKILL)) + sig = SIGKILL; + } + + if (likely(!sig)) + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { @@ -513,6 +549,7 @@ void signal_wake_up(struct task_struct * if (!wake_up_state(t, mask)) kick_process(t); } +EXPORT_SYMBOL_GPL(signal_wake_up); /* * Remove signals in mask from the pending set and queue. @@ -731,7 +768,7 @@ static int send_signal(int sig, struct s q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; - q->info.si_pid = current->pid; + q->info.si_pid = virt_pid(current); q->info.si_uid = current->uid; break; case (unsigned long) SEND_SIG_PRIV: @@ -1048,7 +1085,8 @@ int group_send_sig_info(int sig, struct if (!ret && sig) { ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); + ret = sig_ve_ignored(sig, info, p) ? 0 : + __group_send_sig_info(sig, info, p); unlock_task_sighand(p, &flags); } } @@ -1069,13 +1107,18 @@ int __kill_pg_info(int sig, struct sigin if (pgrp <= 0) return -EINVAL; + /* Use __vpid_to_pid(). This function is used under write_lock + * tasklist_lock. */ + if (is_virtual_pid(pgrp)) + pgrp = __vpid_to_pid(pgrp); + success = 0; retval = -ESRCH; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p); success |= !err; retval = err; - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); return success ? 0 : retval; } @@ -1103,7 +1146,7 @@ kill_proc_info(int sig, struct siginfo * read_lock(&tasklist_lock); acquired_tasklist_lock = 1; } - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); error = -ESRCH; if (p) error = group_send_sig_info(sig, info, p); @@ -1124,7 +1167,7 @@ int kill_proc_info_as_uid(int sig, struc return ret; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); if (!p) { ret = -ESRCH; goto out_unlock; @@ -1166,8 +1209,8 @@ static int kill_something_info(int sig, struct task_struct * p; read_lock(&tasklist_lock); - for_each_process(p) { - if (p->pid > 1 && p->tgid != current->tgid) { + for_each_process_ve(p) { + if (virt_pid(p) > 1 && p->tgid != current->tgid) { int err = group_send_sig_info(sig, info, p); ++count; if (err != -EPERM) @@ -1441,9 +1484,17 @@ void do_notify_parent(struct task_struct BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); +#ifdef CONFIG_VE + /* Allow to send only SIGCHLD from VE */ + if (sig != SIGCHLD && + tsk->ve_task_info.owner_env != + tsk->parent->ve_task_info.owner_env) + sig = SIGCHLD; +#endif + info.si_signo = sig; info.si_errno = 0; - info.si_pid = tsk->pid; + info.si_pid = get_task_pid_ve(tsk, tsk->parent->ve_task_info.owner_env); info.si_uid = tsk->uid; /* FIXME: find out whether or not this is supposed to be c*time. */ @@ -1508,7 +1559,7 @@ static void do_notify_parent_cldstop(str info.si_signo = SIGCHLD; info.si_errno = 0; - info.si_pid = tsk->pid; + info.si_pid = get_task_pid_ve(tsk, VE_TASK_INFO(parent)->owner_env); info.si_uid = tsk->uid; /* FIXME: find out whether or not this is supposed to be c*time. */ @@ -1595,9 +1646,9 @@ static void ptrace_stop(int exit_code, i current->exit_code = exit_code; /* Let the debugger run. */ + set_pn_state(current, PN_STOP_SIGNAL); set_current_state(TASK_TRACED); spin_unlock_irq(¤t->sighand->siglock); - try_to_freeze(); read_lock(&tasklist_lock); if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); @@ -1613,6 +1664,7 @@ static void ptrace_stop(int exit_code, i current->exit_code = nostop_code; } + clear_pn_state(current); /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with @@ -1660,7 +1712,9 @@ finish_stop(int stop_count) read_unlock(&tasklist_lock); } + set_stop_state(current); schedule(); + clear_stop_state(current); /* * Now we don't run again until continued. */ @@ -1762,14 +1816,46 @@ static int handle_group_stop(void) return 1; } +atomic_t global_suspend = ATOMIC_INIT(0); + +/* Refrigerator is place where frozen processes are stored :-). */ +void refrigerator(void) +{ + /* Hmm, should we be allowed to suspend when there are realtime + processes around? */ + long save; + save = current->state; + current->state = TASK_UNINTERRUPTIBLE; + /* printk("="); */ + + spin_lock_irq(¤t->sighand->siglock); + if (test_and_clear_thread_flag(TIF_FREEZE)) { + recalc_sigpending(); /* We sent fake signal, clean it up */ + if (atomic_read(&global_suspend) || + atomic_read(&get_exec_env()->suspend)) { + current->flags |= PF_FROZEN; + } else { + current->state = save; + } + } else { + /* Freeze request could be canceled before we entered + * refrigerator(). In this case we do nothing. */ + current->state = save; + } + spin_unlock_irq(¤t->sighand->siglock); + + while (current->flags & PF_FROZEN) + schedule(); + current->state = save; +} +EXPORT_SYMBOL(refrigerator); + int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie) { sigset_t *mask = ¤t->blocked; int signr = 0; - try_to_freeze(); - relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { @@ -1805,7 +1891,7 @@ relock: info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; - info->si_pid = current->parent->pid; + info->si_pid = virt_pid(current->parent); info->si_uid = current->parent->uid; } @@ -2187,7 +2273,7 @@ sys_kill(int pid, int sig) info.si_signo = sig; info.si_errno = 0; info.si_code = SI_USER; - info.si_pid = current->tgid; + info.si_pid = virt_tgid(current); info.si_uid = current->uid; return kill_something_info(sig, &info, pid); @@ -2203,12 +2289,12 @@ static int do_tkill(int tgid, int pid, i info.si_signo = sig; info.si_errno = 0; info.si_code = SI_TKILL; - info.si_pid = current->tgid; + info.si_pid = virt_tgid(current); info.si_uid = current->uid; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - if (p && (tgid <= 0 || p->tgid == tgid)) { + p = find_task_by_pid_ve(pid); + if (p && (tgid <= 0 || virt_tgid(p) == tgid)) { error = check_kill_permission(sig, &info, p); /* * The null signal is a permissions and process existence @@ -2216,8 +2302,10 @@ static int do_tkill(int tgid, int pid, i */ if (!error && sig && p->sighand) { spin_lock_irq(&p->sighand->siglock); - handle_stop_signal(sig, p); - error = specific_send_sig_info(sig, &info, p); + if (!sig_ve_ignored(sig, &info, p)) { + handle_stop_signal(sig, p); + error = specific_send_sig_info(sig, &info, p); + } spin_unlock_irq(&p->sighand->siglock); } } @@ -2583,5 +2671,5 @@ void __init signals_init(void) kmem_cache_create("sigqueue", sizeof(struct sigqueue), __alignof__(struct sigqueue), - SLAB_PANIC, NULL, NULL); + SLAB_PANIC|SLAB_UBC, NULL, NULL); } diff -uprN linux-2.6.18/kernel/softirq.c linux-2.6.18.ovz/kernel/softirq.c --- linux-2.6.18/kernel/softirq.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/softirq.c 2007-06-13 06:55:07.000000000 -0400 @@ -13,11 +13,14 @@ #include #include #include +#include #include #include #include #include +#include + #include /* - No shared variables, all the data are CPU local. @@ -45,6 +48,8 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +static DEFINE_PER_CPU(struct task_struct *, ksoftirqd_wakeup); +static int ksoftirqd_stat[NR_CPUS]; /* * we cannot loop indefinitely here to avoid userspace starvation, @@ -55,7 +60,7 @@ static DEFINE_PER_CPU(struct task_struct static inline void wakeup_softirqd(void) { /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); + struct task_struct *tsk = __get_cpu_var(ksoftirqd_wakeup); if (tsk && tsk->state != TASK_RUNNING) wake_up_process(tsk); @@ -205,10 +210,14 @@ EXPORT_SYMBOL(local_bh_enable_ip); asmlinkage void __do_softirq(void) { + struct user_beancounter *ub; struct softirq_action *h; __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; + struct ve_struct *envid; + + envid = set_exec_env(get_ve0()); pending = local_softirq_pending(); account_system_vtime(current); @@ -225,6 +234,7 @@ restart: h = softirq_vec; + ub = set_exec_ub(get_ub0()); do { if (pending & 1) { h->action(h); @@ -233,6 +243,7 @@ restart: h++; pending >>= 1; } while (pending); + (void)set_exec_ub(ub); local_irq_disable(); @@ -246,6 +257,7 @@ restart: trace_softirq_exit(); account_system_vtime(current); + (void)set_exec_env(envid); _local_bh_enable(); } @@ -286,6 +298,7 @@ void irq_exit(void) { account_system_vtime(current); trace_hardirq_exit(); + restore_context(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); @@ -574,8 +587,6 @@ static int __cpuinit cpu_callback(struct switch (action) { case CPU_UP_PREPARE: - BUG_ON(per_cpu(tasklet_vec, hotcpu).list); - BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); @@ -605,6 +616,52 @@ static int __cpuinit cpu_callback(struct return NOTIFY_OK; } +static int proc_ksoftirqd(ctl_table *ctl, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, cpu; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + if (!write) + return ret; + + for_each_online_cpu(cpu) { + per_cpu(ksoftirqd_wakeup, cpu) = + ksoftirqd_stat[cpu] ? per_cpu(ksoftirqd, cpu) : NULL; + } + return ret; +} + +static int sysctl_ksoftirqd(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -EINVAL; +} + +static ctl_table debug_table[] = { + { + .ctl_name = 1246, + .procname = "ksoftirqd", + .data = ksoftirqd_stat, + .maxlen = sizeof(ksoftirqd_stat), + .mode = 0644, + .proc_handler = &proc_ksoftirqd, + .strategy = &sysctl_ksoftirqd + }, + {0} +}; + +static ctl_table root_table[] = { + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table + }, + {0} +}; + static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; @@ -615,6 +672,7 @@ __init int spawn_ksoftirqd(void) cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); + register_sysctl_table(root_table, 0); return 0; } diff -uprN linux-2.6.18/kernel/stop_machine.c linux-2.6.18.ovz/kernel/stop_machine.c --- linux-2.6.18/kernel/stop_machine.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/stop_machine.c 2007-06-13 06:55:07.000000000 -0400 @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -56,7 +57,7 @@ static int stopmachine(void *cpu) /* Yield in first stage: migration threads need to * help our sisters onto their CPUs. */ if (!prepared && !irqs_disabled) - yield(); + msleep(10); else cpu_relax(); } @@ -96,7 +97,7 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { - if (i == raw_smp_processor_id()) + if (i == task_cpu(current)) continue; ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); if (ret < 0) @@ -106,7 +107,7 @@ static int stop_machine(void) /* Wait for them all to come to life. */ while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - yield(); + msleep(10); /* If some failed, kill them all. */ if (ret < 0) { @@ -177,7 +178,7 @@ struct task_struct *__stop_machine_run(i /* If they don't care which CPU fn runs on, bind to any online one. */ if (cpu == NR_CPUS) - cpu = raw_smp_processor_id(); + cpu = task_cpu(current); p = kthread_create(do_stop, &smdata, "kstopmachine"); if (!IS_ERR(p)) { diff -uprN linux-2.6.18/kernel/sys.c linux-2.6.18.ovz/kernel/sys.c --- linux-2.6.18/kernel/sys.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/sys.c 2007-06-13 06:55:07.000000000 -0400 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -434,6 +435,102 @@ int unregister_reboot_notifier(struct no EXPORT_SYMBOL(unregister_reboot_notifier); +DECLARE_MUTEX(virtinfo_sem); +EXPORT_SYMBOL(virtinfo_sem); +static struct vnotifier_block *virtinfo_chain[VIRT_TYPES]; + +void __virtinfo_notifier_register(int type, struct vnotifier_block *nb) +{ + struct vnotifier_block **p; + + for (p = &virtinfo_chain[type]; + *p != NULL && nb->priority < (*p)->priority; + p = &(*p)->next); + nb->next = *p; + smp_wmb(); + *p = nb; +} + +EXPORT_SYMBOL(__virtinfo_notifier_register); + +void virtinfo_notifier_register(int type, struct vnotifier_block *nb) +{ + down(&virtinfo_sem); + __virtinfo_notifier_register(type, nb); + up(&virtinfo_sem); +} + +EXPORT_SYMBOL(virtinfo_notifier_register); + +struct virtinfo_cnt_struct { + volatile unsigned long exit[NR_CPUS]; + volatile unsigned long entry; +}; +static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt); + +void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb) +{ + struct vnotifier_block **p; + int entry_cpu, exit_cpu; + unsigned long cnt, ent; + + down(&virtinfo_sem); + for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next); + *p = nb->next; + smp_mb(); + + for_each_cpu_mask(entry_cpu, cpu_possible_map) { + while (1) { + cnt = 0; + for_each_cpu_mask(exit_cpu, cpu_possible_map) + cnt += + per_cpu(virtcnt, entry_cpu).exit[exit_cpu]; + smp_rmb(); + ent = per_cpu(virtcnt, entry_cpu).entry; + if (cnt == ent) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ / 100); + } + } + up(&virtinfo_sem); +} + +EXPORT_SYMBOL(virtinfo_notifier_unregister); + +int virtinfo_notifier_call(int type, unsigned long n, void *data) +{ + int ret; + int entry_cpu, exit_cpu; + struct vnotifier_block *nb; + + entry_cpu = get_cpu(); + per_cpu(virtcnt, entry_cpu).entry++; + smp_wmb(); + put_cpu(); + + nb = virtinfo_chain[type]; + ret = NOTIFY_DONE; + while (nb) + { + ret = nb->notifier_call(nb, n, data, ret); + if(ret & NOTIFY_STOP_MASK) { + ret &= ~NOTIFY_STOP_MASK; + break; + } + nb = nb->next; + } + + exit_cpu = get_cpu(); + smp_wmb(); + per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++; + put_cpu(); + + return ret; +} + +EXPORT_SYMBOL(virtinfo_notifier_call); + static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; @@ -479,17 +576,19 @@ asmlinkage long sys_setpriority(int whic switch (which) { case PRIO_PROCESS: if (!who) - who = current->pid; - p = find_task_by_pid(who); + who = virt_pid(current); + p = find_task_by_pid_ve(who); if (p) error = set_one_prio(p, niceval, error); break; case PRIO_PGRP: if (!who) who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + else + who = vpid_to_pid(who); + do_each_task_pid_ve(who, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_task_pid_ve(who, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -499,10 +598,10 @@ asmlinkage long sys_setpriority(int whic if ((who != current->uid) && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread_ve(g, p) if (p->uid == who) error = set_one_prio(p, niceval, error); - while_each_thread(g, p); + while_each_thread_ve(g, p); if (who != current->uid) free_uid(user); /* For find_user() */ break; @@ -532,8 +631,8 @@ asmlinkage long sys_getpriority(int whic switch (which) { case PRIO_PROCESS: if (!who) - who = current->pid; - p = find_task_by_pid(who); + who = virt_pid(current); + p = find_task_by_pid_ve(who); if (p) { niceval = 20 - task_nice(p); if (niceval > retval) @@ -543,11 +642,13 @@ asmlinkage long sys_getpriority(int whic case PRIO_PGRP: if (!who) who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + else + who = vpid_to_pid(who); + do_each_task_pid_ve(who, PIDTYPE_PGID, p) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_task_pid_ve(who, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -557,13 +658,13 @@ asmlinkage long sys_getpriority(int whic if ((who != current->uid) && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread_ve(g, p) if (p->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } - while_each_thread(g, p); + while_each_thread_ve(g, p); if (who != current->uid) free_uid(user); /* for find_user() */ break; @@ -694,6 +795,24 @@ asmlinkage long sys_reboot(int magic1, i magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + case LINUX_REBOOT_CMD_HALT: + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_RESTART2: + force_sig(SIGKILL, get_exec_env()->init_entry); + + case LINUX_REBOOT_CMD_CAD_ON: + case LINUX_REBOOT_CMD_CAD_OFF: + return 0; + + default: + return -EINVAL; + } +#endif + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ @@ -883,7 +1002,7 @@ asmlinkage long sys_setgid(gid_t gid) return 0; } -static int set_user(uid_t new_ruid, int dumpclear) +int set_user(uid_t new_ruid, int dumpclear) { struct user_struct *new_user; @@ -893,7 +1012,7 @@ static int set_user(uid_t new_ruid, int if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && - new_user != &root_user) { + new_ruid != 0) { free_uid(new_user); return -EAGAIN; } @@ -908,6 +1027,7 @@ static int set_user(uid_t new_ruid, int current->uid = new_ruid; return 0; } +EXPORT_SYMBOL(set_user); /* * Unprivileged users may change the real uid to the effective uid @@ -1196,8 +1316,27 @@ asmlinkage long sys_setfsgid(gid_t gid) return old_fsgid; } +#ifdef CONFIG_VE +unsigned long long ve_relative_clock(struct timespec * ts) +{ + unsigned long long offset = 0; + + if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec || + (ts->tv_sec == get_exec_env()->start_timespec.tv_sec && + ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec)) + offset = (unsigned long long)(ts->tv_sec - + get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC + + ts->tv_nsec - get_exec_env()->start_timespec.tv_nsec; + return nsec_to_clock_t(offset); +} +#endif + asmlinkage long sys_times(struct tms __user * tbuf) { +#ifdef CONFIG_VE + struct timespec now; +#endif + /* * In the SMP world we might just be unlucky and have one of * the times increment as we use it. Since the value is an @@ -1231,7 +1370,13 @@ asmlinkage long sys_times(struct tms __u if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } +#ifndef CONFIG_VE return (long) jiffies_64_to_clock_t(get_jiffies_64()); +#else + /* Compare to calculation in fs/proc/array.c */ + do_posix_clock_monotonic_gettime(&now); + return ve_relative_clock(&now); +#endif } /* @@ -1252,21 +1397,24 @@ asmlinkage long sys_setpgid(pid_t pid, p struct task_struct *p; struct task_struct *group_leader = current->group_leader; int err = -EINVAL; + int _pgid; if (!pid) - pid = group_leader->pid; + pid = virt_pid(group_leader); if (!pgid) pgid = pid; if (pgid < 0) return -EINVAL; + _pgid = vpid_to_pid(pgid); + /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM */ write_lock_irq(&tasklist_lock); err = -ESRCH; - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); if (!p) goto out; @@ -1291,25 +1439,29 @@ asmlinkage long sys_setpgid(pid_t pid, p if (p->signal->leader) goto out; - if (pgid != pid) { + pgid = virt_pid(p); + if (_pgid != p->pid) { struct task_struct *p; - do_each_task_pid(pgid, PIDTYPE_PGID, p) { - if (p->signal->session == group_leader->signal->session) + do_each_task_pid_ve(_pgid, PIDTYPE_PGID, p) { + if (p->signal->session == group_leader->signal->session) { + pgid = virt_pgid(p); goto ok_pgid; - } while_each_task_pid(pgid, PIDTYPE_PGID, p); + } + } while_each_task_pid_ve(_pgid, PIDTYPE_PGID, p); goto out; } ok_pgid: - err = security_task_setpgid(p, pgid); + err = security_task_setpgid(p, _pgid); if (err) goto out; - if (process_group(p) != pgid) { + if (process_group(p) != _pgid) { detach_pid(p, PIDTYPE_PGID); - p->signal->pgrp = pgid; - attach_pid(p, PIDTYPE_PGID, pgid); + p->signal->pgrp = _pgid; + attach_pid(p, PIDTYPE_PGID, _pgid); + set_virt_pgid(p, pgid); } err = 0; @@ -1322,19 +1474,19 @@ out: asmlinkage long sys_getpgid(pid_t pid) { if (!pid) { - return process_group(current); + return virt_pgid(current); } else { int retval; struct task_struct *p; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); retval = -ESRCH; if (p) { retval = security_task_getpgid(p); if (!retval) - retval = process_group(p); + retval = virt_pgid(p); } read_unlock(&tasklist_lock); return retval; @@ -1346,7 +1498,7 @@ asmlinkage long sys_getpgid(pid_t pid) asmlinkage long sys_getpgrp(void) { /* SMP - assuming writes are word atomic this is fine */ - return process_group(current); + return virt_pgid(current); } #endif @@ -1354,19 +1506,19 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { if (!pid) { - return current->signal->session; + return virt_sid(current); } else { int retval; struct task_struct *p; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = find_task_by_pid_ve(pid); retval = -ESRCH; if(p) { retval = security_task_getsid(p); if (!retval) - retval = p->signal->session; + retval = virt_sid(p); } read_unlock(&tasklist_lock); return retval; @@ -1394,14 +1546,17 @@ asmlinkage long sys_setsid(void) * session id and so the check will always fail and make it so * init cannot successfully call setsid. */ - if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session)) + if (session > 1 && find_task_by_pid_type_ve(PIDTYPE_PGID, session)) goto out; group_leader->signal->leader = 1; __set_special_pids(session, session); + set_virt_pgid(group_leader, virt_pid(group_leader)); + set_virt_sid(group_leader, virt_pid(group_leader)); group_leader->signal->tty = NULL; group_leader->signal->tty_old_pgrp = 0; - err = process_group(group_leader); + + err = virt_pgid(group_leader); out: write_unlock_irq(&tasklist_lock); mutex_unlock(&tty_mutex); @@ -1675,7 +1830,7 @@ asmlinkage long sys_newuname(struct new_ int errno = 0; down_read(&uts_sem); - if (copy_to_user(name,&system_utsname,sizeof *name)) + if (copy_to_user(name, utsname(), sizeof *name)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1686,15 +1841,15 @@ asmlinkage long sys_sethostname(char __u int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.nodename, tmp, len); - system_utsname.nodename[len] = 0; + memcpy(utsname()->nodename, tmp, len); + utsname()->nodename[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1710,11 +1865,11 @@ asmlinkage long sys_gethostname(char __u if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(system_utsname.nodename); + i = 1 + strlen(utsname()->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, system_utsname.nodename, i)) + if (copy_to_user(name, utsname()->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1731,7 +1886,7 @@ asmlinkage long sys_setdomainname(char _ int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1739,8 +1894,8 @@ asmlinkage long sys_setdomainname(char _ down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.domainname, tmp, len); - system_utsname.domainname[len] = 0; + memcpy(utsname()->domainname, tmp, len); + utsname()->domainname[len] = 0; errno = 0; } up_write(&uts_sem); diff -uprN linux-2.6.18/kernel/sys_ni.c linux-2.6.18.ovz/kernel/sys_ni.c --- linux-2.6.18/kernel/sys_ni.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/sys_ni.c 2007-06-13 06:55:07.000000000 -0400 @@ -134,3 +134,8 @@ cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); cond_syscall(compat_sys_move_pages); + +cond_syscall(sys_getluid); +cond_syscall(sys_setluid); +cond_syscall(sys_setublimit); +cond_syscall(sys_ubstat); diff -uprN linux-2.6.18/kernel/sysctl.c linux-2.6.18.ovz/kernel/sysctl.c --- linux-2.6.18/kernel/sysctl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/sysctl.c 2007-06-13 06:55:07.000000000 -0400 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,7 @@ #include #include #include +#include #include #include @@ -63,6 +65,7 @@ extern int max_threads; extern int sysrq_enabled; extern int core_uses_pid; extern int suid_dumpable; +extern int sysctl_at_vsyscall; extern char core_pattern[]; extern int cad_pid; extern int pid_max; @@ -73,6 +76,7 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int ve_area_access_check; /* fs/namei.c */ #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -87,6 +91,8 @@ static int min_percpu_pagelist_fract = 8 static int ngroups_max = NGROUPS_MAX; +int ve_allow_kthreads; +EXPORT_SYMBOL(ve_allow_kthreads); #ifdef CONFIG_KMOD extern char modprobe_path[]; #endif @@ -94,13 +100,8 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -extern size_t shm_ctlmax; -extern size_t shm_ctlall; -extern int shm_ctlmni; -extern int msg_ctlmax; -extern int msg_ctlmnb; -extern int msg_ctlmni; -extern int sem_ctls[]; +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif #ifdef __sparc__ @@ -108,6 +109,13 @@ extern char reboot_command []; extern int stop_a_enabled; extern int scons_pwroff; #endif +#ifdef CONFIG_SCHED_VCPU +extern u32 vcpu_sched_timeslice; +extern int vcpu_timeslice; +extern u32 vcpu_hot_timeslice; +#endif + +extern int alloc_fail_warn; #ifdef __hppa__ extern int pwrsw_enabled; @@ -123,6 +131,7 @@ extern int spin_retry; #endif extern int sysctl_hz_timer; +int decode_call_traces = 1; #ifdef CONFIG_BSD_PROCESS_ACCT extern int acct_parm[]; @@ -135,11 +144,25 @@ extern int no_unaligned_warning; #ifdef CONFIG_RT_MUTEXES extern int max_lock_depth; #endif +#ifdef CONFIG_VE +int glob_ve_meminfo = 0; +EXPORT_SYMBOL(glob_ve_meminfo); +#endif + +#ifdef CONFIG_FAIRSCHED +extern int fairsched_max_latency; +extern int scale_vcpu_frequency; +int fsch_sysctl_latency(ctl_table *ctl, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context); static ctl_table root_table[]; static struct ctl_table_header root_table_header = @@ -182,6 +205,8 @@ static void register_proc_table(ctl_tabl static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); #endif +extern struct new_utsname virt_utsname; + /* The default sysctl tables: */ static ctl_table root_table[] = { @@ -228,50 +253,110 @@ static ctl_table root_table[] = { }; static ctl_table kern_table[] = { +#ifndef CONFIG_UTS_NS { .ctl_name = KERN_OSTYPE, .procname = "ostype", - .data = system_utsname.sysname, - .maxlen = sizeof(system_utsname.sysname), + .data = init_uts_ns.name.sysname, + .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_OSRELEASE, .procname = "osrelease", - .data = system_utsname.release, - .maxlen = sizeof(system_utsname.release), + .data = init_uts_ns.name.release, + .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_VERSION, .procname = "version", - .data = system_utsname.version, - .maxlen = sizeof(system_utsname.version), + .data = init_uts_ns.name.version, + .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_NODENAME, .procname = "hostname", - .data = system_utsname.nodename, - .maxlen = sizeof(system_utsname.nodename), + .data = init_uts_ns.name.nodename, + .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_DOMAINNAME, .procname = "domainname", - .data = system_utsname.domainname, - .maxlen = sizeof(system_utsname.domainname), + .data = init_uts_ns.name.domainname, + .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, + }, +#else /* !CONFIG_UTS_NS */ + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = NULL, + /* could maybe use __NEW_UTS_LEN here? */ + .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, release), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, version), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, + .virt_handler = 1, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, + .virt_handler = 1, + }, +#endif /* !CONFIG_UTS_NS */ + { + .ctl_name = KERN_VIRT_OSRELEASE, + .procname = "virt_osrelease", + .data = virt_utsname.release, + .maxlen = sizeof(virt_utsname.release), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_PANIC, @@ -309,10 +394,11 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_CAP_BSET, .procname = "cap-bound", - .data = &cap_bset, + .data = NULL, .maxlen = sizeof(kernel_cap_t), .mode = 0600, .proc_handler = &proc_dointvec_bset, + .strategy = &sysctl_strategy_bset, }, #ifdef CONFIG_BLK_DEV_INITRD { @@ -351,6 +437,22 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = KERN_SILENCE_LEVEL, + .procname = "silence-level", + .data = &console_silence_loglevel, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = KERN_ALLOC_FAIL_WARN, + .procname = "alloc_fail_warn", + .data = &alloc_fail_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, #ifdef __hppa__ { .ctl_name = KERN_HPPA_PWRSW, @@ -431,58 +533,65 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_SHMMAX, .procname = "shmmax", - .data = &shm_ctlmax, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, + .virt_handler = 1, }, { .ctl_name = KERN_SHMALL, .procname = "shmall", - .data = &shm_ctlall, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, + .virt_handler = 1, }, { .ctl_name = KERN_SHMMNI, .procname = "shmmni", - .data = &shm_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, + .virt_handler = 1, }, { .ctl_name = KERN_MSGMAX, .procname = "msgmax", - .data = &msg_ctlmax, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, + .virt_handler = 1, }, { .ctl_name = KERN_MSGMNI, .procname = "msgmni", - .data = &msg_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, + .virt_handler = 1, }, { .ctl_name = KERN_MSGMNB, .procname = "msgmnb", - .data = &msg_ctlmnb, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, + .virt_handler = 1, }, { .ctl_name = KERN_SEM, .procname = "sem", - .data = &sem_ctls, + .data = NULL, .maxlen = 4*sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, + .virt_handler = 1, }, #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -577,6 +686,32 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_SCHED_VCPU + { + .ctl_name = KERN_VCPU_SCHED_TIMESLICE, + .procname = "vcpu_sched_timeslice", + .data = &vcpu_sched_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_VCPU_TIMESLICE, + .procname = "vcpu_timeslice", + .data = &vcpu_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_VCPU_HOT_TIMESLICE, + .procname = "vcpu_hot_timeslice", + .data = &vcpu_hot_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_PIDMAX, .procname = "pid_max", @@ -588,6 +723,24 @@ static ctl_table kern_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, +#ifdef CONFIG_VE + { + .ctl_name = KERN_VIRT_PIDS, + .procname = "virt_pids", + .data = &glob_virt_pids, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_VE_MEMINFO, + .procname = "ve_meminfo", + .data = &glob_ve_meminfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_PANIC_ON_OOPS, .procname = "panic_on_oops", @@ -943,6 +1096,17 @@ static ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_MIN_SLAB, + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_X86_32 { @@ -956,6 +1120,24 @@ static ctl_table vm_table[] = { .extra1 = &zero, }, #endif +#ifdef CONFIG_FAIRSCHED + { + .ctl_name = KERN_FAIRSCHED_MAX_LATENCY, + .procname = "fairsched-max-latency", + .data = &fairsched_max_latency, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &fsch_sysctl_latency + }, + { + .ctl_name = KERN_SCALE_VCPU_FREQUENCY, + .procname = "scale_vcpu_frequency", + .data = &scale_vcpu_frequency, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; @@ -1082,10 +1264,26 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = FS_AT_VSYSCALL, + .procname = "vsyscall", + .data = &sysctl_at_vsyscall, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; static ctl_table debug_table[] = { + { + .ctl_name = DBG_DECODE_CALLTRACES, + .procname = "decode_call_traces", + .data = &decode_call_traces, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; @@ -1149,6 +1347,7 @@ int do_sysctl(int __user *name, int nlen { struct list_head *tmp; int error = -ENOTDIR; + struct ve_struct *ve; if (nlen <= 0 || nlen >= CTL_MAXNAME) return -ENOTDIR; @@ -1157,13 +1356,24 @@ int do_sysctl(int __user *name, int nlen if (!oldlenp || get_user(old_len, oldlenp)) return -EFAULT; } + ve = get_exec_env(); spin_lock(&sysctl_lock); +#ifdef CONFIG_VE + tmp = ve->sysctl_lh.next; +#else tmp = &root_table_header.ctl_entry; +#endif do { - struct ctl_table_header *head = - list_entry(tmp, struct ctl_table_header, ctl_entry); + struct ctl_table_header *head; void *context = NULL; +#ifdef CONFIG_VE + if (tmp == &ve->sysctl_lh) + /* second pass over global variables */ + tmp = &root_table_header.ctl_entry; +#endif + + head = list_entry(tmp, struct ctl_table_header, ctl_entry); if (!use_table(head)) continue; @@ -1217,10 +1427,15 @@ static int test_perm(int mode, int op) static inline int ctl_perm(ctl_table *table, int op) { int error; + int mode = table->mode; + error = security_sysctl(table, op); if (error) return error; - return test_perm(table->mode, op); + if (!ve_accessible(table->owner_env, get_exec_env()) && + !table->virt_handler) + mode &= ~0222; /* disable write access */ + return test_perm(mode, op); } static int parse_table(int __user *name, int nlen, @@ -1262,6 +1477,36 @@ repeat: return -ENOTDIR; } +int __do_sysctl_strategy (void *data, ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) { + size_t len; + + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, data, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + if (copy_from_user(data, newval, len)) + return -EFAULT; + } + + return 0; +} + /* Perform the actual read/write of a sysctl table entry. */ int do_sysctl_strategy (ctl_table *table, int __user *name, int nlen, @@ -1269,7 +1514,6 @@ int do_sysctl_strategy (ctl_table *table void __user *newval, size_t newlen, void **context) { int op = 0, rc; - size_t len; if (oldval) op |= 004; @@ -1289,27 +1533,10 @@ int do_sysctl_strategy (ctl_table *table /* If there is no strategy routine, or if the strategy returns * zero, proceed with automatic r/w */ - if (table->data && table->maxlen) { - if (oldval && oldlenp) { - if (get_user(len, oldlenp)) - return -EFAULT; - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if(copy_to_user(oldval, table->data, len)) - return -EFAULT; - if(put_user(len, oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - len = newlen; - if (len > table->maxlen) - len = table->maxlen; - if(copy_from_user(table->data, newval, len)) - return -EFAULT; - } - } + if (table->data && table->maxlen) + return __do_sysctl_strategy (table->data, table, name, nlen, + oldval, oldlenp, newval, newlen, context); + return 0; } @@ -1386,6 +1613,8 @@ struct ctl_table_header *register_sysctl int insert_at_head) { struct ctl_table_header *tmp; + struct list_head *lh; + tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); if (!tmp) return NULL; @@ -1394,17 +1623,73 @@ struct ctl_table_header *register_sysctl tmp->used = 0; tmp->unregistering = NULL; spin_lock(&sysctl_lock); +#ifdef CONFIG_VE + lh = &get_exec_env()->sysctl_lh; +#else + lh = &root_table_header.ctl_entry; +#endif if (insert_at_head) - list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); + list_add(&tmp->ctl_entry, lh); else - list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); + list_add_tail(&tmp->ctl_entry, lh); spin_unlock(&sysctl_lock); #ifdef CONFIG_PROC_FS +#ifdef CONFIG_VE + register_proc_table(table, get_exec_env()->proc_sys_root, tmp); +#else register_proc_table(table, proc_sys_root, tmp); #endif +#endif return tmp; } +void free_sysctl_clone(ctl_table *clone) +{ + int i; + + for (i = 0; clone[i].ctl_name != 0; i++) + if (clone[i].child != NULL) + free_sysctl_clone(clone[i].child); + + kfree(clone); +} + +ctl_table *clone_sysctl_template(ctl_table *tmpl) +{ + int i, nr; + ctl_table *clone; + + nr = 0; + while (tmpl[nr].ctl_name != 0) + nr++; + nr++; + + clone = kmalloc(nr * sizeof(ctl_table), GFP_KERNEL); + if (clone == NULL) + return NULL; + + memcpy(clone, tmpl, nr * sizeof(ctl_table)); + for (i = 0; i < nr; i++) { + clone[i].owner_env = get_exec_env(); + clone[i].de = NULL; + if (tmpl[i].child == NULL) + continue; + + clone[i].child = clone_sysctl_template(tmpl[i].child); + if (clone[i].child == NULL) + goto unroll; + } + return clone; + +unroll: + for (i--; i >= 0; i--) + if (clone[i].child != NULL) + free_sysctl_clone(clone[i].child); + + kfree(clone); + return NULL; +} + /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl_table @@ -1418,8 +1703,12 @@ void unregister_sysctl_table(struct ctl_ spin_lock(&sysctl_lock); start_unregistering(header); #ifdef CONFIG_PROC_FS +#ifdef CONFIG_VE + unregister_proc_table(header->ctl_table, get_exec_env()->proc_sys_root); +#else unregister_proc_table(header->ctl_table, proc_sys_root); #endif +#endif spin_unlock(&sysctl_lock); kfree(header); } @@ -1505,11 +1794,6 @@ static void unregister_proc_table(ctl_ta * its fields. We are under sysctl_lock here. */ de->data = NULL; - - /* Don't unregister proc entries that are still being used.. */ - if (atomic_read(&de->count)) - continue; - table->de = NULL; remove_proc_entry(table->procname, root); } @@ -1597,14 +1881,14 @@ static ssize_t proc_writesys(struct file * * Returns 0 on success. */ -int proc_dostring(ctl_table *table, int write, struct file *filp, +static int _proc_dostring(void *data, int maxlen, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; char __user *p; char c; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; @@ -1620,20 +1904,20 @@ int proc_dostring(ctl_table *table, int break; len++; } - if (len >= table->maxlen) - len = table->maxlen-1; - if(copy_from_user(table->data, buffer, len)) + if (len >= maxlen) + len = maxlen-1; + if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) table->data)[len] = 0; + ((char *) data)[len] = 0; *ppos += *lenp; } else { - len = strlen(table->data); - if (len > table->maxlen) - len = table->maxlen; + len = strlen(data); + if (len > maxlen) + len = maxlen; if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, table->data, len)) + if(copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { if(put_user('\n', ((char __user *) buffer) + len)) @@ -1646,12 +1930,20 @@ int proc_dostring(ctl_table *table, int return 0; } +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return _proc_dostring(table->data, table->maxlen, write, + filp, buffer, lenp, ppos); +} + /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +#ifndef CONFIG_UTS_NS +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int r; @@ -1668,6 +1960,98 @@ static int proc_doutsstring(ctl_table *t return r; } +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + int r; + + if (newval && newlen) { + down_write(&uts_sem); + r = sysctl_string(table, name, nlen, + oldval, oldlenp, newval, newlen, context); + up_write(&uts_sem); + } else { + down_read(&uts_sem); + r = sysctl_string(table, name, nlen, + oldval, oldlenp, newval, newlen, context); + up_read(&uts_sem); + } + return rv; +} +#else /* !CONFIG_UTS_NS */ +static char *choose_uts_string(int ctl_name) +{ + struct uts_namespace *uts_ns = current->nsproxy->uts_ns; + + switch (ctl_name) { + case KERN_OSTYPE: + return uts_ns->name.sysname; + case KERN_NODENAME: + return uts_ns->name.nodename; + case KERN_OSRELEASE: + return uts_ns->name.release; + case KERN_VERSION: + return uts_ns->name.version; + case KERN_DOMAINNAME: + return uts_ns->name.domainname; + case KERN_VIRT_OSRELEASE: + return virt_utsname.release; + default: + return NULL; + } +} +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + char* which; + + which = choose_uts_string(table->ctl_name); + if (!which) + return -EINVAL; + + if (!write) { + down_read(&uts_sem); + r=_proc_dostring(which,table->maxlen,0,filp,buffer,lenp, ppos); + up_read(&uts_sem); + } else { + down_write(&uts_sem); + r=_proc_dostring(which,table->maxlen,1,filp,buffer,lenp, ppos); + up_write(&uts_sem); + } + return r; +} + +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + ctl_table tmp_table; + char *which; + int r; + + which = choose_uts_string(table->ctl_name); + if (!which) + return -EINVAL; + + tmp_table = *table; + tmp_table.data = which; + if (newval && newlen) { + down_write(&uts_sem); + r = sysctl_string(&tmp_table, name, nlen, + oldval, oldlenp, newval, newlen, context); + up_write(&uts_sem); + } else { + down_read(&uts_sem); + r = sysctl_string(&tmp_table, name, nlen, + oldval, oldlenp, newval, newlen, context); + up_read(&uts_sem); + } + return r; +} +#endif /* !CONFIG_UTS_NS */ + static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, int write, void *data) @@ -1687,8 +2071,9 @@ static int do_proc_dointvec_conv(int *ne return 0; } -static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos, +static int __do_proc_dointvec(void *tbl_data, ctl_table *table, + int write, struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -1701,13 +2086,13 @@ static int do_proc_dointvec(ctl_table *t char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (int *) table->data; + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -1796,6 +2181,16 @@ static int do_proc_dointvec(ctl_table *t #undef TMPBUFLEN } +static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, filp, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -1862,14 +2257,29 @@ int proc_dointvec_bset(ctl_table *table, void __user *buffer, size_t *lenp, loff_t *ppos) { int op; + struct ve_struct *ve; + + ve = get_exec_env(); - if (!capable(CAP_SYS_MODULE)) { + /* For VE's root writing to VE's cap-bound is prohibited */ + if ((ve_is_super(ve) && !capable(CAP_SYS_MODULE)) || + (!ve_is_super(ve) && (!capable(CAP_VE_ADMIN) || write))) { return -EPERM; } op = (current->pid == 1) ? OP_SET : OP_AND; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + return __do_proc_dointvec(&cap_bset, table, write, filp, + buffer, lenp, ppos, do_proc_dointvec_bset_conv, &op); +} + +int sysctl_strategy_bset(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) { + + return __do_sysctl_strategy (&cap_bset, table, name, nlen, + oldval, oldlenp, newval, newlen, context); + + return 1; } struct do_proc_dointvec_minmax_conv_param { @@ -1929,7 +2339,7 @@ int proc_dointvec_minmax(ctl_table *tabl do_proc_dointvec_minmax_conv, ¶m); } -static int do_proc_doulongvec_minmax(ctl_table *table, int write, +static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, @@ -1943,13 +2353,13 @@ static int do_proc_doulongvec_minmax(ctl char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (unsigned long *) table->data; + i = (unsigned long *) data; min = (unsigned long *) table->extra1; max = (unsigned long *) table->extra2; vleft = table->maxlen / sizeof(unsigned long); @@ -2034,6 +2444,17 @@ static int do_proc_doulongvec_minmax(ctl #undef TMPBUFLEN } +static int do_proc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + filp, buffer, lenp, ppos, convmul, convdiv); +} + /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table @@ -2222,6 +2643,49 @@ int proc_dointvec_ms_jiffies(ctl_table * do_proc_dointvec_ms_jiffies_conv, NULL); } +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *data; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + switch (table->ctl_name) { + case KERN_SHMMAX: + data = &ns->shm_ctlmax; + goto proc_minmax; + case KERN_SHMALL: + data = &ns->shm_ctlall; + goto proc_minmax; + case KERN_SHMMNI: + data = &ns->shm_ctlmni; + break; + case KERN_MSGMAX: + data = &ns->msg_ctlmax; + break; + case KERN_MSGMNI: + data = &ns->msg_ctlmni; + break; + case KERN_MSGMNB: + data = &ns->msg_ctlmnb; + break; + case KERN_SEM: + data = &ns->sem_ctls; + break; + default: + return -EINVAL; + } + + return __do_proc_dointvec(data, table, write, filp, buffer, + lenp, ppos, NULL, NULL); +proc_minmax: + return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, + lenp, ppos, 1l, 1l); +} +#endif + #else /* CONFIG_PROC_FS */ int proc_dostring(ctl_table *table, int write, struct file *filp, @@ -2230,12 +2694,27 @@ int proc_dostring(ctl_table *table, int return -ENOSYS; } -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif + int proc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2534,6 +3013,14 @@ void unregister_sysctl_table(struct ctl_ { } +ctl_table * clone_sysctl_template(ctl_table *tmpl, int nr) +{ + return NULL; +} + +void free_sysctl_clone(ctl_table *tmpl) +{ +} #endif /* CONFIG_SYSCTL */ /* @@ -2554,3 +3041,5 @@ EXPORT_SYMBOL(sysctl_jiffies); EXPORT_SYMBOL(sysctl_ms_jiffies); EXPORT_SYMBOL(sysctl_string); EXPORT_SYMBOL(unregister_sysctl_table); +EXPORT_SYMBOL(clone_sysctl_template); +EXPORT_SYMBOL(free_sysctl_clone); diff -uprN linux-2.6.18/kernel/taskstats.c linux-2.6.18.ovz/kernel/taskstats.c --- linux-2.6.18/kernel/taskstats.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/taskstats.c 2007-06-13 06:55:07.000000000 -0400 @@ -180,7 +180,7 @@ static int fill_pid(pid_t pid, struct ta if (!pidtsk) { read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_pid_all(pid); if (!tsk) { read_unlock(&tasklist_lock); return -ESRCH; @@ -219,7 +219,7 @@ static int fill_tgid(pid_t tgid, struct first = tgidtsk; if (!first) { read_lock(&tasklist_lock); - first = find_task_by_pid(tgid); + first = find_task_by_pid_all(tgid); if (!first) { read_unlock(&tasklist_lock); return -ESRCH; @@ -229,14 +229,17 @@ static int fill_tgid(pid_t tgid, struct } else get_task_struct(first); - /* Start with stats from dead tasks */ - spin_lock_irqsave(&first->signal->stats_lock, flags); - if (first->signal->stats) - memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->signal->stats_lock, flags); tsk = first; read_lock(&tasklist_lock); + /* Start with stats from dead tasks */ + if (first->signal) { + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + } + do { if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) continue; @@ -248,7 +251,7 @@ static int fill_tgid(pid_t tgid, struct */ delayacct_add_tsk(stats, tsk); - } while_each_thread(first, tsk); + } while_each_thread_all(first, tsk); read_unlock(&tasklist_lock); stats->version = TASKSTATS_VERSION; @@ -256,7 +259,7 @@ static int fill_tgid(pid_t tgid, struct * Accounting subsytems can also add calls here to modify * fields of taskstats. */ - + put_task_struct(first); return 0; } diff -uprN linux-2.6.18/kernel/timer.c linux-2.6.18.ovz/kernel/timer.c --- linux-2.6.18/kernel/timer.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/timer.c 2007-06-13 06:55:07.000000000 -0400 @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include #include @@ -443,7 +445,11 @@ static inline void __run_timers(tvec_bas spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); + struct ve_struct *ve; + + ve = set_exec_env(get_ve0()); fn(data); + (void)set_exec_env(ve); if (preempt_count != preempt_count()) { printk(KERN_WARNING "huh, entered %p " "with preempt_count %08x, exited" @@ -1212,6 +1218,37 @@ EXPORT_SYMBOL(avenrun); * calc_load - given tick count, update the avenrun load estimates. * This is called while holding a write_lock on xtime_lock. */ + + +#ifdef CONFIG_VE +static void calc_load_ve(void) +{ + unsigned long flags, nr_unint, nr_active; + struct ve_struct *ve; + + read_lock(&ve_list_lock); + for_each_ve(ve) { + nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve); + nr_active *= FIXED_1; + + CALC_LOAD(ve->avenrun[0], EXP_1, nr_active); + CALC_LOAD(ve->avenrun[1], EXP_5, nr_active); + CALC_LOAD(ve->avenrun[2], EXP_15, nr_active); + } + read_unlock(&ve_list_lock); + + nr_unint = nr_uninterruptible() * FIXED_1; + spin_lock_irqsave(&kstat_glb_lock, flags); + CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint); + CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); + CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); + spin_unlock_irqrestore(&kstat_glb_lock, flags); + +} +#else +#define calc_load_ve() do { } while (0) +#endif + static inline void calc_load(unsigned long ticks) { unsigned long active_tasks; /* fixed-point */ @@ -1224,6 +1261,7 @@ static inline void calc_load(unsigned lo CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); + calc_load_ve(); } } @@ -1320,7 +1358,7 @@ asmlinkage unsigned long sys_alarm(unsig */ asmlinkage long sys_getpid(void) { - return current->tgid; + return virt_tgid(current); } /* @@ -1334,7 +1372,7 @@ asmlinkage long sys_getppid(void) int pid; rcu_read_lock(); - pid = rcu_dereference(current->real_parent)->tgid; + pid = virt_tgid(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; @@ -1467,7 +1505,7 @@ EXPORT_SYMBOL(schedule_timeout_uninterru /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { - return current->pid; + return virt_pid(current); } /* @@ -1479,11 +1517,12 @@ asmlinkage long sys_sysinfo(struct sysin unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; unsigned long seq; + unsigned long *__avenrun; + struct timespec tp; memset((char *)&val, 0, sizeof(struct sysinfo)); do { - struct timespec tp; seq = read_seqbegin(&xtime_lock); /* @@ -1500,18 +1539,34 @@ asmlinkage long sys_sysinfo(struct sysin tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; tp.tv_sec++; } - val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + } while (read_seqretry(&xtime_lock, seq)); + if (ve_is_super(get_exec_env())) { + val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + __avenrun = &avenrun[0]; val.procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + } +#ifdef CONFIG_VE + else { + struct ve_struct *ve; + ve = get_exec_env(); + __avenrun = &ve->avenrun[0]; + val.procs = atomic_read(&ve->pcounter); + val.uptime = tp.tv_sec - ve->start_timespec.tv_sec; + } +#endif + val.loads[0] = __avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[1] = __avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + val.loads[2] = __avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); si_meminfo(&val); si_swapinfo(&val); +#ifdef CONFIG_USER_RESOURCE + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, &val) + & NOTIFY_FAIL) + return -ENOMSG; +#endif /* * If the sum of all the available memory (i.e. ram + swap) * is less than can be stored in a 32 bit unsigned long then diff -uprN linux-2.6.18/kernel/ub/Kconfig linux-2.6.18.ovz/kernel/ub/Kconfig --- linux-2.6.18/kernel/ub/Kconfig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/Kconfig 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,111 @@ +# +# User resources part (UBC) +# +# Copyright (C) 2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +menu "User resources" + +config USER_RESOURCE + bool "Enable user resource accounting" + default y + help + This patch provides accounting and allows to configure + limits for user's consumption of exhaustible system resources. + The most important resource controlled by this patch is unswappable + memory (either mlock'ed or used by internal kernel structures and + buffers). The main goal of this patch is to protect processes + from running short of important resources because of an accidental + misbehavior of processes or malicious activity aiming to ``kill'' + the system. It's worth to mention that resource limits configured + by setrlimit(2) do not give an acceptable level of protection + because they cover only small fraction of resources and work on a + per-process basis. Per-process accounting doesn't prevent malicious + users from spawning a lot of resource-consuming processes. + +config USER_RSS_ACCOUNTING + bool "Account physical memory usage" + default y + depends on USER_RESOURCE + help + This allows to estimate per beancounter physical memory usage. + Implemented alghorithm accounts shared pages of memory as well, + dividing them by number of beancounter which use the page. + +config UBC_IO_ACCT + bool "Account disk IO" + default y + depends on USER_RSS_ACCOUNTING + help + When on this option allows seeing disk IO activity caused by + tasks from each UB + +config USER_SWAP_ACCOUNTING + bool "Account swap usage" + default y + depends on USER_RESOURCE + help + This allows accounting of swap usage. + +config USER_RESOURCE_PROC + bool "Report resource usage in /proc" + default y + depends on USER_RESOURCE + help + Allows a system administrator to inspect resource accounts and limits. + +config UBC_DEBUG + bool "User resources debug features" + default n + depends on USER_RESOURCE + help + Enables to setup debug features for user resource accounting + +config UBC_DEBUG_IO + bool "Debug IO accounting" + default y + depends on UBC_DEBUG && UBC_IO_ACCT + help + Debugging for IO accointing. + +config UBC_DEBUG_KMEM + bool "Debug kmemsize with cache counters" + default n + depends on UBC_DEBUG + help + Adds /proc/user_beancounters_debug entry to get statistics + about cache usage of each beancounter + +config UBC_KEEP_UNUSED + bool "Keep unused beancounter alive" + default y + depends on UBC_DEBUG + help + If on, unused beancounters are kept on the hash and maxheld value + can be looked through. + +config UBC_DEBUG_ITEMS + bool "Account resources in items rather than in bytes" + default y + depends on UBC_DEBUG + help + When true some of the resources (e.g. kmemsize) are accounted + in items instead of bytes. + +config UBC_UNLIMITED + bool "Use unlimited ubc settings" + default y + depends on UBC_DEBUG + help + When ON all limits and barriers are set to max values. + +config UBC_IO_PRIO + bool "UBC I/O priority" + default y + depends on UBC_IO_ACCT && IOSCHED_CFQ + help + This option controls whether to build CFQ I/O scheduler + with support of UBC I/O priority. +endmenu diff -uprN linux-2.6.18/kernel/ub/Makefile linux-2.6.18.ovz/kernel/ub/Makefile --- linux-2.6.18/kernel/ub/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/Makefile 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,16 @@ +# +# User resources part (UBC) +# +# Copyright (C) 2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-y := ub_sys.o beancounter.o ub_dcache.o ub_mem.o ub_misc.o \ + ub_pages.o ub_stat.o ub_oom.o + +obj-$(CONFIG_NET) += ub_net.o +obj-$(CONFIG_USER_RSS_ACCOUNTING) += ub_page_bc.o +obj-$(CONFIG_USER_RESOURCE_PROC) += ub_proc.o +obj-$(CONFIG_UBC_IO_ACCT) += io_acct.o +obj-$(CONFIG_UBC_IO_PRIO) += io_prio.o diff -uprN linux-2.6.18/kernel/ub/beancounter.c linux-2.6.18.ovz/kernel/ub/beancounter.c --- linux-2.6.18/kernel/ub/beancounter.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/beancounter.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,670 @@ +/* + * linux/kernel/ub/beancounter.c + * + * Copyright (C) 1998 Alan Cox + * 1998-2000 Andrey V. Savochkin + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * TODO: + * - more intelligent limit check in mremap(): currently the new size is + * charged and _then_ old size is uncharged + * (almost done: !move_vma case is completely done, + * move_vma in its current implementation requires too many conditions to + * do things right, because it may be not only expansion, but shrinking + * also, plus do_munmap will require an additional parameter...) + * - problem: bad pmd page handling + * - consider /proc redesign + * - TCP/UDP ports + * + consider whether __charge_beancounter_locked should be inline + * + * Changes: + * 1999/08/17 Marcelo Tosatti + * - Set "barrier" and "limit" parts of limits atomically. + * 1999/10/06 Marcelo Tosatti + * - setublimit system call. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +static kmem_cache_t *ub_cachep; +static struct user_beancounter default_beancounter; +struct user_beancounter ub0; + +const char *ub_rnames[] = { + "kmemsize", /* 0 */ + "lockedpages", + "privvmpages", + "shmpages", + "dummy", + "numproc", /* 5 */ + "physpages", + "vmguarpages", + "oomguarpages", + "numtcpsock", + "numflock", /* 10 */ + "numpty", + "numsiginfo", + "tcpsndbuf", + "tcprcvbuf", + "othersockbuf", /* 15 */ + "dgramrcvbuf", + "numothersock", + "dcachesize", + "numfile", + "dummy", /* 20 */ + "dummy", + "dummy", + "numiptent", + "unused_privvmpages", /* UB_RESOURCES */ + "tmpfs_respages", + "swap_pages", + "held_pages", +}; + +static void init_beancounter_struct(struct user_beancounter *ub); +static void init_beancounter_store(struct user_beancounter *ub); +static void init_beancounter_nolimits(struct user_beancounter *ub); + +int print_ub_uid(struct user_beancounter *ub, char *buf, int size) +{ + if (ub->parent != NULL) + return snprintf(buf, size, "%u.%u", + ub->parent->ub_uid, ub->ub_uid); + else + return snprintf(buf, size, "%u", ub->ub_uid); +} +EXPORT_SYMBOL(print_ub_uid); + +#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1)) +#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17) +struct hlist_head ub_hash[UB_HASH_SIZE]; +DEFINE_SPINLOCK(ub_hash_lock); +LIST_HEAD(ub_list_head); /* protected by ub_hash_lock */ +EXPORT_SYMBOL(ub_hash); +EXPORT_SYMBOL(ub_hash_lock); +EXPORT_SYMBOL(ub_list_head); + +/* + * Per user resource beancounting. Resources are tied to their luid. + * The resource structure itself is tagged both to the process and + * the charging resources (a socket doesn't want to have to search for + * things at irq time for example). Reference counters keep things in + * hand. + * + * The case where a user creates resource, kills all his processes and + * then starts new ones is correctly handled this way. The refcounters + * will mean the old entry is still around with resource tied to it. + */ + +static inline void free_ub(struct user_beancounter *ub) +{ + if (ub == NULL) + return; + free_percpu(ub->ub_percpu); + kmem_cache_free(ub_cachep, ub); +} + +static inline struct user_beancounter *bc_lookup_hash(struct hlist_head *hash, + uid_t uid, struct user_beancounter *parent) +{ + struct user_beancounter *ub; + struct hlist_node *ptr; + + hlist_for_each_entry (ub, ptr, hash, ub_hash) + if (ub->ub_uid == uid && ub->parent == parent) + return get_beancounter(ub); + + return NULL; +} + +struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) +{ + struct user_beancounter *new_ub, *ub; + unsigned long flags; + struct hlist_head *hash; + + hash = &ub_hash[ub_hash_fun(uid)]; + new_ub = NULL; +retry: + spin_lock_irqsave(&ub_hash_lock, flags); + ub = bc_lookup_hash(hash, uid, NULL); + if (ub != NULL) { + spin_unlock_irqrestore(&ub_hash_lock, flags); + + if (new_ub != NULL) + free_ub(new_ub); + return ub; + } + + if (!create) { + /* no ub found */ + spin_unlock_irqrestore(&ub_hash_lock, flags); + return NULL; + } + + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } + spin_unlock_irqrestore(&ub_hash_lock, flags); + + /* alloc new ub */ + new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, + GFP_KERNEL); + if (new_ub == NULL) + return NULL; + + ub_debug(UBD_ALLOC, "Creating ub %p\n", new_ub); + memcpy(new_ub, &default_beancounter, sizeof(*new_ub)); + init_beancounter_struct(new_ub); + new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct); + if (new_ub->ub_percpu == NULL) + goto fail_free; + new_ub->ub_uid = uid; + goto retry; + +fail_free: + kmem_cache_free(ub_cachep, new_ub); + return NULL; +} +EXPORT_SYMBOL(get_beancounter_byuid); + +struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p, + int id, int create) +{ + struct user_beancounter *new_ub, *ub; + unsigned long flags; + struct hlist_head *hash; + + hash = &ub_hash[ub_subhash_fun(p, id)]; + new_ub = NULL; +retry: + spin_lock_irqsave(&ub_hash_lock, flags); + ub = bc_lookup_hash(hash, id, p); + if (ub != NULL) { + spin_unlock_irqrestore(&ub_hash_lock, flags); + + if (new_ub != NULL) { + put_beancounter(new_ub->parent); + free_ub(new_ub); + } + return ub; + } + + if (!create) { + /* no ub found */ + spin_unlock_irqrestore(&ub_hash_lock, flags); + return NULL; + } + + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } + spin_unlock_irqrestore(&ub_hash_lock, flags); + + /* alloc new ub */ + new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, + GFP_KERNEL); + if (new_ub == NULL) + return NULL; + + ub_debug(UBD_ALLOC, "Creating sub %p\n", new_ub); + memset(new_ub, 0, sizeof(*new_ub)); + init_beancounter_nolimits(new_ub); + init_beancounter_store(new_ub); + init_beancounter_struct(new_ub); + new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct); + if (new_ub->ub_percpu == NULL) + goto fail_free; + new_ub->ub_uid = id; + new_ub->parent = get_beancounter(p); + goto retry; + +fail_free: + kmem_cache_free(ub_cachep, new_ub); + return NULL; +} +EXPORT_SYMBOL(get_subbeancounter_byid); + +static void put_warn(struct user_beancounter *ub) +{ + char id[64]; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_ERR "UB: Bad refcount (%d) on put of %s (%p)\n", + atomic_read(&ub->ub_refcount), id, ub); +} + +#ifdef CONFIG_UBC_KEEP_UNUSED +#define release_beancounter(ub) do { } while (0) +#else +static int verify_res(struct user_beancounter *ub, int resource, + unsigned long held) +{ + char id[64]; + + if (likely(held == 0)) + return 1; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_WARNING "Ub %s helds %lu in %s on put\n", + id, held, ub_rnames[resource]); + return 0; +} + +static inline void bc_verify_held(struct user_beancounter *ub) +{ + int i, clean; + + clean = 1; + for (i = 0; i < UB_RESOURCES; i++) + clean &= verify_res(ub, i, ub->ub_parms[i].held); + + clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages); + clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages); + clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages); + clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages); + + ub_debug_trace(!clean, 5, 60*HZ); +} + +static void bc_free_rcu(struct rcu_head *rcu) +{ + struct user_beancounter *ub; + + ub = container_of(rcu, struct user_beancounter, rcu); + free_ub(ub); +} + +static void delayed_release_beancounter(void *data) +{ + struct user_beancounter *ub, *parent; + unsigned long flags; + + ub = (struct user_beancounter *)data; +again: + local_irq_save(flags); + if (!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock)) { + /* raced with get_beancounter_byuid */ + local_irq_restore(flags); + return; + } + + hlist_del(&ub->ub_hash); + list_del_rcu(&ub->ub_list); + spin_unlock_irqrestore(&ub_hash_lock, flags); + + bc_verify_held(ub); + ub_free_counters(ub); + bc_fini_ioprio(&ub->iopriv); + parent = ub->parent; + + call_rcu(&ub->rcu, bc_free_rcu); + if (parent) { + ub = parent; + goto again; + } +} + +static inline void release_beancounter(struct user_beancounter *ub) +{ + struct execute_work *ew; + + ew = &ub->cleanup; + INIT_WORK(&ew->work, delayed_release_beancounter, ub); + schedule_work(&ew->work); +} +#endif + +void __put_beancounter(struct user_beancounter *ub) +{ + unsigned long flags; + + /* equevalent to atomic_dec_and_lock_irqsave() */ + local_irq_save(flags); + if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) { + if (unlikely(atomic_read(&ub->ub_refcount) < 0)) + put_warn(ub); + local_irq_restore(flags); + return; + } + + if (unlikely(ub == get_ub0())) { + printk(KERN_ERR "Trying to put ub0\n"); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return; + } + + /* prevent get_beancounter_byuid + put_beancounter() reentrance */ + atomic_inc(&ub->ub_refcount); + spin_unlock_irqrestore(&ub_hash_lock, flags); + + release_beancounter(ub); +} +EXPORT_SYMBOL(__put_beancounter); + +/* + * Generic resource charging stuff + */ + +int __charge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict) +{ + ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_parms[resource].held); + /* + * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition + * at the moment is possible so an overflow is impossible. + */ + ub->ub_parms[resource].held += val; + + switch (strict) { + case UB_HARD: + if (ub->ub_parms[resource].held > + ub->ub_parms[resource].barrier) + break; + case UB_SOFT: + if (ub->ub_parms[resource].held > + ub->ub_parms[resource].limit) + break; + case UB_FORCE: + ub_adjust_maxheld(ub, resource); + return 0; + default: + BUG(); + } + + if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl)) + printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n", + ub_rnames[resource], ub->ub_uid); + ub->ub_parms[resource].failcnt++; + ub->ub_parms[resource].held -= val; + return -ENOMEM; +} + +int charge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict) +{ + int retval; + struct user_beancounter *p, *q; + unsigned long flags; + + retval = -EINVAL; + if (val > UB_MAXVALUE) + goto out; + + local_irq_save(flags); + for (p = ub; p != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + retval = __charge_beancounter_locked(p, resource, val, strict); + spin_unlock(&p->ub_lock); + if (retval) + goto unroll; + } +out_restore: + local_irq_restore(flags); +out: + return retval; + +unroll: + for (q = ub; q != p; q = q->parent) { + spin_lock(&q->ub_lock); + __uncharge_beancounter_locked(q, resource, val); + spin_unlock(&q->ub_lock); + } + goto out_restore; +} + +EXPORT_SYMBOL(charge_beancounter); + +void __charge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + struct user_beancounter *p; + unsigned long flags; + + local_irq_save(flags); + for (p = ub; p->parent != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + __charge_beancounter_locked(p, resource, val, UB_FORCE); + spin_unlock(&p->ub_lock); + } + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__charge_beancounter_notop); + +void uncharge_warn(struct user_beancounter *ub, int resource, + unsigned long val, unsigned long held) +{ + char id[64]; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n", + val, held, ub_rnames[resource], id); + ub_debug_trace(1, 10, 10*HZ); +} + +void __uncharge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val) +{ + ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_parms[resource].held); + if (ub->ub_parms[resource].held < val) { + uncharge_warn(ub, resource, + val, ub->ub_parms[resource].held); + val = ub->ub_parms[resource].held; + } + ub->ub_parms[resource].held -= val; +} + +void uncharge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val) +{ + unsigned long flags; + struct user_beancounter *p; + + for (p = ub; p != NULL; p = p->parent) { + spin_lock_irqsave(&p->ub_lock, flags); + __uncharge_beancounter_locked(p, resource, val); + spin_unlock_irqrestore(&p->ub_lock, flags); + } +} + +EXPORT_SYMBOL(uncharge_beancounter); + +void __uncharge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + struct user_beancounter *p; + unsigned long flags; + + local_irq_save(flags); + for (p = ub; p->parent != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + __uncharge_beancounter_locked(p, resource, val); + spin_unlock(&p->ub_lock); + } + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__uncharge_beancounter_notop); + + +/* + * Rate limiting stuff. + */ +int ub_ratelimit(struct ub_rate_info *p) +{ + unsigned long cjif, djif; + unsigned long flags; + static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; + long new_bucket; + + spin_lock_irqsave(&ratelimit_lock, flags); + cjif = jiffies; + djif = cjif - p->last; + if (djif < p->interval) { + if (p->bucket >= p->burst) { + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 0; + } + p->bucket++; + } else { + new_bucket = p->bucket - (djif / (unsigned)p->interval); + if (new_bucket < 0) + new_bucket = 0; + p->bucket = new_bucket + 1; + } + p->last = cjif; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 1; +} +EXPORT_SYMBOL(ub_ratelimit); + + +/* + * Initialization + * + * struct user_beancounter contains + * - limits and other configuration settings, + * with a copy stored for accounting purposes, + * - structural fields: lists, spinlocks and so on. + * + * Before these parts are initialized, the structure should be memset + * to 0 or copied from a known clean structure. That takes care of a lot + * of fields not initialized explicitly. + */ + +static void init_beancounter_struct(struct user_beancounter *ub) +{ + ub->ub_magic = UB_MAGIC; + atomic_set(&ub->ub_refcount, 1); + spin_lock_init(&ub->ub_lock); + INIT_LIST_HEAD(&ub->ub_tcp_sk_list); + INIT_LIST_HEAD(&ub->ub_other_sk_list); +#ifdef CONFIG_UBC_DEBUG_KMEM + INIT_LIST_HEAD(&ub->ub_cclist); +#endif + bc_init_ioprio(&ub->iopriv); +} + +static void init_beancounter_store(struct user_beancounter *ub) +{ + int k; + + for (k = 0; k < UB_RESOURCES; k++) { + memcpy(&ub->ub_store[k], &ub->ub_parms[k], + sizeof(struct ubparm)); + } +} + +static void init_beancounter_nolimits(struct user_beancounter *ub) +{ + int k; + + for (k = 0; k < UB_RESOURCES; k++) { + ub->ub_parms[k].limit = UB_MAXVALUE; + /* FIXME: whether this is right for physpages and guarantees? */ + ub->ub_parms[k].barrier = UB_MAXVALUE; + } + + /* FIXME: set unlimited rate? */ + ub->ub_limit_rl.burst = 4; + ub->ub_limit_rl.interval = 300*HZ; +} + +static void init_beancounter_syslimits(struct user_beancounter *ub) +{ + unsigned long mp; + extern int max_threads; + int k; + + mp = num_physpages; + ub->ub_parms[UB_KMEMSIZE].limit = + mp > (192*1024*1024 >> PAGE_SHIFT) ? + 32*1024*1024 : (mp << PAGE_SHIFT) / 6; + ub->ub_parms[UB_LOCKEDPAGES].limit = 8; + ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE; + ub->ub_parms[UB_SHMPAGES].limit = 64; + ub->ub_parms[UB_NUMPROC].limit = max_threads / 2; + ub->ub_parms[UB_NUMTCPSOCK].limit = 1024; + ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */ + ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */ + ub->ub_parms[UB_NUMOTHERSOCK].limit = 256; + ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */ + ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */ + ub->ub_parms[UB_NUMFLOCK].limit = 1024; + ub->ub_parms[UB_NUMPTY].limit = 16; + ub->ub_parms[UB_NUMSIGINFO].limit = 1024; + ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024; + ub->ub_parms[UB_NUMFILE].limit = 1024; + + for (k = 0; k < UB_RESOURCES; k++) + ub->ub_parms[k].barrier = ub->ub_parms[k].limit; + + ub->ub_limit_rl.burst = 4; + ub->ub_limit_rl.interval = 300*HZ; +} + +#ifdef CONFIG_SMP +static struct percpu_data ub0_percpu; +#endif +static struct ub_percpu_struct ub0_percpu_data[NR_CPUS]; + +void __init ub_init_early(void) +{ + struct user_beancounter *ub; + + init_cache_counters(); + ub = get_ub0(); + memset(ub, 0, sizeof(*ub)); + ub->ub_uid = 0; + init_beancounter_nolimits(ub); + init_beancounter_store(ub); + init_beancounter_struct(ub); + ub->ub_percpu = static_percpu_ptr(&ub0_percpu, ub0_percpu_data); + + memset(¤t->task_bc, 0, sizeof(struct task_beancounter)); + (void)set_exec_ub(ub); + current->task_bc.task_ub = get_beancounter(ub); + __charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE); + current->task_bc.fork_sub = get_beancounter(ub); + ub_init_task_bc(¤t->task_bc); + init_mm.mm_ub = get_beancounter(ub); + + hlist_add_head(&ub->ub_hash, &ub_hash[ub->ub_uid]); + list_add(&ub->ub_list, &ub_list_head); +} + +void __init ub_init_late(void) +{ + ub_cachep = kmem_cache_create("user_beancounters", + sizeof(struct user_beancounter), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); + + memset(&default_beancounter, 0, sizeof(default_beancounter)); +#ifdef CONFIG_UBC_UNLIMITED + init_beancounter_nolimits(&default_beancounter); +#else + init_beancounter_syslimits(&default_beancounter); +#endif + init_beancounter_store(&default_beancounter); + init_beancounter_struct(&default_beancounter); +} diff -uprN linux-2.6.18/kernel/ub/io_acct.c linux-2.6.18.ovz/kernel/ub/io_acct.c --- linux-2.6.18/kernel/ub/io_acct.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/io_acct.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,512 @@ +/* + * kernel/ub/io_acct.c + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Pavel Emelianov + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static struct mempool_s *pb_pool; + +#define PB_MIN_IO (1024) + +static inline struct page_beancounter *io_pb_alloc(void) +{ + return mempool_alloc(pb_pool, GFP_ATOMIC); +} + +static inline void io_pb_free(struct page_beancounter *pb) +{ + mempool_free(pb, pb_pool); +} + +struct page_beancounter **page_pblist(struct page *page) +{ + struct page_beancounter **pb, *iopb; + + pb = &page_pbc(page); + iopb = iopb_to_pb(*pb); + + return iopb == NULL ? pb : &iopb->page_pb_list; +} + +/* + * We save the context page was set dirty to use it later + * when the real write starts. If the page is mapped then + * IO pb is stores like this: + * + * Before saving: + * + * +- page -------+ + * | ... | + * | page_pb +---+ + * +--------------+ | +-----+ +-----+ +-----+ + * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ + * +-----+ +-----+ +-----+ | + * ^ | + * +---------------------------------+ + * + * After saving: + * + * +- page -------+ +- io pb ------+ + * | ... | | ... | + * | page_pb +----> | page_pb_list +-+ + * +--------------+ +--------------+ | + * | + * +-------------------+ + * | + * | +-----+ +-----+ +-----+ + * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ + * +-----+ +-----+ +-----+ | + * ^ | + * +---------------------------------+ + * + * And the page_pblist(...) function returns pointer to the place that + * points to this pbX ring. + */ + +#ifdef CONFIG_UBC_DEBUG_IO +static LIST_HEAD(pb_io_list); +static unsigned long anon_pages, not_released; + +static inline void io_debug_save(struct page_beancounter *pb, + struct page_beancounter *mpb) +{ + pb->io_debug = (mpb == NULL); + list_add(&pb->io_list, &pb_io_list); +} + +static inline void io_debug_release(struct page_beancounter *pb) +{ + list_del(&pb->io_list); +} + +void ub_io_release_debug(struct page *page) +{ + struct page_beancounter *pb; + static int once = 0; + + pb = page_pbc(page); + if (likely(iopb_to_pb(pb) == NULL)) + return; + + if (!once) { + printk("BUG: Page has an IO bc but is not expectd to\n"); + dump_stack(); + once = 1; + } + + spin_lock(&pb_lock); + not_released++; + pb = iopb_to_pb(pb); + page_pbc(page) = NULL; + io_debug_release(pb); + pb->ub->io_pb_held--; + spin_unlock(&pb_lock); + + put_beancounter(pb->ub); + io_pb_free(pb); +} + +static inline int io_debug_precheck_save(struct page *page) +{ + if (unlikely(PageAnon(page))) { + anon_pages++; + return 1; + } + + return 0; +} + +static inline int io_debug_precheck_release(struct page *page) +{ + return 0; +} +#else +#define io_debug_save(pb, mpb) do { } while (0) +#define io_debug_release(pb) do { } while (0) +#define io_debug_precheck_save(page) (0) +#define io_debug_precheck_release(p) (0) +#endif + +static inline void set_page_io(struct page *page, struct page_beancounter *pb, + struct page_beancounter *mapped_pb) +{ + unsigned long val; + + val = (unsigned long)pb | PAGE_IO_MARK; + pb->page = page; + + page_pbc(page) = (struct page_beancounter *)val; + io_debug_save(pb, mapped_pb); + pb->ub->io_pb_held++; +} + +static inline void put_page_io(struct page *page, struct page_beancounter *pb) +{ + pb->ub->io_pb_held--; + io_debug_release(pb); + page_pbc(page) = pb->page_pb_list; +} + +void ub_io_save_context(struct page *page, size_t bytes_dirtied) +{ + struct user_beancounter *ub; + struct page_beancounter *pb, *mapped_pb, *io_pb; + + if (unlikely(in_interrupt())) { + WARN_ON_ONCE(1); + return; + } + + /* + * FIXME - this can happen from atomic context and + * it's probably not that good to loose some requests + */ + + pb = io_pb_alloc(); + io_pb = NULL; + + spin_lock(&pb_lock); + if (io_debug_precheck_save(page)) + goto out_unlock; + + mapped_pb = page_pbc(page); + io_pb = iopb_to_pb(mapped_pb); + if (io_pb != NULL) { + /* + * this page has an IO - release it and force a new one + * We could also race with page cleaning - see below + */ + mapped_pb = io_pb->page_pb_list; + put_page_io(page, io_pb); + } + + /* + * If the page is mapped we must save the context + * it maps to. If the page isn't mapped we use current + * context as this is a regular write. + */ + + if (mapped_pb != NULL) + ub = top_beancounter(mapped_pb->ub); + else + ub = get_io_ub(); + + if (!PageDirty(page)) { + /* + * race with clear_page_dirty(_for_io) - account + * writes for ub_io_release_context() + */ + if (io_pb != NULL) + io_pb->ub->bytes_wrote += PAGE_CACHE_SIZE; + if (pb != NULL) + io_pb_free(pb); + goto out_unlock; + } + + if (pb == NULL) { + ub->bytes_dirty_missed += bytes_dirtied; + goto out_unlock; + } + + /* + * the page may become clean here, but the context will be seen + * in ub_io_release_context() + */ + + pb->ub = get_beancounter(ub); + pb->page_pb_list = mapped_pb; + ub->bytes_dirtied += bytes_dirtied; + + set_page_io(page, pb, mapped_pb); + +out_unlock: + spin_unlock(&pb_lock); + + if (io_pb != NULL) { + put_beancounter(io_pb->ub); + io_pb_free(io_pb); + } +} + +void ub_io_release_context(struct page *page, size_t wrote) +{ + struct page_beancounter *pb; + + if (io_debug_precheck_release(page)) + return; + + if (unlikely(in_interrupt())) { + WARN_ON_ONCE(1); + return; + } + + spin_lock(&pb_lock); + pb = iopb_to_pb(page_pbc(page)); + if (unlikely(pb == NULL)) + /* + * this may happen if we failed to allocate + * context in ub_io_save_context or raced with it + */ + goto out_unlock; + + if (wrote) + pb->ub->bytes_wrote += wrote; + + put_page_io(page, pb); +out_unlock: + spin_unlock(&pb_lock); + + if (pb != NULL) { + put_beancounter(pb->ub); + io_pb_free(pb); + } +} + +void __init ub_init_io(struct kmem_cache *pb_cachep) +{ + pb_pool = mempool_create_slab_pool(PB_MIN_IO, pb_cachep); + if (pb_pool == NULL) + panic("Can't create pb_pool"); +} + +#ifdef CONFIG_PROC_FS +#define in_flight(var) (var > var##_done ? var - var##_done : 0) + +static int bc_ioacct_show(struct seq_file *f, void *v) +{ + int i; + unsigned long long read, write, cancel; + unsigned long sync, sync_done; + unsigned long fsync, fsync_done; + unsigned long fdsync, fdsync_done; + unsigned long frsync, frsync_done; + unsigned long reads, writes; + unsigned long long rchar, wchar; + struct user_beancounter *ub; + + ub = seq_beancounter(f); + + read = write = cancel = 0; + sync = sync_done = fsync = fsync_done = + fdsync = fdsync_done = frsync = frsync_done = 0; + reads = writes = 0; + rchar = wchar = 0; + for_each_online_cpu(i) { + struct ub_percpu_struct *ub_percpu; + ub_percpu = per_cpu_ptr(ub->ub_percpu, i); + + read += ub_percpu->bytes_read; + write += ub_percpu->bytes_wrote; + cancel += ub_percpu->bytes_cancelled; + + sync += ub_percpu->sync; + fsync += ub_percpu->fsync; + fdsync += ub_percpu->fdsync; + frsync += ub_percpu->frsync; + sync_done += ub_percpu->sync_done; + fsync_done += ub_percpu->fsync_done; + fdsync_done += ub_percpu->fdsync_done; + frsync_done += ub_percpu->frsync_done; + + reads += ub_percpu->read; + writes += ub_percpu->write; + rchar += ub_percpu->rchar; + wchar += ub_percpu->wchar; + } + + seq_printf(f, bc_proc_llu_fmt, "read", read); + seq_printf(f, bc_proc_llu_fmt, "write", ub->bytes_wrote + write); + seq_printf(f, bc_proc_llu_fmt, "dirty", ub->bytes_dirtied); + seq_printf(f, bc_proc_llu_fmt, "cancel", cancel); + seq_printf(f, bc_proc_llu_fmt, "missed", ub->bytes_dirty_missed); + + seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync); + seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync); + seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync); + seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync); + + seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync)); + seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync)); + seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync)); + seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync)); + + seq_printf(f, bc_proc_lu_lfmt, "vfs_reads", reads); + seq_printf(f, bc_proc_llu_fmt, "vfs_read_chars", rchar); + seq_printf(f, bc_proc_lu_lfmt, "vfs_writes", writes); + seq_printf(f, bc_proc_llu_fmt, "vfs_write_chars", wchar); + + seq_printf(f, bc_proc_lu_lfmt, "io_pbs", ub->io_pb_held); + return 0; +} + +static struct bc_proc_entry bc_ioacct_entry = { + .name = "ioacct", + .u.show = bc_ioacct_show, +}; + +#ifdef CONFIG_UBC_DEBUG_IO +#define PTR_SIZE (int)(sizeof(void *) * 2) +#define INT_SIZE (int)(sizeof(int) * 2) + +static int bc_io_show(struct seq_file *f, void *v) +{ + struct list_head *lh; + struct page_beancounter *pb; + struct page *pg; + + lh = (struct list_head *)v; + if (lh == &pb_io_list) { + seq_printf(f, "Races: anon %lu missed %lu\n", + anon_pages, not_released); + + seq_printf(f, "%-*s %-1s %-*s %-4s %*s %*s " + "%-*s %-*s %-1s %-*s %-*s\n", + PTR_SIZE, "pb", "", + PTR_SIZE, "page", "flg", + INT_SIZE, "cnt", INT_SIZE, "mcnt", + PTR_SIZE, "pb_list", + PTR_SIZE, "page_pb", "", + PTR_SIZE, "mapping", + INT_SIZE, "ub"); + return 0; + } + + pb = list_entry(lh, struct page_beancounter, io_list); + pg = pb->page; + seq_printf(f, "%p %c %p %c%c%c%c %*d %*d %p %p %c %p %d\n", + pb, pb->io_debug ? 'e' : 'm', pg, + PageDirty(pg) ? 'D' : 'd', + PageAnon(pg) ? 'A' : 'a', + PageWriteback(pg) ? 'W' : 'w', + PageLocked(pg) ? 'L' : 'l', + INT_SIZE, page_count(pg), + INT_SIZE, page_mapcount(pg), + pb->page_pb_list, page_pbc(pg), + iopb_to_pb(page_pbc(pg)) == pb ? ' ' : '!', + pg->mapping, pb->ub->ub_uid); + return 0; +} + +static void *bc_io_start(struct seq_file *f, loff_t *ppos) +{ + loff_t pos; + struct list_head *lh; + + pos = *ppos; + spin_lock(&pb_lock); + if (pos == 0) + return &pb_io_list; + + list_for_each (lh, &pb_io_list) + if (pos-- == 1) + return lh; + return NULL; +} + +static void *bc_io_next(struct seq_file *f, void *v, loff_t *ppos) +{ + struct list_head *lh; + + (*ppos)++; + lh = (struct list_head *)v; + return lh->next == &pb_io_list ? NULL : lh->next; +} + +static void bc_io_stop(struct seq_file *f, void *v) +{ + spin_unlock(&pb_lock); +} + +static struct seq_operations bc_io_seq_ops = { + .start = bc_io_start, + .next = bc_io_next, + .stop = bc_io_stop, + .show = bc_io_show, +}; + +static int bc_io_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &bc_io_seq_ops); +} +static struct file_operations bc_io_debug_ops = { + .open = bc_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct bc_proc_entry bc_ioacct_debug_entry = { + .name = "ioacct_debug", + .u.fops = &bc_io_debug_ops, +}; +#endif + +static int bc_ioacct_notify(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + struct user_beancounter *ub; + unsigned long *vm_events; + unsigned long long bin, bout; + int i; + + if (event != VIRTINFO_VMSTAT) + return old_ret; + + ub = top_beancounter(get_exec_ub()); + + /* Think over: do we need to account here bytes_dirty_missed? */ + bout = ub->bytes_wrote; + bin = 0; + for_each_online_cpu(i) { + bout += per_cpu_ptr(ub->ub_percpu, i)->bytes_wrote; + bin += per_cpu_ptr(ub->ub_percpu, i)->bytes_read; + } + + /* convert to Kbytes */ + bout >>= 10; + bin >>= 10; + + vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS; + vm_events[PGPGOUT] = (unsigned long)bout; + vm_events[PGPGIN] = (unsigned long)bin; + return NOTIFY_OK; +} + +static struct vnotifier_block bc_ioacct_nb = { + .notifier_call = bc_ioacct_notify, +}; + +static int __init bc_ioacct_init(void) +{ +#ifdef CONFIG_UBC_DEBUG_IO + bc_register_proc_root_entry(&bc_ioacct_debug_entry); +#endif + bc_register_proc_entry(&bc_ioacct_entry); + + virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb); + return 0; +} + +late_initcall(bc_ioacct_init); +#endif diff -uprN linux-2.6.18/kernel/ub/io_prio.c linux-2.6.18.ovz/kernel/ub/io_prio.c --- linux-2.6.18/kernel/ub/io_prio.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/io_prio.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,277 @@ +/* + * kernel/ub/io_prio.c + * + * Copyright (C) 2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Vasily Tarasov + * + */ + +#include +#include +#include +#include +#include +#include +#include + +struct cfq_bc_data *__find_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + + list_for_each_entry(cfq_bc, &iopriv->cfq_bc_head, cfq_bc_list) + if (cfq_bc->cfqd == cfqd) + return cfq_bc; + + return NULL; +} + +struct cfq_bc_data *bc_findcreate_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd, gfp_t gfp_mask) +{ + struct cfq_bc_data *cfq_bc_new; + struct cfq_bc_data *cfq_bc; + unsigned long flags; + + read_lock_irqsave(&iopriv->cfq_bc_list_lock, flags); + cfq_bc = __find_cfq_bc(iopriv, cfqd); + read_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags); + + if (cfq_bc) + return cfq_bc; + + cfq_bc_new = kzalloc(sizeof(*cfq_bc_new), gfp_mask); + if (!cfq_bc_new) + return NULL; + + cfq_init_cfq_bc(cfq_bc_new); + cfq_bc_new->cfqd = cfqd; + cfq_bc_new->ub_iopriv = iopriv; + + write_lock_irqsave(&iopriv->cfq_bc_list_lock, flags); + cfq_bc = __find_cfq_bc(iopriv, cfqd); + if (cfq_bc) + kfree(cfq_bc_new); + else { + list_add_tail(&cfq_bc_new->cfq_bc_list, + &iopriv->cfq_bc_head); + cfq_bc = cfq_bc_new; + } + write_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags); + + return cfq_bc; +} + +void bc_init_ioprio(struct ub_iopriv *iopriv) +{ + INIT_LIST_HEAD(&iopriv->cfq_bc_head); + rwlock_init(&iopriv->cfq_bc_list_lock); + iopriv->ioprio = UB_IOPRIO_BASE; +} + +static void inline bc_cfq_bc_check_empty(struct cfq_bc_data *cfq_bc) +{ + int i; + + for (i = 0; i < CFQ_PRIO_LISTS; i++) + BUG_ON(!list_empty(&cfq_bc->rr_list[i])); + + BUG_ON(!list_empty(&cfq_bc->cur_rr)); + BUG_ON(!list_empty(&cfq_bc->busy_rr)); + BUG_ON(!list_empty(&cfq_bc->idle_rr)); +} + +static void bc_release_cfq_bc(struct cfq_bc_data *cfq_bc) +{ + struct cfq_data *cfqd; + elevator_t *eq; + int i; + + cfqd = cfq_bc->cfqd; + eq = cfqd->queue->elevator; + + for (i = 0; i < CFQ_PRIO_LISTS; i++) + if (cfq_bc->async_cfqq[i]) { + eq->ops->put_queue(cfq_bc->async_cfqq[i]); + cfq_bc->async_cfqq[i] = NULL; + } + /* + * Note: this cfq_bc is already not in active list, + * but can be still pointed from cfqd as active. + */ + cfqd->active_cfq_bc = NULL; + + bc_cfq_bc_check_empty(cfq_bc); + list_del(&cfq_bc->cfq_bc_list); + kfree(cfq_bc); +} + +void bc_fini_ioprio(struct ub_iopriv *iopriv) +{ + struct cfq_bc_data *cfq_bc; + struct cfq_bc_data *cfq_bc_tmp; + unsigned long flags; + spinlock_t *queue_lock; + + /* + * Don't get cfq_bc_list_lock since ub is already dead, + * but async cfqqs are still in hash list, consequently + * queue_lock should be hold. + */ + list_for_each_entry_safe(cfq_bc, cfq_bc_tmp, + &iopriv->cfq_bc_head, cfq_bc_list) { + queue_lock = cfq_bc->cfqd->queue->queue_lock; + spin_lock_irqsave(queue_lock, flags); + bc_release_cfq_bc(cfq_bc); + spin_unlock_irqrestore(queue_lock, flags); + } +} + +void bc_cfq_exit_queue(struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + struct user_beancounter *ub; + + local_irq_disable(); + for_each_beancounter(ub) { + write_lock(&ub->iopriv.cfq_bc_list_lock); + cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd); + if (!cfq_bc) { + write_unlock(&ub->iopriv.cfq_bc_list_lock); + continue; + } + bc_release_cfq_bc(cfq_bc); + write_unlock(&ub->iopriv.cfq_bc_list_lock); + } + local_irq_enable(); +} + +int bc_expired(struct cfq_data *cfqd) +{ + return time_after(jiffies, cfqd->slice_end) ? 1 : 0; +} + +static inline int bc_empty(struct cfq_bc_data *cfq_bc) +{ + /* + * consider BC as empty only if there is no requests + * in elevator _and_ in driver + */ + if (!cfq_bc->rqnum && !cfq_bc->on_dispatch) + return 1; + + return 0; +} + +static inline unsigned long bc_time_slice_by_ioprio(unsigned int ioprio, + unsigned int base_slice) +{ + return base_slice + + (base_slice * (ioprio - UB_IOPRIO_MIN)) + / (UB_IOPRIO_MAX - UB_IOPRIO_MIN - 1); +} + +static inline void bc_set_active(struct cfq_data *cfqd) +{ + /* if no active BCs then keep this as an active one */ + if (list_empty(&cfqd->act_cfq_bc_head)) + return; + + cfqd->active_cfq_bc = list_first_entry(&cfqd->act_cfq_bc_head, + struct cfq_bc_data, act_cfq_bc_list); + list_move_tail(&cfqd->active_cfq_bc->act_cfq_bc_list, + &cfqd->act_cfq_bc_head); + cfqd->slice_end = jiffies + + bc_time_slice_by_ioprio(cfqd->active_cfq_bc->ub_iopriv->ioprio, + cfqd->cfq_ub_slice); +} + +void bc_schedule_active(struct cfq_data *cfqd) +{ + if (bc_expired(cfqd) || !cfqd->active_cfq_bc || + bc_empty(cfqd->active_cfq_bc)) + bc_set_active(cfqd); +} + +void bc_inc_rqnum(struct cfq_queue *cfqq) +{ + struct cfq_bc_data *cfq_bc; + + cfq_bc = cfqq->cfq_bc; + + if (!cfq_bc->rqnum) + list_add_tail(&cfq_bc->act_cfq_bc_list, + &cfqq->cfqd->act_cfq_bc_head); + + cfq_bc->rqnum++; +} + +void bc_dec_rqnum(struct cfq_queue *cfqq) +{ + struct cfq_bc_data *cfq_bc; + + cfq_bc = cfqq->cfq_bc; + + cfq_bc->rqnum--; + + if (!cfq_bc->rqnum) + list_del(&cfq_bc->act_cfq_bc_list); +} + +unsigned long bc_set_ioprio(int ubid, int ioprio) +{ + struct user_beancounter *ub; + + if (ioprio < UB_IOPRIO_MIN || ioprio >= UB_IOPRIO_MAX) + return -ERANGE; + + ub = get_beancounter_byuid(ubid, 0); + if (!ub) + return -ESRCH; + + ub->iopriv.ioprio = ioprio; + put_beancounter(ub); + + return 0; +} + +struct user_beancounter *bc_io_switch_context(struct page *page) +{ + struct page_beancounter *pb; + struct user_beancounter *old_ub = NULL; + + pb = page_iopb(page); + pb = iopb_to_pb(pb); + if (pb) { + get_beancounter(pb->ub); + old_ub = set_exec_ub(pb->ub); + } + + return old_ub; +} + +void bc_io_restore_context(struct user_beancounter *ub) +{ + struct user_beancounter *old_ub; + + if (ub) { + old_ub = set_exec_ub(ub); + put_beancounter(old_ub); + } +} + +EXPORT_SYMBOL(bc_io_switch_context); +EXPORT_SYMBOL(bc_io_restore_context); +EXPORT_SYMBOL(__find_cfq_bc); +EXPORT_SYMBOL(bc_fini_ioprio); +EXPORT_SYMBOL(bc_init_ioprio); +EXPORT_SYMBOL(bc_findcreate_cfq_bc); +EXPORT_SYMBOL(bc_cfq_exit_queue); +EXPORT_SYMBOL(bc_expired); +EXPORT_SYMBOL(bc_schedule_active); +EXPORT_SYMBOL(bc_inc_rqnum); +EXPORT_SYMBOL(bc_dec_rqnum); diff -uprN linux-2.6.18/kernel/ub/ub_dcache.c linux-2.6.18.ovz/kernel/ub/ub_dcache.c --- linux-2.6.18/kernel/ub/ub_dcache.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_dcache.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,676 @@ +/* + * kernel/ub/ub_dcache.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Locking + * traverse dcache_lock d_lock + * ub_dentry_charge + - + + * ub_dentry_uncharge + + - + * ub_dentry_charge_nofail + + - + * + * d_inuse changes are atomic, with special handling of "not in use" <-> + * "in use" (-1 <-> 0) transitions. We have two sources of non-atomicity + * here: (1) in many operations we need to change d_inuse of both dentry and + * its parent, and (2) on state transitions we need to adjust the account. + * + * Regarding (1): we do not have (and do not want) a single lock covering all + * operations, so in general it's impossible to get a consistent view of + * a tree with respect to d_inuse counters (except by swsuspend). It also + * means if a dentry with d_inuse of 0 gets one new in-use child and loses + * one, it's d_inuse counter will go either 0 -> 1 -> 0 path or 0 -> -1 -> 0, + * and we can't say which way. + * Note that path -1 -> 0 -> -1 can't turn into -1 -> -2 -> -1, since + * uncharge can be done only after return from charge (with d_genocide being + * the only apparent exception). + * Regarding (2): there is a similar uncertainty with the dcache account. + * If the account is equal to the limit, one more dentry is started to be + * used and one is put, the account will either hit the limit (and an error + * will be returned), or decrement will happen before increment. + * + * These races do not really matter. + * The only things we want are: + * - if a system is suspenede with no in-use dentries, all d_inuse counters + * should be correct (-1); + * - d_inuse counters should always be >= -1. + * This holds if ->parent references are accessed and maintained properly. + * In subtle moments (like d_move) dentries exchanging their parents should + * both be in-use. At d_genocide time, lookups and charges are assumed to be + * impossible. + */ + +/* + * Hierarchical accounting + * UB argument must NOT be NULL + */ + +static int do_charge_dcache(struct user_beancounter *ub, unsigned long size, + enum ub_severity sv) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv)) + goto out_mem; + if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv)) + goto out_dcache; + spin_unlock_irqrestore(&ub->ub_lock, flags); + return 0; + +out_dcache: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); +out_mem: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return -ENOMEM; +} + +static void do_uncharge_dcache(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); + __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static int charge_dcache(struct user_beancounter *ub, unsigned long size, + enum ub_severity sv) +{ + struct user_beancounter *p, *q; + + for (p = ub; p != NULL; p = p->parent) { + if (do_charge_dcache(p, size, sv)) + goto unroll; + } + return 0; + +unroll: + for (q = ub; q != p; q = q->parent) + do_uncharge_dcache(q, size); + return -ENOMEM; +} + +void uncharge_dcache(struct user_beancounter *ub, unsigned long size) +{ + for (; ub != NULL; ub = ub->parent) + do_uncharge_dcache(ub, size); +} + +/* + * Simple helpers to do maintain account and d_ub field. + */ + +static inline int d_charge(struct dentry_beancounter *d_bc) +{ + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + if (charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) { + put_beancounter(ub); + return -1; + } + d_bc->d_ub = ub; + return 0; +} + +static inline void d_forced_charge(struct dentry_beancounter *d_bc) +{ + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + charge_dcache(ub, d_bc->d_ubsize, UB_FORCE); + d_bc->d_ub = ub; +} + +/* + * Minor helpers + */ + +extern kmem_cache_t *dentry_cache; +extern kmem_cache_t *inode_cachep; +static struct rw_semaphore ub_dentry_alloc_sem; + +static inline unsigned int dentry_memusage(void) +{ + return dentry_cache->objuse; +} + +static inline unsigned int inode_memusage(void) +{ + return inode_cachep->objuse; +} + +static inline unsigned long d_charge_size(struct dentry *dentry) +{ + /* dentry's d_name is already set to appropriate value (see d_alloc) */ + return inode_cachep->objuse + dentry_cache->objuse + + (dname_external(dentry) ? + kmem_obj_memusage((void *)dentry->d_name.name) : 0); +} + +/* + * Entry points from dcache.c + */ + +/* + * Set initial d_inuse on d_alloc. + * Called with no locks, preemption disabled. + */ +int __ub_dentry_alloc(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + + d_bc = &dentry->dentry_bc; + d_bc->d_ub = get_beancounter(get_exec_ub()); + atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in ub_dcache.h */ + d_bc->d_ubsize = d_charge_size(dentry); + + if (charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD)) + goto failure; + return 0; + +failure: + put_beancounter(d_bc->d_ub); + d_bc->d_ub = NULL; + return -ENOMEM; +} +void __ub_dentry_alloc_start(void) +{ + down_read(&ub_dentry_alloc_sem); + current->task_bc.dentry_alloc = 1; +} + +void __ub_dentry_alloc_end(void) +{ + current->task_bc.dentry_alloc = 0; + up_read(&ub_dentry_alloc_sem); +} + +/* + * It is assumed that parent is already in use, so traverse upwards is + * limited to one ancestor only. + * Called under d_lock and rcu_read_lock. + */ +int __ub_dentry_charge(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + struct dentry *parent; + int ret; + + if (ub_dget_testone(dentry)) { + d_bc = &dentry->dentry_bc; + /* state transition -1 => 0 */ + if (d_charge(d_bc)) + goto failure; + + if (dentry != dentry->d_parent) { + parent = dentry->d_parent; + if (ub_dget_testone(parent)) + BUG(); + } + } + return 0; + +failure: + /* + * Here we would like to fail the lookup. + * It is not easy: if d_lookup fails, callers expect that a dentry + * with the given name doesn't exist, and create a new one. + * So, first we forcedly charge for this dentry. + * Then try to remove it from cache safely. If it turns out to be + * possible, we can return error. + */ + d_forced_charge(d_bc); + + if (dentry != dentry->d_parent) { + parent = dentry->d_parent; + if (ub_dget_testone(parent)) + BUG(); + } + + ret = 0; + if (spin_trylock(&dcache_lock)) { + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + rcu_read_unlock(); + shrink_dcache_parent(dentry); + rcu_read_lock(); + spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + } + if (atomic_read(&dentry->d_count) == 1) { + __d_drop(dentry); + ret = -1; + } + spin_unlock(&dcache_lock); + } + + return ret; +} + +/* + * Go up in the tree decreasing d_inuse. + * Called under dcache_lock. + */ +void __ub_dentry_uncharge(struct dentry *dentry) +{ + struct dentry *parent; + struct user_beancounter *ub; + unsigned long size; + + /* go up until state doesn't change or and root is reached */ + size = dentry->dentry_bc.d_ubsize; + ub = dentry->dentry_bc.d_ub; + while (ub_dput_testzero(dentry)) { + /* state transition 0 => -1 */ + uncharge_dcache(ub, size); + put_beancounter(ub); + + parent = dentry->d_parent; + if (dentry == parent) + break; + + dentry = parent; + size = dentry->dentry_bc.d_ubsize; + ub = dentry->dentry_bc.d_ub; + } +} + +/* + * Forced charge for __dget_locked, where API doesn't allow to return error. + * Called under dcache_lock. + */ +void __ub_dentry_charge_nofail(struct dentry *dentry) +{ + struct dentry *parent; + + while (ub_dget_testone(dentry)) { + /* state transition -1 => 0 */ + d_forced_charge(&dentry->dentry_bc); + + parent = dentry->d_parent; + if (dentry == parent) + break; + dentry = parent; + } +} + +/* + * Adaptive accounting + */ + +int ub_dentry_on; +int ub_dentry_alloc_barrier; +EXPORT_SYMBOL(ub_dentry_on); + +static DEFINE_PER_CPU(int, checkcnt); +static unsigned long checklowat = 0; +static unsigned long checkhiwat = ULONG_MAX; + +static int sysctl_ub_dentry_chk = 10; +#define sysctl_ub_lowat sysctl_ub_watermark[0] +#define sysctl_ub_hiwat sysctl_ub_watermark[1] +static DECLARE_RWSEM(ub_dentry_alloc_sem); +/* 1024th of lowmem size */ +static unsigned int sysctl_ub_watermark[2] = {0, 100}; + + +static int ub_dentry_acctinit(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + + d_bc = &dentry->dentry_bc; + d_bc->d_ub = NULL; + atomic_set(&d_bc->d_inuse, -1); + if (dname_external(dentry)) { + struct page *page; + page = virt_to_page(dentry->d_name.name); + if (!PageSlab(page) || page_get_cache(page) == NULL) { + printk("Problem with name, dentry %p, parent %p, " + "name %p len %d\n", + dentry, dentry->d_parent, + dentry->d_name.name, + dentry->d_name.len); + printk(" de %p name %.10s\n", + dentry, dentry->d_name.name); + d_bc->d_ubsize = 0; + return 0; + } + } + d_bc->d_ubsize = d_charge_size(dentry); + return 0; +} + +static int ub_dentry_acctcount(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + struct dentry *child; + int count; + + count = 0; + list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) + count++; + + d_bc = &dentry->dentry_bc; + count = atomic_read(&dentry->d_count) - count; + if (count) { + __ub_dentry_charge_nofail(dentry); + if (count > 1) + atomic_add(count - 1, &d_bc->d_inuse); + } + + return 0; +} + +static int ub_dentry_acctdrop(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + + d_bc = &dentry->dentry_bc; + if (atomic_read(&d_bc->d_inuse) < 0) + return 0; + atomic_set(&d_bc->d_inuse, -1); + uncharge_dcache(d_bc->d_ub, d_bc->d_ubsize); + put_beancounter(d_bc->d_ub); + return 0; +} + +extern void kmem_cache_free_block(kmem_cache_t *cachep, + struct kmem_list3 *l3, void **objpp, + int nr_objects, int node); + +static int ub_dentry_walk_node(int (*fun)(struct dentry *), int node) +{ + kmem_cache_t *cachep; + struct array_cache *ac; + struct slab *slabp; + char *objp; + int cpu, i, sz, r, n; + struct kmem_list3 *l3; + unsigned long map[PAGE_SIZE / sizeof(struct dentry) + / BITS_PER_LONG + 1]; + + cachep = dentry_cache; + if (cachep->num >= sizeof(map) * 8) + return -E2BIG; + + l3 = cachep->nodelists[node]; + /* drain all CPU caches to have up-to-date free map */ + +#ifdef CONFIG_NUMA + /* walk through all nodes and drain alien caches */ + for_each_online_node (n) { + if (!cachep->nodelists[n]->alien) + continue; + ac = cachep->nodelists[n]->alien[node]; + if (!ac) + continue; + kmem_cache_free_block(cachep, cachep->nodelists[node], + ac->entry, ac->avail, node); + ac->avail = 0; + } +#endif + + ac = l3->shared; + kmem_cache_free_block(cachep, l3, ac->entry, ac->avail, node); + ac->avail = 0; + for_each_online_cpu(cpu) { + ac = cachep->array[cpu]; + n = cpu_to_node(cpu); + kmem_cache_free_block(cachep, cachep->nodelists[n], + ac->entry, ac->avail, n); + ac->avail = 0; + } + + list_for_each_entry(slabp, &l3->slabs_full, list) { + touch_nmi_watchdog(); + for (i = 0, objp = slabp->s_mem; + i < cachep->num; + i++, objp += cachep->buffer_size) { +#if SLAB_DEBUG + r = (*fun)((struct dentry *) + (objp + cachep->obj_offset)); +#else + r = (*fun)((struct dentry *)objp); +#endif + if (r) + return r; + } + } + + list_for_each_entry(slabp, &l3->slabs_partial, list) { + touch_nmi_watchdog(); + memset(map, 0xff, sizeof(map)); + for (i = slabp->free, r = 0; + i != BUFCTL_END; + i = slab_bufctl(slabp)[i], r++) { + if (r > cachep->num) + return -1; + __clear_bit(i, map); + } + sz = sizeof(map) * BITS_PER_LONG; + for (i = find_first_bit(map, sz); + i < cachep->num; + i = find_next_bit(map, sz, i + 1)) { + objp = slabp->s_mem + i * cachep->buffer_size; +#if SLAB_DEBUG + r = (*fun)((struct dentry *) + (objp + cachep->obj_offset)); +#else + r = (*fun)((struct dentry *)objp); +#endif + if (r) + return r; + } + } + + return 0; +} + +static int ub_dentry_walk(int (*fun)(struct dentry *)) +{ + int node; + int err; + + for_each_online_node (node) { + if ((err = ub_dentry_walk_node(fun, node)) != 0) + return err; + } + return 0; +} + +static int ub_dentry_accton(void *data) +{ + struct user_beancounter *ub; + int err; + + ub = get_exec_ub(); + set_exec_ub(get_ub0()); + err = ub_dentry_walk(&ub_dentry_acctinit); + if (!err) + err = ub_dentry_walk(&ub_dentry_acctcount); + set_exec_ub(ub); + if (err == 0) + ub_dentry_on = 1; + return err; +} + +static int ub_dentry_acctoff(void *data) +{ + int ret; + ret = ub_dentry_walk(&ub_dentry_acctdrop); + if (ret == 0) + ub_dentry_on = 0; + return ret; +} + +/* + * Main function turning dcache accounting on and off. + * Called with preemption disabled (for caller's convenience). + */ +static void ub_dentry_switch(int onoff, unsigned long pages, int (*fun)(void *)) +{ + static char *s[] = { "off", "on" }; + unsigned long start_jiffies; + int err, tm; + + start_jiffies = jiffies; + preempt_enable(); + ub_dentry_alloc_barrier = 1; + /* ensure ub_dentry_alloc_barrier is visible on all CPUs */ + mb(); + synchronize_rcu(); + down_write(&ub_dentry_alloc_sem); + if (ub_dentry_on == onoff) + goto done; + + printk("UBC: preparing to turn dcache accounting %s, " + "size %lu pages, watermarks %lu %lu\n", + s[onoff], pages, checklowat, checkhiwat); + err = stop_machine_run(fun, NULL, NR_CPUS); + if (err) { + printk(KERN_ERR "UBC: ERROR: dcache accounting switch %d\n", + err); + preempt_disable(); + checklowat = 0; + checkhiwat = ULONG_MAX; + sysctl_ub_dentry_chk = INT_MAX; + preempt_enable(); + } else { + tm = jiffies_to_msecs(jiffies - start_jiffies); + printk("UBC: turning dcache accounting %s succeeded, " + "usage %lu, time %u.%03u\n", + s[onoff], + get_ub0()->ub_parms[UB_DCACHESIZE].held, + tm / 1000, tm % 1000); + } + +done: + ub_dentry_alloc_barrier = 0; + up_write(&ub_dentry_alloc_sem); + preempt_disable(); +} + +void ub_dentry_checkup(void) +{ + int *p; + unsigned long pages; + + preempt_disable(); + p = &__get_cpu_var(checkcnt); + if (++*p > sysctl_ub_dentry_chk) { + *p = 0; + pages = dentry_cache->grown + - dentry_cache->reaped + - dentry_cache->shrunk; + pages <<= dentry_cache->gfporder; + if (ub_dentry_on) { + if (pages < checklowat) + ub_dentry_switch(0, pages, &ub_dentry_acctoff); + } else { + if (pages >= checkhiwat) + ub_dentry_switch(1, pages, &ub_dentry_accton); + } + } + preempt_enable(); +} + +static void ub_dentry_set_limits(unsigned long pages, unsigned long cap) +{ + down_write(&ub_dentry_alloc_sem); + preempt_disable(); + checklowat = (pages >> 10) * sysctl_ub_lowat; + checkhiwat = (pages >> 10) * sysctl_ub_hiwat; + if (checkhiwat > cap) { + checkhiwat = cap; + checklowat = cap / sysctl_ub_hiwat * sysctl_ub_lowat; + } + preempt_enable(); + up_write(&ub_dentry_alloc_sem); +} + +static int ub_dentry_proc_handler(ctl_table *ctl, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + + r = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + if (!r && write) + ub_dentry_set_limits(totalram_pages - totalhigh_pages, + ULONG_MAX); + return r; +} + +static ctl_table ub_dentry_sysctl_table[] = { + { + .ctl_name = 1000, + .procname = "dentry_check", + .data = &sysctl_ub_dentry_chk, + .maxlen = sizeof(sysctl_ub_dentry_chk), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = 1001, + .procname = "dentry_watermark", + .data = &sysctl_ub_lowat, + .maxlen = sizeof(sysctl_ub_lowat) * 2, + .mode = 0644, + .proc_handler = &ub_dentry_proc_handler, + }, + { .ctl_name = 0 } +}; +static ctl_table ub_dentry_sysctl_root[] = { + { + .ctl_name = 23681, + .procname = "ubc", + .mode = 0555, + .child = ub_dentry_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static int __init ub_dentry_init(void) +{ + /* + * Initial watermarks are limited, to limit walk time. + * 384MB translates into 0.8 sec on PIII 866MHz. + */ + ub_dentry_set_limits(totalram_pages - totalhigh_pages, + 384 * 1024 * 1024 / PAGE_SIZE); + if (register_sysctl_table(ub_dentry_sysctl_root, 0) == NULL) + return -ENOMEM; + return 0; +} +__initcall(ub_dentry_init); diff -uprN linux-2.6.18/kernel/ub/ub_mem.c linux-2.6.18.ovz/kernel/ub/ub_mem.c --- linux-2.6.18/kernel/ub/ub_mem.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_mem.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,446 @@ +/* + * kernel/ub/ub_mem.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Initialization + */ + +/* + * Slab accounting + */ + +#ifdef CONFIG_UBC_DEBUG_KMEM + +#define CC_HASH_SIZE 1024 +static struct ub_cache_counter *cc_hash[CC_HASH_SIZE]; +spinlock_t cc_lock; + +static void __free_cache_counters(struct user_beancounter *ub, + kmem_cache_t *cachep) +{ + struct ub_cache_counter *cc, **pprev, *del; + int i; + unsigned long flags; + + del = NULL; + spin_lock_irqsave(&cc_lock, flags); + for (i = 0; i < CC_HASH_SIZE; i++) { + pprev = &cc_hash[i]; + cc = cc_hash[i]; + while (cc != NULL) { + if (cc->ub != ub && cc->cachep != cachep) { + pprev = &cc->next; + cc = cc->next; + continue; + } + + list_del(&cc->ulist); + *pprev = cc->next; + cc->next = del; + del = cc; + cc = *pprev; + } + } + spin_unlock_irqrestore(&cc_lock, flags); + + while (del != NULL) { + cc = del->next; + kfree(del); + del = cc; + } +} + +void ub_free_counters(struct user_beancounter *ub) +{ + __free_cache_counters(ub, NULL); +} + +void ub_kmemcache_free(kmem_cache_t *cachep) +{ + __free_cache_counters(NULL, cachep); +} + +void __init init_cache_counters(void) +{ + memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0])); + spin_lock_init(&cc_lock); +} + +#define cc_hash_fun(ub, cachep) ( \ + (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \ + ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \ + ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \ + ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \ + ) & (CC_HASH_SIZE - 1)) + +static int change_slab_charged(struct user_beancounter *ub, + kmem_cache_t *cachep, long val) +{ + struct ub_cache_counter *cc, *new_cnt, **pprev; + unsigned long flags; + + new_cnt = NULL; +again: + spin_lock_irqsave(&cc_lock, flags); + cc = cc_hash[cc_hash_fun(ub, cachep)]; + while (cc) { + if (cc->ub == ub && cc->cachep == cachep) + goto found; + cc = cc->next; + } + + if (new_cnt != NULL) + goto insert; + + spin_unlock_irqrestore(&cc_lock, flags); + + new_cnt = kmalloc(sizeof(*new_cnt), GFP_ATOMIC); + if (new_cnt == NULL) + return -ENOMEM; + + new_cnt->counter = 0; + new_cnt->ub = ub; + new_cnt->cachep = cachep; + goto again; + +insert: + pprev = &cc_hash[cc_hash_fun(ub, cachep)]; + new_cnt->next = *pprev; + *pprev = new_cnt; + list_add(&new_cnt->ulist, &ub->ub_cclist); + cc = new_cnt; + new_cnt = NULL; + +found: + cc->counter += val; + spin_unlock_irqrestore(&cc_lock, flags); + if (new_cnt) + kfree(new_cnt); + return 0; +} + +static inline int inc_slab_charged(struct user_beancounter *ub, + kmem_cache_t *cachep) +{ + return change_slab_charged(ub, cachep, 1); +} + +static inline void dec_slab_charged(struct user_beancounter *ub, + kmem_cache_t *cachep) +{ + if (change_slab_charged(ub, cachep, -1) < 0) + BUG(); +} + +#include + +#define inc_pages_charged(ub, order) ub_percpu_add(ub, \ + pages_charged, 1 << order) +#define dec_pages_charged(ub, order) ub_percpu_sub(ub, \ + pages_charged, 1 << order) + +#ifdef CONFIG_PROC_FS +static int bc_kmem_debug_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + struct ub_cache_counter *cc; + long pages, vmpages, pbc; + int i; + + ub = seq_beancounter(f); + + pages = vmpages = pbc = 0; + for_each_online_cpu(i) { + pages += per_cpu_ptr(ub->ub_percpu, i)->pages_charged; + vmpages += per_cpu_ptr(ub->ub_percpu, i)->vmalloc_charged; + pbc += per_cpu_ptr(ub->ub_percpu, i)->pbcs; + } + if (pages < 0) + pages = 0; + if (vmpages < 0) + vmpages = 0; + + seq_printf(f, bc_proc_lu_lu_fmt, "pages", pages, PAGE_SIZE); + seq_printf(f, bc_proc_lu_lu_fmt, "vmalloced", vmpages, PAGE_SIZE); + seq_printf(f, bc_proc_lu_lu_fmt, "pbcs", pbc, + sizeof(struct page_beancounter)); + + spin_lock_irq(&cc_lock); + list_for_each_entry (cc, &ub->ub_cclist, ulist) { + kmem_cache_t *cachep; + + cachep = cc->cachep; + seq_printf(f, bc_proc_lu_lu_fmt, + cachep->name, cc->counter, + (unsigned long)cachep->objuse); + } + spin_unlock_irq(&cc_lock); + return 0; +} + +static struct bc_proc_entry bc_kmem_debug_entry = { + .name = "kmem_debug", + .u.show = bc_kmem_debug_show, +}; + +static int __init bc_kmem_debug_init(void) +{ + bc_register_proc_entry(&bc_kmem_debug_entry); + return 0; +} + +late_initcall(bc_kmem_debug_init); +#endif + +#else +#define inc_slab_charged(ub, cache) (0) +#define dec_slab_charged(ub, cache) do { } while (0) +#define inc_pages_charged(ub, cache) (0) +#define dec_pages_charged(ub, cache) do { } while (0) +#endif + +static inline struct user_beancounter **slab_ub_ref(kmem_cache_t *cachep, + void *objp) +{ + struct slab *slabp; + int objnr; + + BUG_ON(!(cachep->flags & SLAB_UBC)); + slabp = virt_to_slab(objp); + objnr = (objp - slabp->s_mem) / cachep->buffer_size; + return slab_ubcs(cachep, slabp) + objnr; +} + +struct user_beancounter *slab_ub(void *objp) +{ + struct user_beancounter **ub_ref; + + ub_ref = slab_ub_ref(virt_to_cache(objp), objp); + return *ub_ref; +} + +EXPORT_SYMBOL(slab_ub); + +#define UB_KMEM_QUANT (PAGE_SIZE * 4) + +/* called with IRQ disabled */ +static int ub_kmemsize_charge(struct user_beancounter *ub, + unsigned long size, + enum ub_severity strict) +{ + struct task_beancounter *tbc; + + tbc = ¤t->task_bc; + if (ub != tbc->task_ub || size > UB_KMEM_QUANT) + goto just_charge; + if (tbc->kmem_precharged >= size) { + tbc->kmem_precharged -= size; + return 0; + } + + if (charge_beancounter(ub, UB_KMEMSIZE, UB_KMEM_QUANT, UB_HARD) == 0) { + tbc->kmem_precharged += UB_KMEM_QUANT - size; + return 0; + } + +just_charge: + return charge_beancounter(ub, UB_KMEMSIZE, size, strict); +} + +/* called with IRQ disabled */ +static void ub_kmemsize_uncharge(struct user_beancounter *ub, + unsigned long size) +{ + struct task_beancounter *tbc; + + if (size > UB_MAXVALUE) { + printk("ub_kmemsize_uncharge: size %lu\n", size); + dump_stack(); + } + + tbc = ¤t->task_bc; + if (ub != tbc->task_ub) + goto just_uncharge; + + tbc->kmem_precharged += size; + if (tbc->kmem_precharged < UB_KMEM_QUANT * 2) + return; + size = tbc->kmem_precharged - UB_KMEM_QUANT; + tbc->kmem_precharged -= size; + +just_uncharge: + uncharge_beancounter(ub, UB_KMEMSIZE, size); +} + +static inline int should_charge(kmem_cache_t *cachep, gfp_t flags) +{ + if (!(cachep->flags & SLAB_UBC)) + return 0; + if ((cachep->flags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC)) + return 0; + return 1; +} + +#define should_uncharge(cachep) should_charge(cachep, __GFP_UBC) + +/* called with IRQ disabled */ +int ub_slab_charge(kmem_cache_t *cachep, void *objp, gfp_t flags) +{ + unsigned int size; + struct user_beancounter *ub; + + if (!should_charge(cachep, flags)) + return 0; + + ub = get_beancounter(get_exec_ub()); + if (ub == NULL) + return 0; + + size = CHARGE_SIZE(cachep->objuse); + if (ub_kmemsize_charge(ub, size, + (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) + goto out_err; + + if (inc_slab_charged(ub, cachep) < 0) { + ub_kmemsize_uncharge(ub, size); + goto out_err; + } + *slab_ub_ref(cachep, objp) = ub; + return 0; + +out_err: + put_beancounter(ub); + return -ENOMEM; +} + +/* called with IRQ disabled */ +void ub_slab_uncharge(kmem_cache_t *cachep, void *objp) +{ + unsigned int size; + struct user_beancounter **ub_ref; + + if (!should_uncharge(cachep)) + return; + + ub_ref = slab_ub_ref(cachep, objp); + if (*ub_ref == NULL) + return; + + dec_slab_charged(*ub_ref, cachep); + size = CHARGE_SIZE(cachep->objuse); + ub_kmemsize_uncharge(*ub_ref, size); + put_beancounter(*ub_ref); + *ub_ref = NULL; +} + +/* + * Pages accounting + */ + +int ub_page_charge(struct page *page, int order, gfp_t mask) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = NULL; + if (!(mask & __GFP_UBC)) + goto out; + + ub = get_beancounter(get_exec_ub()); + if (ub == NULL) + goto out; + + local_irq_save(flags); + if (ub_kmemsize_charge(ub, CHARGE_ORDER(order), + (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) + goto err; + + inc_pages_charged(ub, order); + local_irq_restore(flags); +out: + BUG_ON(page_ub(page) != NULL); + page_ub(page) = ub; + return 0; + +err: + local_irq_restore(flags); + BUG_ON(page_ub(page) != NULL); + put_beancounter(ub); + return -ENOMEM; +} + +void ub_page_uncharge(struct page *page, int order) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = page_ub(page); + if (ub == NULL) + return; + + BUG_ON(ub->ub_magic != UB_MAGIC); + dec_pages_charged(ub, order); + local_irq_save(flags); + ub_kmemsize_uncharge(ub, CHARGE_ORDER(order)); + local_irq_restore(flags); + put_beancounter(ub); + page_ub(page) = NULL; +} + +/* + * takes init_mm.page_table_lock + * some outer lock to protect pages from vmalloced area must be held + */ +struct user_beancounter *vmalloc_ub(void *obj) +{ + struct page *pg; + + pg = vmalloc_to_page(obj); + if (pg == NULL) + return NULL; + + return page_ub(pg); +} + +EXPORT_SYMBOL(vmalloc_ub); + +struct user_beancounter *mem_ub(void *obj) +{ + struct user_beancounter *ub; + + if ((unsigned long)obj >= VMALLOC_START && + (unsigned long)obj < VMALLOC_END) + ub = vmalloc_ub(obj); + else + ub = slab_ub(obj); + + return ub; +} + +EXPORT_SYMBOL(mem_ub); diff -uprN linux-2.6.18/kernel/ub/ub_misc.c linux-2.6.18.ovz/kernel/ub/ub_misc.c --- linux-2.6.18/kernel/ub/ub_misc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_misc.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,454 @@ +/* + * kernel/ub/ub_misc.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define UB_FILE_MINQUANT 3 +#define UB_FILE_MAXQUANT 10 +#define UB_FILE_INIQUANT 4 + +static unsigned long ub_file_precharge(struct task_beancounter *task_bc, + struct user_beancounter *ub, unsigned long *kmemsize); + +static inline unsigned long ub_file_kmemsize(unsigned long nr) +{ + return CHARGE_SIZE(filp_cachep->objuse) * nr; +} + +/* + * Task staff + */ + +static void init_task_sub(struct task_struct *parent, + struct task_struct *tsk, + struct task_beancounter *old_bc) +{ + struct task_beancounter *new_bc; + struct user_beancounter *sub; + + new_bc = &tsk->task_bc; + sub = old_bc->fork_sub; + new_bc->fork_sub = get_beancounter(sub); + new_bc->task_fnode = NULL; + new_bc->task_freserv = old_bc->task_freserv; + old_bc->task_freserv = NULL; + memset(&new_bc->task_data, 0, sizeof(new_bc->task_data)); + new_bc->pgfault_handle = 0; + new_bc->pgfault_allot = 0; +} + +void ub_init_task_bc(struct task_beancounter *tbc) +{ + tbc->file_precharged = 0; + tbc->file_quant = UB_FILE_INIQUANT; + tbc->file_count = 0; + + tbc->kmem_precharged = 0; + tbc->dentry_alloc = 0; +} + +int ub_task_charge(struct task_struct *parent, struct task_struct *task) +{ + struct task_beancounter *old_bc; + struct task_beancounter *new_bc; + struct user_beancounter *ub, *pub; + unsigned long file_nr, kmemsize; + unsigned long flags; + + old_bc = &parent->task_bc; + ub = old_bc->fork_sub; + new_bc = &task->task_bc; + new_bc->task_ub = get_beancounter(ub); + new_bc->exec_ub = get_beancounter(ub); + + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); + if (unlikely(__charge_beancounter_locked(pub, UB_NUMPROC, + 1, UB_HARD) < 0)) + goto out_numproc; + + ub_init_task_bc(new_bc); + file_nr = ub_file_precharge(new_bc, pub, &kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + + charge_beancounter_notop(ub, UB_NUMPROC, 1); + if (likely(file_nr)) { + charge_beancounter_notop(ub, UB_NUMFILE, file_nr); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmemsize); + } + + init_task_sub(parent, task, old_bc); + return 0; + +out_numproc: + spin_unlock_irqrestore(&pub->ub_lock, flags); + __put_beancounter_batch(ub, 2); + return -ENOMEM; +} + +extern atomic_t dbgpre; + +void ub_task_uncharge(struct task_struct *task) +{ + struct task_beancounter *task_bc; + struct user_beancounter *pub; + unsigned long file_nr, file_kmemsize; + unsigned long flags; + + task_bc = &task->task_bc; + pub = top_beancounter(task_bc->task_ub); + spin_lock_irqsave(&pub->ub_lock, flags); + __uncharge_beancounter_locked(pub, UB_NUMPROC, 1); + file_nr = task_bc->file_precharged; + if (likely(file_nr)) + __uncharge_beancounter_locked(pub, + UB_NUMFILE, file_nr); + + /* see comment in ub_file_charge */ + task_bc->file_precharged = 0; + file_kmemsize = ub_file_kmemsize(file_nr); + if (likely(file_kmemsize)) + __uncharge_beancounter_locked(pub, + UB_KMEMSIZE, file_kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + + uncharge_beancounter_notop(task_bc->task_ub, UB_NUMPROC, 1); + if (likely(file_nr)) { + uncharge_beancounter_notop(task_bc->task_ub, + UB_NUMFILE, file_nr); + __put_beancounter_batch(task_bc->task_ub, file_nr); + } + if (likely(file_kmemsize)) + uncharge_beancounter_notop(task_bc->task_ub, + UB_KMEMSIZE, file_kmemsize); +} + +void ub_task_put(struct task_struct *task) +{ + struct task_beancounter *task_bc; + struct user_beancounter *pub; + unsigned long kmemsize, flags; + + task_bc = &task->task_bc; + + pub = top_beancounter(task_bc->task_ub); + spin_lock_irqsave(&pub->ub_lock, flags); + kmemsize = task_bc->kmem_precharged; + task_bc->kmem_precharged = 0; + if (likely(kmemsize)) + __uncharge_beancounter_locked(pub, UB_KMEMSIZE, kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + if (likely(kmemsize)) + uncharge_beancounter_notop(task_bc->task_ub, UB_KMEMSIZE, kmemsize); + + put_beancounter(task_bc->exec_ub); + put_beancounter(task_bc->task_ub); + put_beancounter(task_bc->fork_sub); + /* can't be freed elsewhere, failures possible in the middle of fork */ + if (task_bc->task_freserv != NULL) + kfree(task_bc->task_freserv); + + task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc; + task_bc->task_ub = (struct user_beancounter *)0xdead100c; + BUG_ON(task_bc->kmem_precharged != 0); +} + +/* + * Files and file locks. + */ +/* + * For NUMFILE, we do not take a lock and call charge function + * for every file. We try to charge in batches, keeping local reserve on + * task. For experimental purposes, batch size is adaptive and depends + * on numfile barrier, number of processes, and the history of successes and + * failures of batch charges. + * + * Per-task fields have the following meaning + * file_precharged number of files charged to beancounter in advance, + * file_quant logarithm of batch size + * file_count counter of charge successes, to reduce batch size + * fluctuations. + */ +static unsigned long ub_file_precharge(struct task_beancounter *task_bc, + struct user_beancounter *ub, unsigned long *kmemsize) +{ + unsigned long n, kmem; + + n = 1UL << task_bc->file_quant; + if (ub->ub_parms[UB_NUMPROC].held > + (ub->ub_parms[UB_NUMFILE].barrier >> + task_bc->file_quant)) + goto nopre; + if (unlikely(__charge_beancounter_locked(ub, UB_NUMFILE, n, UB_HARD))) + goto nopre; + kmem = ub_file_kmemsize(n); + if (unlikely(__charge_beancounter_locked(ub, UB_KMEMSIZE, + kmem, UB_HARD))) + goto nopre_kmem; + + task_bc->file_precharged += n; + get_beancounter_batch(task_bc->task_ub, n); + task_bc->file_count++; + if (task_bc->file_quant < UB_FILE_MAXQUANT && + task_bc->file_count >= task_bc->file_quant) { + task_bc->file_quant++; + task_bc->file_count = 0; + } + *kmemsize = kmem; + return n; + +nopre_kmem: + __uncharge_beancounter_locked(ub, UB_NUMFILE, n); +nopre: + if (task_bc->file_quant > UB_FILE_MINQUANT) + task_bc->file_quant--; + task_bc->file_count = 0; + return 0; +} + +int ub_file_charge(struct file *f) +{ + struct user_beancounter *ub, *pub; + struct task_beancounter *task_bc; + unsigned long file_nr, kmem; + unsigned long flags; + int err; + + task_bc = ¤t->task_bc; + ub = get_exec_ub(); + if (unlikely(ub != task_bc->task_ub)) + goto just_charge; + + if (likely(task_bc->file_precharged > 0)) { + /* + * files are put via RCU in 2.6.16 so during + * this decrement an IRQ can happen and called + * ub_files_uncharge() will mess file_precharged + * + * ub_task_uncharge() is called via RCU also so no + * protection is needed there + * + * Xemul + */ + + local_irq_save(flags); + task_bc->file_precharged--; + local_irq_restore(flags); + + f->f_ub = ub; + return 0; + } + + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); + file_nr = ub_file_precharge(task_bc, pub, &kmem); + if (unlikely(!file_nr)) + goto last_try; + spin_unlock(&pub->ub_lock); + task_bc->file_precharged--; + local_irq_restore(flags); + + charge_beancounter_notop(ub, UB_NUMFILE, file_nr); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); + f->f_ub = ub; + return 0; + +just_charge: + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); +last_try: + kmem = ub_file_kmemsize(1); + err = __charge_beancounter_locked(pub, UB_NUMFILE, 1, UB_HARD); + if (likely(!err)) { + err = __charge_beancounter_locked(pub, UB_KMEMSIZE, + kmem, UB_HARD); + if (unlikely(err)) + __uncharge_beancounter_locked(pub, UB_NUMFILE, 1); + } + spin_unlock_irqrestore(&pub->ub_lock, flags); + if (likely(!err)) { + charge_beancounter_notop(ub, UB_NUMFILE, 1); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); + f->f_ub = get_beancounter(ub); + } + return err; +} + +void ub_file_uncharge(struct file *f) +{ + struct user_beancounter *ub, *pub; + struct task_beancounter *task_bc; + unsigned long nr; + + ub = f->f_ub; + task_bc = ¤t->task_bc; + if (likely(ub == task_bc->task_ub)) { + task_bc->file_precharged++; + pub = top_beancounter(ub); + if (ub_barrier_farnr(pub, UB_NUMFILE) && + ub_barrier_farsz(pub, UB_KMEMSIZE)) + return; + if (task_bc->file_precharged < (1UL << task_bc->file_quant)) + return; + nr = task_bc->file_precharged + - (1UL << (task_bc->file_quant - 1)); + task_bc->file_precharged -= nr; + __put_beancounter_batch(ub, nr); + uncharge_beancounter(ub, UB_NUMFILE, nr); + uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(nr)); + } else { + uncharge_beancounter(ub, UB_NUMFILE, 1); + uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(1)); + put_beancounter(ub); + } +} + +int ub_flock_charge(struct file_lock *fl, int hard) +{ + struct user_beancounter *ub; + int err; + + /* No need to get_beancounter here since it's already got in slab */ + ub = slab_ub(fl); + if (ub == NULL) + return 0; + + err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT); + if (!err) + fl->fl_charged = 1; + return err; +} + +void ub_flock_uncharge(struct file_lock *fl) +{ + struct user_beancounter *ub; + + /* Ub will be put in slab */ + ub = slab_ub(fl); + if (ub == NULL || !fl->fl_charged) + return; + + uncharge_beancounter(ub, UB_NUMFLOCK, 1); + fl->fl_charged = 0; +} + +/* + * Signal handling + */ + +static int do_ub_siginfo_charge(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD)) + goto out_kmem; + + if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD)) + goto out_num; + + spin_unlock_irqrestore(&ub->ub_lock, flags); + return 0; + +out_num: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); +out_kmem: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return -ENOMEM; +} + +static void do_ub_siginfo_uncharge(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub) +{ + unsigned long size; + struct user_beancounter *p, *q; + + size = CHARGE_SIZE(kmem_obj_memusage(sq)); + for (p = ub; p != NULL; p = p->parent) { + if (do_ub_siginfo_charge(p, size)) + goto unroll; + } + + sq->sig_ub = get_beancounter(ub); + return 0; + +unroll: + for (q = ub; q != p; q = q->parent) + do_ub_siginfo_uncharge(q, size); + return -ENOMEM; +} +EXPORT_SYMBOL(ub_siginfo_charge); + +void ub_siginfo_uncharge(struct sigqueue *sq) +{ + unsigned long size; + struct user_beancounter *ub, *p; + + p = ub = sq->sig_ub; + sq->sig_ub = NULL; + size = CHARGE_SIZE(kmem_obj_memusage(sq)); + for (; ub != NULL; ub = ub->parent) + do_ub_siginfo_uncharge(ub, size); + put_beancounter(p); +} + +/* + * PTYs + */ + +int ub_pty_charge(struct tty_struct *tty) +{ + struct user_beancounter *ub; + int retval; + + ub = slab_ub(tty); + retval = 0; + if (ub && tty->driver->subtype == PTY_TYPE_MASTER && + !test_bit(TTY_CHARGED, &tty->flags)) { + retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD); + if (!retval) + set_bit(TTY_CHARGED, &tty->flags); + } + return retval; +} + +void ub_pty_uncharge(struct tty_struct *tty) +{ + struct user_beancounter *ub; + + ub = slab_ub(tty); + if (ub && tty->driver->subtype == PTY_TYPE_MASTER && + test_bit(TTY_CHARGED, &tty->flags)) { + uncharge_beancounter(ub, UB_NUMPTY, 1); + clear_bit(TTY_CHARGED, &tty->flags); + } +} diff -uprN linux-2.6.18/kernel/ub/ub_net.c linux-2.6.18.ovz/kernel/ub/ub_net.c --- linux-2.6.18/kernel/ub/ub_net.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_net.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,1145 @@ +/* + * linux/kernel/ub/ub_net.c + * + * Copyright (C) 1998-2004 Andrey V. Savochkin + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * TODO: + * - sizeof(struct inode) charge + * = tcp_mem_schedule() feedback based on ub limits + * + measures so that one socket won't exhaust all send buffers, + * see bug in bugzilla + * = sk->socket check for NULL in snd_wakeups + * (tcp_write_space checks for NULL itself) + * + in tcp_close(), orphaned socket abortion should be based on ubc + * resources (same in tcp_out_of_resources) + * Beancounter should also have separate orphaned socket counter... + * + for rcv, in-order segment should be accepted + * if only barrier is exceeded + * = tcp_rmem_schedule() feedback based on ub limits + * - repair forward_alloc mechanism for receive buffers + * It's idea is that some buffer space is pre-charged so that receive fast + * path doesn't need to take spinlocks and do other heavy stuff + * + tcp_prune_queue actions based on ub limits + * + window adjustments depending on available buffers for receive + * - window adjustments depending on available buffers for send + * + race around usewreserv + * + avoid allocating new page for each tiny-gram, see letter from ANK + * + rename ub_sock_lock + * + sk->sleep wait queue probably can be used for all wakeups, and + * sk->ub_wait is unnecessary + * + for UNIX sockets, the current algorithm will lead to + * UB_UNIX_MINBUF-sized messages only for non-blocking case + * - charge for af_packet sockets + * + all datagram sockets should be charged to NUMUNIXSOCK + * - we do not charge for skb copies and clones staying in device queues + * + live-lock if number of sockets is big and buffer limits are small + * [diff-ubc-dbllim3] + * - check that multiple readers/writers on the same socket won't cause fatal + * consequences + * - check allocation/charge orders + * + There is potential problem with callback_lock. In *snd_wakeup we take + * beancounter first, in sock_def_error_report - callback_lock first. + * then beancounter. This is not a problem if callback_lock taken + * readonly, but anyway... + * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator + * General kernel problems: + * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC + * notification won't get signals + * - datagram_poll looks racy + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +/* by some reason it is not used currently */ +#define UB_SOCK_MAINTAIN_WMEMPRESSURE 0 + + +/* Skb truesize definition. Bad place. Den */ + +static inline int skb_chargesize_head(struct sk_buff *skb) +{ + return skb_charge_size(skb->end - skb->head + + sizeof(struct skb_shared_info)); +} + +int skb_charge_fullsize(struct sk_buff *skb) +{ + int chargesize; + struct sk_buff *skbfrag; + + chargesize = skb_chargesize_head(skb) + + PAGE_SIZE * skb_shinfo(skb)->nr_frags; + if (likely(skb_shinfo(skb)->frag_list == NULL)) + return chargesize; + for (skbfrag = skb_shinfo(skb)->frag_list; + skbfrag != NULL; + skbfrag = skbfrag->next) { + chargesize += skb_charge_fullsize(skbfrag); + } + return chargesize; +} +EXPORT_SYMBOL(skb_charge_fullsize); + +static int ub_sock_makewreserv_locked(struct sock *sk, + int bufid, unsigned long size); + +int __ub_too_many_orphans(struct sock *sk, int count) +{ + struct user_beancounter *ub; + + if (sock_has_ubc(sk)) { + ub = top_beancounter(sock_bc(sk)->ub); + if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2) + return 1; + } + return 0; +} + +/* + * Queueing + */ + +static void ub_sock_snd_wakeup(struct user_beancounter *ub) +{ + struct list_head *p; + struct sock *sk; + struct sock_beancounter *skbc; + struct socket *sock; + unsigned long added; + + while (!list_empty(&ub->ub_other_sk_list)) { + p = ub->ub_other_sk_list.next; + skbc = list_entry(p, struct sock_beancounter, ub_sock_list); + sk = skbc_sock(skbc); + + added = 0; + sock = sk->sk_socket; + if (sock == NULL) { + /* sk being destroyed */ + list_del_init(&skbc->ub_sock_list); + continue; + } + + ub_debug(UBD_NET_SLEEP, + "Checking queue, waiting %lu, reserv %lu\n", + skbc->ub_waitspc, skbc->poll_reserv); + added = -skbc->poll_reserv; + if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, + skbc->ub_waitspc)) + break; + added += skbc->poll_reserv; + + list_del_init(&skbc->ub_sock_list); + + /* + * See comments in ub_tcp_snd_wakeup. + * Locking note: both unix_write_space and + * sock_def_write_space take callback_lock themselves. + * We take it here just to be on the safe side and to + * act the same way as ub_tcp_snd_wakeup does. + */ + sock_hold(sk); + read_lock(&sk->sk_callback_lock); + spin_unlock(&ub->ub_lock); + + sk->sk_write_space(sk); + read_unlock(&sk->sk_callback_lock); + + if (skbc->ub != ub && added) + charge_beancounter_notop(skbc->ub, + UB_OTHERSOCKBUF, added); + sock_put(sk); + + spin_lock(&ub->ub_lock); + } +} + +static void ub_tcp_snd_wakeup(struct user_beancounter *ub) +{ + struct list_head *p; + struct sock *sk; + struct sock_beancounter *skbc; + struct socket *sock; + unsigned long added; + + while (!list_empty(&ub->ub_tcp_sk_list)) { + p = ub->ub_tcp_sk_list.next; + skbc = list_entry(p, struct sock_beancounter, ub_sock_list); + sk = skbc_sock(skbc); + + added = 0; + sock = sk->sk_socket; + if (sock == NULL) { + /* sk being destroyed */ + list_del_init(&skbc->ub_sock_list); + continue; + } + + ub_debug(UBD_NET_SLEEP, + "Checking queue, waiting %lu, reserv %lu\n", + skbc->ub_waitspc, skbc->poll_reserv); + added = -skbc->poll_reserv; + if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, + skbc->ub_waitspc)) + break; + added += skbc->poll_reserv; + + list_del_init(&skbc->ub_sock_list); + + /* + * Send async notifications and wake up. + * Locking note: we get callback_lock here because + * tcp_write_space is over-optimistic about calling context + * (socket lock is presumed). So we get the lock here although + * it belongs to the callback. + */ + sock_hold(sk); + read_lock(&sk->sk_callback_lock); + spin_unlock(&ub->ub_lock); + + sk->sk_write_space(sk); + read_unlock(&sk->sk_callback_lock); + + if (skbc->ub != ub && added) + charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added); + sock_put(sk); + + spin_lock(&ub->ub_lock); + } +} + +void ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size) +{ + unsigned long flags; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long added_reserv; + + if (!sock_has_ubc(sk)) + return; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size); + added_reserv = -skbc->poll_reserv; + if (!ub_sock_makewreserv_locked(sk, res, size)) { + /* + * It looks a bit hackish, but it is compatible with both + * wait_for_xx_ubspace and poll. + * This __set_current_state is equivalent to a wakeup event + * right after spin_unlock_irqrestore. + */ + __set_current_state(TASK_RUNNING); + added_reserv += skbc->poll_reserv; + spin_unlock_irqrestore(&ub->ub_lock, flags); + if (added_reserv) + charge_beancounter_notop(skbc->ub, res, added_reserv); + return; + } + + ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n"); + skbc->ub_waitspc = size; + if (!list_empty(&skbc->ub_sock_list)) { + ub_debug(UBD_NET_SOCKET, + "re-adding socket to beancounter %p.\n", ub); + goto out; + } + + switch (res) { + case UB_TCPSNDBUF: + list_add_tail(&skbc->ub_sock_list, + &ub->ub_tcp_sk_list); + break; + case UB_OTHERSOCKBUF: + list_add_tail(&skbc->ub_sock_list, + &ub->ub_other_sk_list); + break; + default: + BUG(); + } +out: + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +EXPORT_SYMBOL(ub_sock_snd_queue_add); + +long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + set_current_state(TASK_INTERRUPTIBLE); + if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size)) + break; + + if (sk->sk_shutdown & SEND_SHUTDOWN) + break; + if (sk->sk_err) + break; + ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size); + timeo = schedule_timeout(timeo); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return timeo; +} + +void ub_sock_sndqueuedel(struct sock *sk) +{ + struct user_beancounter *ub; + struct sock_beancounter *skbc; + unsigned long flags; + + if (!sock_has_ubc(sk)) + return; + skbc = sock_bc(sk); + + /* race with write_space callback of other socket */ + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + list_del_init(&skbc->ub_sock_list); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +/* + * Helpers + */ + +static inline void __ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, + unsigned long size, int resource) +{ + skb_bc(skb)->ub = sock_bc(sk)->ub; + skb_bc(skb)->charged = size; + skb_bc(skb)->resource = resource; +} + +void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, + unsigned long size, int resource) +{ + if (!sock_has_ubc(sk)) + return; + + if (sock_bc(sk)->ub == NULL) + BUG(); + + __ub_skb_set_charge(skb, sk, size, resource); + + /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */ + if (skb->sk == NULL) + skb->sk = sk; +} + +EXPORT_SYMBOL(ub_skb_set_charge); + +static inline void ub_skb_set_uncharge(struct sk_buff *skb) +{ + skb_bc(skb)->ub = NULL; + skb_bc(skb)->charged = 0; + skb_bc(skb)->resource = 0; +} + +static void ub_update_rmem_thres(struct sock_beancounter *skub) +{ + struct user_beancounter *ub; + + if (skub && skub->ub) { + ub = top_beancounter(skub->ub); + ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier / + (ub->ub_parms[UB_NUMTCPSOCK].held + 1); + } +} + +static inline void ub_sock_wcharge_dec(struct sock *sk, + unsigned long chargesize) +{ + /* The check sk->sk_family != PF_NETLINK is made as the skb is + * queued to the kernel end of socket while changed to the user one. + * Den */ + if (unlikely(sock_bc(sk)->ub_wcharged) && sk->sk_family != PF_NETLINK) { + if (sock_bc(sk)->ub_wcharged > chargesize) + sock_bc(sk)->ub_wcharged -= chargesize; + else + sock_bc(sk)->ub_wcharged = 0; + } +} + +/* + * Charge socket number + */ + +static inline void sk_alloc_beancounter(struct sock *sk) +{ + struct sock_beancounter *skbc; + + skbc = sock_bc(sk); + memset(skbc, 0, sizeof(struct sock_beancounter)); +} + +static inline void sk_free_beancounter(struct sock *sk) +{ +} + +static int __sock_charge(struct sock *sk, int res) +{ + struct sock_beancounter *skbc; + struct user_beancounter *cub, *ub; + unsigned long added_reserv, added_forw; + unsigned long flags; + + cub = get_exec_ub(); + if (unlikely(cub == NULL)) + return 0; + + sk_alloc_beancounter(sk); + skbc = sock_bc(sk); + INIT_LIST_HEAD(&skbc->ub_sock_list); + + ub = top_beancounter(cub); + spin_lock_irqsave(&ub->ub_lock, flags); + if (unlikely(__charge_beancounter_locked(ub, res, 1, UB_HARD) < 0)) + goto out_limit; + + added_reserv = 0; + added_forw = 0; + if (res == UB_NUMTCPSOCK) { + added_reserv = skb_charge_size(MAX_TCP_HEADER + + 1500 - sizeof(struct iphdr) - + sizeof(struct tcphdr)); + added_reserv *= 4; + ub->ub_parms[UB_TCPSNDBUF].held += added_reserv; + if (!ub_barrier_farsz(ub, UB_TCPSNDBUF)) { + ub->ub_parms[UB_TCPSNDBUF].held -= added_reserv; + added_reserv = 0; + } + skbc->poll_reserv = added_reserv; + + added_forw = SK_STREAM_MEM_QUANTUM * 4; + ub->ub_parms[UB_TCPRCVBUF].held += added_forw; + if (!ub_barrier_farsz(ub, UB_TCPRCVBUF)) { + ub->ub_parms[UB_TCPRCVBUF].held -= added_forw; + added_forw = 0; + } + skbc->forw_space = added_forw; + } + spin_unlock_irqrestore(&ub->ub_lock, flags); + + charge_beancounter_notop(cub, res, 1); + if (added_reserv) + charge_beancounter_notop(cub, UB_TCPSNDBUF, added_reserv); + if (added_forw) + charge_beancounter_notop(cub, UB_TCPRCVBUF, added_forw); + + skbc->ub = get_beancounter(cub); + return 0; + +out_limit: + spin_unlock_irqrestore(&ub->ub_lock, flags); + sk_free_beancounter(sk); + return -ENOMEM; +} + +int ub_tcp_sock_charge(struct sock *sk) +{ + int ret; + + ret = __sock_charge(sk, UB_NUMTCPSOCK); + ub_update_rmem_thres(sock_bc(sk)); + + return ret; +} + +int ub_other_sock_charge(struct sock *sk) +{ + return __sock_charge(sk, UB_NUMOTHERSOCK); +} + +EXPORT_SYMBOL(ub_other_sock_charge); + +int ub_sock_charge(struct sock *sk, int family, int type) +{ + return (IS_TCP_SOCK(family, type) ? + ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk)); +} +EXPORT_SYMBOL(ub_sock_charge); + +/* + * Uncharge socket number + */ + +void ub_sock_uncharge(struct sock *sk) +{ + int is_tcp_sock; + unsigned long flags; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long reserv, forw; + + if (unlikely(!sock_has_ubc(sk))) + return; + + is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type); + skbc = sock_bc(sk); + ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk); + + ub = top_beancounter(skbc->ub); + + spin_lock_irqsave(&ub->ub_lock, flags); + if (!list_empty(&skbc->ub_sock_list)) { + ub_debug(UBD_NET_SOCKET, + "ub_sock_uncharge: removing from ub(%p) queue.\n", + skbc); + list_del_init(&skbc->ub_sock_list); + } + + reserv = skbc->poll_reserv; + forw = skbc->forw_space; + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), + reserv); + if (forw) + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), + forw); + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); + + ub_sock_wcharge_dec(sk, reserv); + if (unlikely(skbc->ub_wcharged)) + printk(KERN_WARNING + "ub_sock_uncharge: wch=%lu for ub %p (%d).\n", + skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid); + skbc->poll_reserv = 0; + skbc->forw_space = 0; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), + reserv); + if (forw) + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), + forw); + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); + + put_beancounter(skbc->ub); + sk_free_beancounter(sk); +} + +/* + * Special case for netlink_dump - (un)charges precalculated size + */ + +int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk) +{ + int ret; + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + chargesize = skb_charge_fullsize(skb); + ret = charge_beancounter(sock_bc(sk)->ub, + UB_DGRAMRCVBUF, chargesize, UB_HARD); + if (ret < 0) + return ret; + ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); + return ret; +} + +/* + * Poll reserve accounting + * + * This is the core of socket buffer management (along with queueing/wakeup + * functions. The rest of buffer accounting either call these functions, or + * repeat parts of their logic for some simpler cases. + */ + +static int ub_sock_makewreserv_locked(struct sock *sk, + int bufid, unsigned long size) +{ + unsigned long wcharge_added; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + + skbc = sock_bc(sk); + if (skbc->poll_reserv >= size) /* no work to be done */ + goto out; + + ub = top_beancounter(skbc->ub); + ub->ub_parms[bufid].held += size - skbc->poll_reserv; + + wcharge_added = 0; + /* + * Logic: + * 1) when used memory hits barrier, we set wmem_pressure; + * wmem_pressure is reset under barrier/2; + * between barrier/2 and barrier we limit per-socket buffer growth; + * 2) each socket is guaranteed to get (limit-barrier)/maxsockets + * calculated on the base of memory eaten after the barrier is hit + */ + skbc = sock_bc(sk); +#if UB_SOCK_MAINTAIN_WMEMPRESSURE + if (!ub_hfbarrier_hit(ub, bufid)) { + if (ub->ub_wmem_pressure) + ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, + ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_wmem_pressure = 0; + } +#endif + if (ub_barrier_hit(ub, bufid)) { +#if UB_SOCK_MAINTAIN_WMEMPRESSURE + if (!ub->ub_wmem_pressure) + ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, + ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_wmem_pressure = 1; +#endif + if (sk->sk_family == PF_NETLINK) + goto unroll; + wcharge_added = size - skbc->poll_reserv; + skbc->ub_wcharged += wcharge_added; + if (skbc->ub_wcharged * ub->ub_parms[bid2sid(bufid)].limit + + ub->ub_parms[bufid].barrier > + ub->ub_parms[bufid].limit) + goto unroll_wch; + } + if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit) + goto unroll; + + ub_adjust_maxheld(ub, bufid); + skbc->poll_reserv = size; +out: + return 0; + +unroll_wch: + skbc->ub_wcharged -= wcharge_added; +unroll: + ub_debug(UBD_NET_SEND, + "makewres: deny " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_parms[bufid].failcnt++; + ub->ub_parms[bufid].held -= size - skbc->poll_reserv; + return -ENOMEM; +} + +int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + unsigned long added_reserv; + int err; + + skbc = sock_bc(sk); + + /* + * This function provides that there is sufficient reserve upon return + * only if sk has only one user. We can check poll_reserv without + * serialization and avoid locking if the reserve already exists. + */ + if (unlikely(!sock_has_ubc(sk)) || likely(skbc->poll_reserv >= size)) + return 0; + + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + added_reserv = -skbc->poll_reserv; + err = ub_sock_makewreserv_locked(sk, bufid, size); + added_reserv += skbc->poll_reserv; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (added_reserv) + charge_beancounter_notop(skbc->ub, bufid, added_reserv); + + return err; +} + +EXPORT_SYMBOL(ub_sock_make_wreserv); + +int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size) +{ + struct sock_beancounter *skbc; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + /* optimize for the case if socket has sufficient reserve */ + ub_sock_make_wreserv(sk, bufid, size); + skbc = sock_bc(sk); + if (likely(skbc->poll_reserv >= size)) { + skbc->poll_reserv -= size; + return 0; + } + return -ENOMEM; +} + +EXPORT_SYMBOL(ub_sock_get_wreserv); + +static void ub_sock_do_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long extra; + unsigned long flags; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + + extra = 0; + spin_lock_irqsave(&ub->ub_lock, flags); + skbc->poll_reserv += size; + if (skbc->poll_reserv > ressize) { + extra = skbc->poll_reserv - ressize; + ub_sock_wcharge_dec(sk, extra); + skbc->poll_reserv = ressize; + + __uncharge_beancounter_locked(ub, bufid, extra); + if (bufid == UB_TCPSNDBUF) + ub_tcp_snd_wakeup(ub); + else + ub_sock_snd_wakeup(ub); + } + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (extra) + uncharge_beancounter_notop(skbc->ub, bufid, extra); +} + +void ub_sock_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + + if (unlikely(!sock_has_ubc(sk))) + return; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + /* check if the reserve can be kept */ + if (ub_barrier_farsz(ub, bufid)) { + skbc->poll_reserv += size; + return; + } + ub_sock_do_ret_wreserv(sk, bufid, size, ressize); +} + +/* + * UB_DGRAMRCVBUF + */ + +int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + chargesize = skb_charge_fullsize(skb); + if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF, + chargesize, UB_HARD)) + return -ENOMEM; + + ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); + return 0; +} + +EXPORT_SYMBOL(ub_sockrcvbuf_charge); + +static void ub_sockrcvbuf_uncharge(struct sk_buff *skb) +{ + uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF, + skb_bc(skb)->charged); + ub_skb_set_uncharge(skb); +} + +/* + * UB_TCPRCVBUF + */ + +int ub_sock_tcp_chargerecv(struct sock *sk, struct sk_buff *skb, + enum ub_severity strict) +{ + int retval; + unsigned long flags; + struct user_beancounter *ub; + struct sock_beancounter *skbc; + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + skbc = sock_bc(sk); + + chargesize = skb_charge_fullsize(skb); + if (likely(skbc->forw_space >= chargesize)) { + skbc->forw_space -= chargesize; + __ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); + return 0; + } + + /* + * Memory pressure reactions: + * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND) + * 2) set UB_RMEM_SHRINK and tcp_clamp_window() + * tcp_collapse_queues() if rmem_alloc > rcvbuf + * 3) drop OFO, tcp_purge_ofo() + * 4) drop all. + * Currently, we do #2 and #3 at once (which means that current + * collapsing of OFO queue in tcp_collapse_queues() is a waste of time, + * for example...) + * On memory pressure we jump from #0 to #3, and when the pressure + * subsides, to #1. + */ + retval = 0; + ub = top_beancounter(sock_bc(sk)->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_parms[UB_TCPRCVBUF].held += chargesize; + if (ub->ub_parms[UB_TCPRCVBUF].held > + ub->ub_parms[UB_TCPRCVBUF].barrier && + strict != UB_FORCE) + goto excess; + ub_adjust_maxheld(ub, UB_TCPRCVBUF); + spin_unlock_irqrestore(&ub->ub_lock, flags); + +out: + if (retval == 0) { + charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF, + chargesize); + ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); + } + return retval; + +excess: + ub->ub_rmem_pressure = UB_RMEM_SHRINK; + if (strict == UB_HARD) + retval = -ENOMEM; + if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit) + retval = -ENOMEM; + /* + * We try to leave numsock*maxadvmss as a reserve for sockets not + * queueing any data yet (if the difference between the barrier and the + * limit is enough for this reserve). + */ + if (ub->ub_parms[UB_TCPRCVBUF].held + + ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss + > ub->ub_parms[UB_TCPRCVBUF].limit && + atomic_read(&sk->sk_rmem_alloc)) + retval = -ENOMEM; + if (retval) { + ub->ub_parms[UB_TCPRCVBUF].held -= chargesize; + ub->ub_parms[UB_TCPRCVBUF].failcnt++; + } + ub_adjust_maxheld(ub, UB_TCPRCVBUF); + spin_unlock_irqrestore(&ub->ub_lock, flags); + goto out; +} +EXPORT_SYMBOL(ub_sock_tcp_chargerecv); + +static void ub_tcprcvbuf_uncharge(struct sk_buff *skb) +{ + unsigned long flags; + unsigned long held, bar; + int prev_pres; + struct user_beancounter *ub; + + ub = top_beancounter(skb_bc(skb)->ub); + if (ub_barrier_farsz(ub, UB_TCPRCVBUF)) { + sock_bc(skb->sk)->forw_space += skb_bc(skb)->charged; + ub_skb_set_uncharge(skb); + return; + } + + spin_lock_irqsave(&ub->ub_lock, flags); + if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) { + printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n", + skb_bc(skb)->charged, + ub, ub->ub_parms[UB_TCPRCVBUF].held); + /* ass-saving bung */ + skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held; + } + ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged; + held = ub->ub_parms[UB_TCPRCVBUF].held; + bar = ub->ub_parms[UB_TCPRCVBUF].barrier; + prev_pres = ub->ub_rmem_pressure; + if (held <= bar - (bar >> 2)) + ub->ub_rmem_pressure = UB_RMEM_EXPAND; + else if (held <= bar) + ub->ub_rmem_pressure = UB_RMEM_KEEP; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF, + skb_bc(skb)->charged); + ub_skb_set_uncharge(skb); +} + + +/* + * UB_OTHERSOCKBUF and UB_TCPSNDBUF + */ + +static void ub_socksndbuf_uncharge(struct sk_buff *skb) +{ + unsigned long flags; + struct user_beancounter *ub, *cub; + unsigned long chargesize; + + cub = skb_bc(skb)->ub; + ub = top_beancounter(cub); + chargesize = skb_bc(skb)->charged; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_OTHERSOCKBUF, chargesize); + if (skb->sk != NULL && sock_has_ubc(skb->sk)) + ub_sock_wcharge_dec(skb->sk, chargesize); + ub_sock_snd_wakeup(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, chargesize); + ub_skb_set_uncharge(skb); +} + +/* expected to be called under socket lock */ +static void ub_tcpsndbuf_uncharge(struct sk_buff *skb) +{ + /* + * ub_sock_ret_wreserv call is abused here, we just want to uncharge + * skb size. However, to reduce duplication of the code doing + * ub_hfbarrier_hit check, ub_wcharged reduction, and wakeup we call + * a function that already does all of this. 2006/04/27 SAW + */ + ub_sock_ret_wreserv(skb->sk, UB_TCPSNDBUF, skb_bc(skb)->charged, + sock_bc(skb->sk)->poll_reserv); + ub_skb_set_uncharge(skb); +} + +void ub_skb_uncharge(struct sk_buff *skb) +{ + switch (skb_bc(skb)->resource) { + case UB_TCPSNDBUF: + ub_tcpsndbuf_uncharge(skb); + break; + case UB_TCPRCVBUF: + ub_tcprcvbuf_uncharge(skb); + break; + case UB_DGRAMRCVBUF: + ub_sockrcvbuf_uncharge(skb); + break; + case UB_OTHERSOCKBUF: + ub_socksndbuf_uncharge(skb); + break; + } +} + +EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */ + +/* + * Other sock reserve managment + */ + +int ub_sock_getwres_other(struct sock *sk, unsigned long size) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + unsigned long added_reserv; + int err; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + /* + * Nothing except beancounter lock protects skbc->poll_reserv. + * So, take the lock and do the job. + * Dances with added_reserv repeat ub_sock_make_wreserv. + */ + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + added_reserv = -skbc->poll_reserv; + err = ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, size); + added_reserv += skbc->poll_reserv; + if (!err) + skbc->poll_reserv -= size; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (added_reserv) + charge_beancounter_notop(skbc->ub, UB_OTHERSOCKBUF, added_reserv); + + return err; +} +EXPORT_SYMBOL(ub_sock_getwres_other); + +void ub_sock_retwres_other(struct sock *sk, + unsigned long size, unsigned long ressize) +{ + if (unlikely(!sock_has_ubc(sk))) + return; + + ub_sock_do_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize); +} + +/* + * TCP send buffers accouting. Paged part + */ + +int ub_sock_tcp_chargepage(struct sock *sk) +{ + struct sock_beancounter *skbc; + unsigned long extra; + int err; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + skbc = sock_bc(sk); + ub_sock_make_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE); + if (likely(skbc->poll_reserv >= PAGE_SIZE)) { + skbc->poll_reserv -= PAGE_SIZE; + return 0; + } + + /* + * Ok, full page is not available. + * However, this function must succeed if poll previously indicated + * that write is possible. We better make a forced charge here + * than reserve a whole page in poll. + */ + err = ub_sock_make_wreserv(sk, UB_TCPSNDBUF, SOCK_MIN_UBCSPACE); + if (unlikely(err < 0)) + goto out; + if (skbc->poll_reserv < PAGE_SIZE) { + extra = PAGE_SIZE - skbc->poll_reserv; + err = charge_beancounter(skbc->ub, UB_TCPSNDBUF, extra, + UB_FORCE); + if (err < 0) + goto out; + skbc->poll_reserv += extra; + } + skbc->poll_reserv -= PAGE_SIZE; + return 0; + +out: + return err; +} + +void ub_sock_tcp_detachpage(struct sock *sk) +{ + struct sk_buff *skb; + + if (unlikely(!sock_has_ubc(sk))) + return; + + /* The page is just detached from socket. The last skb in queue + with paged part holds referrence to it */ + skb = skb_peek_tail(&sk->sk_write_queue); + if (skb == NULL) { + /* If the queue is empty - all data is sent and page is about + to be freed */ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE, + sock_bc(sk)->poll_reserv); + } else { + /* Last skb is a good aproximation for a last skb with + paged part */ + skb_bc(skb)->charged += PAGE_SIZE; + } +} + +/* + * TCPSNDBUF charge functions below are called in the following cases: + * - sending of SYN, SYN-ACK, FIN, the latter charge is forced by + * some technical reasons in TCP code; + * - fragmentation of TCP packets. + * These functions are allowed but not required to use poll_reserv. + * Originally, these functions didn't do that, since it didn't make + * any sense. Now, since poll_reserv now has a function of general reserve, + * they use it. + */ +int ub_sock_tcp_chargesend(struct sock *sk, struct sk_buff *skb, + enum ub_severity strict) +{ + int ret; + unsigned long chargesize; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + skbc = sock_bc(sk); + chargesize = skb_charge_fullsize(skb); + if (likely(skbc->poll_reserv >= chargesize)) { + skbc->poll_reserv -= chargesize; + __ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + /* XXX hack, see ub_skb_set_charge */ + skb->sk = sk; + return 0; + } + + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ret = __charge_beancounter_locked(ub, UB_TCPSNDBUF, + chargesize, strict); + /* + * Note: this check is not equivalent of the corresponding check + * in makewreserv. It's similar in spirit, but an equivalent check + * would be too long and complicated here. + */ + if (!ret && ub_barrier_hit(ub, UB_TCPSNDBUF)) + skbc->ub_wcharged += chargesize; + spin_unlock_irqrestore(&ub->ub_lock, flags); + if (likely(!ret)) { + charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, chargesize); + ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + } + return ret; +} +EXPORT_SYMBOL(ub_sock_tcp_chargesend); + +void ub_sock_tcp_unchargesend(struct sock *sk, unsigned long size) +{ + if (unlikely(!sock_has_ubc(sk))) + return; + /* see ub_tcpsndbuf_uncharge */ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, sock_bc(sk)->poll_reserv); +} + +/* + * Initialization + */ + +int __init skbc_cache_init(void) +{ + return 0; +} diff -uprN linux-2.6.18/kernel/ub/ub_oom.c linux-2.6.18.ovz/kernel/ub/ub_oom.c --- linux-2.6.18/kernel/ub/ub_oom.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_oom.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,200 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define UB_OOM_TIMEOUT (5 * HZ) + +int oom_generation; +int oom_kill_counter; +static DEFINE_SPINLOCK(oom_lock); +static DECLARE_WAIT_QUEUE_HEAD(oom_wq); + +static inline int ub_oom_completed(struct task_struct *tsk) +{ + if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) + /* we were oom killed - just die */ + return 1; + if (tsk->task_bc.oom_generation != oom_generation) + /* some task was succesfully killed */ + return 1; + return 0; +} + +static void ub_clear_oom(void) +{ + struct user_beancounter *ub; + + rcu_read_lock(); + for_each_beancounter(ub) + ub->ub_oom_noproc = 0; + rcu_read_unlock(); +} + +/* Called with cpuset_lock held */ +int ub_oom_lock(void) +{ + int timeout; + DEFINE_WAIT(oom_w); + struct task_struct *tsk; + + tsk = current; + + spin_lock(&oom_lock); + if (!oom_kill_counter) + goto out_do_oom; + + timeout = UB_OOM_TIMEOUT; + while (1) { + if (ub_oom_completed(tsk)) { + spin_unlock(&oom_lock); + return -EINVAL; + } + + if (timeout == 0) + break; + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&oom_wq, &oom_w); + spin_unlock(&oom_lock); + cpuset_unlock(); + + timeout = schedule_timeout(timeout); + + cpuset_lock(); + spin_lock(&oom_lock); + remove_wait_queue(&oom_wq, &oom_w); + } + +out_do_oom: + ub_clear_oom(); + return 0; +} + +static inline long ub_current_overdraft(struct user_beancounter *ub) +{ + return ub->ub_parms[UB_OOMGUARPAGES].held + + ((ub->ub_parms[UB_KMEMSIZE].held + + ub->ub_parms[UB_TCPSNDBUF].held + + ub->ub_parms[UB_TCPRCVBUF].held + + ub->ub_parms[UB_OTHERSOCKBUF].held + + ub->ub_parms[UB_DGRAMRCVBUF].held) + >> PAGE_SHIFT) - ub->ub_parms[UB_OOMGUARPAGES].barrier; +} + +int ub_oom_task_skip(struct user_beancounter *ub, struct task_struct *tsk) +{ + struct user_beancounter *mm_ub; + + if (ub == NULL) + return 0; + + task_lock(tsk); + if (tsk->mm == NULL) + mm_ub = NULL; + else + mm_ub = tsk->mm->mm_ub; + + while (mm_ub != NULL && mm_ub != ub) + mm_ub = mm_ub->parent; + task_unlock(tsk); + + return mm_ub != ub; +} + +struct user_beancounter *ub_oom_select_worst(void) +{ + struct user_beancounter *ub, *walkp; + long ub_maxover; + + ub_maxover = 0; + ub = NULL; + + rcu_read_lock(); + for_each_beancounter (walkp) { + long ub_overdraft; + + if (walkp->parent != NULL) + continue; + if (walkp->ub_oom_noproc) + continue; + + ub_overdraft = ub_current_overdraft(walkp); + if (ub_overdraft > ub_maxover && get_beancounter_rcu(walkp)) { + put_beancounter(ub); + ub = walkp; + ub_maxover = ub_overdraft; + } + } + + if (ub) + ub->ub_oom_noproc = 1; + rcu_read_unlock(); + + return ub; +} + +void ub_oom_mm_killed(struct user_beancounter *ub) +{ + static struct ub_rate_info ri = { 5, 60*HZ }; + + /* increment is serialized with oom_lock */ + ub->ub_parms[UB_OOMGUARPAGES].failcnt++; + + if (ub_ratelimit(&ri)) + show_mem(); +} + +void ub_oom_unlock(void) +{ + spin_unlock(&oom_lock); +} + +void ub_oom_task_dead(struct task_struct *tsk) +{ + spin_lock(&oom_lock); + oom_kill_counter = 0; + oom_generation++; + + printk("OOM killed process %s (pid=%d, ve=%d) exited, " + "free=%u gen=%d.\n", + tsk->comm, tsk->pid, VEID(tsk->ve_task_info.owner_env), + nr_free_pages(), oom_generation); + /* if there is time to sleep in ub_oom_lock -> sleep will continue */ + wake_up_all(&oom_wq); + spin_unlock(&oom_lock); +} + +void ub_out_of_memory(struct user_beancounter *scope) +{ + struct user_beancounter *ub; + struct task_struct *p; + + cpuset_lock(); + spin_lock(&oom_lock); + ub_clear_oom(); + ub = get_beancounter(scope); + + read_lock(&tasklist_lock); +retry: + p = oom_select_bad_process(ub); + if (p == NULL || PTR_ERR(p) == -1UL) + goto unlock; + + if (oom_kill_process(p, "UB Out of memory")) + goto retry; + + put_beancounter(ub); + +unlock: + read_unlock(&tasklist_lock); + spin_unlock(&oom_lock); + cpuset_unlock(); +} +EXPORT_SYMBOL(ub_out_of_memory); diff -uprN linux-2.6.18/kernel/ub/ub_page_bc.c linux-2.6.18.ovz/kernel/ub/ub_page_bc.c --- linux-2.6.18/kernel/ub/ub_page_bc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_page_bc.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,437 @@ +/* + * kernel/ub/ub_page_bc.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static kmem_cache_t *pb_cachep; +spinlock_t pb_lock = SPIN_LOCK_UNLOCKED; +static struct page_beancounter **pb_hash_table; +static unsigned int pb_hash_mask; + +/* + * Auxiliary staff + */ + +static inline struct page_beancounter *next_page_pb(struct page_beancounter *p) +{ + return list_entry(p->page_list.next, struct page_beancounter, + page_list); +} + +static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p) +{ + return list_entry(p->page_list.prev, struct page_beancounter, + page_list); +} + +/* + * Held pages manipulation + */ +static inline void set_held_pages(struct user_beancounter *bc) +{ + /* all three depend on ub_held_pages */ + __ub_update_physpages(bc); + __ub_update_oomguarpages(bc); + __ub_update_privvm(bc); +} + +static inline void do_dec_held_pages(struct user_beancounter *ub, int value) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_held_pages -= value; + set_held_pages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static void dec_held_pages(struct user_beancounter *ub, int value) +{ + for (; ub != NULL; ub = ub->parent) + do_dec_held_pages(ub, value); +} + +static inline void do_inc_held_pages(struct user_beancounter *ub, int value) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_held_pages += value; + set_held_pages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static void inc_held_pages(struct user_beancounter *ub, int value) +{ + for (; ub != NULL; ub = ub->parent) + do_inc_held_pages(ub, value); +} + +/* + * Alloc - free + */ + +inline int pb_alloc(struct page_beancounter **pbc) +{ + *pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL); + if (*pbc != NULL) { + (*pbc)->next_hash = NULL; + (*pbc)->pb_magic = PB_MAGIC; + } + return (*pbc == NULL); +} + +inline void pb_free(struct page_beancounter **pb) +{ + if (*pb != NULL) { + kmem_cache_free(pb_cachep, *pb); + *pb = NULL; + } +} + +void pb_free_list(struct page_beancounter **p_pb) +{ + struct page_beancounter *list, *pb; + + list = *p_pb; + if (list == PBC_COPY_SAME) + return; + + while (list) { + pb = list; + list = list->next_hash; + pb_free(&pb); + } + *p_pb = NULL; +} + +/* + * head -> -> -> ... + */ +static int __alloc_list(struct page_beancounter **head, int num) +{ + struct page_beancounter *pb; + + while (num > 0) { + if (pb_alloc(&pb)) + return -1; + pb->next_hash = *head; + *head = pb; + num--; + } + + return num; +} + +/* + * Ensure that the list contains at least num elements. + * p_pb points to an initialized list, may be of the zero length. + * + * mm->page_table_lock should be held + */ +int pb_alloc_list(struct page_beancounter **p_pb, int num) +{ + struct page_beancounter *list; + + for (list = *p_pb; list != NULL && num; list = list->next_hash, num--); + if (!num) + return 0; + + /* + * *p_pb(after) *p_pb (before) + * \ \ + * -...-> -> ... + */ + if (__alloc_list(p_pb, num) < 0) + goto nomem; + return 0; + +nomem: + pb_free_list(p_pb); + return -ENOMEM; +} + +/* + * Allocates a page_beancounter for each + * user_beancounter in a hash + */ +int pb_alloc_all(struct page_beancounter **pbs) +{ + int need_alloc; + struct user_beancounter *ub; + + need_alloc = 0; + rcu_read_lock(); + for_each_beancounter(ub) + need_alloc++; + rcu_read_unlock(); + + if (!__alloc_list(pbs, need_alloc)) + return 0; + + pb_free_list(pbs); + return -ENOMEM; +} + +/* + * Hash routines + */ + +static inline int pb_hash(struct user_beancounter *ub, struct page *page) +{ + return (page_to_pfn(page) + (ub->ub_uid << 10)) & pb_hash_mask; +} + +/* pb_lock should be held */ +static inline void insert_pb(struct page_beancounter *p, struct page *page, + struct user_beancounter *ub, int hash) +{ + p->page = page; + p->ub = get_beancounter(ub); + p->next_hash = pb_hash_table[hash]; + pb_hash_table[hash] = p; + inc_pbc_count(ub); +} + +/* + * Heart + */ + +static int __pb_dup_ref(struct page *page, struct user_beancounter *bc, + int hash) +{ + struct page_beancounter *p; + + for (p = pb_hash_table[hash]; + p != NULL && (p->page != page || p->ub != bc); + p = p->next_hash); + if (p == NULL) + return -1; + + PB_COUNT_INC(p->refcount); + return 0; +} + +static void __pb_add_ref(struct page *page, struct user_beancounter *bc, + struct page_beancounter **ppb, int hash) +{ + struct page_beancounter *head, *p, **hp; + int shift; + + p = *ppb; + *ppb = p->next_hash; + + insert_pb(p, page, bc, hash); + hp = page_pblist(page); + head = *hp; + + if (head != NULL) { + /* + * Move the first element to the end of the list. + * List head (pb_head) is set to the next entry. + * Note that this code works even if head is the only element + * on the list (because it's cyclic). + */ + BUG_ON(head->pb_magic != PB_MAGIC); + *hp = next_page_pb(head); + PB_SHIFT_INC(head->refcount); + shift = PB_SHIFT_GET(head->refcount); + /* + * Update user beancounter, the share of head has been changed. + * Note that the shift counter is taken after increment. + */ + dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift); + /* add the new page beancounter to the end of the list */ + head = *hp; + list_add_tail(&p->page_list, &head->page_list); + } else { + *hp = p; + shift = 0; + INIT_LIST_HEAD(&p->page_list); + } + + p->refcount = PB_REFCOUNT_MAKE(shift, 1); + /* update user beancounter for the new page beancounter */ + inc_held_pages(bc, UB_PAGE_WEIGHT >> shift); +} + +void pb_add_ref(struct page *page, struct mm_struct *mm, + struct page_beancounter **p_pb) +{ + int hash; + struct user_beancounter *bc; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + if (__pb_dup_ref(page, bc, hash)) + __pb_add_ref(page, bc, p_pb, hash); + spin_unlock(&pb_lock); +} + +void pb_dup_ref(struct page *page, struct mm_struct *mm, + struct page_beancounter **p_pb) +{ + int hash; + struct user_beancounter *bc; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + if (*page_pblist(page) == NULL) + /* + * pages like ZERO_PAGE must not be accounted in pbc + * so on fork we just skip them + */ + goto out_unlock; + + if (unlikely(*p_pb != PBC_COPY_SAME)) + __pb_add_ref(page, bc, p_pb, hash); + else if (unlikely(__pb_dup_ref(page, bc, hash))) + WARN_ON(1); +out_unlock: + spin_unlock(&pb_lock); +} + +void pb_remove_ref(struct page *page, struct mm_struct *mm) +{ + int hash; + struct user_beancounter *bc; + struct page_beancounter *p, **q, *f; + int shift, shiftt; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + for (q = pb_hash_table + hash, p = *q; + p != NULL && (p->page != page || p->ub != bc); + q = &p->next_hash, p = *q); + if (p == NULL) + goto out_unlock; + + PB_COUNT_DEC(p->refcount); + if (PB_COUNT_GET(p->refcount)) + /* + * More references from the same user beancounter exist. + * Nothing needs to be done. + */ + goto out_unlock; + + /* remove from the hash list */ + f = p; + *q = p->next_hash; + + shift = PB_SHIFT_GET(p->refcount); + + dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift); + + q = page_pblist(page); + if (*q == p) { + if (list_empty(&p->page_list)) { + *q = NULL; + goto out_free; + } + + *q = next_page_pb(p); + } + list_del(&p->page_list); + + /* Now balance the list. Move the tail and adjust its shift counter. */ + p = prev_page_pb(*q); + shiftt = PB_SHIFT_GET(p->refcount); + *q = p; + PB_SHIFT_DEC(p->refcount); + + inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); + + /* + * If the shift counter of the moved beancounter is different from the + * removed one's, repeat the procedure for one more tail beancounter + */ + if (shiftt > shift) { + p = prev_page_pb(*q); + *q = p; + PB_SHIFT_DEC(p->refcount); + inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); + } +out_free: + dec_pbc_count(f->ub); + spin_unlock(&pb_lock); + + put_beancounter(f->ub); + pb_free(&f); + return; + +out_unlock: + spin_unlock(&pb_lock); +} + +struct user_beancounter *pb_grab_page_ub(struct page *page) +{ + struct page_beancounter *pb; + struct user_beancounter *ub; + + spin_lock(&pb_lock); + pb = *page_pblist(page); + ub = (pb == NULL ? ERR_PTR(-EINVAL) : + get_beancounter(pb->ub)); + spin_unlock(&pb_lock); + return ub; +} + +void __init ub_init_pbc(void) +{ + unsigned long hash_size; + + pb_cachep = kmem_cache_create("page_beancounter", + sizeof(struct page_beancounter), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); + hash_size = num_physpages >> 2; + for (pb_hash_mask = 1; + (hash_size & pb_hash_mask) != hash_size; + pb_hash_mask = (pb_hash_mask << 1) + 1); + hash_size = pb_hash_mask + 1; + printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size); + pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *)); + memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *)); + + ub_init_io(pb_cachep); +} diff -uprN linux-2.6.18/kernel/ub/ub_pages.c linux-2.6.18.ovz/kernel/ub/ub_pages.c --- linux-2.6.18/kernel/ub/ub_pages.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_pages.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,549 @@ +/* + * kernel/ub/ub_pages.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pte_t *pte; + spinlock_t *ptl; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + do { + if (!pte_none(*pte) && pte_present(*pte)) + (*ret)++; + } while (pte++, addr += PAGE_SIZE, (addr != end)); + pte_unmap_unlock(pte - 1, ptl); + + return addr; +} + +static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + next = pages_in_pte_range(vma, pmd, addr, next, ret); + } while (pmd++, addr = next, (addr != end)); + + return addr; +} + +static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + next = pages_in_pmd_range(vma, pud, addr, next, ret); + } while (pud++, addr = next, (addr != end)); + + return addr; +} + +unsigned long pages_in_vma_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + unsigned long ret; + + ret = 0; + BUG_ON(addr >= end); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + next = pages_in_pud_range(vma, pgd, addr, next, &ret); + } while (pgd++, addr = next, (addr != end)); + return ret; +} + +void fastcall __ub_update_physpages(struct user_beancounter *ub) +{ + ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages + + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT); + ub_adjust_maxheld(ub, UB_PHYSPAGES); +} + +void fastcall __ub_update_oomguarpages(struct user_beancounter *ub) +{ + ub->ub_parms[UB_OOMGUARPAGES].held = + ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages; + ub_adjust_maxheld(ub, UB_OOMGUARPAGES); +} + +void fastcall __ub_update_privvm(struct user_beancounter *ub) +{ + ub->ub_parms[UB_PRIVVMPAGES].held = + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT) + + ub->ub_unused_privvmpages + + ub->ub_parms[UB_SHMPAGES].held; + ub_adjust_maxheld(ub, UB_PRIVVMPAGES); +} + +static inline int __charge_privvm_locked(struct user_beancounter *ub, + unsigned long s, enum ub_severity strict) +{ + if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0) + return -ENOMEM; + + ub->ub_unused_privvmpages += s; + return 0; +} + +static void __unused_privvm_dec_locked(struct user_beancounter *ub, + long size) +{ + /* catch possible overflow */ + if (ub->ub_unused_privvmpages < size) { + uncharge_warn(ub, UB_UNUSEDPRIVVM, + size, ub->ub_unused_privvmpages); + size = ub->ub_unused_privvmpages; + } + ub->ub_unused_privvmpages -= size; + __ub_update_privvm(ub); +} + +void __ub_unused_privvm_dec(struct mm_struct *mm, long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __unused_privvm_dec_locked(ub, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_unused_privvm_sub(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long count) +{ + if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) + __ub_unused_privvm_dec(mm, count); +} + +void ub_unused_privvm_add(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_unused_privvmpages += size; + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int ub_protected_charge(struct mm_struct *mm, unsigned long size, + unsigned long newflags, struct vm_area_struct *vma) +{ + unsigned long flags; + struct file *file; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return PRIVVM_NO_CHARGE; + + flags = vma->vm_flags; + if (!((newflags ^ flags) & VM_WRITE)) + return PRIVVM_NO_CHARGE; + + file = vma->vm_file; + if (!VM_UB_PRIVATE(newflags | VM_WRITE, file)) + return PRIVVM_NO_CHARGE; + + if (flags & VM_WRITE) + return PRIVVM_TO_SHARED; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_privvm_locked(ub, size, UB_SOFT) < 0) + goto err; + spin_unlock_irqrestore(&ub->ub_lock, flags); + return PRIVVM_TO_PRIVATE; + +err: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return PRIVVM_ERROR; +} + +int ub_memory_charge(struct mm_struct *mm, unsigned long size, + unsigned vm_flags, struct file *vm_file, int sv) +{ + struct user_beancounter *ub, *ubl; + unsigned long flags; + + ub = mm->mm_ub; + if (ub == NULL) + return 0; + + size >>= PAGE_SHIFT; + if (size > UB_MAXVALUE) + return -EINVAL; + + BUG_ON(sv != UB_SOFT && sv != UB_HARD); + + if (vm_flags & VM_LOCKED) { + if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv)) + goto out_err; + } + if (VM_UB_PRIVATE(vm_flags, vm_file)) { + ubl = top_beancounter(ub); + spin_lock_irqsave(&ubl->ub_lock, flags); + if (__charge_privvm_locked(ubl, size, sv)) + goto out_private; + spin_unlock_irqrestore(&ubl->ub_lock, flags); + } + return 0; + +out_private: + spin_unlock_irqrestore(&ubl->ub_lock, flags); + if (vm_flags & VM_LOCKED) + uncharge_beancounter(ub, UB_LOCKEDPAGES, size); +out_err: + return -ENOMEM; +} + +void ub_memory_uncharge(struct mm_struct *mm, unsigned long size, + unsigned vm_flags, struct file *vm_file) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + size >>= PAGE_SHIFT; + + if (vm_flags & VM_LOCKED) + uncharge_beancounter(ub, UB_LOCKEDPAGES, size); + if (VM_UB_PRIVATE(vm_flags, vm_file)) { + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __unused_privvm_dec_locked(ub, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); + } +} + +int ub_locked_charge(struct mm_struct *mm, unsigned long size) +{ + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return 0; + + return charge_beancounter(ub, UB_LOCKEDPAGES, + size >> PAGE_SHIFT, UB_HARD); +} + +void ub_locked_uncharge(struct mm_struct *mm, unsigned long size) +{ + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); +} + +int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size) +{ + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return 0; + + return charge_beancounter(ub, UB_LOCKEDPAGES, + size >> PAGE_SHIFT, UB_HARD); +} + +void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size) +{ + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return; + + uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); +} + + +static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_tmpfs_respages++; + __ub_update_physpages(ub); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_tmpfs_respages_inc(struct shmem_inode_info *shi) +{ + struct user_beancounter *ub; + + for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) + do_ub_tmpfs_respages_inc(ub); +} + +static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + /* catch possible overflow */ + if (ub->ub_tmpfs_respages < size) { + uncharge_warn(ub, UB_TMPFSPAGES, + size, ub->ub_tmpfs_respages); + size = ub->ub_tmpfs_respages; + } + ub->ub_tmpfs_respages -= size; + /* update values what is the most interesting */ + __ub_update_physpages(ub); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_tmpfs_respages_sub(struct shmem_inode_info *shi, + unsigned long size) +{ + struct user_beancounter *ub; + + for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) + do_ub_tmpfs_respages_sub(ub, size); +} + +int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size) +{ + int ret; + unsigned long flags; + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return 0; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD); + if (ret == 0) + __ub_update_privvm(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); + return ret; +} + +void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_SHMPAGES, size); + __ub_update_privvm(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +#ifdef CONFIG_USER_SWAP_ACCOUNTING +static inline void do_ub_swapentry_inc(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_swap_pages++; + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num, + struct user_beancounter *ub) +{ + si->swap_ubs[num] = get_beancounter(ub); + for (; ub != NULL; ub = ub->parent) + do_ub_swapentry_inc(ub); +} +EXPORT_SYMBOL(ub_swapentry_inc); + +static inline void do_ub_swapentry_dec(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (ub->ub_swap_pages <= 0) + uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages); + else + ub->ub_swap_pages--; + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num) +{ + struct user_beancounter *ub, *ubp; + + ub = si->swap_ubs[num]; + si->swap_ubs[num] = NULL; + for (ubp = ub; ubp != NULL; ubp = ubp->parent) + do_ub_swapentry_dec(ubp); + put_beancounter(ub); +} +EXPORT_SYMBOL(ub_swapentry_dec); + +int ub_swap_init(struct swap_info_struct *si, pgoff_t num) +{ + struct user_beancounter **ubs; + + ubs = vmalloc(num * sizeof(struct user_beancounter *)); + if (ubs == NULL) + return -ENOMEM; + + memset(ubs, 0, num * sizeof(struct user_beancounter *)); + si->swap_ubs = ubs; + return 0; +} + +void ub_swap_fini(struct swap_info_struct *si) +{ + if (si->swap_ubs) { + vfree(si->swap_ubs); + si->swap_ubs = NULL; + } +} +#endif + +static int vmguar_enough_memory(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + struct user_beancounter *ub; + + if (event != VIRTINFO_ENOUGHMEM) + return old_ret; + /* + * If it's a kernel thread, don't care about it. + * Added in order aufsd to run smoothly over ramfs. + */ + if (!current->mm) + return NOTIFY_DONE; + + ub = top_beancounter(current->mm->mm_ub); + if (ub->ub_parms[UB_PRIVVMPAGES].held > + ub->ub_parms[UB_VMGUARPAGES].barrier) + return old_ret; + + return NOTIFY_OK; +} + +static struct vnotifier_block vmguar_notifier_block = { + .notifier_call = vmguar_enough_memory +}; + +static int __init init_vmguar_notifier(void) +{ + virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block); + return 0; +} + +static void __exit fini_vmguar_notifier(void) +{ + virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block); +} + +module_init(init_vmguar_notifier); +module_exit(fini_vmguar_notifier); + +#ifdef CONFIG_PROC_FS +static int bc_vmaux_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + unsigned long swap, unmap; + int i; + + ub = seq_beancounter(f); + + swap = unmap = 0; + for_each_online_cpu(i) { + swap += per_cpu_ptr(ub->ub_percpu, i)->swapin; + unmap += per_cpu_ptr(ub->ub_percpu, i)->unmap; + } + + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_UNUSEDPRIVVM], + ub->ub_unused_privvmpages); + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_TMPFSPAGES], + ub->ub_tmpfs_respages); + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_SWAPPAGES], + ub->ub_swap_pages); + + seq_printf(f, bc_proc_lu_fmt, "swapin", swap); + seq_printf(f, bc_proc_lu_fmt, "unmap", unmap); + return 0; +} +static struct bc_proc_entry bc_vmaux_entry = { + .name = "vmaux", + .u.show = bc_vmaux_show, +}; + +static int __init bc_vmaux_init(void) +{ + bc_register_proc_entry(&bc_vmaux_entry); + return 0; +} + +late_initcall(bc_vmaux_init); +#endif diff -uprN linux-2.6.18/kernel/ub/ub_proc.c linux-2.6.18.ovz/kernel/ub/ub_proc.c --- linux-2.6.18/kernel/ub/ub_proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_proc.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,672 @@ +/* + * kernel/ub/proc.c + * + * Copyright (C) 2006 OpenVZ. SWsoft Inc. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Generic output formats */ +#if BITS_PER_LONG == 32 +const char *bc_proc_lu_fmt = "\t%-20s %10lu\n"; +const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; +const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; +const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n"; +#else +const char *bc_proc_lu_fmt = "\t%-20s %21lu\n"; +const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; +const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; +const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n"; +#endif + +#if BITS_PER_LONG == 32 +static const char *head_fmt = "%10s %-12s %10s %10s %10s %10s %10s\n"; +static const char *res_fmt = "%10s %-12s %10lu %10lu %10lu %10lu %10lu\n"; +#else +static const char *head_fmt = "%10s %-12s %20s %20s %20s %20s %20s\n"; +static const char *res_fmt = "%10s %-12s %20lu %20lu %20lu %20lu %20lu\n"; +#endif + +static void ub_show_res(struct seq_file *f, struct user_beancounter *ub, + int r, int show_uid) +{ + int len; + char ub_uid[64]; + + if (show_uid && r == 0) { + len = print_ub_uid(ub, ub_uid, sizeof(ub_uid) - 2); + ub_uid[len] = ':'; + ub_uid[len + 1] = '\0'; + } else + strcpy(ub_uid, ""); + + seq_printf(f, res_fmt, ub_uid, ub_rnames[r], + ub->ub_parms[r].held, + ub->ub_parms[r].maxheld, + ub->ub_parms[r].barrier, + ub->ub_parms[r].limit, + ub->ub_parms[r].failcnt); +} + +static void __show_resources(struct seq_file *f, struct user_beancounter *ub, + int show_uid) +{ + int i; + + for (i = 0; i < UB_RESOURCES_COMPAT; i++) + if (strcmp(ub_rnames[i], "dummy") != 0) + ub_show_res(f, ub, i, show_uid); + + for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++) + ub_show_res(f, ub, i, show_uid); +} + +static int bc_resources_show(struct seq_file *f, void *v) +{ + __show_resources(f, seq_beancounter(f), 0); + return 0; +} + +static struct bc_proc_entry bc_resources_entry = { + .name = "resources", + .u.show = bc_resources_show, +}; + +static int bc_debug_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + char buf[64]; + + ub = seq_beancounter(f); + print_ub_uid(ub, buf, sizeof(buf)); + seq_printf(f, "uid: %s\n", buf); + seq_printf(f, "ref: %d\n", atomic_read(&ub->ub_refcount)); + + seq_printf(f, "bc: %p\n", ub); + seq_printf(f, "par: %p\n", ub->parent); + seq_printf(f, "priv: %p\n", ub->private_data); + return 0; +} + +static struct bc_proc_entry bc_debug_entry = { + .name = "debug", + .u.show = bc_debug_show, +}; + +static int ub_show(struct seq_file *f, void *v) +{ + int i; + + for (i = 0; i < UB_RESOURCES_COMPAT; i++) + ub_show_res(f, (struct user_beancounter *)v, i, 1); + return 0; +} + +static int res_show(struct seq_file *f, void *v) +{ + __show_resources(f, (struct user_beancounter *)v, 1); + return 0; +} + +static int ub_accessible(struct user_beancounter *exec, + struct user_beancounter *target) +{ + struct user_beancounter *p, *q; + + p = top_beancounter(exec); + q = top_beancounter(target); + + return (p == get_ub0() || p == q); +} + +static void ub_show_header(struct seq_file *f) +{ + seq_printf(f, "Version: 2.5\n"); + seq_printf(f, head_fmt, "uid", "resource", + "held", "maxheld", "barrier", "limit", "failcnt"); +} + +static void *ub_start(struct seq_file *f, loff_t *ppos) +{ + struct user_beancounter *ub; + struct user_beancounter *exec_ub; + unsigned long pos; + + pos = *ppos; + if (pos == 0) + ub_show_header(f); + + exec_ub = get_exec_ub(); + + rcu_read_lock(); + for_each_beancounter(ub) { + if (ub->parent != NULL) + continue; + if (!ub_accessible(exec_ub, ub)) + continue; + if (pos-- == 0) + return ub; + } + return NULL; +} + +static void *ub_next(struct seq_file *f, void *v, loff_t *ppos) +{ + struct user_beancounter *ub; + struct list_head *entry; + struct user_beancounter *exec_ub; + + exec_ub = get_exec_ub(); + ub = (struct user_beancounter *)v; + + entry = &ub->ub_list; + + list_for_each_continue_rcu(entry, &ub_list_head) { + ub = list_entry(entry, struct user_beancounter, ub_list); + if (ub->parent != NULL) + continue; + if (!ub_accessible(exec_ub, ub)) + continue; + + (*ppos)++; + return ub; + } + return NULL; +} + +static void ub_stop(struct seq_file *f, void *v) +{ + rcu_read_unlock(); +} + +static struct seq_operations ub_seq_ops = { + .start = ub_start, + .next = ub_next, + .stop = ub_stop, + .show = ub_show, +}; + +static int ub_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &ub_seq_ops); +} + +static struct file_operations ub_file_operations = { + .open = ub_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct seq_operations res_seq_ops = { + .start = ub_start, + .next = ub_next, + .stop = ub_stop, + .show = res_show, +}; + +static int res_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &res_seq_ops); +} + +static struct file_operations resources_operations = { + .open = res_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct bc_proc_entry bc_all_resources_entry = { + .name = "resources", + .u.fops = &resources_operations, +}; + +/* + * Generic showing stuff + */ + +static int cookies, num_entries; +static struct bc_proc_entry *bc_entries __read_mostly; +static struct bc_proc_entry *bc_root_entries __read_mostly; +static DEFINE_SPINLOCK(bc_entries_lock); + +void bc_register_proc_entry(struct bc_proc_entry *e) +{ + spin_lock(&bc_entries_lock); + e->cookie = ++cookies; + e->next = bc_entries; + bc_entries = e; + num_entries++; + spin_unlock(&bc_entries_lock); +} + +EXPORT_SYMBOL(bc_register_proc_entry); + +void bc_register_proc_root_entry(struct bc_proc_entry *e) +{ + spin_lock(&bc_entries_lock); + e->cookie = ++cookies; + e->next = bc_root_entries; + bc_root_entries = e; + spin_unlock(&bc_entries_lock); +} + +EXPORT_SYMBOL(bc_register_proc_root_entry); + +/* + * small helpers + */ + +static inline int bc_make_ino(struct user_beancounter *ub) +{ + int ret; + + ret = 0xbc000000; + if (ub->parent) + ret |= (ub->ub_uid << 12); + ret |= ub->ub_uid; + return ret; +} + +static inline int bc_make_file_ino(struct bc_proc_entry *de) +{ + return 0xbe000000 + de->cookie; +} + +static int bc_d_delete(struct dentry *d) +{ + return 1; +} + +static void bc_d_release(struct dentry *d) +{ + put_beancounter((struct user_beancounter *)d->d_fsdata); +} + +static struct inode_operations bc_entry_iops; +static struct file_operations bc_entry_fops; +static struct dentry_operations bc_dentry_ops = { + .d_delete = bc_d_delete, + .d_release = bc_d_release, +}; + +/* + * common directory operations' helpers + */ + +static int bc_readdir(struct file *file, filldir_t filler, void *data, + struct user_beancounter *parent) +{ + int err = 0, len, ino; + loff_t pos, filled; + struct user_beancounter *ub, *prev; + char buf[64]; + struct bc_proc_entry *pde; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EPERM; + + pos = file->f_pos; + if (pos == 0) { + err = (*filler)(data, ".", 1, pos, + file->f_dentry->d_inode->i_ino, DT_DIR); + if (err < 0) + goto out; + + pos++; + } + + if (pos == 1) { + err = (*filler)(data, "..", 2, pos, + parent_ino(file->f_dentry), DT_DIR); + if (err < 0) + goto out; + + pos++; + } + + filled = 2; + for (pde = (parent == NULL ? bc_root_entries : bc_entries); + pde != NULL; pde = pde->next) { + if (filled++ < pos) + continue; + + err = (*filler)(data, pde->name, strlen(pde->name), pos, + bc_make_file_ino(pde), DT_REG); + if (err < 0) + goto out; + + pos++; + } + + rcu_read_lock(); + prev = NULL; + ub = list_entry(&ub_list_head, struct user_beancounter, ub_list); + while (1) { + ub = list_entry(rcu_dereference(ub->ub_list.next), + struct user_beancounter, ub_list); + if (&ub->ub_list == &ub_list_head) + break; + + if (ub->parent != parent) + continue; + + if (filled++ < pos) + continue; + + if (!get_beancounter_rcu(ub)) + continue; + + rcu_read_unlock(); + put_beancounter(prev); + + len = print_ub_uid(ub, buf, sizeof(buf)); + ino = bc_make_ino(ub); + + err = (*filler)(data, buf, len, pos, ino, DT_DIR); + if (err < 0) { + put_beancounter(ub); + goto out; + } + + rcu_read_lock(); + prev = ub; + pos++; + } + rcu_read_unlock(); + put_beancounter(prev); +out: + file->f_pos = pos; + return err; +} + +static int bc_looktest(struct inode *ino, void *data) +{ + return ino->i_op == &bc_entry_iops && ino->u.generic_ip == data; +} + +static int bc_lookset(struct inode *ino, void *data) +{ + struct user_beancounter *ub; + + ub = (struct user_beancounter *)data; + ino->u.generic_ip = data; + ino->i_ino = bc_make_ino(ub); + ino->i_fop = &bc_entry_fops; + ino->i_op = &bc_entry_iops; + ino->i_mode = S_IFDIR | S_IRUSR | S_IXUGO; + /* subbeancounters are not included, but who cares? */ + ino->i_nlink = num_entries + 2; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir, + struct dentry *dentry) +{ + struct inode *ino; + + ino = iget5_locked(dir->i_sb, ub->ub_uid, bc_looktest, bc_lookset, ub); + if (ino == NULL) + goto out_put; + + unlock_new_inode(ino); + dentry->d_op = &bc_dentry_ops; + dentry->d_fsdata = ub; + d_add(dentry, ino); + return NULL; + +out_put: + put_beancounter(ub); + return ERR_PTR(-ENOENT); +} + +/* + * files (bc_proc_entry) manipulations + */ + +static struct dentry *bc_lookup_file(struct inode *dir, + struct dentry *dentry, struct bc_proc_entry *root, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *)) +{ + struct bc_proc_entry *pde; + struct inode *ino; + + for (pde = root; pde != NULL; pde = pde->next) + if (strcmp(pde->name, dentry->d_name.name) == 0) + break; + + if (pde == NULL) + return ERR_PTR(-ESRCH); + + ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde); + if (ino == NULL) + return ERR_PTR(-ENOENT); + + unlock_new_inode(ino); + dentry->d_op = &bc_dentry_ops; + d_add(dentry, ino); + return NULL; +} + +static int bc_file_open(struct inode *ino, struct file *filp) +{ + struct bc_proc_entry *de; + struct user_beancounter *ub; + + de = (struct bc_proc_entry *)ino->u.generic_ip; + ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata; + BUG_ON(ub->ub_magic != UB_MAGIC); + + /* + * ub can't disappear: we hold d_parent, he holds the beancounter + */ + return single_open(filp, de->u.show, ub); +} + +static struct file_operations bc_file_ops = { + .open = bc_file_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int bc_looktest_entry(struct inode *ino, void *data) +{ + return ino->i_fop == &bc_file_ops && ino->u.generic_ip == data; +} + +static int bc_lookset_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + ino->u.generic_ip = data; + ino->i_ino = bc_make_file_ino(de); + ino->i_fop = &bc_file_ops, + ino->i_mode = S_IFREG | S_IRUSR; + ino->i_nlink = 1; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static inline struct dentry *bc_lookup_files(struct inode *dir, + struct dentry *de) +{ + return bc_lookup_file(dir, de, bc_entries, + bc_looktest_entry, bc_lookset_entry); +} + +static int bc_looktest_root_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + return ino->i_fop == de->u.fops && ino->u.generic_ip == data; +} + +static int bc_lookset_root_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + ino->u.generic_ip = data; + ino->i_ino = bc_make_file_ino(de); + ino->i_fop = de->u.fops; + ino->i_mode = S_IFREG | S_IRUSR; + ino->i_nlink = 1; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static inline struct dentry *bc_lookup_root_files(struct inode *dir, + struct dentry *de) +{ + return bc_lookup_file(dir, de, bc_root_entries, + bc_looktest_root_entry, bc_lookset_root_entry); +} + +/* + * /proc/bc/.../ directory operations + */ + +static int bc_entry_readdir(struct file *file, void *data, filldir_t filler) +{ + return bc_readdir(file, filler, data, + (struct user_beancounter *)file->f_dentry->d_fsdata); +} + +static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int id; + char *end; + struct user_beancounter *par, *ub; + struct dentry *de; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return ERR_PTR(-EPERM); + + de = bc_lookup_files(dir, dentry); + if (de != ERR_PTR(-ESRCH)) + return de; + + id = simple_strtol(dentry->d_name.name, &end, 10); + if (*end != '.') + return ERR_PTR(-ENOENT); + + par = (struct user_beancounter *)dir->u.generic_ip; + if (par->ub_uid != id) + return ERR_PTR(-ENOENT); + + id = simple_strtol(end + 1, &end, 10); + if (*end != '\0') + return ERR_PTR(-ENOENT); + + ub = get_subbeancounter_byid(par, id, 0); + if (ub == NULL) + return ERR_PTR(-ENOENT); + + return bc_lookup(ub, dir, dentry); +} + +static struct file_operations bc_entry_fops = { + .read = generic_read_dir, + .readdir = bc_entry_readdir, +}; + +static struct inode_operations bc_entry_iops = { + .lookup = bc_entry_lookup, +}; + +/* + * /proc/bc directory operations + */ + +static int bc_root_readdir(struct file *file, void *data, filldir_t filler) +{ + return bc_readdir(file, filler, data, NULL); +} + +static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int id; + char *end; + struct user_beancounter *ub; + struct dentry *de; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return ERR_PTR(-EPERM); + + de = bc_lookup_root_files(dir, dentry); + if (de != ERR_PTR(-ESRCH)) + return de; + + id = simple_strtol(dentry->d_name.name, &end, 10); + if (*end != '\0') + return ERR_PTR(-ENOENT); + + ub = get_beancounter_byuid(id, 0); + if (ub == NULL) + return ERR_PTR(-ENOENT); + + return bc_lookup(ub, dir, dentry); +} + +static struct file_operations bc_root_fops = { + .read = generic_read_dir, + .readdir = bc_root_readdir, +}; + +static struct inode_operations bc_root_iops = { + .lookup = bc_root_lookup, +}; + +static int __init ub_init_proc(void) +{ + struct proc_dir_entry *entry; + struct proc_dir_entry *bc_proc_root; + + bc_proc_root = create_proc_entry("bc", + S_IFDIR | S_IRUGO | S_IXUGO, NULL); + if (bc_proc_root == NULL) + panic("Can't create /proc/bc entry"); + + bc_proc_root->proc_fops = &bc_root_fops; + bc_proc_root->proc_iops = &bc_root_iops; + + bc_register_proc_entry(&bc_resources_entry); +#ifdef CONFIG_UBC_DEBUG + bc_register_proc_entry(&bc_debug_entry); +#endif + bc_register_proc_root_entry(&bc_all_resources_entry); + + entry = create_proc_glob_entry("user_beancounters", S_IRUGO, NULL); + entry->proc_fops = &ub_file_operations; + return 0; +} + +core_initcall(ub_init_proc); diff -uprN linux-2.6.18/kernel/ub/ub_stat.c linux-2.6.18.ovz/kernel/ub/ub_stat.c --- linux-2.6.18/kernel/ub/ub_stat.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_stat.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,453 @@ +/* + * kernel/ub/ub_stat.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(ubs_notify_list); +static long ubs_min_interval; +static ubstattime_t ubs_start_time, ubs_end_time; +static struct timer_list ubs_timer; + +static int ubstat_get_list(void __user *buf, long size) +{ + int retval; + struct user_beancounter *ub, *ubp; + long *page, *ptr, *end; + int len; + + page = (long *)__get_free_page(GFP_KERNEL); + if (page == NULL) + return -ENOMEM; + + retval = 0; + ubp = NULL; + ptr = page; + end = page + PAGE_SIZE / sizeof(*ptr); + + spin_lock_irq(&ub_hash_lock); + for_each_beancounter(ub) { + if (ub->parent != NULL) + continue; + *ptr++ = ub->ub_uid; + if (ptr != end) + continue; + + get_beancounter(ub); + spin_unlock_irq(&ub_hash_lock); + + put_beancounter(ubp); + ubp = ub; + + len = min_t(long, (ptr - page) * sizeof(*ptr), size); + if (copy_to_user(buf, page, len)) { + retval = -EFAULT; + goto out_put; + } + retval += len; + if (len < PAGE_SIZE) + goto out_put; + buf += len; + size -= len; + + ptr = page; + end = page + PAGE_SIZE / sizeof(*ptr); + + spin_lock_irq(&ub_hash_lock); + } + spin_unlock_irq(&ub_hash_lock); + + put_beancounter(ubp); + size = min_t(long, (ptr - page) * sizeof(*ptr), size); + if (size > 0 && copy_to_user(buf, page, size)) { + retval = -EFAULT; + goto out_put; + } + retval += size; + +out_put: + put_beancounter(ubp); + free_page((unsigned long)page); + return retval; +} + +static int ubstat_gettime(void __user *buf, long size) +{ + ubgettime_t data; + int retval; + + spin_lock(&ubs_notify_lock); + data.start_time = ubs_start_time; + data.end_time = ubs_end_time; + data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ; + spin_unlock(&ubs_notify_lock); + + retval = min_t(long, sizeof(data), size); + if (copy_to_user(buf, &data, retval)) + retval = -EFAULT; + return retval; +} + +static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf) +{ + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparm_t param[1]; + } *data; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + + data->param[0].maxheld = ub->ub_store[res].maxheld; + data->param[0].failcnt = ub->ub_store[res].failcnt; + + return sizeof(*data); +} + +static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size) +{ + int wrote; + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparm_t param[UB_RESOURCES]; + } *data; + int resource; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + wrote = sizeof(data->start_time) + sizeof(data->end_time); + + for (resource = 0; resource < UB_RESOURCES; resource++) { + if (size < wrote + sizeof(data->param[resource])) + break; + data->param[resource].maxheld = ub->ub_store[resource].maxheld; + data->param[resource].failcnt = ub->ub_store[resource].failcnt; + wrote += sizeof(data->param[resource]); + } + + return wrote; +} + +static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf, + int size) +{ + int wrote; + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparmf_t param[UB_RESOURCES]; + } *data; + int resource; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + wrote = sizeof(data->start_time) + sizeof(data->end_time); + + for (resource = 0; resource < UB_RESOURCES; resource++) { + if (size < wrote + sizeof(data->param[resource])) + break; + /* The beginning of ubstatparmf_t matches struct ubparm. */ + memcpy(&data->param[resource], &ub->ub_store[resource], + sizeof(ub->ub_store[resource])); + data->param[resource].__unused1 = 0; + data->param[resource].__unused2 = 0; + wrote += sizeof(data->param[resource]); + } + return wrote; +} + +static int ubstat_get_stat(struct user_beancounter *ub, long cmd, + void __user *buf, long size) +{ + void *kbuf; + int retval; + + kbuf = (void *)__get_free_page(GFP_KERNEL); + if (kbuf == NULL) + return -ENOMEM; + + spin_lock(&ubs_notify_lock); + switch (UBSTAT_CMD(cmd)) { + case UBSTAT_READ_ONE: + retval = -EINVAL; + if (UBSTAT_PARMID(cmd) >= UB_RESOURCES) + break; + retval = ubstat_do_read_one(ub, + UBSTAT_PARMID(cmd), kbuf); + break; + case UBSTAT_READ_ALL: + retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE); + break; + case UBSTAT_READ_FULL: + retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE); + break; + default: + retval = -EINVAL; + } + spin_unlock(&ubs_notify_lock); + + if (retval > 0) { + retval = min_t(long, retval, size); + if (copy_to_user(buf, kbuf, retval)) + retval = -EFAULT; + } + + free_page((unsigned long)kbuf); + return retval; +} + +static int ubstat_handle_notifrq(ubnotifrq_t *req) +{ + int retval; + struct ub_stat_notify *new_notify; + struct list_head *entry; + struct task_struct *tsk_to_free; + + new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL); + if (new_notify == NULL) + return -ENOMEM; + + tsk_to_free = NULL; + INIT_LIST_HEAD(&new_notify->list); + + spin_lock(&ubs_notify_lock); + list_for_each(entry, &ubs_notify_list) { + struct ub_stat_notify *notify; + + notify = list_entry(entry, struct ub_stat_notify, list); + if (notify->task == current) { + kfree(new_notify); + new_notify = notify; + break; + } + } + + retval = -EINVAL; + if (req->maxinterval < 1) + goto out_unlock; + if (req->maxinterval > TIME_MAX_SEC) + req->maxinterval = TIME_MAX_SEC; + if (req->maxinterval < ubs_min_interval) { + unsigned long dif; + + ubs_min_interval = req->maxinterval; + dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ; + if (dif > req->maxinterval) + mod_timer(&ubs_timer, + ubs_timer.expires - + (dif - req->maxinterval) * HZ); + } + + if (entry != &ubs_notify_list) { + list_del(&new_notify->list); + tsk_to_free = new_notify->task; + } + if (req->signum) { + new_notify->task = current; + get_task_struct(new_notify->task); + new_notify->signum = req->signum; + list_add(&new_notify->list, &ubs_notify_list); + } else + kfree(new_notify); + retval = 0; +out_unlock: + spin_unlock(&ubs_notify_lock); + if (tsk_to_free != NULL) + put_task_struct(tsk_to_free); + return retval; +} + +/* + * former sys_ubstat + */ +long do_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size) +{ + int retval; + struct user_beancounter *ub; + + if (func == UBSTAT_UBPARMNUM) + return UB_RESOURCES; + if (func == UBSTAT_UBLIST) + return ubstat_get_list(buf, size); + if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))) + return -EPERM; + + if (func == UBSTAT_GETTIME) { + retval = ubstat_gettime(buf, size); + goto notify; + } + + ub = get_exec_ub(); + if (ub != NULL && ub->ub_uid == arg1) + get_beancounter(ub); + else /* FIXME must be if (ve_is_super) */ + ub = get_beancounter_byuid(arg1, 0); + + if (ub == NULL) + return -ESRCH; + + retval = ubstat_get_stat(ub, func, buf, size); + put_beancounter(ub); +notify: + /* Handle request for notification */ + if (retval >= 0) { + ubnotifrq_t notifrq; + int err; + + err = -EFAULT; + if (!copy_from_user(¬ifrq, (void __user *)arg2, + sizeof(notifrq))) + err = ubstat_handle_notifrq(¬ifrq); + if (err) + retval = err; + } + + return retval; +} + +static void ubstat_save_onestat(struct user_beancounter *ub) +{ + int resource; + + /* called with local irq disabled */ + spin_lock(&ub->ub_lock); + for (resource = 0; resource < UB_RESOURCES; resource++) { + memcpy(&ub->ub_store[resource], &ub->ub_parms[resource], + sizeof(struct ubparm)); + ub->ub_parms[resource].minheld = + ub->ub_parms[resource].maxheld = + ub->ub_parms[resource].held; + } + spin_unlock(&ub->ub_lock); +} + +static void ubstat_save_statistics(void) +{ + unsigned long flags; + struct user_beancounter *ub; + + local_irq_save(flags); + for_each_beancounter (ub) + ubstat_save_onestat(ub); + local_irq_restore(flags); +} + +static void ubstatd_timeout(unsigned long __data) +{ + struct task_struct *p; + + p = (struct task_struct *) __data; + wake_up_process(p); +} + +/* + * Safe wrapper for send_sig. It prevents a race with release_task + * for sighand. + * Should be called under tasklist_lock. + */ +static void task_send_sig(struct ub_stat_notify *notify) +{ + if (likely(notify->task->sighand != NULL)) + send_sig(notify->signum, notify->task, 1); +} + +static inline void do_notifies(void) +{ + LIST_HEAD(notif_free_list); + struct ub_stat_notify *notify; + struct ub_stat_notify *tmp; + + spin_lock(&ubs_notify_lock); + ubs_start_time = ubs_end_time; + /* + * the expression below relies on time being unsigned long and + * arithmetic promotion rules + */ + ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ; + mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ); + ubs_min_interval = TIME_MAX_SEC; + /* save statistics accumulated for the interval */ + ubstat_save_statistics(); + /* send signals */ + read_lock(&tasklist_lock); + while (!list_empty(&ubs_notify_list)) { + notify = list_entry(ubs_notify_list.next, + struct ub_stat_notify, list); + task_send_sig(notify); + list_del(¬ify->list); + list_add(¬ify->list, ¬if_free_list); + } + read_unlock(&tasklist_lock); + spin_unlock(&ubs_notify_lock); + + list_for_each_entry_safe(notify, tmp, ¬if_free_list, list) { + put_task_struct(notify->task); + kfree(notify); + } +} + +/* + * Kernel thread + */ +static int ubstatd(void *unused) +{ + /* daemonize call will take care of signals */ + daemonize("ubstatd"); + + ubs_timer.data = (unsigned long)current; + ubs_timer.function = ubstatd_timeout; + add_timer(&ubs_timer); + + while (1) { + set_task_state(current, TASK_INTERRUPTIBLE); + if (time_after(ubs_timer.expires, jiffies)) { + schedule(); + try_to_freeze(); + continue; + } + + __set_task_state(current, TASK_RUNNING); + do_notifies(); + } + return 0; +} + +static int __init ubstatd_init(void) +{ + init_timer(&ubs_timer); + ubs_timer.expires = TIME_MAX_JIF; + ubs_min_interval = TIME_MAX_SEC; + ubs_start_time = ubs_end_time = 0; + + kernel_thread(ubstatd, NULL, 0); + return 0; +} + +module_init(ubstatd_init); diff -uprN linux-2.6.18/kernel/ub/ub_sys.c linux-2.6.18.ovz/kernel/ub/ub_sys.c --- linux-2.6.18/kernel/ub/ub_sys.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ub/ub_sys.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,174 @@ +/* + * kernel/ub/ub_sys.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include + +#include + +/* + * The (rather boring) getluid syscall + */ +asmlinkage long sys_getluid(void) +{ + struct user_beancounter *ub; + + ub = get_exec_ub(); + if (ub == NULL) + return -EINVAL; + + return ub->ub_uid; +} + +/* + * The setluid syscall + */ +asmlinkage long sys_setluid(uid_t uid) +{ + struct user_beancounter *ub; + struct task_beancounter *task_bc; + int error; + + task_bc = ¤t->task_bc; + + /* You may not disown a setluid */ + error = -EINVAL; + if (uid == (uid_t)-1) + goto out; + + /* You may only set an ub as root */ + error = -EPERM; + if (!capable(CAP_SETUID)) + goto out; + /* + * The ub once set is irrevocable to all + * unless it's set from ve0. + */ + if (!ve_is_super(get_exec_env())) + goto out; + + /* Ok - set up a beancounter entry for this user */ + error = -ENOBUFS; + ub = get_beancounter_byuid(uid, 1); + if (ub == NULL) + goto out; + + ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) " + "for %.20s pid %d\n", + ub, atomic_read(&ub->ub_refcount), + current->comm, current->pid); + /* install bc */ + error = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_NEWUBC, ub); + if (!(error & NOTIFY_FAIL)) { + put_beancounter(task_bc->exec_ub); + task_bc->exec_ub = ub; + if (!(error & NOTIFY_OK)) { + put_beancounter(task_bc->fork_sub); + task_bc->fork_sub = get_beancounter(ub); + } + error = 0; + } else { + put_beancounter(ub); + error = -ENOBUFS; + } +out: + return error; +} + +long do_setublimit(uid_t uid, unsigned long resource, + unsigned long *new_limits) +{ + int error; + unsigned long flags; + struct user_beancounter *ub; + + error = -EPERM; + if(!capable(CAP_SYS_RESOURCE)) + goto out; + + if (!ve_is_super(get_exec_env())) + goto out; + + error = -EINVAL; + if (resource >= UB_RESOURCES) + goto out; + + error = -EINVAL; + if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE) + goto out; + + error = -ENOENT; + ub = get_beancounter_byuid(uid, 0); + if (ub == NULL) { + ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid); + goto out; + } + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_parms[resource].barrier = new_limits[0]; + ub->ub_parms[resource].limit = new_limits[1]; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + put_beancounter(ub); + + error = 0; +out: + return error; +} + +/* + * The setbeanlimit syscall + */ +asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, + unsigned long __user *limits) +{ + unsigned long new_limits[2]; + + if (copy_from_user(&new_limits, limits, sizeof(new_limits))) + return -EFAULT; + + return do_setublimit(uid, resource, new_limits); +} + +extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size); +asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + return do_ubstat(func, arg1, arg2, buf, size); +} + +#ifdef CONFIG_COMPAT +asmlinkage long compat_sys_setublimit(uid_t uid, int resource, + unsigned int __user *limits) +{ + unsigned int u_new_limits[2]; + unsigned long new_limits[2]; + + if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits))) + return -EFAULT; + + new_limits[0] = u_new_limits[0]; + new_limits[1] = u_new_limits[1]; + + return do_setublimit(uid, resource, new_limits); +} + +asmlinkage long compat_sys_ubstat(int func, unsigned int arg1, + unsigned int arg2, compat_uptr_t *buf, long size) +{ + return sys_ubstat(func, arg1, arg2, buf, size); +} +#endif diff -uprN linux-2.6.18/kernel/user.c linux-2.6.18.ovz/kernel/user.c --- linux-2.6.18/kernel/user.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/kernel/user.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,6 +14,7 @@ #include #include #include +#include /* * UID task count cache, to get fast user lookup in "alloc_uid" @@ -24,7 +25,20 @@ #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) +#define __uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) + +#ifdef CONFIG_VE +#define UIDHASH_MASK_VE (UIDHASH_SZ_VE - 1) +#define __uidhashfn_ve(uid) (((uid >> UIDHASH_BITS_VE) ^ uid) & \ + UIDHASH_MASK_VE) +#define __uidhashentry_ve(uid, envid) ((envid)->uidhash_table + \ + __uidhashfn_ve(uid)) +#define uidhashentry_ve(uid) (ve_is_super(get_exec_env()) ? \ + __uidhashentry(uid) : \ + __uidhashentry_ve(uid, get_exec_env())) +#else +#define uidhashentry_ve(uid) __uidhashentry(uid) +#endif static kmem_cache_t *uid_cachep; static struct list_head uidhash_table[UIDHASH_SZ]; @@ -96,7 +110,7 @@ struct user_struct *find_user(uid_t uid) unsigned long flags; spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(uid, uidhashentry(uid)); + ret = uid_hash_find(uid, uidhashentry_ve(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } @@ -119,10 +133,11 @@ void free_uid(struct user_struct *up) local_irq_restore(flags); } } +EXPORT_SYMBOL_GPL(free_uid); struct user_struct * alloc_uid(uid_t uid) { - struct list_head *hashent = uidhashentry(uid); + struct list_head *hashent = uidhashentry_ve(uid); struct user_struct *up; spin_lock_irq(&uidhash_lock); @@ -172,6 +187,7 @@ struct user_struct * alloc_uid(uid_t uid } return up; } +EXPORT_SYMBOL_GPL(alloc_uid); void switch_uid(struct user_struct *new_user) { @@ -190,21 +206,21 @@ void switch_uid(struct user_struct *new_ free_uid(old_user); suid_keys(current); } - +EXPORT_SYMBOL_GPL(switch_uid); static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_LIST_HEAD(uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(0)); + uid_hash_insert(&root_user, __uidhashentry(0)); spin_unlock_irq(&uidhash_lock); return 0; diff -uprN linux-2.6.18/kernel/utsname.c linux-2.6.18.ovz/kernel/utsname.c --- linux-2.6.18/kernel/utsname.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/utsname.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Author: Serge Hallyn + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include +#include + +/* + * Clone a new ns copying an original utsname, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +{ + struct uts_namespace *ns; + + ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + if (ns) { + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + kref_init(&ns->kref); + } + return ns; +} + +/* + * unshare the current process' utsname namespace. + * called only in sys_unshare() + */ +int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) +{ + if (unshare_flags & CLONE_NEWUTS) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_uts = clone_uts_ns(current->nsproxy->uts_ns); + if (!*new_uts) + return -ENOMEM; + } + + return 0; +} + +/* + * Copy task tsk's utsname namespace, or clone it if flags + * specifies CLONE_NEWUTS. In latter case, changes to the + * utsname of this process won't be seen by parent, and vice + * versa. + */ +int copy_utsname(int flags, struct task_struct *tsk) +{ + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + struct uts_namespace *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_uts_ns(old_ns); + + if (!(flags & CLONE_NEWUTS)) + return 0; + +#ifndef CONFIG_VE + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } +#endif + + new_ns = clone_uts_ns(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + tsk->nsproxy->uts_ns = new_ns; + +out: + put_uts_ns(old_ns); + return err; +} + +void free_uts_ns(struct kref *kref) +{ + struct uts_namespace *ns; + + ns = container_of(kref, struct uts_namespace, kref); + kfree(ns); +} diff -uprN linux-2.6.18/kernel/ve/Makefile linux-2.6.18.ovz/kernel/ve/Makefile --- linux-2.6.18/kernel/ve/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ve/Makefile 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,16 @@ +# +# +# kernel/ve/Makefile +# +# Copyright (C) 2000-2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-y = ve.o veowner.o hooks.o devperms.o + +obj-$(CONFIG_VZ_DEV) += vzdev.o +obj-$(CONFIG_VZ_WDOG) += vzwdog.o +obj-$(CONFIG_VE_CALLS) += vzmon.o + +vzmon-objs = vecalls.o diff -uprN linux-2.6.18/kernel/ve/devperms.c linux-2.6.18.ovz/kernel/ve/devperms.c --- linux-2.6.18/kernel/ve/devperms.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ve/devperms.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,418 @@ +/* + * linux/kernel/ve/devperms.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Devices permissions routines, + * character and block devices separately + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Rules applied in the following order: + * MAJOR!=0, MINOR!=0 + * MAJOR!=0, MINOR==0 + * MAJOR==0, MINOR==0 + */ + +struct devperms_struct { + dev_t dev; /* device id */ + unsigned char mask; + unsigned type; + envid_t veid; + + struct hlist_node hash; + struct rcu_head rcu; +}; + +static struct devperms_struct default_major_perms[] = { + { + MKDEV(UNIX98_PTY_MASTER_MAJOR, 0), + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(UNIX98_PTY_SLAVE_MAJOR, 0), + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(PTY_MASTER_MAJOR, 0), + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(PTY_SLAVE_MAJOR, 0), + S_IROTH | S_IWOTH, + S_IFCHR, + }, +}; + +static struct devperms_struct default_minor_perms[] = { + { + MKDEV(MEM_MAJOR, 3), /* null */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(MEM_MAJOR, 5), /* zero */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(MEM_MAJOR, 7), /* full */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(TTYAUX_MAJOR, 0), /* tty */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(TTYAUX_MAJOR, 2), /* ptmx */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(MEM_MAJOR, 8), /* random */ + S_IROTH, + S_IFCHR, + }, + { + MKDEV(MEM_MAJOR, 9), /* urandom */ + S_IROTH, + S_IFCHR + }, +}; + +static struct devperms_struct default_deny_perms = { + MKDEV(0, 0), + 0, + S_IFCHR, +}; + +static inline struct devperms_struct *find_default_devperms(int type, dev_t dev) +{ + int i; + + /* XXX all defaults perms are S_IFCHR */ + if (type != S_IFCHR) + return &default_deny_perms; + + for (i = 0; i < ARRAY_SIZE(default_minor_perms); i++) + if (MAJOR(dev) == MAJOR(default_minor_perms[i].dev) && + MINOR(dev) == MINOR(default_minor_perms[i].dev)) + return &default_minor_perms[i]; + + for (i = 0; i < ARRAY_SIZE(default_major_perms); i++) + if (MAJOR(dev) == MAJOR(default_major_perms[i].dev)) + return &default_major_perms[i]; + + return &default_deny_perms; +} + +#define DEVPERMS_HASH_SZ 512 +#define devperms_hashfn(id, dev) \ + ( (id << 5) ^ (id >> 5) ^ (MAJOR(dev)) ^ MINOR(dev) ) & \ + (DEVPERMS_HASH_SZ - 1) + +static DEFINE_SPINLOCK(devperms_hash_lock); +static struct hlist_head devperms_hash[DEVPERMS_HASH_SZ]; + +static inline struct devperms_struct *find_devperms(envid_t veid, + int type, + dev_t dev) +{ + struct hlist_head *table; + struct devperms_struct *perms; + struct hlist_node *h; + + table = &devperms_hash[devperms_hashfn(veid, dev)]; + hlist_for_each_entry_rcu (perms, h, table, hash) + if (perms->type == type && perms->veid == veid && + MAJOR(perms->dev) == MAJOR(dev) && + MINOR(perms->dev) == MINOR(dev)) + return perms; + + return NULL; +} + +static void free_devperms(struct rcu_head *rcu) +{ + struct devperms_struct *perms; + + perms = container_of(rcu, struct devperms_struct, rcu); + kfree(perms); +} + +/* API calls */ + +void clean_device_perms_ve(envid_t veid) +{ + int i; + struct devperms_struct *p; + struct hlist_node *n, *tmp; + + spin_lock(&devperms_hash_lock); + for (i = 0; i < DEVPERMS_HASH_SZ; i++) + hlist_for_each_entry_safe (p, n, tmp, &devperms_hash[i], hash) + if (p->veid == veid) { + hlist_del_rcu(&p->hash); + call_rcu(&p->rcu, free_devperms); + } + spin_unlock(&devperms_hash_lock); +} + +EXPORT_SYMBOL(clean_device_perms_ve); + +/* + * Mode is a mask of + * FMODE_READ for read access (configurable by S_IROTH) + * FMODE_WRITE for write access (configurable by S_IWOTH) + * FMODE_QUOTACTL for quotactl access (configurable by S_IXGRP) + */ + +int get_device_perms_ve(int dev_type, dev_t dev, int access_mode) +{ + struct devperms_struct *p; + struct ve_struct *ve; + envid_t veid; + char mask; + + ve = get_exec_env(); + veid = ve->veid; + rcu_read_lock(); + + p = find_devperms(veid, dev_type | VE_USE_MINOR, dev); + if (p != NULL) + goto end; + + p = find_devperms(veid, dev_type | VE_USE_MAJOR, MKDEV(MAJOR(dev),0)); + if (p != NULL) + goto end; + + p = find_devperms(veid, dev_type, MKDEV(0,0)); + if (p != NULL) + goto end; + + if (ve->features & VE_FEATURE_DEF_PERMS) { + p = find_default_devperms(dev_type, dev); + if (p != NULL) + goto end; + } + + rcu_read_unlock(); + return -ENODEV; + +end: + mask = p->mask; + rcu_read_unlock(); + + access_mode = "\000\004\002\006\010\014\012\016"[access_mode]; + return ((mask & access_mode) == access_mode) ? 0 : -EACCES; +} + +EXPORT_SYMBOL(get_device_perms_ve); + +int set_device_perms_ve(envid_t veid, unsigned type, dev_t dev, unsigned mask) +{ + struct devperms_struct *perms, *new_perms; + struct hlist_head *htable; + + new_perms = kmalloc(sizeof(struct devperms_struct), GFP_KERNEL); + + spin_lock(&devperms_hash_lock); + perms = find_devperms(veid, type, dev); + if (perms != NULL) { + kfree(new_perms); + perms->mask = mask & S_IALLUGO; + } else { + switch (type & VE_USE_MASK) { + case 0: + dev = 0; + break; + case VE_USE_MAJOR: + dev = MKDEV(MAJOR(dev),0); + break; + } + + new_perms->veid = veid; + new_perms->dev = dev; + new_perms->type = type; + new_perms->mask = mask & S_IALLUGO; + + htable = &devperms_hash[devperms_hashfn(new_perms->veid, + new_perms->dev)]; + hlist_add_head_rcu(&new_perms->hash, htable); + } + spin_unlock(&devperms_hash_lock); + return 0; +} + +EXPORT_SYMBOL(set_device_perms_ve); + +#ifdef CONFIG_PROC_FS +static int devperms_seq_show(struct seq_file *m, void *v) +{ + struct devperms_struct *dp; + char dev_s[32], type_c; + unsigned use, type; + dev_t dev; + + dp = (struct devperms_struct *)v; + if (dp == (struct devperms_struct *)1L) { + seq_printf(m, "Version: 2.7\n"); + return 0; + } + + use = dp->type & VE_USE_MASK; + type = dp->type & S_IFMT; + dev = dp->dev; + + if ((use | VE_USE_MINOR) == use) + snprintf(dev_s, sizeof(dev_s), "%d:%d", MAJOR(dev), MINOR(dev)); + else if ((use | VE_USE_MAJOR) == use) + snprintf(dev_s, sizeof(dev_s), "%d:*", MAJOR(dp->dev)); + else + snprintf(dev_s, sizeof(dev_s), "*:*"); + + if (type == S_IFCHR) + type_c = 'c'; + else if (type == S_IFBLK) + type_c = 'b'; + else + type_c = '?'; + + seq_printf(m, "%10u %c %03o %s\n", dp->veid, type_c, dp->mask, dev_s); + return 0; +} + +static void *devperms_seq_start(struct seq_file *m, loff_t *pos) +{ + loff_t cpos; + long slot; + struct devperms_struct *dp; + struct hlist_node *h; + + cpos = *pos; + rcu_read_lock(); + + if (cpos-- == 0) + return (void *)1L; + + for (slot = 0; slot < DEVPERMS_HASH_SZ; slot++) + hlist_for_each_entry_rcu (dp, h, &devperms_hash[slot], hash) + if (cpos-- == 0) { + m->private = (void *)slot; + return dp; + } + return NULL; +} + +static void *devperms_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + long slot; + struct hlist_node *next; + struct devperms_struct *dp; + + dp = (struct devperms_struct *)v; + + if (unlikely(dp == (struct devperms_struct *)1L)) + slot = 0; + else { + next = rcu_dereference(dp->hash.next); + if (next != NULL) + goto out; + + slot = (long)m->private + 1; + } + + for (; slot < DEVPERMS_HASH_SZ; slot++) { + next = rcu_dereference(devperms_hash[slot].first); + if (next == NULL) + continue; + + m->private = (void *)slot; + goto out; + } + return NULL; + +out: + (*pos)++; + return hlist_entry(next, struct devperms_struct, hash); +} + +static void devperms_seq_stop(struct seq_file *m, void *v) +{ + rcu_read_unlock(); +} + +static struct seq_operations devperms_seq_op = { + .start = devperms_seq_start, + .next = devperms_seq_next, + .stop = devperms_seq_stop, + .show = devperms_seq_show, +}; + +static int devperms_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &devperms_seq_op); +} + +struct file_operations proc_devperms_ops = { + .open = devperms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +EXPORT_SYMBOL(proc_devperms_ops); +#endif + +/* Initialisation */ + +static struct devperms_struct original_perms[] = +{ + { + MKDEV(0,0), + S_IROTH | S_IWOTH, + S_IFCHR, + 0, + }, + { + MKDEV(0,0), + S_IXGRP | S_IROTH | S_IWOTH, + S_IFBLK, + 0, + }, +}; + +static int __init init_devperms_hash(void) +{ + hlist_add_head(&original_perms[0].hash, + &devperms_hash[devperms_hashfn(0, + original_perms[0].dev)]); + hlist_add_head(&original_perms[1].hash, + &devperms_hash[devperms_hashfn(0, + original_perms[1].dev)]); + return 0; +} + +core_initcall(init_devperms_hash); diff -uprN linux-2.6.18/kernel/ve/hooks.c linux-2.6.18.ovz/kernel/ve/hooks.c --- linux-2.6.18/kernel/ve/hooks.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ve/hooks.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,114 @@ +/* + * linux/kernel/ve/hooks.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include + +static struct list_head ve_hooks[VE_MAX_CHAINS]; +static DECLARE_RWSEM(ve_hook_sem); + +void ve_hook_register(int chain, struct ve_hook *vh) +{ + struct list_head *lh; + struct ve_hook *tmp; + + BUG_ON(chain > VE_MAX_CHAINS); + + down_write(&ve_hook_sem); + list_for_each(lh, &ve_hooks[chain]) { + tmp = list_entry(lh, struct ve_hook, list); + if (vh->priority < tmp->priority) + break; + } + + list_add_tail(&vh->list, lh); + up_write(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_register); + +void ve_hook_unregister(struct ve_hook *vh) +{ + down_write(&ve_hook_sem); + list_del(&vh->list); + up_write(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_unregister); + +static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve) +{ + int err; + + err = 0; + if (try_module_get(vh->owner)) { + err = vh->init(ve); + module_put(vh->owner); + } + return err; +} + +static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve) +{ + if (vh->fini != NULL && try_module_get(vh->owner)) { + vh->fini(ve); + module_put(vh->owner); + } +} + +int ve_hook_iterate_init(int chain, void *ve) +{ + struct ve_hook *vh; + int err; + + err = 0; + + down_read(&ve_hook_sem); + list_for_each_entry(vh, &ve_hooks[chain], list) + if ((err = ve_hook_init(vh, ve)) < 0) + break; + + if (err) + list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list) + ve_hook_fini(vh, ve); + + up_read(&ve_hook_sem); + return err; +} + +EXPORT_SYMBOL(ve_hook_iterate_init); + +void ve_hook_iterate_fini(int chain, void *ve) +{ + struct ve_hook *vh; + + down_read(&ve_hook_sem); + list_for_each_entry_reverse(vh, &ve_hooks[chain], list) + ve_hook_fini(vh, ve); + up_read(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_iterate_fini); + +static int __init ve_hooks_init(void) +{ + int i; + + for (i = 0; i < VE_MAX_CHAINS; i++) + INIT_LIST_HEAD(&ve_hooks[i]); + return 0; +} + +core_initcall(ve_hooks_init); + diff -uprN linux-2.6.18/kernel/ve/ve.c linux-2.6.18.ovz/kernel/ve/ve.c --- linux-2.6.18/kernel/ve/ve.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ve/ve.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,164 @@ +/* + * linux/kernel/ve/ve.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * 've.c' helper file performing VE sub-system initialization + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +unsigned long vz_rstamp = 0x37e0f59d; + +#ifdef CONFIG_MODULES +struct module no_module = { .state = MODULE_STATE_GOING }; +EXPORT_SYMBOL(no_module); +#endif + +INIT_KSYM_MODULE(ip_tables); +INIT_KSYM_MODULE(ip6_tables); +INIT_KSYM_MODULE(iptable_filter); +INIT_KSYM_MODULE(ip6table_filter); +INIT_KSYM_MODULE(iptable_mangle); +INIT_KSYM_MODULE(ip6table_mangle); +INIT_KSYM_MODULE(ip_conntrack); +INIT_KSYM_MODULE(ip_conntrack_ftp); +INIT_KSYM_MODULE(ip_conntrack_irc); +INIT_KSYM_MODULE(ip_nat); +INIT_KSYM_MODULE(iptable_nat); +INIT_KSYM_MODULE(ip_nat_ftp); +INIT_KSYM_MODULE(ip_nat_irc); + +INIT_KSYM_CALL(int, init_netfilter, (void)); +INIT_KSYM_CALL(int, init_iptables, (void)); +INIT_KSYM_CALL(int, init_ip6tables, (void)); +INIT_KSYM_CALL(int, init_iptable_filter, (void)); +INIT_KSYM_CALL(int, init_ip6table_filter, (void)); +INIT_KSYM_CALL(int, init_iptable_mangle, (void)); +INIT_KSYM_CALL(int, init_ip6table_mangle, (void)); +INIT_KSYM_CALL(int, init_iptable_conntrack, (void)); +INIT_KSYM_CALL(int, init_ip_ct_ftp, (void)); +INIT_KSYM_CALL(int, init_ip_ct_irc, (void)); +INIT_KSYM_CALL(int, ip_nat_init, (void)); +INIT_KSYM_CALL(int, init_iptable_nat, (void)); +INIT_KSYM_CALL(int, init_iptable_nat_ftp, (void)); +INIT_KSYM_CALL(int, init_iptable_nat_irc, (void)); +INIT_KSYM_CALL(void, fini_iptable_nat_irc, (void)); +INIT_KSYM_CALL(void, fini_iptable_nat_ftp, (void)); +INIT_KSYM_CALL(void, fini_iptable_nat, (void)); +INIT_KSYM_CALL(void, ip_nat_cleanup, (void)); +INIT_KSYM_CALL(void, fini_ip_ct_irc, (void)); +INIT_KSYM_CALL(void, fini_ip_ct_ftp, (void)); +INIT_KSYM_CALL(void, fini_iptable_conntrack, (void)); +INIT_KSYM_CALL(void, fini_ip6table_filter, (void)); +INIT_KSYM_CALL(void, fini_iptable_filter, (void)); +INIT_KSYM_CALL(void, fini_ip6table_mangle, (void)); +INIT_KSYM_CALL(void, fini_iptable_mangle, (void)); +INIT_KSYM_CALL(void, fini_ip6tables, (void)); +INIT_KSYM_CALL(void, fini_iptables, (void)); +INIT_KSYM_CALL(void, fini_netfilter, (void)); + +#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS) +INIT_KSYM_MODULE(vzmon); +INIT_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); + +void do_env_free(struct ve_struct *env) +{ + KSYMSAFECALL_VOID(vzmon, real_do_env_free, (env)); +} +EXPORT_SYMBOL(do_env_free); +#endif + +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) +INIT_KSYM_MODULE(vzethdev); +INIT_KSYM_CALL(int, veth_open, (struct net_device *dev)); +#endif + +struct ve_struct ve0 = { + .ve_list = LIST_HEAD_INIT(ve0.ve_list), + .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh), + .start_jiffies = INITIAL_JIFFIES, +#ifdef CONFIG_NET + ._net_dev_tail = &ve0._net_dev_base, + .ifindex = -1, +#endif +#ifdef CONFIG_UNIX98_PTYS + .devpts_config = &devpts_config, +#endif + .ve_ns = &init_nsproxy, + .is_running = 1, +}; + +EXPORT_SYMBOL(ve0); + +#ifdef CONFIG_SMP +static struct percpu_data ve0_cpu_stats; +#endif +static struct ve_cpu_stats ve0_cpu_stats_data[NR_CPUS]; + +LIST_HEAD(ve_list_head); +rwlock_t ve_list_lock = RW_LOCK_UNLOCKED; + +LIST_HEAD(ve_cleanup_list); +DEFINE_SPINLOCK(ve_cleanup_lock); +struct task_struct *ve_cleanup_thread; + +EXPORT_SYMBOL(ve_list_lock); +EXPORT_SYMBOL(ve_list_head); +EXPORT_SYMBOL(ve_cleanup_lock); +EXPORT_SYMBOL(ve_cleanup_list); +EXPORT_SYMBOL(ve_cleanup_thread); + +void init_ve0(void) +{ + struct ve_struct *ve; + + ve = get_ve0(); + (void)get_ve(ve); + atomic_set(&ve->pcounter, 1); + + ve->cpu_stats = static_percpu_ptr(&ve0_cpu_stats, + ve0_cpu_stats_data); + + list_add(&ve->ve_list, &ve_list_head); +} + +void ve_cleanup_schedule(struct ve_struct *ve) +{ + BUG_ON(ve_cleanup_thread == NULL); + + spin_lock(&ve_cleanup_lock); + list_add_tail(&ve->cleanup_list, &ve_cleanup_list); + spin_unlock(&ve_cleanup_lock); + + wake_up_process(ve_cleanup_thread); +} diff -uprN linux-2.6.18/kernel/ve/vecalls.c linux-2.6.18.ovz/kernel/ve/vecalls.c --- linux-2.6.18/kernel/ve/vecalls.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ve/vecalls.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,2961 @@ +/* + * linux/kernel/ve/vecalls.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + */ + +/* + * 'vecalls.c' is file with basic VE support. It provides basic primities + * along with initialization script + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#ifdef CONFIG_FAIRSCHED +#include +#endif + +#include +#include + +int nr_ve = 1; /* One VE always exists. Compatibility with vestat */ +EXPORT_SYMBOL(nr_ve); + +static int do_env_enter(struct ve_struct *ve, unsigned int flags); +static int alloc_ve_tty_drivers(struct ve_struct* ve); +static void free_ve_tty_drivers(struct ve_struct* ve); +static int register_ve_tty_drivers(struct ve_struct* ve); +static void unregister_ve_tty_drivers(struct ve_struct* ve); +static int init_ve_tty_drivers(struct ve_struct *); +static void fini_ve_tty_drivers(struct ve_struct *); +static void clear_termios(struct tty_driver* driver ); +#ifdef CONFIG_INET +static void ve_mapped_devs_cleanup(struct ve_struct *ve); +#endif + +static void vecalls_exit(void); + +struct ve_struct *__find_ve_by_id(envid_t veid) +{ + struct ve_struct *ve; + + for_each_ve(ve) { + if (ve->veid == veid) + return ve; + } + return NULL; +} +EXPORT_SYMBOL(__find_ve_by_id); + +struct ve_struct *get_ve_by_id(envid_t veid) +{ + struct ve_struct *ve; + read_lock(&ve_list_lock); + ve = __find_ve_by_id(veid); + get_ve(ve); + read_unlock(&ve_list_lock); + return ve; +} +EXPORT_SYMBOL(get_ve_by_id); + +/* + * real_put_ve() MUST be used instead of put_ve() inside vecalls. + */ +void real_do_env_free(struct ve_struct *ve); +static inline void real_put_ve(struct ve_struct *ve) +{ + if (ve && atomic_dec_and_test(&ve->counter)) { + if (atomic_read(&ve->pcounter) > 0) + BUG(); + if (ve->is_running) + BUG(); + real_do_env_free(ve); + } +} + +static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf) +{ + struct ve_struct *ve; + struct vz_cpu_stat *vstat; + int retval; + int i, cpu; + unsigned long tmp; + + if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid)) + return -EPERM; + if (veid == 0) + return -ESRCH; + + vstat = kzalloc(sizeof(*vstat), GFP_KERNEL); + if (!vstat) + return -ENOMEM; + + retval = -ESRCH; + read_lock(&ve_list_lock); + ve = __find_ve_by_id(veid); + if (ve == NULL) + goto out_unlock; + for_each_online_cpu(cpu) { + struct ve_cpu_stats *st; + + st = VE_CPU_STATS(ve, cpu); + vstat->user_jif += (unsigned long)cputime64_to_clock_t(st->user); + vstat->nice_jif += (unsigned long)cputime64_to_clock_t(st->nice); + vstat->system_jif += (unsigned long)cputime64_to_clock_t(st->system); + vstat->idle_clk += __ve_sched_get_idle_time(ve, cpu); + } + vstat->uptime_clk = get_cycles() - ve->start_cycles; + vstat->uptime_jif = (unsigned long)cputime64_to_clock_t( + get_jiffies_64() - ve->start_jiffies); + for (i = 0; i < 3; i++) { + tmp = ve->avenrun[i] + (FIXED_1/200); + vstat->avenrun[i].val_int = LOAD_INT(tmp); + vstat->avenrun[i].val_frac = LOAD_FRAC(tmp); + } + read_unlock(&ve_list_lock); + + retval = 0; + if (copy_to_user(buf, vstat, sizeof(*vstat))) + retval = -EFAULT; +out_free: + kfree(vstat); + return retval; + +out_unlock: + read_unlock(&ve_list_lock); + goto out_free; +} + +static int real_setdevperms(envid_t veid, unsigned type, + dev_t dev, unsigned mask) +{ + struct ve_struct *ve; + int err; + + if (!capable(CAP_SETVEID) || veid == 0) + return -EPERM; + + if ((ve = get_ve_by_id(veid)) == NULL) + return -ESRCH; + + down_read(&ve->op_sem); + err = -ESRCH; + if (ve->is_running) + err = set_device_perms_ve(veid, type, dev, mask); + up_read(&ve->op_sem); + real_put_ve(ve); + return err; +} + +/********************************************************************** + ********************************************************************** + * + * FS-related helpers to VE start/stop + * + ********************************************************************** + **********************************************************************/ + +#ifdef CONFIG_SYSCTL +static inline int register_ve_sysctltables(struct ve_struct *ve) +{ + /* + * a code to register kernel sysctl table used to be here + * it registered utsname and ipc ones only, but since + * we have namespaves for them booth nothing is to be done + * here. + */ + return 0; +} + +static inline void unregister_ve_sysctltables(struct ve_struct *ve) +{ +} + +static inline void free_ve_sysctltables(struct ve_struct *ve) +{ +} +#endif + +/********************************************************************** + ********************************************************************** + * + * VE start: subsystems + * + ********************************************************************** + **********************************************************************/ + +#ifdef CONFIG_INET +#include +#include +#include +#include + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int init_fini_ve_mibs6(struct ve_struct *ve, int fini) +{ + if (fini) + goto fini; + + if (!(ve->_ipv6_statistics[0] = alloc_percpu(struct ipstats_mib))) + goto out1; + if (!(ve->_ipv6_statistics[1] = alloc_percpu(struct ipstats_mib))) + goto out2; + if (!(ve->_icmpv6_statistics[0] = alloc_percpu(struct icmpv6_mib))) + goto out3; + if (!(ve->_icmpv6_statistics[1] = alloc_percpu(struct icmpv6_mib))) + goto out4; + if (!(ve->_udp_stats_in6[0] = alloc_percpu(struct udp_mib))) + goto out5; + if (!(ve->_udp_stats_in6[1] = alloc_percpu(struct udp_mib))) + goto out6; + return 0; + +fini: + free_percpu(ve->_udp_stats_in6[1]); +out6: + free_percpu(ve->_udp_stats_in6[0]); +out5: + free_percpu(ve->_icmpv6_statistics[1]); +out4: + free_percpu(ve->_icmpv6_statistics[0]); +out3: + free_percpu(ve->_ipv6_statistics[1]); +out2: + free_percpu(ve->_ipv6_statistics[0]); +out1: + return -ENOMEM; +} +#else +static int init_fini_ve_mibs6(struct ve_struct *ve, int fini) { return 0; } +#endif + +static int init_fini_ve_mibs(struct ve_struct *ve, int fini) +{ + if (fini) + goto fini; + + if (!(ve->_net_statistics[0] = alloc_percpu(struct linux_mib))) + goto out1; + if (!(ve->_net_statistics[1] = alloc_percpu(struct linux_mib))) + goto out2; + if (!(ve->_ip_statistics[0] = alloc_percpu(struct ipstats_mib))) + goto out3; + if (!(ve->_ip_statistics[1] = alloc_percpu(struct ipstats_mib))) + goto out4; + if (!(ve->_icmp_statistics[0] = alloc_percpu(struct icmp_mib))) + goto out5; + if (!(ve->_icmp_statistics[1] = alloc_percpu(struct icmp_mib))) + goto out6; + if (!(ve->_tcp_statistics[0] = alloc_percpu(struct tcp_mib))) + goto out7; + if (!(ve->_tcp_statistics[1] = alloc_percpu(struct tcp_mib))) + goto out8; + if (!(ve->_udp_statistics[0] = alloc_percpu(struct udp_mib))) + goto out9; + if (!(ve->_udp_statistics[1] = alloc_percpu(struct udp_mib))) + goto out10; + if (init_fini_ve_mibs6(ve, fini)) + goto out11; + return 0; + +fini: + init_fini_ve_mibs6(ve, fini); +out11: + free_percpu(ve->_udp_statistics[1]); +out10: + free_percpu(ve->_udp_statistics[0]); +out9: + free_percpu(ve->_tcp_statistics[1]); +out8: + free_percpu(ve->_tcp_statistics[0]); +out7: + free_percpu(ve->_icmp_statistics[1]); +out6: + free_percpu(ve->_icmp_statistics[0]); +out5: + free_percpu(ve->_ip_statistics[1]); +out4: + free_percpu(ve->_ip_statistics[0]); +out3: + free_percpu(ve->_net_statistics[1]); +out2: + free_percpu(ve->_net_statistics[0]); +out1: + return -ENOMEM; +} + +static inline int init_ve_mibs(struct ve_struct *ve) +{ + return init_fini_ve_mibs(ve, 0); +} + +static inline void fini_ve_mibs(struct ve_struct *ve) +{ + (void)init_fini_ve_mibs(ve, 1); +} + +static void veloop_setup(struct net_device *dev) +{ + int padded; + padded = dev->padded; + memcpy(dev, &templ_loopback_dev, sizeof(struct net_device)); + dev->padded = padded; +} + +static int init_ve_netdev(void) +{ + struct ve_struct *ve; + struct net_device_stats *stats; + int err; + + ve = get_exec_env(); + INIT_HLIST_HEAD(&ve->_net_dev_head); + ve->_net_dev_base = NULL; + ve->_net_dev_tail = &ve->_net_dev_base; + + err = -ENOMEM; + ve->_loopback_dev = alloc_netdev(0, templ_loopback_dev.name, + veloop_setup); + if (ve->_loopback_dev == NULL) + goto out; + + ve->_loopback_stats = alloc_percpu(struct net_device_stats); + if (ve->_loopback_stats == NULL) + goto out_free_netdev; + if (loopback_dev.get_stats != NULL) { + stats = kzalloc(sizeof(struct net_device_stats), GFP_KERNEL); + if (stats != NULL) { + ve->_loopback_dev->priv = stats; + ve->_loopback_dev->get_stats = loopback_dev.get_stats; + ve->_loopback_dev->destructor = loopback_dev.destructor; + } + } + err = register_netdev(ve->_loopback_dev); + if (err) + goto out_free_stats; + return 0; + +out_free_stats: + if (ve->_loopback_dev->priv != NULL) + kfree(ve->_loopback_dev->priv); + free_percpu(ve->_loopback_stats); +out_free_netdev: + free_netdev(ve->_loopback_dev); +out: + return err; +} + +static void fini_ve_netdev(void) +{ + struct ve_struct *ve; + struct net_device *dev; + + ve = get_exec_env(); + while (1) { + rtnl_lock(); + /* + * loopback is special, it can be referenced in fib's, + * so it must be freed the last. Doing so is + * sufficient to guarantee absence of such references. + */ + if (dev_base == ve->_loopback_dev) + dev = dev_base->next; + else + dev = dev_base; + if (dev == NULL) + break; + unregister_netdevice(dev); + rtnl_unlock(); + free_netdev(dev); + } + unregister_netdevice(ve->_loopback_dev); + rtnl_unlock(); + free_netdev(ve->_loopback_dev); + ve->_loopback_dev = NULL; + + free_percpu(ve->_loopback_stats); + ve->_loopback_stats = NULL; +} +#else +#define init_ve_mibs(ve) (0) +#define fini_ve_mibs(ve) do { } while (0) +#define init_ve_netdev() (0) +#define fini_ve_netdev() do { } while (0) +#endif + +static int prepare_proc_root(struct ve_struct *ve) +{ + struct proc_dir_entry *de; + + de = kzalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL); + if (de == NULL) + return -ENOMEM; + + memcpy(de + 1, "/proc", 6); + de->name = (char *)(de + 1); + de->namelen = 5; + de->mode = S_IFDIR | S_IRUGO | S_IXUGO; + de->nlink = 2; + atomic_set(&de->count, 1); + + ve->proc_root = de; + return 0; +} + +#ifdef CONFIG_PROC_FS +static int init_ve_proc(struct ve_struct *ve) +{ + int err; + struct proc_dir_entry *de; + + err = prepare_proc_root(ve); + if (err) + goto out_root; + + err = register_ve_fs_type(ve, &proc_fs_type, + &ve->proc_fstype, &ve->proc_mnt); + if (err) + goto out_reg; + + err = -ENOMEM; + de = create_proc_entry("kmsg", S_IRUSR, NULL); + if (!de) + goto out_kmsg; + de->proc_fops = &proc_kmsg_operations; + + /* create necessary /proc subdirs in VE local proc tree */ + err = -ENOMEM; + de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); + if (!de) + goto out_vz; + + proc_net = proc_mkdir("net", NULL); + if (!proc_net) + goto out_net; + + if (ve_snmp_proc_init()) + goto out_snmp; + + return 0; + +out_snmp: + remove_proc_entry("net", NULL); +out_net: + remove_proc_entry("vz", NULL); +out_vz: + remove_proc_entry("kmsg", NULL); +out_kmsg: + unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); + ve->proc_mnt = NULL; +out_reg: + /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */ + ; +out_root: + return err; +} + +static void fini_ve_proc(struct ve_struct *ve) +{ + ve_snmp_proc_fini(); + remove_proc_entry("net", NULL); + proc_net = NULL; + remove_proc_entry("vz", NULL); + remove_proc_entry("kmsg", NULL); + unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); + ve->proc_mnt = NULL; +} + +static void free_ve_proc(struct ve_struct *ve) +{ + /* proc filesystem frees proc_dir_entries on remove_proc_entry() only, + so we check that everything was removed and not lost */ + if (ve->proc_root && ve->proc_root->subdir) { + struct proc_dir_entry *p = ve->proc_root; + printk(KERN_WARNING "VE: %d: proc entry /proc", ve->veid); + while ((p = p->subdir) != NULL) + printk("/%s", p->name); + printk(" is not removed!\n"); + } + + kfree(ve->proc_root); + kfree(ve->proc_fstype); + + ve->proc_fstype = NULL; + ve->proc_root = NULL; +} +#else +#define init_ve_proc(ve) (0) +#define fini_ve_proc(ve) do { } while (0) +#define free_ve_proc(ve) do { } while (0) +#endif + +#ifdef CONFIG_SYSCTL +static int init_ve_sysctl(struct ve_struct *ve) +{ + int err; + +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + ve->proc_sys_root = proc_mkdir("sys", NULL); + if (ve->proc_sys_root == NULL) + goto out_proc; +#endif + INIT_LIST_HEAD(&ve->sysctl_lh); + err = register_ve_sysctltables(ve); + if (err) + goto out_reg; + + err = devinet_sysctl_init(ve); + if (err) + goto out_dev; + + err = addrconf_sysctl_init(ve); + if (err) + goto out_dev6; + + return 0; + +out_dev6: + devinet_sysctl_fini(ve); +out_dev: + unregister_ve_sysctltables(ve); + free_ve_sysctltables(ve); +out_reg: +#ifdef CONFIG_PROC_FS + remove_proc_entry("sys", NULL); +out_proc: +#endif + return err; +} + +static void fini_ve_sysctl(struct ve_struct *ve) +{ + addrconf_sysctl_fini(ve); + devinet_sysctl_fini(ve); + unregister_ve_sysctltables(ve); + remove_proc_entry("sys", NULL); +} + +static void free_ve_sysctl(struct ve_struct *ve) +{ + addrconf_sysctl_free(ve); + devinet_sysctl_free(ve); + free_ve_sysctltables(ve); +} +#else +#define init_ve_sysctl(ve) (0) +#define fini_ve_sysctl(ve) do { } while (0) +#define free_ve_sysctl(ve) do { } while (0) +#endif + +#ifdef CONFIG_UNIX98_PTYS +#include + +/* + * DEVPTS needs a virtualization: each environment should see each own list of + * pseudo-terminals. + * To implement it we need to have separate devpts superblocks for each + * VE, and each VE should mount its own one. + * Thus, separate vfsmount structures are required. + * To minimize intrusion into vfsmount lookup code, separate file_system_type + * structures are created. + * + * In addition to this, patch fo character device itself is required, as file + * system itself is used only for MINOR/MAJOR lookup. + */ + +static int init_ve_devpts(struct ve_struct *ve) +{ + int err; + + err = -ENOMEM; + ve->devpts_config = kzalloc(sizeof(struct devpts_config), GFP_KERNEL); + if (ve->devpts_config == NULL) + goto out; + + ve->devpts_config->mode = 0600; + err = register_ve_fs_type(ve, &devpts_fs_type, + &ve->devpts_fstype, &ve->devpts_mnt); + if (err) { + kfree(ve->devpts_config); + ve->devpts_config = NULL; + } +out: + return err; +} + +static void fini_ve_devpts(struct ve_struct *ve) +{ + unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt); + /* devpts_fstype is freed in real_put_ve -> free_ve_filesystems */ + ve->devpts_mnt = NULL; + kfree(ve->devpts_config); + ve->devpts_config = NULL; +} +#else +#define init_ve_devpts(ve) (0) +#define fini_ve_devpts(ve) do { } while (0) +#endif + +static int init_ve_shmem(struct ve_struct *ve) +{ + return register_ve_fs_type(ve, + &tmpfs_fs_type, + &ve->shmem_fstype, + &ve->shmem_mnt); +} + +static void fini_ve_shmem(struct ve_struct *ve) +{ + unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt); + /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */ + ve->shmem_mnt = NULL; +} + +static inline int init_ve_sysfs_root(struct ve_struct *ve) +{ + struct sysfs_dirent *sysfs_root; + + sysfs_root = kzalloc(sizeof(struct sysfs_dirent), GFP_KERNEL); + if (sysfs_root == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&sysfs_root->s_sibling); + INIT_LIST_HEAD(&sysfs_root->s_children); + sysfs_root->s_type = SYSFS_ROOT; + ve->sysfs_root = sysfs_root; + return 0; +} + +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) +static inline int init_ve_netclass(struct ve_struct *ve) +{ + struct class *nc; + int err; + + nc = kzalloc(sizeof(*nc), GFP_KERNEL); + if (!nc) + return -ENOMEM; + + nc->name = net_class.name; + nc->release = net_class.release; + nc->uevent = net_class.uevent; + + err = class_register(nc); + if (!err) { + ve->net_class = nc; + return 0; + } + kfree(nc); + return err; +} + +static inline void fini_ve_netclass(struct ve_struct *ve) +{ + class_unregister(ve->net_class); + kfree(ve->net_class); + ve->net_class = NULL; +} +#else +static inline int init_ve_netclass(struct ve_struct *ve) { return 0; } +static inline void fini_ve_netclass(struct ve_struct *ve) { ; } +#endif + +static int init_ve_sysfs(struct ve_struct *ve) +{ + struct subsystem *subsys; + int err; + +#ifdef CONFIG_SYSFS + err = 0; + if (ve->features & VE_FEATURE_SYSFS) { + err = init_ve_sysfs_root(ve); + if (err != 0) + goto out; + err = register_ve_fs_type(ve, + &sysfs_fs_type, + &ve->sysfs_fstype, + &ve->sysfs_mnt); + } + if (err != 0) + goto out_fs_type; +#endif + err = -ENOMEM; + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (subsys == NULL) + goto out_class_obj; + /* ick, this is ugly, the things we go through to keep from showing up + * in sysfs... */ + memcpy(&subsys->kset.kobj.name, &class_obj_subsys.kset.kobj.name, + sizeof(subsys->kset.kobj.name)); + subsys->kset.ktype = class_obj_subsys.kset.ktype; + subsys->kset.uevent_ops = class_obj_subsys.kset.uevent_ops; + subsystem_init(subsys); + if (!subsys->kset.subsys) + subsys->kset.subsys = subsys; + ve->class_obj_subsys = subsys; + + err = -ENOMEM; + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (subsys == NULL) + goto out_class_subsys; + /* ick, this is ugly, the things we go through to keep from showing up + * in sysfs... */ + memcpy(&subsys->kset.kobj.name, &class_subsys.kset.kobj.name, + sizeof(subsys->kset.kobj.name)); + subsys->kset.ktype = class_subsys.kset.ktype; + subsys->kset.uevent_ops = class_subsys.kset.uevent_ops; + ve->class_subsys = subsys; + err = subsystem_register(subsys); + if (err != 0) + goto out_register; + + err = init_ve_netclass(ve); + if (err) + goto out_nc; + + ve->tty_class = init_ve_tty_class(); + if (IS_ERR(ve->tty_class)) { + err = PTR_ERR(ve->tty_class); + ve->tty_class = NULL; + goto out_tty_class_register; + } + + return err; + +out_tty_class_register: + fini_ve_netclass(ve); +out_nc: + subsystem_unregister(subsys); +out_register: + kfree(ve->class_subsys); +out_class_subsys: + kfree(ve->class_obj_subsys); +out_class_obj: +#ifdef CONFIG_SYSFS + unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); + /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ +out_fs_type: + kfree(ve->sysfs_root); + ve->sysfs_root = NULL; +#endif + ve->class_subsys = NULL; + ve->class_obj_subsys = NULL; +out: + return err; +} + +static void fini_ve_sysfs(struct ve_struct *ve) +{ + fini_ve_tty_class(ve->tty_class); + fini_ve_netclass(ve); + subsystem_unregister(ve->class_subsys); + kfree(ve->class_subsys); + kfree(ve->class_obj_subsys); + + ve->class_subsys = NULL; + ve->class_obj_subsys = NULL; +#ifdef CONFIG_SYSFS + unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); + ve->sysfs_mnt = NULL; + kfree(ve->sysfs_root); + ve->sysfs_root = NULL; + /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ +#endif +} + +static void free_ve_filesystems(struct ve_struct *ve) +{ +#ifdef CONFIG_SYSFS + kfree(ve->sysfs_fstype); + ve->sysfs_fstype = NULL; +#endif + kfree(ve->shmem_fstype); + ve->shmem_fstype = NULL; + + kfree(ve->devpts_fstype); + ve->devpts_fstype = NULL; + + free_ve_proc(ve); +} + +static int init_printk(struct ve_struct *ve) +{ + struct ve_prep_printk { + wait_queue_head_t log_wait; + unsigned long log_start; + unsigned long log_end; + unsigned long logged_chars; + } *tmp; + + tmp = kzalloc(sizeof(struct ve_prep_printk), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + init_waitqueue_head(&tmp->log_wait); + ve->_log_wait = &tmp->log_wait; + ve->_log_start = &tmp->log_start; + ve->_log_end = &tmp->log_end; + ve->_logged_chars = &tmp->logged_chars; + /* ve->log_buf will be initialized later by ve_log_init() */ + return 0; +} + +static void fini_printk(struct ve_struct *ve) +{ + /* + * there is no spinlock protection here because nobody can use + * log_buf at the moments when this code is called. + */ + kfree(ve->log_buf); + kfree(ve->_log_wait); +} + +static void fini_venet(struct ve_struct *ve) +{ +#ifdef CONFIG_INET + tcp_v4_kill_ve_sockets(ve); + ve_mapped_devs_cleanup(ve); + synchronize_net(); +#endif +} + +static int init_ve_sched(struct ve_struct *ve, unsigned int vcpus) +{ +#ifdef CONFIG_FAIRSCHED + int err; + + /* + * We refuse to switch to an already existing node since nodes + * keep a pointer to their ve_struct... + */ + err = sys_fairsched_mknod(0, 1, ve->veid); + if (err < 0) { + printk(KERN_WARNING "Can't create fairsched node %d\n", + ve->veid); + return err; + } + err = sys_fairsched_vcpus(ve->veid, vcpus); + if (err) { + printk(KERN_WARNING "Can't set fairsched vcpus on node %d\n", + ve->veid); + goto cleanup; + } + err = sys_fairsched_mvpr(current->pid, ve->veid); + if (err) { + printk(KERN_WARNING "Can't switch to fairsched node %d\n", + ve->veid); + goto cleanup; + } +#endif + ve_sched_attach(ve); + return 0; + +#ifdef CONFIG_FAIRSCHED +cleanup: + if (sys_fairsched_rmnod(ve->veid)) + printk(KERN_ERR "Can't clean fairsched node %d\n", + ve->veid); + return err; +#endif +} + +static void fini_ve_sched(struct ve_struct *ve) +{ +#ifdef CONFIG_FAIRSCHED + if (task_vsched_id(current) == ve->veid) + if (sys_fairsched_mvpr(current->pid, fairsched_init_node.id)) + printk(KERN_WARNING "Can't leave fairsched node %d\n", + ve->veid); + if (sys_fairsched_rmnod(ve->veid)) + printk(KERN_ERR "Can't remove fairsched node %d\n", + ve->veid); +#endif +} + +/* + * Namespaces + */ + +static inline int init_ve_namespaces(struct ve_struct *ve, + struct nsproxy **old) +{ + int err; + struct task_struct *tsk; + struct nsproxy *cur; + + tsk = current; + cur = get_nsproxy(tsk->nsproxy); + + err = copy_namespaces(CLONE_NAMESPACES_MASK, tsk); + if (err < 0) { + put_nsproxy(cur); + return err; + } + + ve->ve_ns = get_nsproxy(tsk->nsproxy); + memcpy(ve->ve_ns->uts_ns->name.release, virt_utsname.release, + sizeof(virt_utsname.release)); + *old = cur; + return 0; +} + +static inline void fini_ve_namespaces(struct ve_struct *ve, + struct nsproxy *old) +{ + struct task_struct *tsk; + struct nsproxy *cur; + + if (old) { + tsk = current; + cur = tsk->nsproxy; + tsk->nsproxy = old; + put_nsproxy(cur); + } + + if (ve->ve_ns->ipc_ns) + shm_clean_ns(ve->ve_ns->ipc_ns); + + put_nsproxy(ve->ve_ns); + ve->ve_ns = NULL; +} + +static inline void switch_ve_namespaces(struct ve_struct *ve, + struct task_struct *tsk) +{ + struct nsproxy *old_ns; + struct nsproxy *new_ns; + + BUG_ON(tsk != current); + old_ns = tsk->nsproxy; + new_ns = ve->ve_ns; + + if (old_ns != new_ns) { + tsk->nsproxy = get_nsproxy(new_ns); + put_nsproxy(old_ns); + } +} + +static __u64 get_ve_features(env_create_param_t *data, int datalen) +{ + __u64 known_features; + + if (datalen < sizeof(struct env_create_param3)) + /* this version of vzctl is aware of VE_FEATURES_OLD only */ + known_features = VE_FEATURES_OLD; + else + known_features = data->known_features; + + /* + * known features are set as required + * yet unknown features are set as in VE_FEATURES_DEF + */ + return (data->feature_mask & known_features) | + (VE_FEATURES_DEF & ~known_features); +} + +static int init_ve_struct(struct ve_struct *ve, envid_t veid, + u32 class_id, env_create_param_t *data, int datalen, + struct task_struct *init_tsk) +{ + int n; + + (void)get_ve(ve); + ve->veid = veid; + ve->class_id = class_id; + ve->init_entry = init_tsk; + ve->features = get_ve_features(data, datalen); + INIT_LIST_HEAD(&ve->vetask_lh); + init_rwsem(&ve->op_sem); +#ifdef CONFIG_NET + ve->ifindex = -1; +#endif + + for(n = 0; n < UIDHASH_SZ_VE; ++n) + INIT_LIST_HEAD(&ve->uidhash_table[n]); + + ve->start_timespec = ve->init_entry->start_time; + /* The value is wrong, but it is never compared to process + * start times */ + ve->start_jiffies = get_jiffies_64(); + ve->start_cycles = get_cycles(); + ve->virt_pids = glob_virt_pids; + + return 0; +} + +/********************************************************************** + ********************************************************************** + * + * /proc/meminfo virtualization + * + ********************************************************************** + **********************************************************************/ +static int ve_set_meminfo(envid_t veid, unsigned long val) +{ +#ifdef CONFIG_USER_RESOURCE + struct ve_struct *ve; + + ve = get_ve_by_id(veid); + if (!ve) + return -EINVAL; + + ve->meminfo_val = val; + real_put_ve(ve); + return 0; +#else + return -ENOTTY; +#endif +} + +static int init_ve_meminfo(struct ve_struct *ve) +{ + ve->meminfo_val = 0; + return 0; +} + +static inline void fini_ve_meminfo(struct ve_struct *ve) +{ +} + +static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk) +{ + read_lock(&tsk->fs->lock); + ve->fs_rootmnt = tsk->fs->rootmnt; + ve->fs_root = tsk->fs->root; + read_unlock(&tsk->fs->lock); + mark_tree_virtual(ve->fs_rootmnt, ve->fs_root); +} + +static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk) +{ + /* required for real_setdevperms from register_ve_ above */ + memcpy(&ve->ve_cap_bset, &tsk->cap_effective, sizeof(kernel_cap_t)); + cap_lower(ve->ve_cap_bset, CAP_SETVEID); +} + +static int ve_list_add(struct ve_struct *ve) +{ + write_lock_irq(&ve_list_lock); + if (__find_ve_by_id(ve->veid) != NULL) + goto err_exists; + + list_add(&ve->ve_list, &ve_list_head); + nr_ve++; + write_unlock_irq(&ve_list_lock); + return 0; + +err_exists: + write_unlock_irq(&ve_list_lock); + return -EEXIST; +} + +static void ve_list_del(struct ve_struct *ve) +{ + write_lock_irq(&ve_list_lock); + list_del(&ve->ve_list); + nr_ve--; + write_unlock_irq(&ve_list_lock); +} + +static void set_task_ve_caps(struct task_struct *tsk, struct ve_struct *ve) +{ + spin_lock(&task_capability_lock); + cap_mask(tsk->cap_effective, ve->ve_cap_bset); + cap_mask(tsk->cap_inheritable, ve->ve_cap_bset); + cap_mask(tsk->cap_permitted, ve->ve_cap_bset); + spin_unlock(&task_capability_lock); +} + +void ve_move_task(struct task_struct *tsk, struct ve_struct *new) +{ + struct ve_struct *old; + + might_sleep(); + BUG_ON(tsk != current); + BUG_ON(!(thread_group_leader(tsk) && thread_group_empty(tsk))); + + /* this probihibts ptracing of task entered to VE from host system */ + tsk->mm->vps_dumpable = 0; + /* setup capabilities before enter */ + set_task_ve_caps(tsk, new); + + old = tsk->ve_task_info.owner_env; + tsk->ve_task_info.owner_env = new; + tsk->ve_task_info.exec_env = new; + + write_lock_irq(&tasklist_lock); + list_del_rcu(&tsk->ve_task_info.vetask_list); + write_unlock_irq(&tasklist_lock); + + synchronize_rcu(); + + write_lock_irq(&tasklist_lock); + list_add_tail_rcu(&tsk->ve_task_info.vetask_list, + &new->vetask_lh); + write_unlock_irq(&tasklist_lock); + + atomic_dec(&old->pcounter); + real_put_ve(old); + + atomic_inc(&new->pcounter); + get_ve(new); +} + +EXPORT_SYMBOL(ve_move_task); + +#ifdef CONFIG_VE_IPTABLES +extern int init_netfilter(void); +extern void fini_netfilter(void); +#define init_ve_netfilter() init_netfilter() +#define fini_ve_netfilter() fini_netfilter() + +#define KSYMIPTINIT(mask, ve, full_mask, mod, name, args) \ +({ \ + int ret = 0; \ + if (VE_IPT_CMP(mask, full_mask) && \ + VE_IPT_CMP((ve)->_iptables_modules, \ + full_mask & ~(full_mask##_MOD))) { \ + ret = KSYMERRCALL(1, mod, name, args); \ + if (ret == 0) \ + (ve)->_iptables_modules |= \ + full_mask##_MOD; \ + if (ret == 1) \ + ret = 0; \ + } \ + ret; \ +}) + +#define KSYMIPTFINI(mask, full_mask, mod, name, args) \ +({ \ + if (VE_IPT_CMP(mask, full_mask##_MOD)) \ + KSYMSAFECALL_VOID(mod, name, args); \ +}) + + +static int do_ve_iptables(struct ve_struct *ve, __u64 init_mask, + int init_or_cleanup) +{ + int err; + + /* Remove when userspace will start supplying IPv6-related bits. */ + init_mask &= ~VE_IP_IPTABLES6; + init_mask &= ~VE_IP_FILTER6; + init_mask &= ~VE_IP_MANGLE6; + init_mask &= ~VE_IP_IPTABLE_NAT_MOD; + if ((init_mask & VE_IP_IPTABLES) == VE_IP_IPTABLES) + init_mask |= VE_IP_IPTABLES6; + if ((init_mask & VE_IP_FILTER) == VE_IP_FILTER) + init_mask |= VE_IP_FILTER6; + if ((init_mask & VE_IP_MANGLE) == VE_IP_MANGLE) + init_mask |= VE_IP_MANGLE6; + if ((init_mask & VE_IP_NAT) == VE_IP_NAT) + init_mask |= VE_IP_IPTABLE_NAT; + + err = 0; + if (!init_or_cleanup) + goto cleanup; + + /* init part */ +#if defined(CONFIG_IP_NF_IPTABLES) || \ + defined(CONFIG_IP_NF_IPTABLES_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, + ip_tables, init_iptables, ()); + if (err < 0) + goto err_iptables; +#endif +#if defined(CONFIG_IP6_NF_IPTABLES) || \ + defined(CONFIG_IP6_NF_IPTABLES_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES6, + ip6_tables, init_ip6tables, ()); + if (err < 0) + goto err_ip6tables; +#endif +#if defined(CONFIG_IP_NF_CONNTRACK) || \ + defined(CONFIG_IP_NF_CONNTRACK_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK, + ip_conntrack, init_iptable_conntrack, ()); + if (err < 0) + goto err_iptable_conntrack; +#endif +#if defined(CONFIG_IP_NF_FTP) || \ + defined(CONFIG_IP_NF_FTP_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_FTP, + ip_conntrack_ftp, init_ip_ct_ftp, ()); + if (err < 0) + goto err_iptable_ftp; +#endif +#if defined(CONFIG_IP_NF_IRC) || \ + defined(CONFIG_IP_NF_IRC_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_IRC, + ip_conntrack_irc, init_ip_ct_irc, ()); + if (err < 0) + goto err_iptable_irc; +#endif +#if defined(CONFIG_IP_NF_NAT) || \ + defined(CONFIG_IP_NF_NAT_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, + ip_nat, ip_nat_init, ()); + if (err < 0) + goto err_iptable_nat; + err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLE_NAT, + iptable_nat, init_iptable_nat, ()); + if (err < 0) + goto err_iptable_nat2; +#endif +#if defined(CONFIG_IP_NF_NAT_FTP) || \ + defined(CONFIG_IP_NF_NAT_FTP_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_FTP, + ip_nat_ftp, init_iptable_nat_ftp, ()); + if (err < 0) + goto err_iptable_nat_ftp; +#endif +#if defined(CONFIG_IP_NF_NAT_IRC) || \ + defined(CONFIG_IP_NF_NAT_IRC_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_IRC, + ip_nat_irc, init_iptable_nat_irc, ()); + if (err < 0) + goto err_iptable_nat_irc; +#endif +#if defined(CONFIG_IP_NF_FILTER) || \ + defined(CONFIG_IP_NF_FILTER_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER, + iptable_filter, init_iptable_filter, ()); + if (err < 0) + goto err_iptable_filter; +#endif +#if defined(CONFIG_IP6_NF_FILTER) || \ + defined(CONFIG_IP6_NF_FILTER_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER6, + ip6table_filter, init_ip6table_filter, ()); + if (err < 0) + goto err_ip6table_filter; +#endif +#if defined(CONFIG_IP_NF_MANGLE) || \ + defined(CONFIG_IP_NF_MANGLE_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE, + iptable_mangle, init_iptable_mangle, ()); + if (err < 0) + goto err_iptable_mangle; +#endif +#if defined(CONFIG_IP6_NF_MANGLE) || \ + defined(CONFIG_IP6_NF_MANGLE_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE6, + ip6table_mangle, init_ip6table_mangle, ()); + if (err < 0) + goto err_ip6table_mangle; +#endif + return 0; + +/* ------------------------------------------------------------------------- */ + +cleanup: +#if defined(CONFIG_IP6_NF_MANGLE) || \ + defined(CONFIG_IP6_NF_MANGLE_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE6, + ip6table_mangle, fini_ip6table_mangle, ()); +err_ip6table_mangle: +#endif +#if defined(CONFIG_IP_NF_MANGLE) || \ + defined(CONFIG_IP_NF_MANGLE_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, + iptable_mangle, fini_iptable_mangle, ()); +err_iptable_mangle: +#endif +#if defined(CONFIG_IP6_NF_FILTER) || \ + defined(CONFIG_IP6_NF_FILTER_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER6, + ip6table_filter, fini_ip6table_filter, ()); +err_ip6table_filter: +#endif +#if defined(CONFIG_IP_NF_FILTER) || \ + defined(CONFIG_IP_NF_FILTER_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, + iptable_filter, fini_iptable_filter, ()); +err_iptable_filter: +#endif +#if defined(CONFIG_IP_NF_NAT_IRC) || \ + defined(CONFIG_IP_NF_NAT_IRC_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_IRC, + ip_nat_irc, fini_iptable_nat_irc, ()); +err_iptable_nat_irc: +#endif +#if defined(CONFIG_IP_NF_NAT_FTP) || \ + defined(CONFIG_IP_NF_NAT_FTP_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_FTP, + ip_nat_ftp, fini_iptable_nat_ftp, ()); +err_iptable_nat_ftp: +#endif +#if defined(CONFIG_IP_NF_NAT) || \ + defined(CONFIG_IP_NF_NAT_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLE_NAT, + iptable_nat, fini_iptable_nat, ()); +err_iptable_nat2: + KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, + ip_nat, ip_nat_cleanup, ()); +err_iptable_nat: +#endif +#if defined(CONFIG_IP_NF_IRC) || \ + defined(CONFIG_IP_NF_IRC_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_IRC, + ip_conntrack_irc, fini_ip_ct_irc, ()); +err_iptable_irc: +#endif +#if defined(CONFIG_IP_NF_FTP) || \ + defined(CONFIG_IP_NF_FTP_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_FTP, + ip_conntrack_ftp, fini_ip_ct_ftp, ()); +err_iptable_ftp: +#endif +#if defined(CONFIG_IP_NF_CONNTRACK) || \ + defined(CONFIG_IP_NF_CONNTRACK_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK, + ip_conntrack, fini_iptable_conntrack, ()); +err_iptable_conntrack: +#endif +#if defined(CONFIG_IP6_NF_IPTABLES) || \ + defined(CONFIG_IP6_NF_IPTABLES_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES6, + ip6_tables, fini_ip6tables, ()); +err_ip6tables: +#endif +#if defined(CONFIG_IP_NF_IPTABLES) || \ + defined(CONFIG_IP_NF_IPTABLES_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, + ip_tables, fini_iptables, ()); +err_iptables: +#endif + ve->_iptables_modules = 0; + + return err; +} + +static inline int init_ve_iptables(struct ve_struct *ve, __u64 init_mask) +{ + return do_ve_iptables(ve, init_mask, 1); +} + +static inline void fini_ve_iptables(struct ve_struct *ve, __u64 init_mask) +{ + (void)do_ve_iptables(ve, init_mask, 0); +} + +#else +#define init_ve_iptables(x, y) (0) +#define fini_ve_iptables(x, y) do { } while (0) +#define init_ve_netfilter() (0) +#define fini_ve_netfilter() do { } while (0) +#endif + +static inline int init_ve_cpustats(struct ve_struct *ve) +{ + ve->cpu_stats = alloc_percpu(struct ve_cpu_stats); + return ve->cpu_stats == NULL ? -ENOMEM : 0; +} + +static inline void free_ve_cpustats(struct ve_struct *ve) +{ + free_percpu(ve->cpu_stats); + ve->cpu_stats = NULL; +} + +static int alone_in_pgrp(struct task_struct *tsk) +{ + struct task_struct *p; + int alone = 0; + + read_lock(&tasklist_lock); + do_each_task_pid_all(tsk->pid, PIDTYPE_PGID, p) { + if (p != tsk) + goto out; + } while_each_task_pid_all(tsk->pid, PIDTYPE_PGID, p); + do_each_task_pid_all(tsk->pid, PIDTYPE_SID, p) { + if (p != tsk) + goto out; + } while_each_task_pid_all(tsk->pid, PIDTYPE_SID, p); + alone = 1; +out: + read_unlock(&tasklist_lock); + return alone; +} + +static int do_env_create(envid_t veid, unsigned int flags, u32 class_id, + env_create_param_t *data, int datalen) +{ + struct task_struct *tsk; + struct ve_struct *old; + struct ve_struct *old_exec; + struct ve_struct *ve; + __u64 init_mask; + int err; + struct nsproxy *old_ns; + + tsk = current; + old = VE_TASK_INFO(tsk)->owner_env; + + if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) + return -EINVAL; + + if (tsk->signal->tty) { + printk("ERR: VE init has controlling terminal\n"); + return -EINVAL; + } + if (tsk->signal->pgrp != tsk->pid || + tsk->signal->session != tsk->pid) { + int may_setsid; + + read_lock(&tasklist_lock); + may_setsid = !tsk->signal->leader && + !find_task_by_pid_type_all(PIDTYPE_PGID, tsk->pid); + read_unlock(&tasklist_lock); + + if (!may_setsid) { + printk("ERR: VE init is process group leader\n"); + return -EINVAL; + } + } + /* Check that the process is not a leader of non-empty group/session. + * If it is, we cannot virtualize its PID and must fail. */ + if (!alone_in_pgrp(tsk)) { + printk("ERR: VE init is not alone in process group\n"); + return -EINVAL; + } + + + VZTRACE("%s: veid=%d classid=%d pid=%d\n", + __FUNCTION__, veid, class_id, current->pid); + + err = -ENOMEM; + ve = kzalloc(sizeof(struct ve_struct), GFP_KERNEL); + if (ve == NULL) + goto err_struct; + + init_ve_struct(ve, veid, class_id, data, datalen, tsk); + __module_get(THIS_MODULE); + down_write(&ve->op_sem); + if (flags & VE_LOCK) + ve->is_locked = 1; + + /* + * this should be done before adding to list + * because if calc_load_ve finds this ve in + * list it will be very surprised + */ + if ((err = init_ve_cpustats(ve)) < 0) + goto err_cpu_stats; + + if ((err = ve_list_add(ve)) < 0) + goto err_exist; + + /* this should be done before context switching */ + if ((err = init_printk(ve)) < 0) + goto err_log_wait; + + old_exec = set_exec_env(ve); + + if ((err = init_ve_sched(ve, data->total_vcpus)) < 0) + goto err_sched; + + /* move user to VE */ + if ((err = set_user(0, 0)) < 0) + goto err_set_user; + + set_ve_root(ve, tsk); + + if ((err = init_ve_namespaces(ve, &old_ns))) + goto err_ns; + + if ((err = init_ve_mibs(ve))) + goto err_mibs; + + if ((err = init_ve_proc(ve))) + goto err_proc; + + if ((err = init_ve_sysctl(ve))) + goto err_sysctl; + + if ((err = init_ve_sysfs(ve))) + goto err_sysfs; + + if ((err = ve_arp_init(ve)) < 0) + goto err_route; + if ((err = ve_ndisc_init(ve)) < 0) + goto err_route; + + if ((err = init_ve_route(ve)) < 0) + goto err_route; + + if ((err = init_ve_route6(ve)) < 0) + goto err_route; + + if ((err = init_ve_netdev())) + goto err_dev; + + if ((err = init_ve_tty_drivers(ve)) < 0) + goto err_tty; + + if ((err = init_ve_shmem(ve))) + goto err_shmem; + + if ((err = init_ve_devpts(ve))) + goto err_devpts; + + if((err = init_ve_meminfo(ve))) + goto err_meminf; + + set_ve_caps(ve, tsk); + + /* It is safe to initialize netfilter here as routing initialization and + interface setup will be done below. This means that NO skb can be + passed inside. Den */ + /* iptables ve initialization for non ve0; + ve0 init is in module_init */ + if ((err = init_ve_netfilter()) < 0) + goto err_netfilter; + + init_mask = data ? data->iptables_mask : VE_IP_DEFAULT; + if ((err = init_ve_iptables(ve, init_mask)) < 0) + goto err_iptables; + + if ((err = alloc_vpid(tsk->pids[PIDTYPE_PID].pid, 1)) < 0) + goto err_vpid; + + if ((err = ve_hook_iterate_init(VE_SS_CHAIN, ve)) < 0) + goto err_ve_hook; + + put_nsproxy(old_ns); + + /* finally: set vpids and move inside */ + ve_move_task(tsk, ve); + + set_virt_pid(tsk, 1); + set_virt_tgid(tsk, 1); + + set_special_pids(tsk->pid, tsk->pid); + current->signal->tty_old_pgrp = 0; + set_virt_pgid(tsk, 1); + set_virt_sid(tsk, 1); + + ve->is_running = 1; + up_write(&ve->op_sem); + + printk(KERN_INFO "VE: %d: started\n", veid); + return veid; + +err_ve_hook: + free_vpid(tsk->pids[PIDTYPE_PID].pid); +err_vpid: + fini_venet(ve); + fini_ve_iptables(ve, init_mask); +err_iptables: + fini_ve_netfilter(); +err_netfilter: + fini_ve_meminfo(ve); +err_meminf: + fini_ve_devpts(ve); +err_devpts: + fini_ve_shmem(ve); +err_shmem: + fini_ve_tty_drivers(ve); +err_tty: + fini_ve_netdev(); +err_dev: + fini_ve_route(ve); + fini_ve_route6(ve); +err_route: + ve_ndisc_fini(ve); + ve_arp_fini(ve); + fini_ve_sysfs(ve); +err_sysfs: + fini_ve_sysctl(ve); +err_sysctl: + fini_ve_proc(ve); +err_proc: + clean_device_perms_ve(ve->veid); + fini_ve_mibs(ve); +err_mibs: + /* free_ve_utsname() is called inside real_put_ve() */ ; + fini_ve_namespaces(ve, old_ns); +err_ns: + /* It is safe to restore current->envid here because + * ve_fairsched_detach does not use current->envid. */ + /* Really fairsched code uses current->envid in sys_fairsched_mknod + * only. It is correct if sys_fairsched_mknod is called from + * userspace. If sys_fairsched_mknod is called from + * ve_fairsched_attach, then node->envid and node->parent_node->envid + * are explicitly set to valid value after the call. */ + /* FIXME */ + VE_TASK_INFO(tsk)->owner_env = old; + VE_TASK_INFO(tsk)->exec_env = old_exec; + /* move user back */ + if (set_user(0, 0) < 0) + printk(KERN_WARNING"Can't restore UID\n"); + +err_set_user: + fini_ve_sched(ve); +err_sched: + (void)set_exec_env(old_exec); + + /* we can jump here having incorrect envid */ + VE_TASK_INFO(tsk)->owner_env = old; + fini_printk(ve); +err_log_wait: + /* cpustats will be freed in do_env_free */ + ve_list_del(ve); + up_write(&ve->op_sem); + + real_put_ve(ve); +err_struct: + printk(KERN_INFO "VE: %d: failed to start with err=%d\n", veid, err); + return err; + +err_exist: + free_ve_cpustats(ve); +err_cpu_stats: + kfree(ve); + goto err_struct; +} + + +/********************************************************************** + ********************************************************************** + * + * VE start/stop callbacks + * + ********************************************************************** + **********************************************************************/ + +int real_env_create(envid_t veid, unsigned flags, u32 class_id, + env_create_param_t *data, int datalen) +{ + int status; + struct ve_struct *ve; + + if (!flags) { + status = get_exec_env()->veid; + goto out; + } + + status = -EPERM; + if (!capable(CAP_SETVEID)) + goto out; + + status = -EINVAL; + if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE))) + goto out; + + status = -EINVAL; + ve = get_ve_by_id(veid); + if (ve) { + if (flags & VE_TEST) { + status = 0; + goto out_put; + } + if (flags & VE_EXCLUSIVE) { + status = -EACCES; + goto out_put; + } + if (flags & VE_CREATE) { + flags &= ~VE_CREATE; + flags |= VE_ENTER; + } + } else { + if (flags & (VE_TEST|VE_ENTER)) { + status = -ESRCH; + goto out; + } + } + + if (flags & VE_CREATE) { + status = do_env_create(veid, flags, class_id, data, datalen); + goto out; + } else if (flags & VE_ENTER) + status = do_env_enter(ve, flags); + + /* else: returning EINVAL */ + +out_put: + real_put_ve(ve); +out: + return status; +} +EXPORT_SYMBOL(real_env_create); + +static int do_env_enter(struct ve_struct *ve, unsigned int flags) +{ + struct task_struct *tsk = current; + int err; + + VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid); + + err = -EBUSY; + down_read(&ve->op_sem); + if (!ve->is_running) + goto out_up; + if (ve->is_locked && !(flags & VE_SKIPLOCK)) + goto out_up; + err = -EINVAL; + if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) + goto out_up; + +#ifdef CONFIG_FAIRSCHED + err = sys_fairsched_mvpr(current->pid, ve->veid); + if (err) + goto out_up; +#endif + ve_sched_attach(ve); + switch_ve_namespaces(ve, tsk); + ve_move_task(current, ve); + + /* Check that the process is not a leader of non-empty group/session. + * If it is, we cannot virtualize its PID. Do not fail, just leave + * it non-virtual. + */ + if (!is_virtual_pid(virt_pid(tsk)) && alone_in_pgrp(tsk)) { + pid_t vpid = alloc_vpid(tsk->pids[PIDTYPE_PID].pid, -1); + if (vpid > 0) { + set_virt_pid(tsk, vpid); + set_virt_tgid(tsk, vpid); + if (tsk->signal->pgrp == tsk->pid) + set_virt_pgid(tsk, vpid); + if (tsk->signal->session == tsk->pid) + set_virt_sid(tsk, vpid); + } + } + /* Unlike VE_CREATE, we do not setsid() in VE_ENTER. + * Process is allowed to be in an external group/session. + * If user space callers wants, it will do setsid() after + * VE_ENTER. + */ + err = VE_TASK_INFO(tsk)->owner_env->veid; + +out_up: + up_read(&ve->op_sem); + return err; +} + +static void env_cleanup(struct ve_struct *ve) +{ + struct ve_struct *old_ve; + + VZTRACE("real_do_env_cleanup\n"); + + down_read(&ve->op_sem); + old_ve = set_exec_env(ve); + + ve_hook_iterate_fini(VE_SS_CHAIN, ve); + + fini_venet(ve); + + /* no new packets in flight beyond this point */ + /* skb hold dst_entry, and in turn lies in the ip fragment queue */ + ip_fragment_cleanup(ve); + + fini_ve_netdev(); + fini_ve_route(ve); + fini_ve_route6(ve); + + ve_arp_fini(ve); + ve_ndisc_fini(ve); + + fini_ve_namespaces(ve, NULL); + + /* kill iptables */ + /* No skb belonging to VE can exist at this point as unregister_netdev + is an operation awaiting until ALL skb's gone */ + fini_ve_iptables(ve, ve->_iptables_modules); + fini_ve_netfilter(); + + fini_ve_sched(ve); + clean_device_perms_ve(ve->veid); + + fini_ve_devpts(ve); + fini_ve_shmem(ve); + fini_ve_sysfs(ve); + unregister_ve_tty_drivers(ve); + fini_ve_sysctl(ve); + fini_ve_proc(ve); + fini_ve_meminfo(ve); + + fini_ve_mibs(ve); + + (void)set_exec_env(old_ve); + fini_printk(ve); /* no printk can happen in ve context anymore */ + + ve_list_del(ve); + up_read(&ve->op_sem); + + real_put_ve(ve); +} + +static DECLARE_COMPLETION(vzmond_complete); +static volatile int stop_vzmond; + +static int vzmond_helper(void *arg) +{ + char name[18]; + struct ve_struct *ve; + + ve = (struct ve_struct *)arg; + snprintf(name, sizeof(name), "vzmond/%d", ve->veid); + daemonize(name); + env_cleanup(ve); + module_put_and_exit(0); +} + +static void do_pending_env_cleanups(void) +{ + int err; + struct ve_struct *ve; + + spin_lock(&ve_cleanup_lock); + while (1) { + if (list_empty(&ve_cleanup_list) || need_resched()) + break; + + ve = list_first_entry(&ve_cleanup_list, + struct ve_struct, cleanup_list); + list_del(&ve->cleanup_list); + spin_unlock(&ve_cleanup_lock); + + __module_get(THIS_MODULE); + err = kernel_thread(vzmond_helper, (void *)ve, 0); + if (err < 0) { + env_cleanup(ve); + module_put(THIS_MODULE); + } + + spin_lock(&ve_cleanup_lock); + } + spin_unlock(&ve_cleanup_lock); +} + +static inline int have_pending_cleanups(void) +{ + return !list_empty(&ve_cleanup_list); +} + +static int vzmond(void *arg) +{ + daemonize("vzmond"); + set_current_state(TASK_INTERRUPTIBLE); + + while (!stop_vzmond || have_pending_cleanups()) { + schedule(); + try_to_freeze(); + if (signal_pending(current)) + flush_signals(current); + + do_pending_env_cleanups(); + set_current_state(TASK_INTERRUPTIBLE); + if (have_pending_cleanups()) + __set_current_state(TASK_RUNNING); + } + + __set_task_state(current, TASK_RUNNING); + complete_and_exit(&vzmond_complete, 0); +} + +static int __init init_vzmond(void) +{ + int pid; + struct task_struct *tsk; + + pid = kernel_thread(vzmond, NULL, 0); + if (pid > 0) { + tsk = find_task_by_pid_all(pid); + BUG_ON(tsk == NULL); + ve_cleanup_thread = tsk; + } + return pid; +} + +static void fini_vzmond(void) +{ + stop_vzmond = 1; + wake_up_process(ve_cleanup_thread); + wait_for_completion(&vzmond_complete); + ve_cleanup_thread = NULL; + WARN_ON(!list_empty(&ve_cleanup_list)); +} + +void real_do_env_free(struct ve_struct *ve) +{ + VZTRACE("real_do_env_free\n"); + + free_ve_tty_drivers(ve); + free_ve_sysctl(ve); /* free per ve sysctl data */ + free_ve_filesystems(ve); + free_ve_cpustats(ve); + printk(KERN_INFO "VE: %d: stopped\n", VEID(ve)); + kfree(ve); + + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(real_do_env_free); + + +/********************************************************************** + ********************************************************************** + * + * VE TTY handling + * + ********************************************************************** + **********************************************************************/ + +static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base, + struct ve_struct *ve) +{ + size_t size; + struct tty_driver *driver; + + driver = ub_kmalloc(sizeof(struct tty_driver), GFP_KERNEL); + if (!driver) + goto out; + + memcpy(driver, base, sizeof(struct tty_driver)); + + driver->driver_state = NULL; + + size = base->num * 3 * sizeof(void *); + if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { + void **p; + p = ub_kzalloc(size, GFP_KERNEL); + if (!p) + goto out_free; + + driver->ttys = (struct tty_struct **)p; + driver->termios = (struct termios **)(p + driver->num); + driver->termios_locked = (struct termios **) + (p + driver->num * 2); + } else { + driver->ttys = NULL; + driver->termios = NULL; + driver->termios_locked = NULL; + } + + driver->owner_env = ve; + driver->flags |= TTY_DRIVER_INSTALLED; + driver->refcount = 0; + + return driver; + +out_free: + kfree(driver); +out: + return NULL; +} + +static void free_ve_tty_driver(struct tty_driver *driver) +{ + if (!driver) + return; + + clear_termios(driver); + kfree(driver->ttys); + kfree(driver); +} + +static int alloc_ve_tty_drivers(struct ve_struct* ve) +{ +#ifdef CONFIG_LEGACY_PTYS + /* Traditional BSD devices */ + ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve); + if (!ve->pty_driver) + goto out_mem; + + ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve); + if (!ve->pty_slave_driver) + goto out_mem; + + ve->pty_driver->other = ve->pty_slave_driver; + ve->pty_slave_driver->other = ve->pty_driver; +#endif + +#ifdef CONFIG_UNIX98_PTYS + ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve); + if (!ve->ptm_driver) + goto out_mem; + + ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve); + if (!ve->pts_driver) + goto out_mem; + + ve->ptm_driver->other = ve->pts_driver; + ve->pts_driver->other = ve->ptm_driver; + + ve->allocated_ptys = ub_kmalloc(sizeof(*ve->allocated_ptys), + GFP_KERNEL); + if (!ve->allocated_ptys) + goto out_mem; + idr_init(ve->allocated_ptys); +#endif + return 0; + +out_mem: + free_ve_tty_drivers(ve); + return -ENOMEM; +} + +static void free_ve_tty_drivers(struct ve_struct* ve) +{ +#ifdef CONFIG_LEGACY_PTYS + free_ve_tty_driver(ve->pty_driver); + free_ve_tty_driver(ve->pty_slave_driver); + ve->pty_driver = ve->pty_slave_driver = NULL; +#endif +#ifdef CONFIG_UNIX98_PTYS + free_ve_tty_driver(ve->ptm_driver); + free_ve_tty_driver(ve->pts_driver); + kfree(ve->allocated_ptys); + ve->ptm_driver = ve->pts_driver = NULL; + ve->allocated_ptys = NULL; +#endif +} + +static inline void __register_tty_driver(struct tty_driver *driver) +{ + list_add(&driver->tty_drivers, &tty_drivers); +} + +static inline void __unregister_tty_driver(struct tty_driver *driver) +{ + if (!driver) + return; + list_del(&driver->tty_drivers); +} + +static int register_ve_tty_drivers(struct ve_struct* ve) +{ + write_lock_irq(&tty_driver_guard); +#ifdef CONFIG_UNIX98_PTYS + __register_tty_driver(ve->ptm_driver); + __register_tty_driver(ve->pts_driver); +#endif +#ifdef CONFIG_LEGACY_PTYS + __register_tty_driver(ve->pty_driver); + __register_tty_driver(ve->pty_slave_driver); +#endif + write_unlock_irq(&tty_driver_guard); + + return 0; +} + +static void unregister_ve_tty_drivers(struct ve_struct* ve) +{ + VZTRACE("unregister_ve_tty_drivers\n"); + + write_lock_irq(&tty_driver_guard); +#ifdef CONFIG_LEGACY_PTYS + __unregister_tty_driver(ve->pty_driver); + __unregister_tty_driver(ve->pty_slave_driver); +#endif +#ifdef CONFIG_UNIX98_PTYS + __unregister_tty_driver(ve->ptm_driver); + __unregister_tty_driver(ve->pts_driver); +#endif + write_unlock_irq(&tty_driver_guard); +} + +static int init_ve_tty_drivers(struct ve_struct *ve) +{ + int err; + + if ((err = alloc_ve_tty_drivers(ve))) + goto err_ttyalloc; + if ((err = register_ve_tty_drivers(ve))) + goto err_ttyreg; + return 0; + +err_ttyreg: + free_ve_tty_drivers(ve); +err_ttyalloc: + return err; +} + +static void fini_ve_tty_drivers(struct ve_struct *ve) +{ + unregister_ve_tty_drivers(ve); + free_ve_tty_drivers(ve); +} + +/* + * Free the termios and termios_locked structures because + * we don't want to get memory leaks when modular tty + * drivers are removed from the kernel. + */ +static void clear_termios(struct tty_driver *driver) +{ + int i; + struct termios *tp; + + if (driver->termios == NULL) + return; + for (i = 0; i < driver->num; i++) { + tp = driver->termios[i]; + if (tp) { + driver->termios[i] = NULL; + kfree(tp); + } + tp = driver->termios_locked[i]; + if (tp) { + driver->termios_locked[i] = NULL; + kfree(tp); + } + } +} + + +/********************************************************************** + ********************************************************************** + * + * Pieces of VE network + * + ********************************************************************** + **********************************************************************/ + +#ifdef CONFIG_NET +#include +#include +#include +#include +#include +#include +#endif + +#ifdef CONFIG_INET +static void ve_del_ip_addrs(struct net_device *dev) +{ + struct in_device *in_dev; + + in_dev = in_dev_get(dev); + if (in_dev == NULL) + return; + + while (in_dev->ifa_list != NULL) { + inet_del_ifa(in_dev, &in_dev->ifa_list, 1); + } + in_dev_put(in_dev); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static void ve_del_ipv6_addrs(struct net_device *dev) +{ + addrconf_ifdown(dev, 2); +} +#else +#define ve_del_ipv6_addrs(dev) do { } while (0) +#endif + +static int ve_netdev_cleanup(struct net_device *dev, int to_ve) +{ + int err; + + err = 0; + ve_del_ip_addrs(dev); + ve_del_ipv6_addrs(dev); + if ((dev->flags & IFF_UP) != 0) + err = dev_close(dev); + synchronize_net(); + dev_shutdown(dev); + dev_mc_discard(dev); + free_divert_blk(dev); + synchronize_net(); + return err; +} + +static void __ve_dev_move(struct net_device *dev, struct ve_struct *ve_src, + struct ve_struct *ve_dst, struct user_beancounter *exec_ub) +{ + struct net_device **dp, *d; + struct user_beancounter *ub; + struct ve_struct *exec_ve; + + for (d = ve_src->_net_dev_base, dp = NULL; d != NULL; + dp = &d->next, d = d->next) { + if (d == dev) { + hlist_del(&dev->name_hlist); + hlist_del(&dev->index_hlist); + if (ve_src->_net_dev_tail == &dev->next) + ve_src->_net_dev_tail = dp; + if (dp) + *dp = dev->next; + dev->next = NULL; + break; + } + } + *ve_dst->_net_dev_tail = dev; + ve_dst->_net_dev_tail = &dev->next; + hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name, ve_dst)); + hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex, ve_dst)); + dev->owner_env = ve_dst; + + ub = netdev_bc(dev)->exec_ub; + netdev_bc(dev)->exec_ub = get_beancounter(exec_ub); + put_beancounter(ub); + + write_unlock_bh(&dev_base_lock); + + exec_ve = set_exec_env(ve_src); + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + (void)set_exec_env(ve_dst); + call_netdevice_notifiers(NETDEV_REGISTER, dev); + (void)set_exec_env(exec_ve); + + write_lock_bh(&dev_base_lock); +} + +static int ve_dev_add(envid_t veid, char *dev_name) +{ + int err; + struct net_device *dev; + struct ve_struct *ve; + struct hlist_node *p; + struct hlist_head *head; + + dev = NULL; + err = -ESRCH; + + ve = get_ve_by_id(veid); + if (ve == NULL) + goto out; + + rtnl_lock(); + + read_lock(&dev_base_lock); + hlist_for_each(p, dev_name_hash(dev_name, get_ve0())) { + struct net_device *d = hlist_entry(p, struct net_device, + name_hlist); + if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) { + dev = d; + break; + } + } + read_unlock(&dev_base_lock); + if (dev == NULL) + goto out_unlock; + + err = -EPERM; + if (!ve_is_dev_movable(dev)) + goto out_unlock; + + err = -EINVAL; + if (dev->flags & (IFF_SLAVE|IFF_MASTER)) + goto out_unlock; + + /* Check for existence of name */ + head = dev_name_hash(dev->name, ve); + hlist_for_each(p, head) { + struct net_device *d + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(d->name, dev->name, IFNAMSIZ)) { + err = -EEXIST; + goto out_unlock; + } + } + + ve_netdev_cleanup(dev, 1); + + write_lock_bh(&dev_base_lock); + __ve_dev_move(dev, get_ve0(), ve, get_exec_ub()); + write_unlock_bh(&dev_base_lock); + + err = 0; + +out_unlock: + rtnl_unlock(); + real_put_ve(ve); + + if (dev == NULL) + printk(KERN_WARNING "Device %s not found\n", dev_name); + +out: + return err; +} + +static int ve_dev_del(envid_t veid, char *dev_name) +{ + int err; + struct net_device *dev; + struct ve_struct *ve, *old_exec; + struct hlist_node *p; + + dev = NULL; + err = -ESRCH; + + ve = get_ve_by_id(veid); + if (ve == NULL) + goto out; + + rtnl_lock(); + + read_lock(&dev_base_lock); + hlist_for_each(p, dev_name_hash(dev_name, ve)) { + struct net_device *d = hlist_entry(p, struct net_device, + name_hlist); + if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) { + dev = d; + break; + } + } + read_unlock(&dev_base_lock); + if (dev == NULL) + goto out_unlock; + + err = -EPERM; + if (!ve_is_dev_movable(dev)) + goto out_unlock; + + old_exec = set_exec_env(ve); + ve_netdev_cleanup(dev, 0); + (void)set_exec_env(old_exec); + + write_lock_bh(&dev_base_lock); + __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub); + write_unlock_bh(&dev_base_lock); + + err = 0; + +out_unlock: + rtnl_unlock(); + real_put_ve(ve); + + if (dev == NULL) + printk(KERN_WARNING "Device %s not found\n", dev_name); + +out: + return err; +} + +int real_ve_dev_map(envid_t veid, int op, char *dev_name) +{ + int err; + err = -EPERM; + if (!capable(CAP_SETVEID)) + goto out; + switch (op) + { + case VE_NETDEV_ADD: + err = ve_dev_add(veid, dev_name); + break; + case VE_NETDEV_DEL: + err = ve_dev_del(veid, dev_name); + break; + default: + err = -EINVAL; + break; + } +out: + return err; +} + +static void ve_mapped_devs_cleanup(struct ve_struct *ve) +{ + struct net_device *dev; + + rtnl_lock(); + write_lock_bh(&dev_base_lock); +restart: + for (dev = ve->_net_dev_base; dev != NULL; dev = dev->next) + { + if ((dev->features & NETIF_F_VENET) || + (dev == ve->_loopback_dev)) /* Skip loopback dev */ + continue; + write_unlock_bh(&dev_base_lock); + ve_netdev_cleanup(dev, 0); + write_lock_bh(&dev_base_lock); + __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub); + goto restart; + } + write_unlock_bh(&dev_base_lock); + rtnl_unlock(); +} +#endif + + +/********************************************************************** + ********************************************************************** + * + * VE information via /proc + * + ********************************************************************** + **********************************************************************/ +#ifdef CONFIG_PROC_FS +#if BITS_PER_LONG == 32 +#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21) +#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n" +#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n" +#else +#define VESTAT_LINE_WIDTH (12 * 21) +#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n" +#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n" +#endif + +static int vestat_seq_show(struct seq_file *m, void *v) +{ + struct list_head *entry; + struct ve_struct *ve; + struct ve_struct *curve; + int cpu; + unsigned long user_ve, nice_ve, system_ve; + unsigned long long uptime; + cycles_t uptime_cycles, idle_time, strv_time, used; + + entry = (struct list_head *)v; + ve = list_entry(entry, struct ve_struct, ve_list); + + curve = get_exec_env(); + if (entry == ve_list_head.next || + (!ve_is_super(curve) && ve == curve)) { + /* print header */ + seq_printf(m, "%-*s\n", + VESTAT_LINE_WIDTH - 1, + "Version: 2.2"); + seq_printf(m, VESTAT_HEAD_FMT, "VEID", + "user", "nice", "system", + "uptime", "idle", + "strv", "uptime", "used", + "maxlat", "totlat", "numsched"); + } + + if (ve == get_ve0()) + return 0; + + user_ve = nice_ve = system_ve = 0; + idle_time = strv_time = used = 0; + + for_each_online_cpu(cpu) { + struct ve_cpu_stats *st; + + st = VE_CPU_STATS(ve, cpu); + user_ve += st->user; + nice_ve += st->nice; + system_ve += st->system; + used += st->used_time; + idle_time += __ve_sched_get_idle_time(ve, cpu); + } + uptime_cycles = get_cycles() - ve->start_cycles; + uptime = get_jiffies_64() - ve->start_jiffies; + + seq_printf(m, VESTAT_LINE_FMT, ve->veid, + user_ve, nice_ve, system_ve, + (unsigned long long)uptime, + (unsigned long long)idle_time, + (unsigned long long)strv_time, + (unsigned long long)uptime_cycles, + (unsigned long long)used, + (unsigned long long)ve->sched_lat_ve.last.maxlat, + (unsigned long long)ve->sched_lat_ve.last.totlat, + ve->sched_lat_ve.last.count); + return 0; +} + +static void *ve_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ve_struct *curve; + struct list_head *entry; + loff_t l; + + curve = get_exec_env(); + read_lock(&ve_list_lock); + if (!ve_is_super(curve)) { + if (*pos != 0) + return NULL; + return curve; + } + + l = *pos; + list_for_each(entry, &ve_list_head) { + if (l == 0) + return entry; + l--; + } + return NULL; +} + +static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *entry; + + entry = (struct list_head *)v; + if (!ve_is_super(get_exec_env())) + return NULL; + (*pos)++; + return entry->next == &ve_list_head ? NULL : entry->next; +} + +static void ve_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_list_lock); +} + +static struct seq_operations vestat_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = vestat_seq_show +}; + +static int vestat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vestat_seq_op); +} + +static struct file_operations proc_vestat_operations = { + .open = vestat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static inline unsigned long ve_used_mem(struct user_beancounter *ub) +{ + extern int glob_ve_meminfo; + return glob_ve_meminfo ? ub->ub_parms[UB_OOMGUARPAGES].held : + ub->ub_parms[UB_PRIVVMPAGES].held ; +} + +static inline void ve_mi_replace(struct meminfo *mi) +{ +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *ub; + unsigned long meminfo_val; + unsigned long nodettram; + unsigned long usedmem; + + meminfo_val = get_exec_env()->meminfo_val; + + if(!meminfo_val) + return; /* No virtualization */ + + nodettram = mi->si.totalram; + ub = current->mm->mm_ub; + usedmem = ve_used_mem(ub); + + memset(mi, 0, sizeof(*mi)); + + mi->si.totalram = (meminfo_val > nodettram) ? + nodettram : meminfo_val; + mi->si.freeram = (mi->si.totalram > usedmem) ? + (mi->si.totalram - usedmem) : 0; +#else + return; +#endif +} + +static int meminfo_call(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + if (event != VIRTINFO_MEMINFO) + return old_ret; + + ve_mi_replace((struct meminfo *)arg); + + return NOTIFY_OK; +} + + +static struct vnotifier_block meminfo_notifier_block = { + .notifier_call = meminfo_call +}; + +static int __init init_vecalls_proc(void) +{ + struct proc_dir_entry *de; + + de = create_proc_glob_entry_mod("vz/vestat", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + if (de == NULL) { + /* create "vz" subdirectory, if not exist */ + (void) create_proc_glob_entry("vz", + S_IFDIR|S_IRUGO|S_IXUGO, NULL); + de = create_proc_glob_entry_mod("vz/vestat", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + } + if (de) + de->proc_fops = &proc_vestat_operations; + else + printk(KERN_WARNING + "VZMON: can't make vestat proc entry\n"); + + de = create_proc_entry_mod("vz/devperms", S_IFREG | S_IRUSR, NULL, + THIS_MODULE); + if (de) + de->proc_fops = &proc_devperms_ops; + else + printk(KERN_WARNING + "VZMON: can't make devperms proc entry\n"); + + virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block); + + return 0; +} + +static void fini_vecalls_proc(void) +{ + remove_proc_entry("vz/devperms", NULL); + remove_proc_entry("vz/vestat", NULL); + virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block); +} +#else +#define init_vecalls_proc() (0) +#define fini_vecalls_proc() do { } while (0) +#endif /* CONFIG_PROC_FS */ + + +/********************************************************************** + ********************************************************************** + * + * User ctl + * + ********************************************************************** + **********************************************************************/ + +int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VZCTL_MARK_ENV_TO_DOWN: { + /* Compatibility issue */ + err = 0; + } + break; + case VZCTL_SETDEVPERMS: { + /* Device type was mistakenly declared as dev_t + * in the old user-kernel interface. + * That's wrong, dev_t is a kernel internal type. + * I use `unsigned' not having anything better in mind. + * 2001/08/11 SAW */ + struct vzctl_setdevperms s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_setdevperms(s.veid, s.type, + new_decode_dev(s.dev), s.mask); + } + break; +#ifdef CONFIG_INET + case VZCTL_VE_NETDEV: { + struct vzctl_ve_netdev d; + char *s; + err = -EFAULT; + if (copy_from_user(&d, (void __user *)arg, sizeof(d))) + break; + err = -ENOMEM; + s = kmalloc(IFNAMSIZ+1, GFP_KERNEL); + if (s == NULL) + break; + err = -EFAULT; + if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) { + s[IFNAMSIZ] = 0; + err = real_ve_dev_map(d.veid, d.op, s); + } + kfree(s); + } + break; +#endif + case VZCTL_ENV_CREATE: { + struct vzctl_env_create s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_env_create(s.veid, s.flags, s.class_id, + NULL, 0); + } + break; + case VZCTL_ENV_CREATE_DATA: { + struct vzctl_env_create_data s; + env_create_param_t *data; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err=-EINVAL; + if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN || + s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN || + s.data == 0) + break; + err = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + break; + + err = -EFAULT; + if (copy_from_user(data, (void __user *)s.data, + s.datalen)) + goto free_data; + err = real_env_create(s.veid, s.flags, s.class_id, + data, s.datalen); +free_data: + kfree(data); + } + break; + case VZCTL_GET_CPU_STAT: { + struct vzctl_cpustatctl s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = ve_get_cpu_stat(s.veid, s.cpustat); + } + break; + case VZCTL_VE_MEMINFO: { + struct vzctl_ve_meminfo s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = ve_set_meminfo(s.veid, s.val); + } + break; + } + return err; +} + +#ifdef CONFIG_COMPAT +int compat_vzcalls_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int err; + + switch(cmd) { + case VZCTL_GET_CPU_STAT: { + /* FIXME */ + } + case VZCTL_COMPAT_ENV_CREATE_DATA: { + struct compat_vzctl_env_create_data cs; + struct vzctl_env_create_data __user *s; + + s = compat_alloc_user_space(sizeof(*s)); + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + if (put_user(cs.veid, &s->veid) || + put_user(cs.flags, &s->flags) || + put_user(cs.class_id, &s->class_id) || + put_user(compat_ptr(cs.data), &s->data) || + put_user(cs.datalen, &s->datalen)) + break; + err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA, + (unsigned long)s); + break; + } +#ifdef CONFIG_NET + case VZCTL_COMPAT_VE_NETDEV: { + struct compat_vzctl_ve_netdev cs; + struct vzctl_ve_netdev __user *s; + + s = compat_alloc_user_space(sizeof(*s)); + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + if (put_user(cs.veid, &s->veid) || + put_user(cs.op, &s->op) || + put_user(compat_ptr(cs.dev_name), &s->dev_name)) + break; + err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s); + break; + } +#endif + case VZCTL_COMPAT_VE_MEMINFO: { + struct compat_vzctl_ve_meminfo cs; + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + err = ve_set_meminfo(cs.veid, cs.val); + break; + } + default: + err = vzcalls_ioctl(file, cmd, arg); + break; + } + return err; +} +#endif + +static struct vzioctlinfo vzcalls = { + .type = VZCTLTYPE, + .ioctl = vzcalls_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_vzcalls_ioctl, +#endif + .owner = THIS_MODULE, +}; + + +/********************************************************************** + ********************************************************************** + * + * Init/exit stuff + * + ********************************************************************** + **********************************************************************/ + +static int __init init_vecalls_symbols(void) +{ + KSYMRESOLVE(real_do_env_free); + KSYMMODRESOLVE(vzmon); + return 0; +} + +static void fini_vecalls_symbols(void) +{ + KSYMMODUNRESOLVE(vzmon); + KSYMUNRESOLVE(real_do_env_free); +} + +static inline __init int init_vecalls_ioctls(void) +{ + vzioctl_register(&vzcalls); + return 0; +} + +static inline void fini_vecalls_ioctls(void) +{ + vzioctl_unregister(&vzcalls); +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *table_header; + +static ctl_table kernel_table[] = { + { + .ctl_name = KERN_VE_ALLOW_KTHREADS, + .procname = "ve_allow_kthreads", + .data = &ve_allow_kthreads, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { 0 } +}; + +static ctl_table root_table[] = { + {CTL_KERN, "kernel", NULL, 0, 0555, kernel_table}, + { 0 } +}; + +static int init_vecalls_sysctl(void) +{ + table_header = register_sysctl_table(root_table, 0); + if (!table_header) + return -ENOMEM ; + return 0; +} + +static void fini_vecalls_sysctl(void) +{ + unregister_sysctl_table(table_header); +} +#else +static int init_vecalls_sysctl(void) { return 0; } +static void fini_vecalls_sysctl(void) { ; } +#endif + +static int __init vecalls_init(void) +{ + int err; + + err = init_vecalls_sysctl(); + if (err) + goto out_vzmond; + + init_rwsem(&get_ve0()->op_sem); + + err = init_vzmond(); + if (err < 0) + goto out_sysctl; + + err = init_vecalls_symbols(); + if (err < 0) + goto out_sym; + + err = init_vecalls_proc(); + if (err < 0) + goto out_proc; + + err = init_vecalls_ioctls(); + if (err < 0) + goto out_ioctls; + + return 0; + +out_ioctls: + fini_vecalls_proc(); +out_proc: + fini_vecalls_symbols(); +out_sym: + fini_vzmond(); +out_sysctl: + fini_vecalls_sysctl(); +out_vzmond: + return err; +} + +static void vecalls_exit(void) +{ + fini_vecalls_ioctls(); + fini_vecalls_proc(); + fini_vecalls_symbols(); + fini_vzmond(); + fini_vecalls_sysctl(); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Control"); +MODULE_LICENSE("GPL v2"); + +module_init(vecalls_init) +module_exit(vecalls_exit) diff -uprN linux-2.6.18/kernel/ve/veowner.c linux-2.6.18.ovz/kernel/ve/veowner.c --- linux-2.6.18/kernel/ve/veowner.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ve/veowner.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,259 @@ +/* + * kernel/ve/veowner.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +void prepare_ve0_process(struct task_struct *tsk) +{ + set_virt_pid(tsk, tsk->pid); + set_virt_tgid(tsk, tsk->tgid); + if (tsk->signal) { + set_virt_pgid(tsk, tsk->signal->pgrp); + set_virt_sid(tsk, tsk->signal->session); + } + VE_TASK_INFO(tsk)->exec_env = get_ve0(); + VE_TASK_INFO(tsk)->owner_env = get_ve0(); + VE_TASK_INFO(tsk)->sleep_time = 0; + VE_TASK_INFO(tsk)->wakeup_stamp = 0; + VE_TASK_INFO(tsk)->sched_time = 0; + seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock); + + if (tsk->pid) { + list_add_rcu(&tsk->ve_task_info.vetask_list, + &get_ve0()->vetask_lh); + atomic_inc(&get_ve0()->pcounter); + } +} + +#ifdef CONFIG_NET +void prepare_ve0_loopback(void) +{ + get_ve0()->_loopback_dev = &loopback_dev; +} +#endif + +/* + * ------------------------------------------------------------------------ + * proc entries + * ------------------------------------------------------------------------ + */ + +#ifdef CONFIG_PROC_FS +static void proc_move(struct proc_dir_entry *ddir, + struct proc_dir_entry *sdir, + const char *name) +{ + struct proc_dir_entry **p, *q; + int len; + + len = strlen(name); + for (p = &sdir->subdir, q = *p; q != NULL; p = &q->next, q = *p) + if (proc_match(len, name, q)) + break; + if (q == NULL) + return; + *p = q->next; + q->parent = ddir; + q->next = ddir->subdir; + ddir->subdir = q; +} +static void prepare_proc_misc(void) +{ + static char *table[] = { + "loadavg", + "uptime", + "meminfo", + "version", + "stat", + "filesystems", + "locks", + "swaps", + "mounts", + "net", + "cpuinfo", + "sysvipc", + "sys", + "fs", + "vz", + "cmdline", + "vmstat", + "modules", + NULL, + }; + char **p; + + for (p = table; *p != NULL; p++) + proc_move(&proc_root, ve0.proc_root, *p); +} +int prepare_proc(void) +{ + struct ve_struct *envid; + struct proc_dir_entry *de; + struct proc_dir_entry *ve_root; + + envid = set_exec_env(&ve0); + ve_root = ve0.proc_root->subdir; + /* move the whole tree to be visible in VE0 only */ + ve0.proc_root->subdir = proc_root.subdir; + for (de = ve0.proc_root->subdir; de->next != NULL; de = de->next) + de->parent = ve0.proc_root; + de->parent = ve0.proc_root; + de->next = ve_root; + + /* move back into the global scope some specific entries */ + proc_root.subdir = NULL; + prepare_proc_misc(); + proc_net = proc_mkdir("net", ve0.proc_root); + proc_net_stat = proc_mkdir("stat", proc_net); + proc_mkdir("vz", NULL); +#ifdef CONFIG_SYSVIPC + proc_mkdir("sysvipc", NULL); +#endif + proc_root_fs = proc_mkdir("fs", NULL); + /* XXX proc_tty_init(); */ + + /* XXX process inodes */ + + (void)set_exec_env(envid); + + (void)create_proc_glob_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); + return 0; +} + +static struct proc_dir_entry ve0_proc_root = { + .name = "/proc", + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2 +}; + +void prepare_ve0_proc_root(void) +{ + ve0.proc_root = &ve0_proc_root; +} +#endif + +/* + * ------------------------------------------------------------------------ + * Virtualized sysctl + * ------------------------------------------------------------------------ + */ +extern int ve_area_access_check; +#ifdef CONFIG_INET +static ctl_table vz_ipv4_route_table[] = { + { + .ctl_name = NET_IPV4_ROUTE_SRC_CHECK, + .procname = "src_check", + .data = &ip_rt_src_check, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { 0 } +}; +static ctl_table vz_ipv4_table[] = { + {NET_IPV4_ROUTE, "route", NULL, 0, 0555, vz_ipv4_route_table}, + { 0 } +}; +static ctl_table vz_net_table[] = { + {NET_IPV4, "ipv4", NULL, 0, 0555, vz_ipv4_table}, + { 0 } +}; +#endif +static ctl_table vz_fs_table[] = { + { + .ctl_name = 226, + .procname = "ve-area-access-check", + .data = &ve_area_access_check, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { 0 } +}; +static ctl_table root_table2[] = { +#ifdef CONFIG_INET + {CTL_NET, "net", NULL, 0, 0555, vz_net_table}, +#endif + {CTL_FS, "fs", NULL, 0, 0555, vz_fs_table}, + { 0 } +}; +int prepare_sysctl(void) +{ + struct ve_struct *envid; + + envid = set_exec_env(&ve0); + register_sysctl_table(root_table2, 0); + (void)set_exec_env(envid); + return 0; +} + +void prepare_ve0_sysctl(void) +{ + INIT_LIST_HEAD(&ve0.sysctl_lh); +#ifdef CONFIG_SYSCTL + ve0.proc_sys_root = proc_mkdir("sys", NULL); +#endif +} + +/* + * ------------------------------------------------------------------------ + * XXX init_ve_system + * ------------------------------------------------------------------------ + */ + +void init_ve_system(void) +{ + struct task_struct *init_entry, *p, *tsk; + struct ve_struct *ve; + unsigned long flags; + int i; + + ve = get_ve0(); + + init_entry = child_reaper; + ve->init_entry = init_entry; + /* if ve_move_task to VE0 (e.g. in cpt code) * + * occurs, ve_cap_bset on VE0 is required */ + ve->ve_cap_bset = CAP_INIT_EFF_SET; + +#ifdef CONFIG_INET + ve->_ipv4_devconf = &ipv4_devconf; + ve->_ipv4_devconf_dflt = &ipv4_devconf_dflt; +#endif + + read_lock(&init_entry->fs->lock); + ve->fs_rootmnt = init_entry->fs->rootmnt; + ve->fs_root = init_entry->fs->root; + read_unlock(&init_entry->fs->lock); + + /* common prepares */ +#ifdef CONFIG_PROC_FS + prepare_proc(); +#endif + prepare_sysctl(); +} diff -uprN linux-2.6.18/kernel/ve/vzdev.c linux-2.6.18.ovz/kernel/ve/vzdev.c --- linux-2.6.18/kernel/ve/vzdev.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ve/vzdev.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,154 @@ +/* + * kernel/ve/vzdev.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VZCTL_MAJOR 126 +#define VZCTL_NAME "vzctl" + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Interface"); +MODULE_LICENSE("GPL v2"); + +static LIST_HEAD(ioctls); +static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED; + +static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd) +{ + struct vzioctlinfo *h; + + spin_lock(&ioctl_lock); + list_for_each_entry(h, &ioctls, list) { + if (h->type == _IOC_TYPE(cmd)) + goto found; + } + h = NULL; +found: + if (h && !try_module_get(h->owner)) + h = NULL; + spin_unlock(&ioctl_lock); + return h; +} + +static void vzctl_put_handler(struct vzioctlinfo *h) +{ + if (!h) + return; + + module_put(h->owner); +} + +long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vzioctlinfo *h; + int err; + + err = -ENOTTY; + h = vzctl_get_handler(cmd); + if (h && h->ioctl) + err = (*h->ioctl)(file, cmd, arg); + vzctl_put_handler(h); + + return err; +} + +long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vzioctlinfo *h; + int err; + + err = -ENOIOCTLCMD; + h = vzctl_get_handler(cmd); + if (h && h->compat_ioctl) + err = (*h->compat_ioctl)(file, cmd, arg); + vzctl_put_handler(h); + + return err; +} + +void vzioctl_register(struct vzioctlinfo *inf) +{ + spin_lock(&ioctl_lock); + list_add(&inf->list, &ioctls); + spin_unlock(&ioctl_lock); +} +EXPORT_SYMBOL(vzioctl_register); + +void vzioctl_unregister(struct vzioctlinfo *inf) +{ + spin_lock(&ioctl_lock); + list_del_init(&inf->list); + spin_unlock(&ioctl_lock); +} +EXPORT_SYMBOL(vzioctl_unregister); + +/* + * Init/exit stuff. + */ +static struct file_operations vzctl_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vzctl_ioctl, + .compat_ioctl = compat_vzctl_ioctl, +}; + +static struct class *vzctl_class; + +static void __exit vzctl_exit(void) +{ + class_device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0)); + class_destroy(vzctl_class); + unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); +} + +static int __init vzctl_init(void) +{ + int ret; + struct class_device *class_err; + + ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops); + if (ret < 0) + goto out; + + vzctl_class = class_create(THIS_MODULE, "vzctl"); + if (IS_ERR(vzctl_class)) { + ret = PTR_ERR(vzctl_class); + goto out_cleandev; + } + + class_err = class_device_create(vzctl_class, NULL, MKDEV(VZCTL_MAJOR, 0), + NULL, VZCTL_NAME); + if (IS_ERR(class_err)) { + ret = PTR_ERR(class_err); + goto out_rmclass; + } + + goto out; + +out_rmclass: + class_destroy(vzctl_class); +out_cleandev: + unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); +out: + return ret; +} + +module_init(vzctl_init) +module_exit(vzctl_exit); diff -uprN linux-2.6.18/kernel/ve/vzevent.c linux-2.6.18.ovz/kernel/ve/vzevent.c --- linux-2.6.18/kernel/ve/vzevent.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ve/vzevent.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,145 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NETLINK_UEVENT 15 +#define VZ_EVGRP_ALL 0x01 + +#define KOBJECT_UEVENT (defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)) + +#if KOBJECT_UEVENT +#ifdef NETLINK_KOBJECT_UEVENT +#if NETLINK_KOBJECT_UEVENT != 15 +#error "netlink conflict?" +#endif +#endif + +static int do_vzevent_send(int event, char *msg, int len) +{ + struct kobject ktmp; + + memset(&ktmp, 0, sizeof(ktmp)); + ktmp.parent = NULL; + ktmp.k_name = msg; + kobject_uevent(&ktmp, event); + return 0; +} +#else /* KOBJECT_UEVENT */ +static struct sock *vzev_sock; + +static char *action_to_string(int action) +{ + switch (action) { + case KOBJ_MOUNT: + return "mount"; + case KOBJ_UMOUNT: + return "umount"; + case KOBJ_START: + return "start"; + case KOBJ_STOP: + return "stop"; + default: + return NULL; + } +} + +static int do_vzevent_send(int event, char *msg, int len) +{ + struct sk_buff *skb; + char *buf, *action; + int alen; + + action = action_to_string(event); + alen = strlen(action); + + skb = alloc_skb(len + 1 + alen, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + buf = skb_put(skb, len + 1 + alen); + memcpy(buf, action, alen); + buf[alen] = '@'; + memcpy(buf + alen + 1, msg, len); + (void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL); + return 0; +} +#endif + +int vzevent_send(int event, const char *attrs_fmt, ...) +{ + va_list args; + int len, err; + struct ve_struct *ve; + char *page; + + err = -ENOMEM; + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + goto out; + + va_start(args, attrs_fmt); + len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args); + va_end(args); + + ve = set_exec_env(get_ve0()); + err = do_vzevent_send(event, page, len); + (void)set_exec_env(ve); + free_page((unsigned long)page); +out: + return err; +} +EXPORT_SYMBOL(vzevent_send); + +static int ve_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + vzevent_send(KOBJ_START, "%d", ve->veid); + return 0; +} + +static void ve_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + vzevent_send(KOBJ_STOP, "%d", ve->veid); +} + +static struct ve_hook ve_start_stop_hook = { + .init = ve_start, + .fini = ve_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_AFTERALL, +}; + +static int __init init_vzevent(void) +{ +#if !KOBJECT_UEVENT + vzev_sock = netlink_kernel_create(NETLINK_UEVENT, 0, NULL, THIS_MODULE); + if (vzev_sock == NULL) + return -ENOMEM; +#endif + ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook); + return 0; +} + +static void __exit exit_vzevent(void) +{ + ve_hook_unregister(&ve_start_stop_hook); +#if !KOBJECT_UEVENT + sock_release(vzev_sock->sk_socket); +#endif +} + +MODULE_LICENSE("GPL"); + +module_init(init_vzevent); +module_exit(exit_vzevent); diff -uprN linux-2.6.18/kernel/ve/vzwdog.c linux-2.6.18.ovz/kernel/ve/vzwdog.c --- linux-2.6.18/kernel/ve/vzwdog.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.18.ovz/kernel/ve/vzwdog.c 2007-06-13 06:55:07.000000000 -0400 @@ -0,0 +1,282 @@ +/* + * kernel/ve/vzwdog.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Staff regading kernel thread polling VE validity */ +static int sleep_timeout = 60; +static struct task_struct *wdog_thread_tsk; + +extern void show_mem(void); + +static struct file *intr_file; +static char page[PAGE_SIZE]; + +static void parse_irq_list(int len) +{ + int i, k, skip; + for (i = 0; i < len; ) { + k = i; + while (i < len && page[i] != '\n' && page[i] != ':') + i++; + skip = 0; + if (i < len && page[i] != '\n') { + i++; /* skip ':' */ + while (i < len && (page[i] == ' ' || page[i] == '0')) + i++; + skip = (i < len && (page[i] < '0' || page[i] > '9')); + while (i < len && page[i] != '\n') + i++; + } + if (!skip) + printk("%.*s\n", i - k, page + k); + if (i < len) + i++; /* skip '\n' */ + } +} + +extern loff_t vfs_llseek(struct file *file, loff_t, int); +extern ssize_t vfs_read(struct file *file, char __user *, size_t, loff_t *); +extern struct file *filp_open(const char *filename, int flags, int mode); +extern int filp_close(struct file *filp, fl_owner_t id); +static void show_irq_list(void) +{ + mm_segment_t fs; + int r; + + fs = get_fs(); + set_fs(KERNEL_DS); + vfs_llseek(intr_file, 0, 0); + r = vfs_read(intr_file, (void __user *)page, sizeof(page), + &intr_file->f_pos); + set_fs(fs); + + if (r > 0) + parse_irq_list(r); +} + +static void show_alloc_latency(void) +{ + static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = { + "A0", + "L0", + "H0", + "L1", + "H1" + }; + int i; + + printk("lat: "); + for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) { + struct kstat_lat_struct *p; + cycles_t maxlat, avg0, avg1, avg2; + + p = &kstat_glob.alloc_lat[i]; + spin_lock_irq(&kstat_glb_lock); + maxlat = p->last.maxlat; + avg0 = p->avg[0]; + avg1 = p->avg[1]; + avg2 = p->avg[2]; + spin_unlock_irq(&kstat_glb_lock); + + printk("%s %Lu (%Lu %Lu %Lu)", + alloc_descr[i], + (unsigned long long)maxlat, + (unsigned long long)avg0, + (unsigned long long)avg1, + (unsigned long long)avg2); + } + printk("\n"); +} + +static void show_schedule_latency(void) +{ + struct kstat_lat_pcpu_struct *p; + cycles_t maxlat, totlat, avg0, avg1, avg2; + unsigned long count; + + p = &kstat_glob.sched_lat; + spin_lock_irq(&kstat_glb_lock); + maxlat = p->last.maxlat; + totlat = p->last.totlat; + count = p->last.count; + avg0 = p->avg[0]; + avg1 = p->avg[1]; + avg2 = p->avg[2]; + spin_unlock_irq(&kstat_glb_lock); + + printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n", + (unsigned long long)maxlat, + (unsigned long long)totlat, + count, + (unsigned long long)avg0, + (unsigned long long)avg1, + (unsigned long long)avg2); +} + +static void show_header(void) +{ + struct timeval tv; + + do_gettimeofday(&tv); + preempt_disable(); + printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n", + tv.tv_sec, (long)tv.tv_usec, + (unsigned long long)get_jiffies_64(), + smp_processor_id()); +#ifdef CONFIG_FAIRSCHED + printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n", + cycles_per_jiffy, HZ); +#else + printk("*** jiffies_per_second %u ***\n", HZ); +#endif + preempt_enable(); +} + +static void show_pgdatinfo(void) +{ + pg_data_t *pgdat; + + printk("pgdat:"); + for_each_online_pgdat(pgdat) { + printk(" %d: %lu,%lu,%lu", + pgdat->node_id, + pgdat->node_start_pfn, + pgdat->node_present_pages, + pgdat->node_spanned_pages); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(",%p", pgdat->node_mem_map); +#endif + } + printk("\n"); +} + +static void show_diskio(void) +{ + struct gendisk *gd; + char buf[BDEVNAME_SIZE]; + + printk("disk_io: "); + + down_read(&block_subsys.rwsem); + list_for_each_entry(gd, &block_subsys.kset.list, kobj.entry) { + char *name; + name = disk_name(gd, 0, buf); + if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) && + isdigit(name[4])) + continue; + if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) && + isdigit(name[3])) + continue; + printk("(%u,%u) %s r(%lu %lu %lu) w(%lu %lu %lu)\n", + gd->major, gd->first_minor, + name, + disk_stat_read(gd, ios[READ]), + disk_stat_read(gd, sectors[READ]), + disk_stat_read(gd, merges[READ]), + disk_stat_read(gd, ios[WRITE]), + disk_stat_read(gd, sectors[WRITE]), + disk_stat_read(gd, merges[WRITE])); + } + up_read(&block_subsys.rwsem); + + printk("\n"); +} + +static void show_nrprocs(void) +{ + unsigned long _nr_running, _nr_sleeping, + _nr_unint, _nr_zombie, _nr_dead, _nr_stopped; + + _nr_running = nr_running(); + _nr_unint = nr_uninterruptible(); + _nr_sleeping = nr_sleeping(); + _nr_zombie = nr_zombie; + _nr_dead = atomic_read(&nr_dead); + _nr_stopped = nr_stopped(); + + printk("VEnum: %d, proc R %lu, S %lu, D %lu, " + "Z %lu, X %lu, T %lu (tot %d)\n", + nr_ve, _nr_running, _nr_sleeping, _nr_unint, + _nr_zombie, _nr_dead, _nr_stopped, nr_threads); +} + +static void wdog_print(void) +{ + show_header(); + show_irq_list(); + show_pgdatinfo(); + show_mem(); + show_diskio(); + show_schedule_latency(); + show_alloc_latency(); + show_nrprocs(); +} + +static int wdog_loop(void* data) +{ + while (1) { + wdog_print(); + try_to_freeze(); + + set_current_state(TASK_UNINTERRUPTIBLE); + if (kthread_should_stop()) + break; + schedule_timeout(sleep_timeout*HZ); + } + return 0; +} + +static int __init wdog_init(void) +{ + struct file *file; + + file = filp_open("/proc/interrupts", 0, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + intr_file = file; + + wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog"); + if (IS_ERR(wdog_thread_tsk)) { + filp_close(intr_file, NULL); + return -EBUSY; + } + return 0; +} + +static void __exit wdog_exit(void) +{ + kthread_stop(wdog_thread_tsk); + filp_close(intr_file, NULL); +} + +module_param(sleep_timeout, int, 0666); +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo WDOG"); +MODULE_LICENSE("GPL v2"); + +module_init(wdog_init) +module_exit(wdog_exit) diff -uprN linux-2.6.18/lib/Kconfig.debug linux-2.6.18.ovz/lib/Kconfig.debug --- linux-2.6.18/lib/Kconfig.debug 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/lib/Kconfig.debug 2007-06-13 06:55:07.000000000 -0400 @@ -39,6 +39,14 @@ config UNUSED_SYMBOLS you really need it, and what the merge plan to the mainline kernel for your module is. +config SYSRQ_DEBUG + bool "Debugging via sysrq keys" + depends on MAGIC_SYSRQ + help + Say Y if you want to extend functionality of magic key. It will + provide you with some debugging facilities such as dumping and + writing memory, resolving symbols and some other. + config DEBUG_KERNEL bool "Kernel debugging" help @@ -64,7 +72,7 @@ config LOG_BUF_SHIFT config DETECT_SOFTLOCKUP bool "Detect Soft Lockups" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !SCHED_VCPU default y help Say Y here to enable the kernel to detect "soft lockups", diff -uprN linux-2.6.18/lib/audit.c linux-2.6.18.ovz/lib/audit.c --- linux-2.6.18/lib/audit.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/lib/audit.c 2007-06-13 06:55:07.000000000 -0400 @@ -28,8 +28,10 @@ int audit_classify_syscall(int abi, unsi switch(syscall) { case __NR_open: return 2; +#ifdef __NR_openat case __NR_openat: return 3; +#endif #ifdef __NR_socketcall case __NR_socketcall: return 4; diff -uprN linux-2.6.18/lib/bust_spinlocks.c linux-2.6.18.ovz/lib/bust_spinlocks.c --- linux-2.6.18/lib/bust_spinlocks.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/lib/bust_spinlocks.c 2007-06-13 06:55:07.000000000 -0400 @@ -12,26 +12,21 @@ #include #include #include - +#include void bust_spinlocks(int yes) { + if (printk_no_wake) + return; + if (yes) { oops_in_progress = 1; } else { - int loglevel_save = console_loglevel; #ifdef CONFIG_VT unblank_screen(); #endif oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk() will give klogd - * and the blanked console a poke. Hold onto your hats... - */ - console_loglevel = 15; /* NMI oopser may have shut the console up */ - printk(" "); - console_loglevel = loglevel_save; + wake_up_klogd(); } } diff -uprN linux-2.6.18/lib/kobject.c linux-2.6.18.ovz/lib/kobject.c --- linux-2.6.18/lib/kobject.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/lib/kobject.c 2007-06-13 06:55:07.000000000 -0400 @@ -516,6 +516,8 @@ void subsystem_init(struct subsystem * s kset_init(&s->kset); } +EXPORT_SYMBOL(subsystem_init); + /** * subsystem_register - register a subsystem. * @s: the subsystem we're registering. diff -uprN linux-2.6.18/lib/kobject_uevent.c linux-2.6.18.ovz/lib/kobject_uevent.c --- linux-2.6.18/lib/kobject_uevent.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/lib/kobject_uevent.c 2007-06-13 06:55:07.000000000 -0400 @@ -50,6 +50,10 @@ static char *action_to_string(enum kobje return "offline"; case KOBJ_ONLINE: return "online"; + case KOBJ_START: + return "start"; + case KOBJ_STOP: + return "stop"; default: return NULL; } diff -uprN linux-2.6.18/lib/smp_processor_id.c linux-2.6.18.ovz/lib/smp_processor_id.c --- linux-2.6.18/lib/smp_processor_id.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/lib/smp_processor_id.c 2007-06-13 06:55:07.000000000 -0400 @@ -7,11 +7,26 @@ #include #include +#ifdef CONFIG_VCPU +/* We can not guarantee pcpu affinity if use VCPU extention */ +static inline int run_on_single_cpu(int cpu) { return 0; } +#else +static inline int run_on_single_cpu(int cpu) +{ + cpumask_t this_mask; + + this_mask = cpumask_of_cpu(cpu); + if (cpus_equal(current->cpus_allowed, this_mask)) + return 1; + + return 0; +} +#endif + unsigned int debug_smp_processor_id(void) { unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); - cpumask_t this_mask; if (likely(preempt_count)) goto out; @@ -23,9 +38,7 @@ unsigned int debug_smp_processor_id(void * Kernel threads bound to a single CPU can safely use * smp_processor_id(): */ - this_mask = cpumask_of_cpu(this_cpu); - - if (cpus_equal(current->cpus_allowed, this_mask)) + if (run_on_single_cpu(this_cpu)) goto out; /* diff -uprN linux-2.6.18/mm/filemap.c linux-2.6.18.ovz/mm/filemap.c --- linux-2.6.18/mm/filemap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/filemap.c 2007-06-13 06:55:07.000000000 -0400 @@ -40,6 +40,8 @@ #include +#include + static ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); @@ -118,6 +120,7 @@ void __remove_from_page_cache(struct pag radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; + ub_io_release_debug(page); mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); } diff -uprN linux-2.6.18/mm/filemap_xip.c linux-2.6.18.ovz/mm/filemap_xip.c --- linux-2.6.18/mm/filemap_xip.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/filemap_xip.c 2007-06-13 06:55:07.000000000 -0400 @@ -15,6 +15,7 @@ #include #include #include "filemap.h" +#include /* * This is a file read routine for execute in place files, and uses @@ -190,6 +191,8 @@ __xip_unmap (struct address_space * mapp flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page); + pb_remove_ref(page, mm); + ub_unused_privvm_inc(mm, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); diff -uprN linux-2.6.18/mm/fremap.c linux-2.6.18.ovz/mm/fremap.c --- linux-2.6.18/mm/fremap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/fremap.c 2007-06-13 06:55:07.000000000 -0400 @@ -20,6 +20,8 @@ #include #include +#include + static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -34,6 +36,7 @@ static int zap_pte(struct mm_struct *mm, if (pte_dirty(pte)) set_page_dirty(page); page_remove_rmap(page); + pb_remove_ref(page, mm); page_cache_release(page); } } else { @@ -57,6 +60,10 @@ int install_page(struct mm_struct *mm, s pte_t *pte; pte_t pte_val; spinlock_t *ptl; + struct page_beancounter *pbc; + + if (unlikely(pb_alloc(&pbc))) + goto out_nopb; pte = get_locked_pte(mm, addr, &ptl); if (!pte) @@ -75,11 +82,14 @@ int install_page(struct mm_struct *mm, s if (page_mapcount(page) > INT_MAX/2) goto unlock; - if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) + if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) { + ub_unused_privvm_dec(mm, vma); inc_mm_counter(mm, file_rss); + } flush_icache_page(vma, page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); + pb_add_ref(page, mm, &pbc); page_add_file_rmap(page); pte_val = *pte; update_mmu_cache(vma, addr, pte_val); @@ -88,6 +98,8 @@ int install_page(struct mm_struct *mm, s unlock: pte_unmap_unlock(pte, ptl); out: + pb_free(&pbc); +out_nopb: return err; } EXPORT_SYMBOL(install_page); @@ -110,6 +122,7 @@ int install_file_pte(struct mm_struct *m if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { update_hiwater_rss(mm); + ub_unused_privvm_inc(mm, vma); dec_mm_counter(mm, file_rss); } @@ -227,4 +240,5 @@ asmlinkage long sys_remap_file_pages(uns return err; } +EXPORT_SYMBOL_GPL(sys_remap_file_pages); diff -uprN linux-2.6.18/mm/memory.c linux-2.6.18.ovz/mm/memory.c --- linux-2.6.18/mm/memory.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/memory.c 2007-06-13 06:55:07.000000000 -0400 @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,9 @@ #include #include +#include +#include + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -104,18 +108,21 @@ void pgd_clear_bad(pgd_t *pgd) pgd_ERROR(*pgd); pgd_clear(pgd); } +EXPORT_SYMBOL_GPL(pgd_clear_bad); void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } +EXPORT_SYMBOL_GPL(pud_clear_bad); void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } +EXPORT_SYMBOL_GPL(pmd_clear_bad); /* * Note: this doesn't free the actual pages themselves. That @@ -318,6 +325,7 @@ int __pte_alloc(struct mm_struct *mm, pm spin_unlock(&mm->page_table_lock); return 0; } +EXPORT_SYMBOL_GPL(__pte_alloc); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) { @@ -416,6 +424,7 @@ struct page *vm_normal_page(struct vm_ar */ return pfn_to_page(pfn); } +EXPORT_SYMBOL_GPL(vm_normal_page); /* * copy one vm_area from one task to the other. Assumes the page tables @@ -426,7 +435,7 @@ struct page *vm_normal_page(struct vm_ar static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + unsigned long addr, int *rss, struct page_beancounter **pbc) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; @@ -481,6 +490,7 @@ copy_one_pte(struct mm_struct *dst_mm, s if (page) { get_page(page); page_dup_rmap(page); + pb_dup_ref(page, dst_mm, pbc); rss[!!PageAnon(page)]++; } @@ -488,20 +498,36 @@ out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); } +#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1))) +#ifdef CONFIG_USER_RESOURCE +#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub) +#else +#define same_ub(mm1, mm2) (1) +#endif + static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[2], rss_tot; + struct page_beancounter *pbc; + int err; + err = -ENOMEM; + pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL; again: + if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr))) + goto out; rss[1] = rss[0] = 0; dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) - return -ENOMEM; + goto out; + src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -522,22 +548,31 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, addr, rss, &pbc); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); + rss_tot = rss[0] + rss[1]; + ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot); add_mm_rss(dst_mm, rss[0], rss[1]); pte_unmap_unlock(dst_pte - 1, dst_ptl); cond_resched(); if (addr != end) goto again; - return 0; + + err = 0; +out: + pb_free_list(&pbc); + return err; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + pud_t *dst_pud, pud_t *src_pud, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -552,14 +587,16 @@ static inline int copy_pmd_range(struct if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + pgd_t *dst_pgd, pgd_t *src_pgd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -574,19 +611,20 @@ static inline int copy_pud_range(struct if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) +int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, + unsigned long addr, size_t size) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = vma->vm_mm; pgd_t *src_pgd, *dst_pgd; unsigned long next; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; + unsigned long end = addr + size; /* * Don't copy ptes where a page fault will fill them correctly. @@ -609,11 +647,22 @@ int copy_page_range(struct mm_struct *ds if (pgd_none_or_clear_bad(src_pgd)) continue; if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pgd++, src_pgd++, addr = next, addr != end); return 0; } +EXPORT_SYMBOL_GPL(__copy_page_range); + +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *dst_vma, struct vm_area_struct *vma) +{ + if (dst_vma->vm_mm != dst) + BUG(); + if (vma->vm_mm != src) + BUG(); + return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start); +} static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, @@ -625,6 +674,7 @@ static unsigned long zap_pte_range(struc spinlock_t *ptl; int file_rss = 0; int anon_rss = 0; + int rss; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { @@ -678,6 +728,7 @@ static unsigned long zap_pte_range(struc file_rss--; } page_remove_rmap(page); + pb_remove_ref(page, mm); tlb_remove_page(tlb, page); continue; } @@ -692,6 +743,8 @@ static unsigned long zap_pte_range(struc pte_clear_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + rss = -(file_rss + anon_rss); + ub_unused_privvm_add(mm, vma, rss); add_mm_rss(mm, file_rss, anon_rss); pte_unmap_unlock(pte - 1, ptl); @@ -1104,21 +1157,27 @@ static int zeromap_pte_range(struct mm_s { pte_t *pte; spinlock_t *ptl; + int err = 0; pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) - return -ENOMEM; + return -EAGAIN; do { struct page *page = ZERO_PAGE(addr); pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); + + if (unlikely(!pte_none(*pte))) { + err = -EEXIST; + pte++; + break; + } page_cache_get(page); page_add_file_rmap(page); inc_mm_counter(mm, file_rss); - BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); - return 0; + return err; } static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -1126,16 +1185,18 @@ static inline int zeromap_pmd_range(stru { pmd_t *pmd; unsigned long next; + int err; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - return -ENOMEM; + return -EAGAIN; do { next = pmd_addr_end(addr, end); - if (zeromap_pte_range(mm, pmd, addr, next, prot)) - return -ENOMEM; + err = zeromap_pte_range(mm, pmd, addr, next, prot); + if (err) + break; } while (pmd++, addr = next, addr != end); - return 0; + return err; } static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, @@ -1143,16 +1204,18 @@ static inline int zeromap_pud_range(stru { pud_t *pud; unsigned long next; + int err; pud = pud_alloc(mm, pgd, addr); if (!pud) - return -ENOMEM; + return -EAGAIN; do { next = pud_addr_end(addr, end); - if (zeromap_pmd_range(mm, pud, addr, next, prot)) - return -ENOMEM; + err = zeromap_pmd_range(mm, pud, addr, next, prot); + if (err) + break; } while (pud++, addr = next, addr != end); - return 0; + return err; } int zeromap_page_range(struct vm_area_struct *vma, @@ -1173,6 +1236,8 @@ int zeromap_page_range(struct vm_area_st if (err) break; } while (pgd++, addr = next, addr != end); + + ub_unused_privvm_sub(mm, vma, size >> PAGE_SHIFT); return err; } @@ -1459,6 +1524,7 @@ static int do_wp_page(struct mm_struct * struct page *old_page, *new_page; pte_t entry; int reuse, ret = VM_FAULT_MINOR; + struct page_beancounter *pbc; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) @@ -1507,6 +1573,7 @@ static int do_wp_page(struct mm_struct * flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ClearPageCheckpointed(old_page); ptep_set_access_flags(vma, address, page_table, entry, 1); update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); @@ -1521,6 +1588,9 @@ static int do_wp_page(struct mm_struct * gotten: pte_unmap_unlock(page_table, ptl); + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + if (unlikely(anon_vma_prepare(vma))) goto oom; if (old_page == ZERO_PAGE(address)) { @@ -1541,20 +1611,31 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { page_remove_rmap(old_page); + pb_remove_ref(old_page, mm); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); } - } else + } else { + ub_unused_privvm_dec(mm, vma); inc_mm_counter(mm, anon_rss); + } flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); lazy_mmu_prot_update(entry); - ptep_establish(vma, address, page_table, entry); + /* + * Clear the pte entry and flush it first, before updating the + * pte with the new entry. This will avoid a race condition + * seen in the presence of one thread doing SMC and another + * thread doing COW. + */ + ptep_clear_flush(vma, address, page_table); + set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + pb_add_ref(new_page, mm, &pbc); /* Free the old page.. */ new_page = old_page; @@ -1564,10 +1645,13 @@ gotten: page_cache_release(new_page); if (old_page) page_cache_release(old_page); + pb_free(&pbc); unlock: pte_unmap_unlock(page_table, ptl); return ret; oom: + pb_free(&pbc); +oom_nopb: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; @@ -1926,10 +2010,16 @@ static int do_swap_page(struct mm_struct swp_entry_t entry; pte_t pte; int ret = VM_FAULT_MINOR; + struct page_beancounter *pbc; + cycles_t start; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - goto out; + goto out_nostat; + if (unlikely(pb_alloc(&pbc))) + return VM_FAULT_OOM; + + start = get_cycles(); entry = pte_to_swp_entry(orig_pte); if (is_migration_entry(entry)) { migration_entry_wait(mm, pmd, address); @@ -1977,6 +2067,7 @@ static int do_swap_page(struct mm_struct /* The page isn't present yet, go ahead with the fault. */ inc_mm_counter(mm, anon_rss); + ub_percpu_inc(mm->mm_ub, swapin); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -1986,10 +2077,11 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); + try_to_remove_exclusive_swap_page(page); unlock_page(page); if (write_access) { @@ -2005,9 +2097,15 @@ static int do_swap_page(struct mm_struct unlock: pte_unmap_unlock(page_table, ptl); out: + pb_free(&pbc); + spin_lock_irq(&kstat_glb_lock); + KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start); + spin_unlock_irq(&kstat_glb_lock); +out_nostat: return ret; out_nomap: pte_unmap_unlock(page_table, ptl); + pb_free(&pbc); unlock_page(page); page_cache_release(page); return ret; @@ -2025,11 +2123,15 @@ static int do_anonymous_page(struct mm_s struct page *page; spinlock_t *ptl; pte_t entry; + struct page_beancounter *pbc; if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage(vma, address); @@ -2045,7 +2147,10 @@ static int do_anonymous_page(struct mm_s inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); page_add_new_anon_rmap(page, vma, address); + pb_add_ref(page, mm, &pbc); } else { + pbc = NULL; + /* Map the ZERO_PAGE - vm_page_prot is readonly */ page = ZERO_PAGE(address); page_cache_get(page); @@ -2059,18 +2164,22 @@ static int do_anonymous_page(struct mm_s page_add_file_rmap(page); } + ub_unused_privvm_dec(mm, vma); set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); unlock: + pb_free(&pbc); pte_unmap_unlock(page_table, ptl); return VM_FAULT_MINOR; release: page_cache_release(page); goto unlock; oom: + pb_free(&pbc); +oom_nopb: return VM_FAULT_OOM; } @@ -2098,6 +2207,7 @@ static int do_no_page(struct mm_struct * unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; + struct page_beancounter *pbc; pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); @@ -2107,6 +2217,9 @@ static int do_no_page(struct mm_struct * sequence = mapping->truncate_count; smp_rmb(); /* serializes i_size against truncate_count */ } + + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; retry: new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); /* @@ -2119,9 +2232,9 @@ retry: /* no page was available -- either SIGBUS or OOM */ if (new_page == NOPAGE_SIGBUS) - return VM_FAULT_SIGBUS; + goto bus_nopg; if (new_page == NOPAGE_OOM) - return VM_FAULT_OOM; + goto oom_nopg; /* * Should we do an early C-O-W break? @@ -2193,6 +2306,8 @@ retry: inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); } + pb_add_ref(new_page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); } else { /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); @@ -2204,10 +2319,18 @@ retry: lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); + pb_free(&pbc); return ret; oom: page_cache_release(new_page); +oom_nopg: + pb_free(&pbc); +oom_nopb: return VM_FAULT_OOM; + +bus_nopg: + pb_free(&pbc); + return VM_FAULT_SIGBUS; } /* @@ -2326,6 +2449,27 @@ int __handle_mm_fault(struct mm_struct * pmd_t *pmd; pte_t *pte; +#ifdef CONFIG_VZ_GENCALLS + do { + int ret; +#ifdef CONFIG_USER_RESOURCE + struct task_beancounter *tbc; + + tbc = ¤t->task_bc; + if (!test_bit(UB_AFLAG_NOTIF_PAGEIN, &mm->mm_ub->ub_aflags) && + tbc->pgfault_allot) { + tbc->pgfault_allot--; + break; /* skip notifier */ + } +#endif + ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_PAGEIN, + (void *)1); + if (ret & NOTIFY_FAIL) + return VM_FAULT_SIGBUS; + if (ret & NOTIFY_OK) + return VM_FAULT_MINOR; /* retry */ + } while (0); +#endif __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); @@ -2376,6 +2520,8 @@ int __pud_alloc(struct mm_struct *mm, pg } #endif /* __PAGETABLE_PUD_FOLDED */ +EXPORT_SYMBOL_GPL(__pud_alloc); + #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. @@ -2410,6 +2556,8 @@ int __pmd_alloc(struct mm_struct *mm, pu } #endif /* __PAGETABLE_PMD_FOLDED */ +EXPORT_SYMBOL_GPL(__pmd_alloc); + int make_pages_present(unsigned long addr, unsigned long end) { int ret, len, write; @@ -2429,6 +2577,8 @@ int make_pages_present(unsigned long add return ret == len ? 0 : -1; } +EXPORT_SYMBOL(make_pages_present); + /* * Map a vmalloc()-space virtual address to the physical page. */ diff -uprN linux-2.6.18/mm/mempolicy.c linux-2.6.18.ovz/mm/mempolicy.c --- linux-2.6.18/mm/mempolicy.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/mempolicy.c 2007-06-13 06:55:07.000000000 -0400 @@ -916,7 +916,7 @@ asmlinkage long sys_migrate_pages(pid_t /* Find the mm_struct */ read_lock(&tasklist_lock); - task = pid ? find_task_by_pid(pid) : current; + task = pid ? find_task_by_pid_ve(pid) : current; if (!task) { read_unlock(&tasklist_lock); return -ESRCH; diff -uprN linux-2.6.18/mm/mempool.c linux-2.6.18.ovz/mm/mempool.c --- linux-2.6.18/mm/mempool.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/mempool.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,6 +14,7 @@ #include #include #include +#include static void add_element(mempool_t *pool, void *element) { @@ -78,6 +79,8 @@ mempool_t *mempool_create_node(int min_n init_waitqueue_head(&pool->wait); pool->alloc = alloc_fn; pool->free = free_fn; + if (alloc_fn == mempool_alloc_slab) + kmem_mark_nocharge((kmem_cache_t *)pool_data); /* * First pre-allocate the guaranteed number of buffers. @@ -119,6 +122,7 @@ int mempool_resize(mempool_t *pool, int unsigned long flags; BUG_ON(new_min_nr <= 0); + gfp_mask &= ~__GFP_UBC; spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { @@ -212,6 +216,7 @@ void * mempool_alloc(mempool_t *pool, gf gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ + gfp_mask &= ~__GFP_UBC; gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); diff -uprN linux-2.6.18/mm/migrate.c linux-2.6.18.ovz/mm/migrate.c --- linux-2.6.18/mm/migrate.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/migrate.c 2007-06-13 06:55:07.000000000 -0400 @@ -880,7 +880,7 @@ asmlinkage long sys_move_pages(pid_t pid /* Find the mm_struct */ read_lock(&tasklist_lock); - task = pid ? find_task_by_pid(pid) : current; + task = pid ? find_task_by_pid_all(pid) : current; if (!task) { read_unlock(&tasklist_lock); return -ESRCH; @@ -950,7 +950,8 @@ asmlinkage long sys_move_pages(pid_t pid goto out; pm[i].node = node; - } + } else + pm[i].node = 0; /* anything to not match MAX_NUMNODES */ } /* End marker */ pm[nr_pages].node = MAX_NUMNODES; diff -uprN linux-2.6.18/mm/mincore.c linux-2.6.18.ovz/mm/mincore.c --- linux-2.6.18/mm/mincore.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/mincore.c 2007-06-13 06:55:07.000000000 -0400 @@ -1,7 +1,7 @@ /* * linux/mm/mincore.c * - * Copyright (C) 1994-1999 Linus Torvalds + * Copyright (C) 1994-2006 Linus Torvalds */ /* @@ -38,46 +38,51 @@ static unsigned char mincore_page(struct return present; } -static long mincore_vma(struct vm_area_struct * vma, - unsigned long start, unsigned long end, unsigned char __user * vec) +/* + * Do a chunk of "sys_mincore()". We've already checked + * all the arguments, we hold the mmap semaphore: we should + * just return the amount of info we're asked for. + */ +static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) { - long error, i, remaining; - unsigned char * tmp; - - error = -ENOMEM; - if (!vma->vm_file) - return error; - - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - if (end > vma->vm_end) - end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - - error = -EAGAIN; - tmp = (unsigned char *) __get_free_page(GFP_KERNEL); - if (!tmp) - return error; - - /* (end - start) is # of pages, and also # of bytes in "vec */ - remaining = (end - start), + unsigned long i, nr, pgoff; + struct vm_area_struct *vma = find_vma(current->mm, addr); - error = 0; - for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { - int j = 0; - long thispiece = (remaining < PAGE_SIZE) ? - remaining : PAGE_SIZE; + /* + * find_vma() didn't find anything above us, or we're + * in an unmapped hole in the address space: ENOMEM. + */ + if (!vma || addr < vma->vm_start) + return -ENOMEM; - while (j < thispiece) - tmp[j++] = mincore_page(vma, start++); + /* + * Ok, got it. But check whether it's a segment we support + * mincore() on. Right now, we don't do any anonymous mappings. + * + * FIXME: This is just stupid. And returning ENOMEM is + * stupid too. We should just look at the page tables. But + * this is what we've traditionally done, so we'll just + * continue doing it. + */ + if (!vma->vm_file) + return -ENOMEM; - if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { - error = -EFAULT; - break; - } - } + /* + * Calculate how many pages there are left in the vma, and + * what the pgoff is for our address. + */ + nr = (vma->vm_end - addr) >> PAGE_SHIFT; + if (nr > pages) + nr = pages; + + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + + /* And then we just fill the sucker in.. */ + for (i = 0 ; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma, pgoff); - free_page((unsigned long) tmp); - return error; + return nr; } /* @@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_s asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char __user * vec) { - int index = 0; - unsigned long end, limit; - struct vm_area_struct * vma; - size_t max; - int unmapped_error = 0; - long error; + long retval; + unsigned long pages; + unsigned char *tmp; - /* check the arguments */ + /* Check the start address: needs to be page-aligned.. */ if (start & ~PAGE_CACHE_MASK) - goto einval; - - limit = TASK_SIZE; - if (start >= limit) - goto enomem; + return -EINVAL; - if (!len) - return 0; + /* ..and we need to be passed a valid user-space range */ + if (!access_ok(VERIFY_READ, (void __user *) start, len)) + return -ENOMEM; - max = limit - start; - len = PAGE_CACHE_ALIGN(len); - if (len > max || !len) - goto enomem; + /* This also avoids any overflows on PAGE_CACHE_ALIGN */ + pages = len >> PAGE_SHIFT; + pages += (len & ~PAGE_MASK) != 0; - end = start + len; + if (!access_ok(VERIFY_WRITE, vec, pages)) + return -EFAULT; - /* check the output buffer whilst holding the lock */ - error = -EFAULT; - down_read(¤t->mm->mmap_sem); - - if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT)) - goto out; - - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - */ - error = 0; + tmp = (void *) __get_free_page(GFP_USER); + if (!tmp) + return -EAGAIN; - vma = find_vma(current->mm, start); - while (vma) { - /* Here start < vma->vm_end. */ - if (start < vma->vm_start) { - unmapped_error = -ENOMEM; - start = vma->vm_start; - } + retval = 0; + while (pages) { + /* + * Do at most PAGE_SIZE entries per iteration, due to + * the temporary buffer size. + */ + down_read(¤t->mm->mmap_sem); + retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); + up_read(¤t->mm->mmap_sem); - /* Here vma->vm_start <= start < vma->vm_end. */ - if (end <= vma->vm_end) { - if (start < end) { - error = mincore_vma(vma, start, end, - &vec[index]); - if (error) - goto out; - } - error = unmapped_error; - goto out; + if (retval <= 0) + break; + if (copy_to_user(vec, tmp, retval)) { + retval = -EFAULT; + break; } - - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = mincore_vma(vma, start, vma->vm_end, &vec[index]); - if (error) - goto out; - index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; - start = vma->vm_end; - vma = vma->vm_next; + pages -= retval; + vec += retval; + start += retval << PAGE_SHIFT; + retval = 0; } - - /* we found a hole in the area queried if we arrive here */ - error = -ENOMEM; - -out: - up_read(¤t->mm->mmap_sem); - return error; - -einval: - return -EINVAL; -enomem: - return -ENOMEM; + free_page((unsigned long) tmp); + return retval; } diff -uprN linux-2.6.18/mm/mlock.c linux-2.6.18.ovz/mm/mlock.c --- linux-2.6.18/mm/mlock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/mlock.c 2007-06-13 06:55:07.000000000 -0400 @@ -8,9 +8,11 @@ #include #include #include +#include #include #include +#include static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned int newflags) @@ -25,6 +27,14 @@ static int mlock_fixup(struct vm_area_st goto out; } + if (newflags & VM_LOCKED) { + ret = ub_locked_charge(mm, end - start); + if (ret < 0) { + *prev = vma; + goto out; + } + } + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -38,13 +48,13 @@ static int mlock_fixup(struct vm_area_st if (start != vma->vm_start) { ret = split_vma(mm, vma, start, 1); if (ret) - goto out; + goto out_uncharge; } if (end != vma->vm_end) { ret = split_vma(mm, vma, end, 0); if (ret) - goto out; + goto out_uncharge; } success: @@ -63,13 +73,19 @@ success: pages = -pages; if (!(newflags & VM_IO)) ret = make_pages_present(start, end); - } + } else + ub_locked_uncharge(mm, end - start); vma->vm_mm->locked_vm -= pages; out: if (ret == -ENOMEM) ret = -EAGAIN; return ret; + +out_uncharge: + if (newflags & VM_LOCKED) + ub_locked_uncharge(mm, end - start); + goto out; } static int do_mlock(unsigned long start, size_t len, int on) @@ -146,6 +162,7 @@ asmlinkage long sys_mlock(unsigned long up_write(¤t->mm->mmap_sem); return error; } +EXPORT_SYMBOL_GPL(sys_mlock); asmlinkage long sys_munlock(unsigned long start, size_t len) { @@ -158,6 +175,7 @@ asmlinkage long sys_munlock(unsigned lon up_write(¤t->mm->mmap_sem); return ret; } +EXPORT_SYMBOL_GPL(sys_munlock); static int do_mlockall(int flags) { diff -uprN linux-2.6.18/mm/mmap.c linux-2.6.18.ovz/mm/mmap.c --- linux-2.6.18/mm/mmap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/mmap.c 2007-06-13 06:55:07.000000000 -0400 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -34,9 +35,12 @@ #define arch_mmap_check(addr, len, flags) (0) #endif +#include + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); +static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft); /* * WARNING: the debugging will use recursive algorithms so never enable this @@ -91,6 +95,18 @@ int __vm_enough_memory(long pages, int c vm_acct_memory(pages); +#ifdef CONFIG_USER_RESOURCE + switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM, + (void *)pages) + & (NOTIFY_OK | NOTIFY_FAIL)) { + case NOTIFY_OK: + return 0; + case NOTIFY_FAIL: + vm_unacct_memory(pages); + return -ENOMEM; + } +#endif + /* * Sometimes we want to use more memory than we have */ @@ -217,6 +233,9 @@ static struct vm_area_struct *remove_vma struct vm_area_struct *next = vma->vm_next; might_sleep(); + + ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start, + vma->vm_flags, vma->vm_file); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) @@ -264,7 +283,7 @@ asmlinkage unsigned long sys_brk(unsigne goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk) goto out; set_brk: mm->brk = brk; @@ -1048,6 +1067,10 @@ munmap_back: } } + if (ub_memory_charge(mm, len, vm_flags, file, + (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) + goto charge_error; + /* * Can we just expand an old private anonymous mapping? * The VM_SHARED test is necessary because shmem_zero_setup @@ -1063,7 +1086,8 @@ munmap_back: * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | + (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0)); if (!vma) { error = -ENOMEM; goto unacct_error; @@ -1163,6 +1187,8 @@ unmap_and_free_vma: free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: + ub_memory_uncharge(mm, len, vm_flags, file); +charge_error: if (charged) vm_unacct_memory(charged); return error; @@ -1492,12 +1518,16 @@ static int acct_stack_growth(struct vm_a return -ENOMEM; } + if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags, + vma->vm_file, UB_SOFT)) + goto fail_charge; + /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory(grow)) - return -ENOMEM; + goto fail_sec; /* Ok, everything looks good - let it rip */ mm->total_vm += grow; @@ -1505,6 +1535,11 @@ static int acct_stack_growth(struct vm_a mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; + +fail_sec: + ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file); +fail_charge: + return -ENOMEM; } #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) @@ -1767,6 +1802,7 @@ int split_vma(struct mm_struct * mm, str return 0; } +EXPORT_SYMBOL_GPL(split_vma); /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the @@ -1860,7 +1896,7 @@ static inline void verify_mm_writelocked * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -unsigned long do_brk(unsigned long addr, unsigned long len) +static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft) { struct mm_struct * mm = current->mm; struct vm_area_struct * vma, * prev; @@ -1919,8 +1955,11 @@ unsigned long do_brk(unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + if (ub_memory_charge(mm, len, flags, NULL, soft)) + goto fail_charge; + if (security_vm_enough_memory(len >> PAGE_SHIFT)) - return -ENOMEM; + goto fail_sec; /* Can we just expand an old private anonymous mapping? */ if (vma_merge(mm, prev, addr, addr + len, flags, @@ -1930,11 +1969,10 @@ unsigned long do_brk(unsigned long addr, /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; - } + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | + (soft == UB_SOFT ? __GFP_SOFT_UBC : 0)); + if (!vma) + goto fail_alloc; vma->vm_mm = mm; vma->vm_start = addr; @@ -1951,8 +1989,19 @@ out: make_pages_present(addr, addr + len); } return addr; + +fail_alloc: + vm_unacct_memory(len >> PAGE_SHIFT); +fail_sec: + ub_memory_uncharge(mm, len, flags, NULL); +fail_charge: + return -ENOMEM; } +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + return __do_brk(addr, len, UB_SOFT); +} EXPORT_SYMBOL(do_brk); /* Release all mmaps. */ diff -uprN linux-2.6.18/mm/mmzone.c linux-2.6.18.ovz/mm/mmzone.c --- linux-2.6.18/mm/mmzone.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/mmzone.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,7 +14,7 @@ struct pglist_data *first_online_pgdat(v return NODE_DATA(first_online_node); } -EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */ +EXPORT_SYMBOL(first_online_pgdat); /* June 2006 */ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) { @@ -24,7 +24,7 @@ struct pglist_data *next_online_pgdat(st return NULL; return NODE_DATA(nid); } -EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */ +EXPORT_SYMBOL(next_online_pgdat); /* June 2006 */ /* diff -uprN linux-2.6.18/mm/mprotect.c linux-2.6.18.ovz/mm/mprotect.c --- linux-2.6.18/mm/mprotect.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/mprotect.c 2007-06-13 06:55:07.000000000 -0400 @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -26,6 +27,8 @@ #include #include +#include + static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot) { @@ -127,12 +130,20 @@ mprotect_fixup(struct vm_area_struct *vm pgprot_t newprot; pgoff_t pgoff; int error; + unsigned long ch_size; + int ch_dir; if (newflags == oldflags) { *pprev = vma; return 0; } + error = -ENOMEM; + ch_size = nrpages - pages_in_vma_range(vma, start, end); + ch_dir = ub_protected_charge(mm, ch_size, newflags, vma); + if (ch_dir == PRIVVM_ERROR) + goto fail_ch; + /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we @@ -145,7 +156,7 @@ mprotect_fixup(struct vm_area_struct *vm if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { charged = nrpages; if (security_vm_enough_memory(charged)) - return -ENOMEM; + goto fail_sec; newflags |= VM_ACCOUNT; } } @@ -196,10 +207,16 @@ success: change_protection(vma, start, end, newprot); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); + if (ch_dir == PRIVVM_TO_SHARED) + __ub_unused_privvm_dec(mm, ch_size); return 0; fail: vm_unacct_memory(charged); +fail_sec: + if (ch_dir == PRIVVM_TO_PRIVATE) + __ub_unused_privvm_dec(mm, ch_size); +fail_ch: return error; } @@ -301,3 +318,4 @@ out: up_write(¤t->mm->mmap_sem); return error; } +EXPORT_SYMBOL_GPL(sys_mprotect); diff -uprN linux-2.6.18/mm/mremap.c linux-2.6.18.ovz/mm/mremap.c --- linux-2.6.18/mm/mremap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/mremap.c 2007-06-13 06:55:07.000000000 -0400 @@ -23,6 +23,8 @@ #include #include +#include + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -166,17 +168,21 @@ static unsigned long move_vma(struct vm_ unsigned long hiwater_vm; int split = 0; + if (ub_memory_charge(mm, new_len, vm_flags, + vma->vm_file, UB_HARD)) + goto err; + /* * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ if (mm->map_count >= sysctl_max_map_count - 3) - return -ENOMEM; + goto err_nomem; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); if (!new_vma) - return -ENOMEM; + goto err_nomem; moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { @@ -235,7 +241,13 @@ static unsigned long move_vma(struct vm_ new_addr + new_len); } - return new_addr; + if (new_addr != -ENOMEM) + return new_addr; + +err_nomem: + ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file); +err: + return -ENOMEM; } /* @@ -359,7 +371,15 @@ unsigned long do_mremap(unsigned long ad max_addr = vma->vm_next->vm_start; /* can we just expand the current mapping? */ if (max_addr - addr >= new_len) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long len; + int pages; + + len = new_len - old_len; + pages = len >> PAGE_SHIFT; + ret = -ENOMEM; + if (ub_memory_charge(mm, len, vma->vm_flags, + vma->vm_file, UB_HARD)) + goto out; vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL); diff -uprN linux-2.6.18/mm/msync.c linux-2.6.18.ovz/mm/msync.c --- linux-2.6.18/mm/msync.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/msync.c 2007-06-13 06:55:07.000000000 -0400 @@ -146,10 +146,10 @@ static int msync_interval(struct vm_area asmlinkage long sys_msync(unsigned long start, size_t len, int flags) { unsigned long end; + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int unmapped_error = 0; int error = -EINVAL; - int done = 0; if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; @@ -169,64 +169,58 @@ asmlinkage long sys_msync(unsigned long * If the interval [start,end) covers some unmapped address ranges, * just ignore them, but return -ENOMEM at the end. */ - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, start); - if (!vma) { - error = -ENOMEM; - goto out_unlock; - } - do { + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + for (;;) { unsigned long nr_pages_dirtied = 0; struct file *file; + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out_unlock; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { - unmapped_error = -ENOMEM; start = vma->vm_start; - } - /* Here vma->vm_start <= start < vma->vm_end. */ - if (end <= vma->vm_end) { - if (start < end) { - error = msync_interval(vma, start, end, flags, - &nr_pages_dirtied); - if (error) - goto out_unlock; - } - error = unmapped_error; - done = 1; - } else { - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = msync_interval(vma, start, vma->vm_end, flags, - &nr_pages_dirtied); - if (error) + if (start >= end) goto out_unlock; + unmapped_error = -ENOMEM; } + /* Here vma->vm_start <= start < vma->vm_end. */ + error = msync_interval(vma, start, min(end, vma->vm_end), + flags, &nr_pages_dirtied); + if (error) + goto out_unlock; file = vma->vm_file; start = vma->vm_end; if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { get_file(file); - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); balance_dirty_pages_ratelimited_nr(file->f_mapping, nr_pages_dirtied); fput(file); - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, start); + if (start >= end) + goto out; + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); } else if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { get_file(file); - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); error = do_fsync(file, 0); fput(file); - down_read(¤t->mm->mmap_sem); - if (error) - goto out_unlock; - vma = find_vma(current->mm, start); + if (error || start >= end) + goto out; + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); } else { + if (start >= end) + goto out_unlock; vma = vma->vm_next; } - } while (vma && !done); + } out_unlock: - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); out: - return error; + return error ? : unmapped_error; } diff -uprN linux-2.6.18/mm/oom_kill.c linux-2.6.18.ovz/mm/oom_kill.c --- linux-2.6.18/mm/oom_kill.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/oom_kill.c 2007-06-13 06:55:07.000000000 -0400 @@ -17,11 +17,16 @@ #include #include +#include +#include #include #include #include #include +#include +#include + int sysctl_panic_on_oom; /* #define DEBUG */ @@ -179,15 +184,15 @@ static inline int constrained_alloc(stru * * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned long *ppoints) +struct task_struct *oom_select_bad_process(struct user_beancounter *ub) { struct task_struct *g, *p; struct task_struct *chosen = NULL; struct timespec uptime; - *ppoints = 0; + unsigned long chosen_points = 0; do_posix_clock_monotonic_gettime(&uptime); - do_each_thread(g, p) { + do_each_thread_all(g, p) { unsigned long points; int releasing; @@ -199,6 +204,8 @@ static struct task_struct *select_bad_pr /* If p's nodes don't overlap ours, it won't help to kill p. */ if (!cpuset_excl_nodes_overlap(p)) continue; + if (ub_oom_task_skip(ub, p)) + continue; /* * This is in the process of releasing memory so wait for it @@ -212,11 +219,11 @@ static struct task_struct *select_bad_pr return p; points = badness(p, uptime.tv_sec); - if (points > *ppoints || !chosen) { + if (points > chosen_points || !chosen) { chosen = p; - *ppoints = points; + chosen_points = points; } - } while_each_thread(g, p); + } while_each_thread_all(g, p); return chosen; } @@ -253,13 +260,16 @@ static void __oom_kill_task(struct task_ set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); + ub_oom_task_killed(p); } static int oom_kill_task(struct task_struct *p, const char *message) { struct mm_struct *mm; + struct user_beancounter *ub; struct task_struct *g, *q; + task_lock(p); mm = p->mm; /* WARNING: mm may not be dereferenced since we did not obtain its @@ -271,30 +281,36 @@ static int oom_kill_task(struct task_str * However, this is of no concern to us. */ - if (mm == NULL || mm == &init_mm) + if (mm == NULL || mm == &init_mm) { + task_unlock(p); return 1; + } + + ub = get_beancounter(mm_ub(mm)); + task_unlock(p); __oom_kill_task(p, message); /* * kill all processes that share the ->mm (i.e. all threads), * but are in a different thread group */ - do_each_thread(g, q) + do_each_thread_all(g, q) { if (q->mm == mm && q->tgid != p->tgid) __oom_kill_task(q, message); - while_each_thread(g, q); + } while_each_thread_all(g, q); + ub_oom_mm_killed(ub); + put_beancounter(ub); return 0; } -static int oom_kill_process(struct task_struct *p, unsigned long points, - const char *message) +int oom_kill_process(struct task_struct *p, const char *message) { struct task_struct *c; struct list_head *tsk; - printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " - "children.\n", p->pid, p->comm, points); + printk(KERN_ERR "Out of Memory: Kill process %d (%s) and children.\n", + p->pid, p->comm); /* Try to kill a child first */ list_for_each(tsk, &p->children) { c = list_entry(tsk, struct task_struct, sibling); @@ -317,16 +333,26 @@ static int oom_kill_process(struct task_ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { struct task_struct *p; - unsigned long points = 0; + struct user_beancounter *ub; + + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OUTOFMEM, NULL) + & (NOTIFY_OK | NOTIFY_FAIL)) + return; + + ub = NULL; + + cpuset_lock(); + if (ub_oom_lock()) + goto out_cpuset; if (printk_ratelimit()) { printk("oom-killer: gfp_mask=0x%x, order=%d\n", gfp_mask, order); dump_stack(); show_mem(); + show_slab_info(); } - cpuset_lock(); read_lock(&tasklist_lock); /* @@ -335,36 +361,41 @@ void out_of_memory(struct zonelist *zone */ switch (constrained_alloc(zonelist, gfp_mask)) { case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, points, - "No available memory (MPOL_BIND)"); + oom_kill_process(current, "No available memory (MPOL_BIND)"); break; case CONSTRAINT_CPUSET: - oom_kill_process(current, points, - "No available memory in cpuset"); + oom_kill_process(current, "No available memory in cpuset"); break; case CONSTRAINT_NONE: if (sysctl_panic_on_oom) panic("out of memory. panic_on_oom is selected\n"); retry: + put_beancounter(ub); + /* * Rambo mode: Shoot down a process and hope it solves whatever * issues we may have. */ - p = select_bad_process(&points); + ub = ub_oom_select_worst(); + p = oom_select_bad_process(ub); if (PTR_ERR(p) == -1UL) goto out; /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { + if (ub != NULL) + goto retry; + read_unlock(&tasklist_lock); + ub_oom_unlock(); cpuset_unlock(); panic("Out of memory and no killable processes...\n"); } - if (oom_kill_process(p, points, "Out of memory")) + if (oom_kill_process(p, "Out of memory")) goto retry; break; @@ -372,6 +403,10 @@ retry: out: read_unlock(&tasklist_lock); + ub_oom_unlock(); + put_beancounter(ub); + +out_cpuset: cpuset_unlock(); /* diff -uprN linux-2.6.18/mm/page-writeback.c linux-2.6.18.ovz/mm/page-writeback.c --- linux-2.6.18/mm/page-writeback.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/page-writeback.c 2007-06-13 06:55:07.000000000 -0400 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,9 @@ #include #include +#include +#include + /* * The maximum number of pages to writeout in a single bdflush/kupdate * operation. We do this so we don't hold I_LOCK against an inode for @@ -575,12 +579,15 @@ int write_one_page(struct page *page, in .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; + struct user_beancounter *old_ub; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); + old_ub = bc_io_switch_context(page); + if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); @@ -593,6 +600,9 @@ int write_one_page(struct page *page, in } else { unlock_page(page); } + + bc_io_restore_context(old_ub); + return ret; } EXPORT_SYMBOL(write_one_page); @@ -614,6 +624,9 @@ EXPORT_SYMBOL(write_one_page); */ int __set_page_dirty_nobuffers(struct page *page) { + int acct; + + acct = 0; if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; @@ -623,9 +636,11 @@ int __set_page_dirty_nobuffers(struct pa mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + acct = 1; + } radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } @@ -635,6 +650,8 @@ int __set_page_dirty_nobuffers(struct pa __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } + if (acct) + task_io_account_write(page, PAGE_CACHE_SIZE, 0); } return 1; } @@ -712,8 +729,13 @@ int test_clear_page_dirty(struct page *p radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { __dec_zone_page_state(page, NR_FILE_DIRTY); + write_unlock_irqrestore(&mapping->tree_lock, + flags); + ub_io_release_context(page, 0); + return 1; + } write_unlock_irqrestore(&mapping->tree_lock, flags); return 1; } @@ -744,8 +766,10 @@ int clear_page_dirty_for_io(struct page if (mapping) { if (TestClearPageDirty(page)) { - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); + ub_io_release_context(page, PAGE_CACHE_SIZE); + } return 1; } return 0; diff -uprN linux-2.6.18/mm/page_alloc.c linux-2.6.18.ovz/mm/page_alloc.c --- linux-2.6.18/mm/page_alloc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/page_alloc.c 2007-06-13 06:55:07.000000000 -0400 @@ -42,6 +42,9 @@ #include #include "internal.h" +#include +#include + /* * MCD - HACK: Find somewhere to initialize this EARLY, or make this * initializer cleaner @@ -71,6 +74,7 @@ static void __free_pages_ok(struct page */ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; +EXPORT_SYMBOL(nr_swap_pages); EXPORT_SYMBOL(totalram_pages); /* @@ -391,8 +395,11 @@ static inline int free_pages_check(struc 1 << PG_reserved | 1 << PG_buddy )))) bad_page(page); - if (PageDirty(page)) + if (PageDirty(page)) { + ub_io_release_context(page, 0); __ClearPageDirty(page); + } else + ub_io_release_debug(page); /* * For now, we report if PG_reserved was found set, but do not * clear it, and do not free the page. But we shall soon need @@ -454,6 +461,7 @@ static void __free_pages_ok(struct page return; kernel_map_pages(page, 1 << order, 0); + ub_page_uncharge(page, order); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); @@ -550,7 +558,8 @@ static int prep_new_page(struct page *pa page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); + 1 << PG_checked | 1 << PG_mappedtodisk | + 1 << PG_checkpointed); set_page_private(page, 0); set_page_refcounted(page); kernel_map_pages(page, 1 << order, 1); @@ -727,6 +736,7 @@ static void fastcall free_hot_cold_page( kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + ub_page_uncharge(page, 0); local_irq_save(flags); __count_vm_event(PGFREE); list_add(&page->lru, &pcp->list); @@ -903,6 +913,28 @@ get_page_from_freelist(gfp_t gfp_mask, u return page; } +static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order, + struct page *page, cycles_t time) +{ + int ind; + unsigned long flags; + + time = (jiffies - time) * cycles_per_jiffy; + if (!(gfp_mask & __GFP_WAIT)) + ind = 0; + else if (!(gfp_mask & __GFP_HIGHMEM)) + ind = (order > 0 ? 2 : 1); + else + ind = (order > 0 ? 4 : 3); + spin_lock_irqsave(&kstat_glb_lock, flags); + KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time); + if (!page) + kstat_glob.alloc_fails[ind]++; + spin_unlock_irqrestore(&kstat_glb_lock, flags); +} + +int alloc_fail_warn; + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -918,6 +950,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned i int do_retry; int alloc_flags; int did_some_progress; + cycles_t start; might_sleep_if(wait); @@ -929,6 +962,7 @@ restart: return NULL; } + start = jiffies; page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); if (page) @@ -968,6 +1002,7 @@ restart: if (page) goto got_pg; +rebalance: /* This allocation should allow future memory freeing. */ if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) @@ -991,7 +1026,6 @@ nofail_alloc: if (!wait) goto nopage; -rebalance: cond_resched(); /* We now go into synchronous reclaim */ @@ -1048,14 +1082,23 @@ rebalance: } nopage: - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + __alloc_collect_stats(gfp_mask, order, NULL, start); + if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) && + printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", p->comm, order, gfp_mask); dump_stack(); show_mem(); } + return NULL; + got_pg: + __alloc_collect_stats(gfp_mask, order, page, start); + if (ub_page_charge(page, order, gfp_mask)) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -1139,6 +1182,19 @@ unsigned int nr_free_pages(void) EXPORT_SYMBOL(nr_free_pages); +unsigned int nr_free_lowpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_online_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_NORMAL].free_pages; + + return pages; +} +EXPORT_SYMBOL(nr_free_lowpages); + + #ifdef CONFIG_NUMA unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) { @@ -1673,6 +1729,8 @@ void __meminit memmap_init_zone(unsigned for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!early_pfn_valid(pfn)) continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); init_page_count(page); @@ -1845,8 +1903,10 @@ static inline void free_zone_pagesets(in for_each_zone(zone) { struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + /* Free per_cpu_pageset if it is slab allocated */ + if (pset != &boot_pageset[cpu]) + kfree(pset); zone_pcp(zone, cpu) = NULL; - kfree(pset); } } @@ -2008,6 +2068,7 @@ static void __meminit free_area_init_cor #ifdef CONFIG_NUMA zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) / 100; + zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); @@ -2016,7 +2077,7 @@ static void __meminit free_area_init_cor zone->zone_pgdat = pgdat; zone->free_pages = 0; - zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); @@ -2318,6 +2379,22 @@ int sysctl_min_unmapped_ratio_sysctl_han sysctl_min_unmapped_ratio) / 100; return 0; } + +int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_slab_pages = (zone->present_pages * + sysctl_min_slab_ratio) / 100; + return 0; +} #endif /* diff -uprN linux-2.6.18/mm/readahead.c linux-2.6.18.ovz/mm/readahead.c --- linux-2.6.18/mm/readahead.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/readahead.c 2007-06-13 06:55:07.000000000 -0400 @@ -13,6 +13,7 @@ #include #include #include +#include #include void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) @@ -143,6 +144,7 @@ int read_cache_pages(struct address_spac page_cache_release(page); continue; } + task_io_account_read(PAGE_CACHE_SIZE); ret = filler(data, page); if (!pagevec_add(&lru_pvec, page)) __pagevec_lru_add(&lru_pvec); diff -uprN linux-2.6.18/mm/rmap.c linux-2.6.18.ovz/mm/rmap.c --- linux-2.6.18/mm/rmap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/rmap.c 2007-06-13 06:55:07.000000000 -0400 @@ -54,6 +54,9 @@ #include #include +#include +#include + #include struct kmem_cache *anon_vma_cachep; @@ -115,6 +118,7 @@ int anon_vma_prepare(struct vm_area_stru } return 0; } +EXPORT_SYMBOL_GPL(anon_vma_prepare); void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { @@ -143,6 +147,7 @@ void anon_vma_link(struct vm_area_struct spin_unlock(&anon_vma->lock); } } +EXPORT_SYMBOL_GPL(anon_vma_link); void anon_vma_unlink(struct vm_area_struct *vma) { @@ -179,14 +184,15 @@ static void anon_vma_ctor(void *data, st void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, + anon_vma_ctor, NULL); } /* * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; @@ -204,6 +210,7 @@ out: rcu_read_unlock(); return anon_vma; } +EXPORT_SYMBOL_GPL(page_lock_anon_vma); /* * At what user virtual address is page expected in vma? @@ -531,6 +538,13 @@ void page_remove_rmap(struct page *page) */ if (page_test_and_clear_dirty(page)) set_page_dirty(page); + + /* + * Well, when a page is unmapped, we cannot keep PG_checkpointed + * flag, it is not accessible via process VM and we have no way + * to reset its state + */ + ClearPageCheckpointed(page); __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); } @@ -622,6 +636,9 @@ static int try_to_unmap_one(struct page page_remove_rmap(page); + ub_unused_privvm_inc(mm, vma); + ub_percpu_inc(mm->mm_ub, unmap); + pb_remove_ref(page, mm); page_cache_release(page); out_unmap: @@ -712,6 +729,9 @@ static void try_to_unmap_cluster(unsigne set_page_dirty(page); page_remove_rmap(page); + ub_percpu_inc(mm->mm_ub, unmap); + pb_remove_ref(page, mm); + ub_unused_privvm_inc(mm, vma); page_cache_release(page); dec_mm_counter(mm, file_rss); (*mapcount)--; diff -uprN linux-2.6.18/mm/shmem.c linux-2.6.18.ovz/mm/shmem.c --- linux-2.6.18/mm/shmem.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/shmem.c 2007-06-13 06:55:07.000000000 -0400 @@ -50,6 +50,8 @@ #include #include +#include + /* This magic number is used in glibc for posix shared memory */ #define TMPFS_MAGIC 0x01021994 @@ -211,7 +213,7 @@ static void shmem_free_blocks(struct ino * * It has to be called with the spinlock held. */ -static void shmem_recalc_inode(struct inode *inode) +static void shmem_recalc_inode(struct inode *inode, long swp_freed) { struct shmem_inode_info *info = SHMEM_I(inode); long freed; @@ -221,6 +223,8 @@ static void shmem_recalc_inode(struct in info->alloced -= freed; shmem_unacct_blocks(info->flags, freed); shmem_free_blocks(inode, freed); + if (freed > swp_freed) + ub_tmpfs_respages_sub(info, freed - swp_freed); } } @@ -326,6 +330,11 @@ static void shmem_swp_set(struct shmem_i struct page *page = kmap_atomic_to_page(entry); set_page_private(page, page_private(page) + incdec); } + + if (incdec == 1) + ub_tmpfs_respages_dec(info); + else + ub_tmpfs_respages_inc(info); } /* @@ -342,14 +351,24 @@ static swp_entry_t *shmem_swp_alloc(stru struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct page *page = NULL; swp_entry_t *entry; + unsigned long ub_val; if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return ERR_PTR(-EINVAL); + ub_val = 0; + if (info->next_index <= index) { + ub_val = index + 1 - info->next_index; + if (ub_shmpages_charge(info, ub_val)) + return ERR_PTR(-ENOSPC); + } + while (!(entry = shmem_swp_entry(info, index, &page))) { - if (sgp == SGP_READ) - return shmem_swp_map(ZERO_PAGE(0)); + if (sgp == SGP_READ) { + entry = shmem_swp_map(ZERO_PAGE(0)); + goto out; + } /* * Test free_blocks against 1 not 0, since we have 1 data * page (and perhaps indirect index pages) yet to allocate: @@ -359,7 +378,8 @@ static swp_entry_t *shmem_swp_alloc(stru spin_lock(&sbinfo->stat_lock); if (sbinfo->free_blocks <= 1) { spin_unlock(&sbinfo->stat_lock); - return ERR_PTR(-ENOSPC); + entry = ERR_PTR(-ENOSPC); + goto out; } sbinfo->free_blocks--; inode->i_blocks += BLOCKS_PER_PAGE; @@ -367,31 +387,43 @@ static swp_entry_t *shmem_swp_alloc(stru } spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); + page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | + __GFP_ZERO | __GFP_UBC); if (page) set_page_private(page, 0); spin_lock(&info->lock); if (!page) { - shmem_free_blocks(inode, 1); - return ERR_PTR(-ENOMEM); + entry = ERR_PTR(-ENOMEM); + goto out_block; } if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { entry = ERR_PTR(-EINVAL); - break; + goto out_dir; } - if (info->next_index <= index) + if (info->next_index <= index) { + ub_val = 0; info->next_index = index + 1; + } } if (page) { /* another task gave its page, or truncated the file */ shmem_free_blocks(inode, 1); shmem_dir_free(page); } - if (info->next_index <= index && !IS_ERR(entry)) + if (info->next_index <= index) info->next_index = index + 1; return entry; + +out_dir: + shmem_dir_free(page); +out_block: + shmem_free_blocks(inode, 1); +out: + if (ub_val) + ub_shmpages_uncharge(info, ub_val); + return entry; } /* @@ -484,6 +516,7 @@ static void shmem_truncate_range(struct return; spin_lock(&info->lock); + ub_shmpages_uncharge(info, info->next_index - idx); info->flags |= SHMEM_TRUNCATE; if (likely(end == (loff_t) -1)) { limit = info->next_index; @@ -510,7 +543,12 @@ static void shmem_truncate_range(struct size = SHMEM_NR_DIRECT; nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); } - if (!topdir) + + /* + * If there are no indirect blocks or we are punching a hole + * below indirect blocks, nothing to be done. + */ + if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT))) goto done2; BUG_ON(limit <= SHMEM_NR_DIRECT); @@ -613,7 +651,7 @@ done2: info->swapped -= nr_swaps_freed; if (nr_pages_to_free) shmem_free_blocks(inode, nr_pages_to_free); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, nr_swaps_freed); spin_unlock(&info->lock); /* @@ -696,6 +734,7 @@ static void shmem_delete_inode(struct in sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); } + shmi_ub_put(info); clear_inode(inode); } @@ -817,6 +856,12 @@ int shmem_unuse(swp_entry_t entry, struc return found; } +#ifdef CONFIG_USER_RESOURCE +#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub)) +#else +#define shm_get_swap_page(info) (get_swap_page(NULL)) +#endif + /* * Move the page from the page cache to the swap cache. */ @@ -837,12 +882,12 @@ static int shmem_writepage(struct page * info = SHMEM_I(inode); if (info->flags & VM_LOCKED) goto redirty; - swap = get_swap_page(); + swap = shm_get_swap_page(info); if (!swap.val) goto redirty; spin_lock(&info->lock); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, 0); if (index >= info->next_index) { BUG_ON(!(info->flags & SHMEM_TRUNCATE)); goto unlock; @@ -1030,7 +1075,7 @@ repeat: goto failed; spin_lock(&info->lock); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, 0); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) { spin_unlock(&info->lock); @@ -1198,6 +1243,7 @@ repeat: spin_unlock(&info->lock); flush_dcache_page(filepage); SetPageUptodate(filepage); + ub_tmpfs_respages_inc(info); } done: if (*pagep != filepage) { @@ -1299,28 +1345,6 @@ shmem_get_policy(struct vm_area_struct * } #endif -int shmem_lock(struct file *file, int lock, struct user_struct *user) -{ - struct inode *inode = file->f_dentry->d_inode; - struct shmem_inode_info *info = SHMEM_I(inode); - int retval = -ENOMEM; - - spin_lock(&info->lock); - if (lock && !(info->flags & VM_LOCKED)) { - if (!user_shm_lock(inode->i_size, user)) - goto out_nomem; - info->flags |= VM_LOCKED; - } - if (!lock && (info->flags & VM_LOCKED) && user) { - user_shm_unlock(inode->i_size, user); - info->flags &= ~VM_LOCKED; - } - retval = 0; -out_nomem: - spin_unlock(&info->lock); - return retval; -} - int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); @@ -1357,6 +1381,7 @@ shmem_get_inode(struct super_block *sb, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); + shmi_ub_set(info, get_exec_ub()); spin_lock_init(&info->lock); INIT_LIST_HEAD(&info->swaplist); @@ -2222,6 +2247,10 @@ static struct vm_operations_struct shmem #endif }; +int is_shmem_mapping(struct address_space *map) +{ + return (map != NULL && map->a_ops == &shmem_aops); +} static int shmem_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) @@ -2229,13 +2258,19 @@ static int shmem_get_sb(struct file_syst return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); } -static struct file_system_type tmpfs_fs_type = { +struct file_system_type tmpfs_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .get_sb = shmem_get_sb, .kill_sb = kill_litter_super, }; +EXPORT_SYMBOL(tmpfs_fs_type); + +#ifdef CONFIG_VE +#define shm_mnt (get_exec_env()->shmem_mnt) +#else static struct vfsmount *shm_mnt; +#endif static int __init init_tmpfs(void) { @@ -2270,6 +2305,36 @@ out3: } module_init(init_tmpfs) +static inline int shm_charge_ahead(struct inode *inode) +{ +#ifdef CONFIG_USER_RESOURCE + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long idx; + swp_entry_t *entry; + + if (!inode->i_size) + return 0; + idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + /* + * Just touch info to allocate space for entry and + * make all UBC checks + */ + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, SGP_CACHE); + if (IS_ERR(entry)) + goto err; + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + return 0; + +err: + spin_unlock(&info->lock); + return PTR_ERR(entry); +#else + return 0; +#endif +} + /* * shmem_file_setup - get an unlinked file living in tmpfs * @@ -2317,6 +2382,10 @@ struct file *shmem_file_setup(char *name d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ + error = shm_charge_ahead(inode); + if (error) + goto close_file; + file->f_vfsmnt = mntget(shm_mnt); file->f_dentry = dentry; file->f_mapping = inode->i_mapping; @@ -2332,6 +2401,7 @@ put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(shmem_file_setup); /* * shmem_zero_setup - setup a shared anonymous mapping @@ -2349,6 +2419,8 @@ int shmem_zero_setup(struct vm_area_stru if (vma->vm_file) fput(vma->vm_file); + else if (vma->vm_flags & VM_WRITE) + __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; return 0; diff -uprN linux-2.6.18/mm/slab.c linux-2.6.18.ovz/mm/slab.c --- linux-2.6.18/mm/slab.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/slab.c 2007-06-13 06:55:07.000000000 -0400 @@ -108,32 +108,19 @@ #include #include #include +#include +#include #include #include #include #include -/* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, - * SLAB_RED_ZONE & SLAB_POISON. - * 0 for faster, smaller code (especially in the critical paths). - * - * STATS - 1 to collect stats for /proc/slabinfo. - * 0 for faster, smaller code (especially in the critical paths). - * - * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) - */ +#include -#ifdef CONFIG_DEBUG_SLAB -#define DEBUG 1 -#define STATS 1 -#define FORCED_DEBUG 1 -#else -#define DEBUG 0 -#define STATS 0 -#define FORCED_DEBUG 0 -#endif +#define DEBUG SLAB_DEBUG +#define STATS SLAB_STATS +#define FORCED_DEBUG SLAB_FORCED_DEBUG /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) @@ -176,131 +163,17 @@ SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_UBC | SLAB_NO_CHARGE) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_UBC | SLAB_NO_CHARGE) #endif /* - * kmem_bufctl_t: - * - * Bufctl's are used for linking objs within a slab - * linked offsets. - * - * This implementation relies on "struct page" for locating the cache & - * slab an object belongs to. - * This allows the bufctl structure to be small (one int), but limits - * the number of objects a slab (not a cache) can contain when off-slab - * bufctls are used. The limit is the size of the largest general cache - * that does not use off-slab slabs. - * For 32bit archs with 4 kB pages, is this 56. - * This is not serious, as it is only for large objects, when it is unwise - * to have too many per slab. - * Note: This limit can be raised by introducing a general cache whose size - * is less than 512 (PAGE_SIZE<<3), but greater than 256. - */ - -typedef unsigned int kmem_bufctl_t; -#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) -#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) -#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) -#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) - -/* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. - */ -struct slab { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; -}; - -/* - * struct slab_rcu - * - * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to - * arrange for kmem_freepages to be called via RCU. This is useful if - * we need to approach a kernel structure obliquely, from its address - * obtained without the usual locking. We can lock the structure to - * stabilize it and check it's still at the given address, only if we - * can be sure that the memory has not been meanwhile reused for some - * other kind of object (which our subsystem's lock might corrupt). - * - * rcu_read_lock before reading the address, then rcu_read_unlock after - * taking the spinlock within the structure expected at that address. - * - * We assume struct slab_rcu can overlay struct slab when destroying. - */ -struct slab_rcu { - struct rcu_head head; - struct kmem_cache *cachep; - void *addr; -}; - -/* - * struct array_cache - * - * Purpose: - * - LIFO ordering, to hand out cache-warm objects from _alloc - * - reduce the number of linked list operations - * - reduce spinlock operations - * - * The limit is stored in the per-cpu structure to reduce the data cache - * footprint. - * - */ -struct array_cache { - unsigned int avail; - unsigned int limit; - unsigned int batchcount; - unsigned int touched; - spinlock_t lock; - void *entry[0]; /* - * Must have this definition in here for the proper - * alignment of array_cache. Also simplifies accessing - * the entries. - * [0] is for gcc 2.95. It should really be []. - */ -}; - -/* - * bootstrap: The caches do not work without cpuarrays anymore, but the - * cpuarrays are allocated from the generic caches... - */ -#define BOOT_CPUCACHE_ENTRIES 1 -struct arraycache_init { - struct array_cache cache; - void *entries[BOOT_CPUCACHE_ENTRIES]; -}; - -/* - * The slab lists for all objects. - */ -struct kmem_list3 { - struct list_head slabs_partial; /* partial list first, better asm code */ - struct list_head slabs_full; - struct list_head slabs_free; - unsigned long free_objects; - unsigned int free_limit; - unsigned int colour_next; /* Per-node cache coloring */ - spinlock_t list_lock; - struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ - unsigned long next_reap; /* updated without locking */ - int free_touched; /* updated without locking */ -}; - -/* * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) @@ -371,82 +244,6 @@ static void kmem_list3_init(struct kmem_ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) -/* - * struct kmem_cache - * - * manages a cache. - */ - -struct kmem_cache { -/* 1) per-cpu data, touched during every alloc/free */ - struct array_cache *array[NR_CPUS]; -/* 2) Cache tunables. Protected by cache_chain_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int buffer_size; -/* 3) touched by every alloc & free from the backend */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; - - unsigned int flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 4) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t gfpflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - struct kmem_cache *slabp_cache; - unsigned int slab_size; - unsigned int dflags; /* dynamic flags */ - - /* constructor func */ - void (*ctor) (void *, struct kmem_cache *, unsigned long); - - /* de-constructor func */ - void (*dtor) (void *, struct kmem_cache *, unsigned long); - -/* 5) cache creation/removal */ - const char *name; - struct list_head next; - -/* 6) statistics */ -#if STATS - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; -#endif -#if DEBUG - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. buffer_size contains the total - * object size including these internal fields, the following two - * variables contain the offset to the user object and its size. - */ - int obj_offset; - int obj_size; -#endif -}; - -#define CFLGS_OFF_SLAB (0x80000000UL) -#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) - #define BATCHREFILL_LIMIT 16 /* * Optimization question: fewer reaps means less probability for unnessary @@ -458,12 +255,14 @@ struct kmem_cache { #define REAPTIMEOUT_CPUC (2*HZ) #define REAPTIMEOUT_LIST3 (4*HZ) +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) +#define STATS_INC_SHRUNK(x) ((x)->shrunk++) + #if STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) #define STATS_DEC_ACTIVE(x) ((x)->num_active--) #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) -#define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) #define STATS_SET_HIGH(x) \ do { \ if ((x)->num_active > (x)->high_mark) \ @@ -486,8 +285,6 @@ struct kmem_cache { #define STATS_INC_ACTIVE(x) do { } while (0) #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) -#define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x,y) do { } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -579,67 +376,14 @@ static void **dbg_userword(struct kmem_c static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; /* - * Functions for storing/retrieving the cachep and or slab from the page - * allocator. These are used to find the slab an obj belongs to. With kfree(), - * these are used to find the cache which an obj belongs to. - */ -static inline void page_set_cache(struct page *page, struct kmem_cache *cache) -{ - page->lru.next = (struct list_head *)cache; -} - -static inline struct kmem_cache *page_get_cache(struct page *page) -{ - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); - BUG_ON(!PageSlab(page)); - return (struct kmem_cache *)page->lru.next; -} - -static inline void page_set_slab(struct page *page, struct slab *slab) -{ - page->lru.prev = (struct list_head *)slab; -} - -static inline struct slab *page_get_slab(struct page *page) -{ - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); - BUG_ON(!PageSlab(page)); - return (struct slab *)page->lru.prev; -} - -static inline struct kmem_cache *virt_to_cache(const void *obj) -{ - struct page *page = virt_to_page(obj); - return page_get_cache(page); -} - -static inline struct slab *virt_to_slab(const void *obj) -{ - struct page *page = virt_to_page(obj); - return page_get_slab(page); -} - -static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, - unsigned int idx) -{ - return slab->s_mem + cache->buffer_size * idx; -} - -static inline unsigned int obj_to_index(struct kmem_cache *cache, - struct slab *slab, void *obj) -{ - return (unsigned)(obj - slab->s_mem) / cache->buffer_size; -} - -/* * These are the default caches for kmalloc. Custom caches can have other sizes. */ struct cache_sizes malloc_sizes[] = { #define CACHE(x) { .cs_size = (x) }, #include CACHE(ULONG_MAX) +#include + CACHE(ULONG_MAX) #undef CACHE }; EXPORT_SYMBOL(malloc_sizes); @@ -653,10 +397,17 @@ struct cache_names { static struct cache_names __initdata cache_names[] = { #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, #include + {NULL,}, +#undef CACHE +#define CACHE(x) { .name = "size-" #x "(UBC)", .name_dma = "size-" #x "(DMA,UBC)" }, +#include {NULL,} #undef CACHE }; +int malloc_cache_num; +EXPORT_SYMBOL(malloc_cache_num); + static struct arraycache_init initarray_cache __initdata = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; static struct arraycache_init initarray_generic = @@ -708,6 +459,7 @@ static inline void init_lock_keys(struct /* Guard access to the cache-chain. */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; +static spinlock_t cache_chain_lock; /* * vm_enough_memory() looks at this to determine how many slab-allocated pages @@ -748,6 +500,8 @@ static inline struct kmem_cache *__find_ { struct cache_sizes *csizep = malloc_sizes; + if (gfpflags & __GFP_UBC) + csizep += malloc_cache_num; #if DEBUG /* This happens if someone tries to call * kmem_cache_create(), or __kmalloc(), before @@ -774,9 +528,17 @@ struct kmem_cache *kmem_find_general_cac } EXPORT_SYMBOL(kmem_find_general_cachep); -static size_t slab_mgmt_size(size_t nr_objs, size_t align) +static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags) +{ + size_t size_noub; + + size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t); + return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags); +} + +static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags) { - return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); + return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align); } /* @@ -821,20 +583,23 @@ static void cache_estimate(unsigned long * into account. */ nr_objs = (slab_size - sizeof(struct slab)) / - (buffer_size + sizeof(kmem_bufctl_t)); + (buffer_size + sizeof(kmem_bufctl_t) + + UB_EXTRA(flags)); /* * This calculated number will be either the right * amount, or one greater than what we want. */ - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size - > slab_size) + if (slab_mgmt_size(nr_objs, align, flags) + + nr_objs * buffer_size > slab_size) nr_objs--; + BUG_ON(slab_mgmt_size(nr_objs, align, flags) + + nr_objs * buffer_size > slab_size); if (nr_objs > SLAB_LIMIT) nr_objs = SLAB_LIMIT; - mgmt_size = slab_mgmt_size(nr_objs, align); + mgmt_size = slab_mgmt_size(nr_objs, align, flags); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; @@ -867,7 +632,7 @@ static void init_reap_node(int cpu) if (node == MAX_NUMNODES) node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; + per_cpu(reap_node, cpu) = node; } static void next_reap_node(void) @@ -1312,6 +1077,7 @@ static void init_list(struct kmem_cache cachep->nodelists[nodeid] = ptr; local_irq_enable(); } +static int offslab_limit; /* * Initialisation. Called after the page allocator have been initialised and @@ -1360,6 +1126,7 @@ void __init kmem_cache_init(void) /* 1) create the cache_cache */ INIT_LIST_HEAD(&cache_chain); + spin_lock_init(&cache_chain_lock); list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; @@ -1393,7 +1160,7 @@ void __init kmem_cache_init(void) sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, + ARCH_KMALLOC_FLAGS | SLAB_PANIC, NULL, NULL); if (INDEX_AC != INDEX_L3) { @@ -1401,12 +1168,13 @@ void __init kmem_cache_init(void) kmem_cache_create(names[INDEX_L3].name, sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, + ARCH_KMALLOC_FLAGS | SLAB_PANIC, NULL, NULL); } slab_early_init = 0; + for (i = 0; i < 2; i++) { while (sizes->cs_size != ULONG_MAX) { /* * For performance, all the general caches are L1 aligned. @@ -1419,20 +1187,29 @@ void __init kmem_cache_init(void) sizes->cs_cachep = kmem_cache_create(names->name, sizes->cs_size, ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, + ARCH_KMALLOC_FLAGS | SLAB_PANIC | + (i ? SLAB_UBC : 0) | SLAB_NO_CHARGE, NULL, NULL); } init_lock_keys(sizes); + if (!(OFF_SLAB(sizes->cs_cachep))) + offslab_limit = sizes->cs_size; sizes->cs_dmacachep = kmem_cache_create(names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| - SLAB_PANIC, - NULL, NULL); + (i ? SLAB_UBC : 0) | SLAB_NO_CHARGE| + SLAB_PANIC, NULL, NULL); sizes++; names++; } + + sizes++; + names++; + if (!i) + malloc_cache_num = sizes - malloc_sizes; + } /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; @@ -1850,7 +1627,6 @@ static void set_up_list3s(struct kmem_ca static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -1863,15 +1639,10 @@ static size_t calculate_slab_order(struc continue; if (flags & CFLGS_OFF_SLAB) { - /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). - */ - offslab_limit = size - sizeof(struct slab); - offslab_limit /= sizeof(kmem_bufctl_t); + int slab_size; - if (num > offslab_limit) + slab_size = slab_mgmt_size_noalign(num, flags); + if (slab_size > offslab_limit) break; } @@ -2170,8 +1941,7 @@ kmem_cache_create (const char *name, siz cachep = NULL; goto oops; } - slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), align); + slab_size = slab_mgmt_size(cachep->num, align, flags); /* * If the slab has been placed off-slab, and we have enough space then @@ -2184,8 +1954,7 @@ kmem_cache_create (const char *name, siz if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - slab_size = - cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + slab_size = slab_mgmt_size_noalign(cachep->num, flags); } cachep->colour_off = cache_line_size(); @@ -2210,7 +1979,10 @@ kmem_cache_create (const char *name, siz setup_cpu_cache(cachep); /* cache setup completed, link it into the list */ + spin_lock(&cache_chain_lock); list_add(&cachep->next, &cache_chain); + spin_unlock(&cache_chain_lock); + set_cache_objuse(cachep); oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", @@ -2321,6 +2093,7 @@ static int drain_freelist(struct kmem_ca BUG_ON(slabp->inuse); #endif list_del(&slabp->list); + STATS_INC_SHRUNK(cache); /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2402,13 +2175,17 @@ int kmem_cache_destroy(struct kmem_cache /* * the chain is never empty, cache_cache is never destroyed */ + spin_lock(&cache_chain_lock); list_del(&cachep->next); + spin_unlock(&cache_chain_lock); mutex_unlock(&cache_chain_mutex); if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); mutex_lock(&cache_chain_mutex); + spin_lock(&cache_chain_lock); list_add(&cachep->next, &cache_chain); + spin_unlock(&cache_chain_lock); mutex_unlock(&cache_chain_mutex); unlock_cpu_hotplug(); return 1; @@ -2429,6 +2206,8 @@ int kmem_cache_destroy(struct kmem_cache kfree(l3); } } + + ub_kmemcache_free(cachep); kmem_cache_free(&cache_cache, cachep); unlock_cpu_hotplug(); return 0; @@ -2445,7 +2224,7 @@ static struct slab *alloc_slabmgmt(struc if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, - local_flags, nodeid); + local_flags & (~__GFP_UBC), nodeid); if (!slabp) return NULL; } else { @@ -2456,14 +2235,10 @@ static struct slab *alloc_slabmgmt(struc slabp->colouroff = colour_off; slabp->s_mem = objp + colour_off; slabp->nodeid = nodeid; + init_slab_ubps(cachep, slabp); return slabp; } -static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) -{ - return (kmem_bufctl_t *) (slabp + 1); -} - static void cache_init_objs(struct kmem_cache *cachep, struct slab *slabp, unsigned long ctor_flags) { @@ -2641,7 +2416,7 @@ static int cache_grow(struct kmem_cache * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - objp = kmem_getpages(cachep, flags, nodeid); + objp = kmem_getpages(cachep, flags & (~__GFP_UBC), nodeid); if (!objp) goto failed; @@ -2999,10 +2774,15 @@ static __always_inline void *__cache_all local_irq_save(save_flags); objp = ____cache_alloc(cachep, flags); - local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); + + if (objp && ub_slab_charge(cachep, objp, flags)) { + kmem_cache_free(cachep, objp); + objp = NULL; + } + local_irq_restore(save_flags); return objp; } @@ -3118,6 +2898,7 @@ static void free_block(struct kmem_cache /* fixup slab chains */ if (slabp->inuse == 0) { if (l3->free_objects > l3->free_limit) { + STATS_INC_SHRUNK(cachep); l3->free_objects -= cachep->num; slab_destroy(cachep, slabp); } else { @@ -3133,6 +2914,19 @@ static void free_block(struct kmem_cache } } +void kmem_cache_free_block(kmem_cache_t *cachep, struct kmem_list3 *l3, + void **objpp, int nr_objects, int node) +{ + unsigned long flags; + + if (!nr_objects) + return; + + spin_lock_irqsave(&l3->list_lock, flags); + free_block(cachep, objpp, nr_objects, node); + spin_unlock_irqrestore(&l3->list_lock, flags); +} + static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) { int batchcount; @@ -3195,6 +2989,8 @@ static inline void __cache_free(struct k check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + ub_slab_uncharge(cachep, objp); + if (cache_free_alien(cachep, objp)) return; @@ -3308,11 +3104,15 @@ void *kmem_cache_alloc_node(struct kmem_ ptr = ____cache_alloc(cachep, flags); else ptr = __cache_alloc_node(cachep, flags, nodeid); - local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); + if (ptr && ub_slab_charge(cachep, ptr, flags)) { + kmem_cache_free(cachep, ptr); + ptr = NULL; + } + local_irq_restore(save_flags); return ptr; } EXPORT_SYMBOL(kmem_cache_alloc_node); @@ -3378,10 +3178,10 @@ EXPORT_SYMBOL(__kmalloc_track_caller); * * @size: how many bytes of memory are required. */ -void *__alloc_percpu(size_t size) +void *__alloc_percpu_mask(size_t size, gfp_t gfp) { int i; - struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL); + struct percpu_data *pdata = kmalloc(sizeof(*pdata), gfp); if (!pdata) return NULL; @@ -3395,9 +3195,9 @@ void *__alloc_percpu(size_t size) int node = cpu_to_node(i); if (node_online(node)) - pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); + pdata->ptrs[i] = kmalloc_node(size, gfp, node); else - pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); + pdata->ptrs[i] = kmalloc(size, gfp); if (!pdata->ptrs[i]) goto unwind_oom; @@ -3416,7 +3216,7 @@ unwind_oom: kfree(pdata); return NULL; } -EXPORT_SYMBOL(__alloc_percpu); +EXPORT_SYMBOL(__alloc_percpu_mask); #endif /** @@ -3744,7 +3544,7 @@ static void cache_reap(void *unused) { struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); + int node; if (!mutex_trylock(&cache_chain_mutex)) { /* Give up. Setup the next iteration. */ @@ -3753,8 +3553,11 @@ static void cache_reap(void *unused) return; } + {KSTAT_PERF_ENTER(cache_reap) + preempt_disable(); list_for_each_entry(searchp, &cache_chain, next) { check_irq_on(); + node = numa_node_id(); /* * We only take the l3 lock if absolutely necessary and we @@ -3788,14 +3591,18 @@ static void cache_reap(void *unused) STATS_ADD_REAPED(searchp, freed); } next: + preempt_enable_no_resched(); cond_resched(); + preempt_disable(); } check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); refresh_cpu_vm_stats(smp_processor_id()); + KSTAT_PERF_LEAVE(cache_reap)} /* Set up the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); + preempt_enable(); } #ifdef CONFIG_PROC_FS @@ -3817,12 +3624,80 @@ static void print_slabinfo_header(struct seq_puts(m, " : slabdata "); #if STATS seq_puts(m, " : globalstat " - " "); + " "); seq_puts(m, " : cpustat "); #endif seq_putc(m, '\n'); } +#define SHOW_TOP_SLABS 10 + +static unsigned long get_cache_size(struct kmem_cache *cachep) +{ + unsigned long flags; + unsigned long slabs; + struct kmem_list3 *l3; + struct list_head *lh; + int node; + + slabs = 0; + + for_each_online_node (node) { + l3 = cachep->nodelists[node]; + if (l3 == NULL) + continue; + + spin_lock_irqsave(&l3->list_lock, flags); + list_for_each (lh, &l3->slabs_full) + slabs++; + list_for_each (lh, &l3->slabs_partial) + slabs++; + list_for_each (lh, &l3->slabs_free) + slabs++; + spin_unlock_irqrestore(&l3->list_lock, flags); + } + + return slabs * (PAGE_SIZE << cachep->gfporder) + + (OFF_SLAB(cachep) ? + cachep->slabp_cache->buffer_size * slabs : 0); +} + +void show_slab_info(void) +{ + int i, j; + unsigned long size; + struct kmem_cache *ptr; + unsigned long sizes[SHOW_TOP_SLABS]; + struct kmem_cache *top[SHOW_TOP_SLABS]; + + memset(top, 0, sizeof(top)); + memset(sizes, 0, sizeof(sizes)); + + printk("Top %d caches:\n", SHOW_TOP_SLABS); + + spin_lock(&cache_chain_lock); + list_for_each_entry (ptr, &cache_chain, next) { + size = get_cache_size(ptr); + + j = 0; + for (i = 1; i < SHOW_TOP_SLABS; i++) + if (sizes[i] < sizes[j]) + j = i; + + if (size > sizes[j]) { + sizes[j] = size; + top[j] = ptr; + } + } + + for (i = 0; i < SHOW_TOP_SLABS; i++) + if (top[i]) + printk("%-21s: size %10lu objsize %10u\n", + top[i]->name, sizes[i], + top[i]->buffer_size); + spin_unlock(&cache_chain_lock); +} + static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; @@ -3910,7 +3785,7 @@ static int s_show(struct seq_file *m, vo if (error) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + seq_printf(m, "%-21s %6lu %6lu %6u %4u %4d", name, active_objs, num_objs, cachep->buffer_size, cachep->num, (1 << cachep->gfporder)); seq_printf(m, " : tunables %4u %4u %4u", @@ -3923,6 +3798,7 @@ static int s_show(struct seq_file *m, vo unsigned long allocs = cachep->num_allocations; unsigned long grown = cachep->grown; unsigned long reaped = cachep->reaped; + unsigned long shrunk = cachep->shrunk; unsigned long errors = cachep->errors; unsigned long max_freeable = cachep->max_freeable; unsigned long node_allocs = cachep->node_allocs; @@ -3930,9 +3806,10 @@ static int s_show(struct seq_file *m, vo unsigned long overflows = cachep->node_overflow; seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, + %4lu %4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, reaped, errors, max_freeable, node_allocs, - node_frees, overflows); + node_frees, overflows, shrunk); } /* cpu stats */ { diff -uprN linux-2.6.18/mm/slob.c linux-2.6.18.ovz/mm/slob.c --- linux-2.6.18/mm/slob.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/slob.c 2007-06-13 06:55:07.000000000 -0400 @@ -345,16 +345,16 @@ EXPORT_SYMBOL(slab_reclaim_pages); #ifdef CONFIG_SMP -void *__alloc_percpu(size_t size) +void *__alloc_percpu_mask(size_t size, gfp_t gfp) { int i; - struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); + struct percpu_data *pdata = kmalloc(sizeof (*pdata), gfp); if (!pdata) return NULL; for_each_possible_cpu(i) { - pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); + pdata->ptrs[i] = kmalloc(size, gfp); if (!pdata->ptrs[i]) goto unwind_oom; memset(pdata->ptrs[i], 0, size); @@ -372,7 +372,7 @@ unwind_oom: kfree(pdata); return NULL; } -EXPORT_SYMBOL(__alloc_percpu); +EXPORT_SYMBOL(__alloc_percpu_mask); void free_percpu(const void *objp) diff -uprN linux-2.6.18/mm/swap.c linux-2.6.18.ovz/mm/swap.c --- linux-2.6.18/mm/swap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/swap.c 2007-06-13 06:55:07.000000000 -0400 @@ -178,6 +178,8 @@ void fastcall lru_cache_add_active(struc put_cpu_var(lru_add_active_pvecs); } +EXPORT_SYMBOL(lru_cache_add_active); + static void __lru_add_drain(int cpu) { struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); @@ -196,6 +198,8 @@ void lru_add_drain(void) put_cpu(); } +EXPORT_SYMBOL(lru_add_drain); + #ifdef CONFIG_NUMA static void lru_add_drain_per_cpu(void *dummy) { diff -uprN linux-2.6.18/mm/swap_state.c linux-2.6.18.ovz/mm/swap_state.c --- linux-2.6.18/mm/swap_state.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/swap_state.c 2007-06-13 06:55:07.000000000 -0400 @@ -19,6 +19,9 @@ #include +#include +#include + /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_list, to make sync_page look nicer, and to allow @@ -43,6 +46,7 @@ struct address_space swapper_space = { .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, }; +EXPORT_SYMBOL(swapper_space); #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -53,14 +57,18 @@ static struct { unsigned long find_total; unsigned long noent_race; unsigned long exist_race; + unsigned long remove_race; } swap_cache_info; +EXPORT_SYMBOL(swap_cache_info); void show_swap_cache_info(void) { - printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", + printk("Swap cache: add %lu, delete %lu, find %lu/%lu, " + "race %lu+%lu+%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total, - swap_cache_info.noent_race, swap_cache_info.exist_race); + swap_cache_info.noent_race, swap_cache_info.exist_race, + swap_cache_info.remove_race); printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } @@ -69,8 +77,7 @@ void show_swap_cache_info(void) * __add_to_swap_cache resembles add_to_page_cache on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -static int __add_to_swap_cache(struct page *page, swp_entry_t entry, - gfp_t gfp_mask) +int __add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; @@ -95,7 +102,9 @@ static int __add_to_swap_cache(struct pa return error; } -static int add_to_swap_cache(struct page *page, swp_entry_t entry) +EXPORT_SYMBOL(__add_to_swap_cache); + +int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; @@ -117,6 +126,8 @@ static int add_to_swap_cache(struct page return 0; } +EXPORT_SYMBOL(add_to_swap_cache); + /* * This must be called only on pages that have * been verified to be in the swap cache. @@ -151,7 +162,14 @@ int add_to_swap(struct page * page, gfp_ BUG_ON(!PageLocked(page)); for (;;) { - entry = get_swap_page(); + struct user_beancounter *ub; + + ub = pb_grab_page_ub(page); + if (IS_ERR(ub)) + return 0; + + entry = get_swap_page(ub); + put_beancounter(ub); if (!entry.val) return 0; @@ -237,6 +255,7 @@ int move_from_swap_cache(struct page *pa delete_from_swap_cache(page); /* shift page from clean_pages to dirty_pages list */ ClearPageDirty(page); + ub_io_release_debug(page); set_page_dirty(page); } return err; @@ -252,10 +271,13 @@ int move_from_swap_cache(struct page *pa */ static inline void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !TestSetPageLocked(page)) { + if (!PageSwapCache(page)) + return; + if (!TestSetPageLocked(page)) { remove_exclusive_swap_page(page); unlock_page(page); - } + } else + INC_CACHE_INFO(remove_race); } /* @@ -364,3 +386,5 @@ struct page *read_swap_cache_async(swp_e page_cache_release(new_page); return found_page; } + +EXPORT_SYMBOL(read_swap_cache_async); diff -uprN linux-2.6.18/mm/swapfile.c linux-2.6.18.ovz/mm/swapfile.c --- linux-2.6.18/mm/swapfile.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/swapfile.c 2007-06-13 06:55:07.000000000 -0400 @@ -32,6 +32,8 @@ #include #include +#include + DEFINE_SPINLOCK(swap_lock); unsigned int nr_swapfiles; long total_swap_pages; @@ -43,8 +45,12 @@ static const char Bad_offset[] = "Bad sw static const char Unused_offset[] = "Unused swap offset entry "; struct swap_list_t swap_list = {-1, -1}; +struct swap_info_struct swap_info[MAX_SWAPFILES]; -static struct swap_info_struct swap_info[MAX_SWAPFILES]; +EXPORT_SYMBOL(total_swap_pages); +EXPORT_SYMBOL(swap_lock); +EXPORT_SYMBOL(swap_list); +EXPORT_SYMBOL(swap_info); static DEFINE_MUTEX(swapon_mutex); @@ -171,7 +177,7 @@ no_page: return 0; } -swp_entry_t get_swap_page(void) +swp_entry_t get_swap_page(struct user_beancounter *ub) { struct swap_info_struct *si; pgoff_t offset; @@ -192,6 +198,8 @@ swp_entry_t get_swap_page(void) wrapped++; } + if (si->flags & SWP_READONLY) + continue; if (!si->highest_bit) continue; if (!(si->flags & SWP_WRITEOK)) @@ -201,6 +209,7 @@ swp_entry_t get_swap_page(void) offset = scan_swap_map(si); if (offset) { spin_unlock(&swap_lock); + ub_swapentry_inc(si, offset, ub); return swp_entry(type, offset); } next = swap_list.next; @@ -212,6 +221,8 @@ noswap: return (swp_entry_t) {0}; } +EXPORT_SYMBOL(get_swap_page); + swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -219,7 +230,7 @@ swp_entry_t get_swap_page_of_type(int ty spin_lock(&swap_lock); si = swap_info + type; - if (si->flags & SWP_WRITEOK) { + if (si->flags & SWP_WRITEOK && !(si->flags & SWP_READONLY)) { nr_swap_pages--; offset = scan_swap_map(si); if (offset) { @@ -276,6 +287,7 @@ static int swap_entry_free(struct swap_i count--; p->swap_map[offset] = count; if (!count) { + ub_swapentry_dec(p, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -304,6 +316,8 @@ void swap_free(swp_entry_t entry) } } +EXPORT_SYMBOL(swap_free); + /* * How many references to page are currently swapped out? */ @@ -385,6 +399,55 @@ int remove_exclusive_swap_page(struct pa return retval; } +int try_to_remove_exclusive_swap_page(struct page *page) +{ + int retval; + struct swap_info_struct * p; + swp_entry_t entry; + + BUG_ON(PagePrivate(page)); + BUG_ON(!PageLocked(page)); + + if (!PageSwapCache(page)) + return 0; + if (PageWriteback(page)) + return 0; + if (page_count(page) != 2) /* 2: us + cache */ + return 0; + + entry.val = page->private; + p = swap_info_get(entry); + if (!p) + return 0; + + if (!vm_swap_full() && + (p->flags & (SWP_ACTIVE|SWP_READONLY)) == SWP_ACTIVE) { + spin_unlock(&swap_lock); + return 0; + } + + /* Is the only swap cache user the cache itself? */ + retval = 0; + if (p->swap_map[swp_offset(entry)] == 1) { + /* Recheck the page count with the swapcache lock held.. */ + write_lock_irq(&swapper_space.tree_lock); + if ((page_count(page) == 2) && !PageWriteback(page)) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } + write_unlock_irq(&swapper_space.tree_lock); + } + spin_unlock(&swap_lock); + + if (retval) { + swap_free(entry); + page_cache_release(page); + } + + return retval; +} + /* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. @@ -425,6 +488,8 @@ void free_swap_and_cache(swp_entry_t ent } } +EXPORT_SYMBOL(free_swap_and_cache); + #ifdef CONFIG_SOFTWARE_SUSPEND /* * Find the swap type that corresponds to given device (if any) @@ -487,11 +552,17 @@ unsigned int count_swap_pages(int type, * force COW, vm_page_prot omits write permission from any private vma. */ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, - unsigned long addr, swp_entry_t entry, struct page *page) + unsigned long addr, swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { - inc_mm_counter(vma->vm_mm, anon_rss); + struct mm_struct *mm; + + mm = vma->vm_mm; + inc_mm_counter(mm, anon_rss); + ub_unused_privvm_dec(mm, vma); + pb_add_ref(page, mm, pb); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, + set_pte_at(mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); swap_free(entry); @@ -504,7 +575,8 @@ static void unuse_pte(struct vm_area_str static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; @@ -518,7 +590,7 @@ static int unuse_pte_range(struct vm_are * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, pte++, addr, entry, page); + unuse_pte(vma, pte++, addr, entry, page, pb); found = 1; break; } @@ -529,7 +601,8 @@ static int unuse_pte_range(struct vm_are static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pmd_t *pmd; unsigned long next; @@ -539,7 +612,7 @@ static inline int unuse_pmd_range(struct next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (unuse_pte_range(vma, pmd, addr, next, entry, page)) + if (unuse_pte_range(vma, pmd, addr, next, entry, page, pb)) return 1; } while (pmd++, addr = next, addr != end); return 0; @@ -547,7 +620,8 @@ static inline int unuse_pmd_range(struct static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pud_t *pud; unsigned long next; @@ -557,14 +631,15 @@ static inline int unuse_pud_range(struct next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (unuse_pmd_range(vma, pud, addr, next, entry, page)) + if (unuse_pmd_range(vma, pud, addr, next, entry, page, pb)) return 1; } while (pud++, addr = next, addr != end); return 0; } static int unuse_vma(struct vm_area_struct *vma, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pgd_t *pgd; unsigned long addr, end, next; @@ -585,14 +660,15 @@ static int unuse_vma(struct vm_area_stru next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (unuse_pud_range(vma, pgd, addr, next, entry, page)) + if (unuse_pud_range(vma, pgd, addr, next, entry, page, pb)) return 1; } while (pgd++, addr = next, addr != end); return 0; } static int unuse_mm(struct mm_struct *mm, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { struct vm_area_struct *vma; @@ -607,7 +683,7 @@ static int unuse_mm(struct mm_struct *mm lock_page(page); } for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && unuse_vma(vma, entry, page)) + if (vma->anon_vma && unuse_vma(vma, entry, page, pb)) break; } up_read(&mm->mmap_sem); @@ -673,6 +749,7 @@ static int try_to_unuse(unsigned int typ int retval = 0; int reset_overflow = 0; int shmem; + struct page_beancounter *pb; /* * When searching mms for an entry, a good strategy is to @@ -724,6 +801,13 @@ static int try_to_unuse(unsigned int typ break; } + pb = NULL; + if (pb_alloc_all(&pb)) { + page_cache_release(page); + retval = -ENOMEM; + break; + } + /* * Don't hold on to start_mm if it looks like exiting. */ @@ -746,6 +830,20 @@ static int try_to_unuse(unsigned int typ lock_page(page); wait_on_page_writeback(page); + /* If read failed we cannot map not-uptodate page to + * user space. Actually, we are in serious troubles, + * we do not even know what process to kill. So, the only + * variant remains: to stop swapoff() and allow someone + * to kill processes to zap invalid pages. + */ + if (unlikely(!PageUptodate(page))) { + pb_free_list(&pb); + unlock_page(page); + page_cache_release(page); + retval = -EIO; + break; + } + /* * Remove all references to entry. * Whenever we reach init_mm, there's no address space @@ -757,7 +855,7 @@ static int try_to_unuse(unsigned int typ if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else - retval = unuse_mm(start_mm, entry, page); + retval = unuse_mm(start_mm, entry, page, &pb); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); @@ -787,7 +885,7 @@ static int try_to_unuse(unsigned int typ set_start_mm = 1; shmem = shmem_unuse(entry, page); } else - retval = unuse_mm(mm, entry, page); + retval = unuse_mm(mm, entry, page, &pb); if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); @@ -801,6 +899,8 @@ static int try_to_unuse(unsigned int typ mmput(start_mm); start_mm = new_start_mm; } + + pb_free_list(&pb); if (retval) { unlock_page(page); page_cache_release(page); @@ -1146,6 +1246,10 @@ asmlinkage long sys_swapoff(const char _ int i, type, prev; int err; + /* VE admin check is just to be on the safe side, the admin may affect + * swaps only if he has access to special, i.e. if he has been granted + * access to the block device or if the swap file is in the area + * visible to him. */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1245,6 +1349,7 @@ asmlinkage long sys_swapoff(const char _ spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + ub_swap_fini(p); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); @@ -1264,6 +1369,8 @@ out: return err; } +EXPORT_SYMBOL(sys_swapoff); + #ifdef CONFIG_PROC_FS /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) @@ -1590,9 +1697,16 @@ asmlinkage long sys_swapon(const char __ goto bad_swap; } + if (ub_swap_init(p, maxpages)) { + error = -ENOMEM; + goto bad_swap; + } + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); p->flags = SWP_ACTIVE; + if (swap_flags & SWAP_FLAG_READONLY) + p->flags |= SWP_READONLY; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; @@ -1652,6 +1766,8 @@ out: return error; } +EXPORT_SYMBOL(sys_swapon); + void si_swapinfo(struct sysinfo *val) { unsigned int i; @@ -1711,6 +1827,8 @@ bad_file: goto out; } +EXPORT_SYMBOL(swap_duplicate); + struct swap_info_struct * get_swap_info_struct(unsigned type) { diff -uprN linux-2.6.18/mm/truncate.c linux-2.6.18.ovz/mm/truncate.c --- linux-2.6.18/mm/truncate.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/truncate.c 2007-06-13 06:55:07.000000000 -0400 @@ -12,6 +12,7 @@ #include #include #include +#include #include /* grr. try_to_release_page, do_invalidatepage */ @@ -42,7 +43,8 @@ truncate_complete_page(struct address_sp if (PagePrivate(page)) do_invalidatepage(page, 0); - clear_page_dirty(page); + if (test_clear_page_dirty(page)) + task_io_account_cancelled_write(PAGE_CACHE_SIZE); ClearPageUptodate(page); ClearPageMappedToDisk(page); remove_from_page_cache(page); @@ -270,9 +272,48 @@ unsigned long invalidate_inode_pages(str { return invalidate_mapping_pages(mapping, 0, ~0UL); } - EXPORT_SYMBOL(invalidate_inode_pages); +/* + * This is like invalidate_complete_page(), except it ignores the page's + * refcount. We do this because invalidate_inode_pages2() needs stronger + * invalidation guarantees, and cannot afford to leave pages behind because + * shrink_list() has a temp ref on them, or because they're transiently sitting + * in the lru_cache_add() pagevecs. + */ +static int +invalidate_complete_page2(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return 0; + + if (PagePrivate(page) && !try_to_release_page(page, 0)) + return 0; + + write_lock_irq(&mapping->tree_lock); + if (PageDirty(page)) + goto failed; + + BUG_ON(PagePrivate(page)); + __remove_from_page_cache(page); + write_unlock_irq(&mapping->tree_lock); + ClearPageUptodate(page); + page_cache_release(page); /* pagecache ref */ + return 1; +failed: + write_unlock_irq(&mapping->tree_lock); + return 0; +} + +static int do_launder_page(struct address_space *mapping, struct page *page) +{ + if (!PageDirty(page)) + return 0; + if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + return 0; + return mapping->a_ops->launder_page(page); +} + /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space @@ -339,7 +380,8 @@ int invalidate_inode_pages2_range(struct } } was_dirty = test_clear_page_dirty(page); - if (!invalidate_complete_page(mapping, page)) { + ret = do_launder_page(mapping, page); + if (ret == 0 && !invalidate_complete_page2(mapping, page)) { if (was_dirty) set_page_dirty(page); ret = -EIO; diff -uprN linux-2.6.18/mm/vmalloc.c linux-2.6.18.ovz/mm/vmalloc.c --- linux-2.6.18/mm/vmalloc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/vmalloc.c 2007-06-13 06:55:07.000000000 -0400 @@ -20,6 +20,9 @@ #include #include +#include +#include + DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; @@ -269,6 +272,70 @@ static struct vm_struct *__find_vm_area( return tmp; } +struct vm_struct * get_vm_area_best(unsigned long size, unsigned long flags) +{ + unsigned long addr, best_addr, delta, best_delta; + struct vm_struct **p, **best_p, *tmp, *area; + + area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + size += PAGE_SIZE; /* one-page gap at the end */ + addr = VMALLOC_START; + best_addr = 0UL; + best_p = NULL; + best_delta = PAGE_ALIGN(VMALLOC_END) - VMALLOC_START; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) && + (tmp->addr <= (void *)PAGE_ALIGN(VMALLOC_END)); + p = &tmp->next) { + if ((unsigned long)tmp->addr < addr) + continue; + if ((size + addr) < addr) + break; + delta = (unsigned long) tmp->addr - (size + addr); + if (delta < best_delta) { + best_delta = delta; + best_addr = addr; + best_p = p; + } + addr = tmp->size + (unsigned long) tmp->addr; + if (addr > VMALLOC_END-size) + break; + } + + if (!tmp || (tmp->addr > (void *)PAGE_ALIGN(VMALLOC_END))) { + /* check free area after list end */ + delta = (unsigned long) PAGE_ALIGN(VMALLOC_END) - (size + addr); + if (delta < best_delta) { + best_delta = delta; + best_addr = addr; + best_p = p; + } + } + if (best_addr) { + area->flags = flags; + /* allocate at the end of this area */ + area->addr = (void *)(best_addr + best_delta); + area->size = size; + area->next = *best_p; + area->pages = NULL; + area->nr_pages = 0; + area->phys_addr = 0; + *best_p = area; + /* check like in __vunmap */ + WARN_ON((PAGE_SIZE - 1) & (unsigned long)area->addr); + } else { + kfree(area); + area = NULL; + } + write_unlock(&vmlist_lock); + + return area; +} + /* Caller must hold vmlist_lock */ struct vm_struct *__remove_vm_area(void *addr) { @@ -309,7 +376,7 @@ struct vm_struct *remove_vm_area(void *a return v; } -void __vunmap(void *addr, int deallocate_pages) +void __vunmap(void *addr, int deallocate_pages, int uncharge) { struct vm_struct *area; @@ -335,6 +402,8 @@ void __vunmap(void *addr, int deallocate if (deallocate_pages) { int i; + if (uncharge) + dec_vmalloc_charged(area); for (i = 0; i < area->nr_pages; i++) { BUG_ON(!area->pages[i]); __free_page(area->pages[i]); @@ -364,7 +433,7 @@ void __vunmap(void *addr, int deallocate void vfree(void *addr) { BUG_ON(in_interrupt()); - __vunmap(addr, 1); + __vunmap(addr, 1, 1); } EXPORT_SYMBOL(vfree); @@ -381,7 +450,7 @@ EXPORT_SYMBOL(vfree); void vunmap(void *addr) { BUG_ON(in_interrupt()); - __vunmap(addr, 0); + __vunmap(addr, 0, 0); } EXPORT_SYMBOL(vunmap); @@ -454,10 +523,12 @@ void *__vmalloc_area_node(struct vm_stru if (map_vm_area(area, prot, &pages)) goto fail; + + inc_vmalloc_charged(area, gfp_mask); return area->addr; fail: - vfree(area->addr); + __vunmap(area->addr, 1, 0); return NULL; } @@ -501,6 +572,21 @@ void *__vmalloc(unsigned long size, gfp_ } EXPORT_SYMBOL(__vmalloc); +static void *____vmalloc(unsigned long size, gfp_t mask, pgprot_t prot) +{ + struct vm_struct *area; + + size = PAGE_ALIGN(size); + if (!size || (size >> PAGE_SHIFT) > num_physpages) + return NULL; + + area = get_vm_area_best(size, VM_ALLOC); + if (!area) + return NULL; + + return __vmalloc_area_node(area, mask, prot, -1); +} + /** * vmalloc - allocate virtually contiguous memory * @@ -518,6 +604,26 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +void *ub_vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); +} +EXPORT_SYMBOL(ub_vmalloc); + +void *vmalloc_best(unsigned long size) +{ + return ____vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); +} + +EXPORT_SYMBOL(vmalloc_best); + +void *ub_vmalloc_best(unsigned long size) +{ + return ____vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); +} + +EXPORT_SYMBOL(ub_vmalloc_best); + /** * vmalloc_user - allocate virtually contiguous memory which has * been zeroed so it can be mapped to userspace without @@ -558,6 +664,12 @@ void *vmalloc_node(unsigned long size, i } EXPORT_SYMBOL(vmalloc_node); +void *ub_vmalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL, node); +} +EXPORT_SYMBOL(ub_vmalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -751,3 +863,36 @@ out_einval_locked: } EXPORT_SYMBOL(remap_vmalloc_range); +void vprintstat(void) +{ + struct vm_struct *p, *last_p = NULL; + unsigned long addr, size, free_size, max_free_size; + int num; + + addr = VMALLOC_START; + size = max_free_size = 0; + num = 0; + + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + free_size = (unsigned long)p->addr - addr; + if (free_size > max_free_size) + max_free_size = free_size; + addr = (unsigned long)p->addr + p->size; + size += p->size; + ++num; + last_p = p; + } + if (last_p) { + free_size = VMALLOC_END - + ((unsigned long)last_p->addr + last_p->size); + if (free_size > max_free_size) + max_free_size = free_size; + } + read_unlock(&vmlist_lock); + + printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n" + " Max_Free: %luKB Start: %lx End: %lx\n", + size/1024, (VMALLOC_END - VMALLOC_START)/1024, num, + max_free_size/1024, VMALLOC_START, VMALLOC_END); +} diff -uprN linux-2.6.18/mm/vmscan.c linux-2.6.18.ovz/mm/vmscan.c --- linux-2.6.18/mm/vmscan.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/vmscan.c 2007-06-13 06:55:07.000000000 -0400 @@ -36,6 +36,9 @@ #include #include +#include +#include + #include #include @@ -333,6 +336,7 @@ static pageout_t pageout(struct page *pa */ if (PagePrivate(page)) { if (try_to_free_buffers(page)) { + ub_io_release_context(page, 0); ClearPageDirty(page); printk("%s: orphaned page\n", __FUNCTION__); return PAGE_CLEAN; @@ -696,6 +700,20 @@ done: } /* + * We are about to scan this zone at a certain priority level. If that priority + * level is smaller (ie: more urgent) than the previous priority, then note + * that priority level within the zone. This is done so that when the next + * process comes in to scan this zone, it will immediately start out at this + * priority level rather than having to build up its own scanning priority. + * Here, this priority affects only the reclaim-mapped threshold. + */ +static inline void note_zone_scanning_priority(struct zone *zone, int priority) +{ + if (priority < zone->prev_priority) + zone->prev_priority = priority; +} + +/* * This moves pages from the active list to the inactive list. * * We move them the other way if the page is referenced by one or more @@ -713,7 +731,7 @@ done: * But we had to alter page->flags anyway. */ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc) + struct scan_control *sc, int priority) { unsigned long pgmoved; int pgdeactivate = 0; @@ -734,7 +752,7 @@ static void shrink_active_list(unsigned * `distress' is a measure of how much trouble we're having * reclaiming pages. 0 -> no problems. 100 -> great trouble. */ - distress = 100 >> zone->prev_priority; + distress = 100 >> min(zone->prev_priority, priority); /* * The point of this algorithm is to decide when to start @@ -768,6 +786,7 @@ static void shrink_active_list(unsigned reclaim_mapped = 1; } + {KSTAT_PERF_ENTER(refill_inact) lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, @@ -847,6 +866,7 @@ static void shrink_active_list(unsigned spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); + KSTAT_PERF_LEAVE(refill_inact)} } /* @@ -885,7 +905,7 @@ static unsigned long shrink_zone(int pri nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); nr_active -= nr_to_scan; - shrink_active_list(nr_to_scan, zone, sc); + shrink_active_list(nr_to_scan, zone, sc, priority); } if (nr_inactive) { @@ -934,9 +954,7 @@ static unsigned long shrink_zones(int pr if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = priority; - if (zone->prev_priority > priority) - zone->prev_priority = priority; + note_zone_scanning_priority(zone, priority); if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ @@ -976,15 +994,16 @@ unsigned long try_to_free_pages(struct z .swappiness = vm_swappiness, }; + KSTAT_PERF_ENTER(ttfp); count_vm_event(ALLOCSTALL); + ub_oom_start(); for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = DEF_PRIORITY; lru_pages += zone->nr_active + zone->nr_inactive; } @@ -1022,14 +1041,24 @@ unsigned long try_to_free_pages(struct z blk_congestion_wait(WRITE, HZ/10); } out: + /* + * Now that we've scanned all the zones at this priority level, note + * that level within the zone so that the next thread which performs + * scanning of this zone will immediately start out at this priority + * level. This affects only the decision whether or not to bring + * mapped pages onto the inactive list. + */ + if (priority < 0) + priority = 0; for (i = 0; zones[i] != 0; i++) { struct zone *zone = zones[i]; if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->prev_priority = zone->temp_priority; + zone->prev_priority = priority; } + KSTAT_PERF_LEAVE(ttfp); return ret; } @@ -1068,6 +1097,11 @@ static unsigned long balance_pgdat(pg_da .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, }; + /* + * temp_priority is used to remember the scanning priority at which + * this zone was successfully refilled to free_pages == pages_high. + */ + int temp_priority[MAX_NR_ZONES]; loop_again: total_scanned = 0; @@ -1075,11 +1109,8 @@ loop_again: sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->temp_priority = DEF_PRIORITY; - } + for (i = 0; i < pgdat->nr_zones; i++) + temp_priority[i] = DEF_PRIORITY; for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -1140,10 +1171,9 @@ scan: if (!zone_watermark_ok(zone, order, zone->pages_high, end_zone, 0)) all_zones_ok = 0; - zone->temp_priority = priority; - if (zone->prev_priority > priority) - zone->prev_priority = priority; + temp_priority[i] = priority; sc.nr_scanned = 0; + note_zone_scanning_priority(zone, priority); nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -1183,10 +1213,15 @@ scan: break; } out: + /* + * Note within each zone the priority level at which this zone was + * brought into a happy state. So that the next thread which scans this + * zone will start out at that priority level. + */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; - zone->prev_priority = zone->temp_priority; + zone->prev_priority = temp_priority[i]; } if (!all_zones_ok) { cond_resched(); @@ -1315,7 +1350,7 @@ static unsigned long shrink_all_zones(un if (zone->nr_scan_active >= nr_pages || pass > 3) { zone->nr_scan_active = 0; nr_to_scan = min(nr_pages, zone->nr_active); - shrink_active_list(nr_to_scan, zone, sc); + shrink_active_list(nr_to_scan, zone, sc, prio); } } @@ -1510,7 +1545,6 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ -#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -1526,6 +1560,12 @@ int zone_reclaim_mode __read_mostly; int sysctl_min_unmapped_ratio = 1; /* + * If the number of slab pages in a zone grows beyond this percentage then + * slab reclaim needs to occur. + */ +int sysctl_min_slab_ratio = 5; + +/* * Try to free up some pages from this zone through reclaim. */ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) @@ -1556,29 +1596,38 @@ static int __zone_reclaim(struct zone *z reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - /* - * Free memory by calling shrink zone with increasing priorities - * until we have enough memory freed. - */ - priority = ZONE_RECLAIM_PRIORITY; - do { - nr_reclaimed += shrink_zone(priority, zone, &sc); - priority--; - } while (priority >= 0 && nr_reclaimed < nr_pages); + if (zone_page_state(zone, NR_FILE_PAGES) - + zone_page_state(zone, NR_FILE_MAPPED) > + zone->min_unmapped_ratio) { + /* + * Free memory by calling shrink zone with increasing + * priorities until we have enough memory freed. + */ + priority = ZONE_RECLAIM_PRIORITY; + do { + note_zone_scanning_priority(zone, priority); + nr_reclaimed += shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && nr_reclaimed < nr_pages); + } - if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + if (zone_page_state(zone, NR_SLAB) > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how - * many pages were freed in this zone. So we just shake the slab - * a bit and then go off node for this particular allocation - * despite possibly having freed enough memory to allocate in - * this zone. If we freed local memory then the next - * allocations will be local again. + * many pages were freed in this zone. So we take the current + * number of slab pages and shake the slab until it is reduced + * by the same nr_pages that we used for reclaiming unmapped + * pages. * - * shrink_slab will free memory on all zones and may take - * a long time. + * Note that shrink_slab will free memory on all zones and may + * take a long time. */ - shrink_slab(sc.nr_scanned, gfp_mask, order); + unsigned long limit = zone_page_state(zone, + NR_SLAB) - nr_pages; + + while (shrink_slab(sc.nr_scanned, gfp_mask, order) && + zone_page_state(zone, NR_SLAB) > limit) + ; } p->reclaim_state = NULL; @@ -1592,7 +1641,8 @@ int zone_reclaim(struct zone *zone, gfp_ int node_id; /* - * Zone reclaim reclaims unmapped file backed pages. + * Zone reclaim reclaims unmapped file backed pages and + * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately @@ -1601,7 +1651,9 @@ int zone_reclaim(struct zone *zone, gfp_ * unmapped file backed pages. */ if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) + zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio + && zone_page_state(zone, NR_SLAB) + <= zone->min_slab_pages) return 0; /* diff -uprN linux-2.6.18/mm/vmstat.c linux-2.6.18.ovz/mm/vmstat.c --- linux-2.6.18/mm/vmstat.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/mm/vmstat.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,6 +14,8 @@ #include #include +#include + void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) { @@ -73,6 +75,20 @@ static void sum_vm_events(unsigned long } } +unsigned long vm_events(enum vm_event_item i) +{ + int cpu; + unsigned long sum; + struct vm_event_state *st; + + sum = 0; + for_each_online_cpu(cpu) { + st = &per_cpu(vm_event_states, cpu); + sum += st->event[i]; + } + + return (sum < 0 ? 0 : sum); +} /* * Accumulate the vm event counters across all CPUs. * The result is unavoidably approximate - it can change @@ -586,11 +602,9 @@ static int zoneinfo_show(struct seq_file seq_printf(m, "\n all_unreclaimable: %u" "\n prev_priority: %i" - "\n temp_priority: %i" "\n start_pfn: %lu", zone->all_unreclaimable, zone->prev_priority, - zone->temp_priority, zone->zone_start_pfn); spin_unlock_irqrestore(&zone->lock, flags); seq_putc(m, '\n'); @@ -611,30 +625,41 @@ static void *vmstat_start(struct seq_fil unsigned long *v; #ifdef CONFIG_VM_EVENT_COUNTERS unsigned long *e; +#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + \ + sizeof(struct vm_event_state)) +#else +#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) #endif int i; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; -#ifdef CONFIG_VM_EVENT_COUNTERS - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(struct vm_event_state), GFP_KERNEL); -#else - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), - GFP_KERNEL); -#endif + v = kmalloc(VMSTAT_BUFSIZE, GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - v[i] = global_page_state(i); + + if (ve_is_super(get_exec_env())) { + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_page_state(i); + #ifdef CONFIG_VM_EVENT_COUNTERS - e = v + NR_VM_ZONE_STAT_ITEMS; - all_vm_events(e); - e[PGPGIN] /= 2; /* sectors -> kbytes */ - e[PGPGOUT] /= 2; + e = v + NR_VM_ZONE_STAT_ITEMS; + all_vm_events(e); + e[PGPGIN] /= 2; /* sectors -> kbytes */ + e[PGPGOUT] /= 2; #endif + } else { + memset(v, 0, VMSTAT_BUFSIZE); + if (virtinfo_notifier_call(VITYPE_GENERAL, + VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) { + kfree(v); + m->private = NULL; + return ERR_PTR(-ENOMSG); + } + } + return v + *pos; } diff -uprN linux-2.6.18/net/8021q/vlan.c linux-2.6.18.ovz/net/8021q/vlan.c --- linux-2.6.18/net/8021q/vlan.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/8021q/vlan.c 2007-06-13 06:55:07.000000000 -0400 @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include #include "vlan.h" @@ -67,6 +69,44 @@ static struct packet_type vlan_packet_ty .func = vlan_skb_recv, /* VLAN receive method */ }; +#ifdef CONFIG_VE +static int vlan_start(void *data) +{ + int err; + + err = vlan_proc_init(); + if (err < 0) + goto out_proc; + + __module_get(THIS_MODULE); + return 0; + +out_proc: + return err; +} + +static void vlan_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->_proc_vlan_dir == NULL) + return; + + vlan_proc_cleanup(); + ve->_proc_vlan_conf = NULL; + ve->_proc_vlan_dir = NULL; + module_put(THIS_MODULE); +} + +static struct ve_hook vlan_ve_hook = { + .init = vlan_start, + .fini = vlan_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_POST, +}; +#endif + /* End of global variables definitions. */ /* @@ -104,6 +144,7 @@ static int __init vlan_proto_init(void) } vlan_ioctl_set(vlan_ioctl_handler); + ve_hook_register(VE_SS_CHAIN, &vlan_ve_hook); return 0; } @@ -116,6 +157,8 @@ static void __exit vlan_cleanup_devices( { struct net_device *dev, *nxt; + ve_hook_unregister(&vlan_ve_hook); + rtnl_lock(); for (dev = dev_base; dev; dev = nxt) { nxt = dev->next; @@ -160,14 +203,16 @@ module_init(vlan_proto_init); module_exit(vlan_cleanup_module); /* Must be invoked with RCU read lock (no preempt) */ -static struct vlan_group *__vlan_find_group(int real_dev_ifindex) +static struct vlan_group *__vlan_find_group(int real_dev_ifindex, + struct ve_struct *ve) { struct vlan_group *grp; struct hlist_node *n; int hash = vlan_grp_hashfn(real_dev_ifindex); hlist_for_each_entry_rcu(grp, n, &vlan_group_hash[hash], hlist) { - if (grp->real_dev_ifindex == real_dev_ifindex) + if (grp->real_dev_ifindex == real_dev_ifindex && + ve_accessible_strict(ve, grp->owner)) return grp; } @@ -181,7 +226,8 @@ static struct vlan_group *__vlan_find_gr struct net_device *__find_vlan_dev(struct net_device *real_dev, unsigned short VID) { - struct vlan_group *grp = __vlan_find_group(real_dev->ifindex); + struct vlan_group *grp = __vlan_find_group(real_dev->ifindex, + real_dev->owner_env); if (grp) return grp->vlan_devices[VID]; @@ -218,7 +264,7 @@ static int unregister_vlan_dev(struct ne return -EINVAL; ASSERT_RTNL(); - grp = __vlan_find_group(real_dev_ifindex); + grp = __vlan_find_group(real_dev_ifindex, real_dev->owner_env); ret = 0; @@ -260,6 +306,9 @@ static int unregister_vlan_dev(struct ne hlist_del_rcu(&grp->hlist); + put_ve(grp->owner); + grp->owner = NULL; + /* Free the group, after all cpu's are done. */ call_rcu(&grp->rcu, vlan_rcu_free); @@ -338,6 +387,8 @@ static void vlan_setup(struct net_device new_dev->set_multicast_list = vlan_dev_set_multicast_list; new_dev->destructor = free_netdev; new_dev->do_ioctl = vlan_dev_ioctl; + if (!ve_is_super(get_exec_env())) + new_dev->features |= NETIF_F_VIRTUAL; } static void vlan_transfer_operstate(const struct net_device *dev, struct net_device *vlandev) @@ -534,18 +585,19 @@ static struct net_device *register_vlan_ /* So, got the sucker initialized, now lets place * it into our local structure. */ - grp = __vlan_find_group(real_dev->ifindex); + grp = __vlan_find_group(real_dev->ifindex, real_dev->owner_env); /* Note, we are running under the RTNL semaphore * so it cannot "appear" on us. */ if (!grp) { /* need to add a new group */ - grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL); + grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL_UBC); if (!grp) goto out_free_unregister; /* printk(KERN_ALERT "VLAN REGISTER: Allocated new group.\n"); */ grp->real_dev_ifindex = real_dev->ifindex; + grp->owner = get_ve(real_dev->owner_env); hlist_add_head_rcu(&grp->hlist, &vlan_group_hash[vlan_grp_hashfn(real_dev->ifindex)]); @@ -591,10 +643,12 @@ out_ret_null: static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = ptr; - struct vlan_group *grp = __vlan_find_group(dev->ifindex); + struct vlan_group *grp; int i, flgs; struct net_device *vlandev; + struct ve_struct *env; + grp = __vlan_find_group(dev->ifindex, dev->owner_env); if (!grp) goto out; @@ -656,7 +710,9 @@ static int vlan_device_event(struct noti ret = unregister_vlan_dev(dev, VLAN_DEV_INFO(vlandev)->vlan_id); + env = set_exec_env(vlandev->owner_env); unregister_netdevice(vlandev); + set_exec_env(env); /* Group was destroyed? */ if (ret == 1) @@ -669,6 +725,15 @@ out: return NOTIFY_DONE; } +static inline int vlan_check_caps(void) +{ + return capable(CAP_NET_ADMIN) +#ifdef CONFIG_VE + || capable(CAP_VE_NET_ADMIN) +#endif + ; +} + /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver @@ -693,7 +758,7 @@ static int vlan_ioctl_handler(void __use switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) return -EPERM; err = vlan_dev_set_ingress_priority(args.device1, args.u.skb_priority, @@ -701,7 +766,7 @@ static int vlan_ioctl_handler(void __use break; case SET_VLAN_EGRESS_PRIORITY_CMD: - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) return -EPERM; err = vlan_dev_set_egress_priority(args.device1, args.u.skb_priority, @@ -709,7 +774,7 @@ static int vlan_ioctl_handler(void __use break; case SET_VLAN_FLAG_CMD: - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) return -EPERM; err = vlan_dev_set_vlan_flag(args.device1, args.u.flag, @@ -717,7 +782,7 @@ static int vlan_ioctl_handler(void __use break; case SET_VLAN_NAME_TYPE_CMD: - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) return -EPERM; if ((args.u.name_type >= 0) && (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) { @@ -729,7 +794,7 @@ static int vlan_ioctl_handler(void __use break; case ADD_VLAN_CMD: - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) return -EPERM; /* we have been given the name of the Ethernet Device we want to * talk to: args.dev1 We also have the @@ -743,7 +808,7 @@ static int vlan_ioctl_handler(void __use break; case DEL_VLAN_CMD: - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) return -EPERM; /* Here, the args.dev1 is the actual VLAN we want * to get rid of. diff -uprN linux-2.6.18/net/8021q/vlan_dev.c linux-2.6.18.ovz/net/8021q/vlan_dev.c --- linux-2.6.18/net/8021q/vlan_dev.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/8021q/vlan_dev.c 2007-06-13 06:55:07.000000000 -0400 @@ -436,6 +436,7 @@ int vlan_dev_hard_header(struct sk_buff int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ve_struct *env; struct net_device_stats *stats = vlan_dev_get_stats(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); @@ -489,13 +490,17 @@ int vlan_dev_hard_start_xmit(struct sk_b stats->tx_bytes += skb->len; skb->dev = VLAN_DEV_INFO(dev)->real_dev; + skb->owner_env = skb->dev->owner_env; + env = set_exec_env(skb->owner_env); dev_queue_xmit(skb); + set_exec_env(env); return 0; } int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ve_struct *env; struct net_device_stats *stats = vlan_dev_get_stats(dev); unsigned short veth_TCI; @@ -513,7 +518,10 @@ int vlan_dev_hwaccel_hard_start_xmit(str stats->tx_bytes += skb->len; skb->dev = VLAN_DEV_INFO(dev)->real_dev; + skb->owner_env = skb->dev->owner_env; + env = set_exec_env(skb->owner_env); dev_queue_xmit(skb); + set_exec_env(env); return 0; } diff -uprN linux-2.6.18/net/8021q/vlanproc.c linux-2.6.18.ovz/net/8021q/vlanproc.c --- linux-2.6.18/net/8021q/vlanproc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/8021q/vlanproc.c 2007-06-13 06:55:07.000000000 -0400 @@ -114,13 +114,21 @@ static struct file_operations vlandev_fo * /proc/net/vlan */ +#ifdef CONFIG_VE +#define proc_vlan_dir (get_exec_env()->_proc_vlan_dir) +#else static struct proc_dir_entry *proc_vlan_dir; +#endif /* * /proc/net/vlan/config */ +#ifdef CONFIG_VE +#define proc_vlan_conf (get_exec_env()->_proc_vlan_conf) +#else static struct proc_dir_entry *proc_vlan_conf; +#endif /* Strings */ static const char *vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = { @@ -154,7 +162,7 @@ void vlan_proc_cleanup(void) * Create /proc/net/vlan entries */ -int __init vlan_proc_init(void) +int vlan_proc_init(void) { proc_vlan_dir = proc_mkdir(name_root, proc_net); if (proc_vlan_dir) { diff -uprN linux-2.6.18/net/bluetooth/cmtp/capi.c linux-2.6.18.ovz/net/bluetooth/cmtp/capi.c --- linux-2.6.18/net/bluetooth/cmtp/capi.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/bluetooth/cmtp/capi.c 2007-06-13 06:55:07.000000000 -0400 @@ -196,6 +196,9 @@ static void cmtp_recv_interopmsg(struct switch (CAPIMSG_SUBCOMMAND(skb->data)) { case CAPI_CONF: + if (skb->len < CAPI_MSG_BASELEN + 10) + break; + func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 5); info = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 8); @@ -226,6 +229,9 @@ static void cmtp_recv_interopmsg(struct break; case CAPI_FUNCTION_GET_PROFILE: + if (skb->len < CAPI_MSG_BASELEN + 11 + sizeof(capi_profile)) + break; + controller = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 11); msgnum = CAPIMSG_MSGID(skb->data); @@ -246,17 +252,26 @@ static void cmtp_recv_interopmsg(struct break; case CAPI_FUNCTION_GET_MANUFACTURER: + if (skb->len < CAPI_MSG_BASELEN + 15) + break; + controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 10); if (!info && ctrl) { + int len = min_t(uint, CAPI_MANUFACTURER_LEN, + skb->data[CAPI_MSG_BASELEN + 14]); + + memset(ctrl->manu, 0, CAPI_MANUFACTURER_LEN); strncpy(ctrl->manu, - skb->data + CAPI_MSG_BASELEN + 15, - skb->data[CAPI_MSG_BASELEN + 14]); + skb->data + CAPI_MSG_BASELEN + 15, len); } break; case CAPI_FUNCTION_GET_VERSION: + if (skb->len < CAPI_MSG_BASELEN + 32) + break; + controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12); if (!info && ctrl) { @@ -269,13 +284,18 @@ static void cmtp_recv_interopmsg(struct break; case CAPI_FUNCTION_GET_SERIAL_NUMBER: + if (skb->len < CAPI_MSG_BASELEN + 17) + break; + controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12); if (!info && ctrl) { + int len = min_t(uint, CAPI_SERIAL_LEN, + skb->data[CAPI_MSG_BASELEN + 16]); + memset(ctrl->serial, 0, CAPI_SERIAL_LEN); strncpy(ctrl->serial, - skb->data + CAPI_MSG_BASELEN + 17, - skb->data[CAPI_MSG_BASELEN + 16]); + skb->data + CAPI_MSG_BASELEN + 17, len); } break; @@ -284,14 +304,18 @@ static void cmtp_recv_interopmsg(struct break; case CAPI_IND: + if (skb->len < CAPI_MSG_BASELEN + 6) + break; + func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 3); if (func == CAPI_FUNCTION_LOOPBACK) { + int len = min_t(uint, skb->len - CAPI_MSG_BASELEN - 6, + skb->data[CAPI_MSG_BASELEN + 5]); appl = CAPIMSG_APPID(skb->data); msgnum = CAPIMSG_MSGID(skb->data); cmtp_send_interopmsg(session, CAPI_RESP, appl, msgnum, func, - skb->data + CAPI_MSG_BASELEN + 6, - skb->data[CAPI_MSG_BASELEN + 5]); + skb->data + CAPI_MSG_BASELEN + 6, len); } break; @@ -309,6 +333,9 @@ void cmtp_recv_capimsg(struct cmtp_sessi BT_DBG("session %p skb %p len %d", session, skb, skb->len); + if (skb->len < CAPI_MSG_BASELEN) + return; + if (CAPIMSG_COMMAND(skb->data) == CAPI_INTEROPERABILITY) { cmtp_recv_interopmsg(session, skb); return; diff -uprN linux-2.6.18/net/bluetooth/hci_sock.c linux-2.6.18.ovz/net/bluetooth/hci_sock.c --- linux-2.6.18/net/bluetooth/hci_sock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/bluetooth/hci_sock.c 2007-06-13 06:55:07.000000000 -0400 @@ -120,10 +120,13 @@ void hci_send_to_sock(struct hci_dev *hd if (!hci_test_bit(evt, &flt->event_mask)) continue; - if (flt->opcode && ((evt == HCI_EV_CMD_COMPLETE && - flt->opcode != *(__u16 *)(skb->data + 3)) || - (evt == HCI_EV_CMD_STATUS && - flt->opcode != *(__u16 *)(skb->data + 4)))) + if (flt->opcode && + ((evt == HCI_EV_CMD_COMPLETE && + flt->opcode != + get_unaligned((__u16 *)(skb->data + 3))) || + (evt == HCI_EV_CMD_STATUS && + flt->opcode != + get_unaligned((__u16 *)(skb->data + 4))))) continue; } diff -uprN linux-2.6.18/net/bluetooth/rfcomm/tty.c linux-2.6.18.ovz/net/bluetooth/rfcomm/tty.c --- linux-2.6.18/net/bluetooth/rfcomm/tty.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/bluetooth/rfcomm/tty.c 2007-06-13 06:55:07.000000000 -0400 @@ -748,6 +748,9 @@ static void rfcomm_tty_set_termios(struc BT_DBG("tty %p termios %p", tty, old); + if (!dev) + return; + /* Handle turning off CRTSCTS */ if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS)) BT_DBG("Turning off CRTSCTS unsupported"); diff -uprN linux-2.6.18/net/bridge/br_ioctl.c linux-2.6.18.ovz/net/bridge/br_ioctl.c --- linux-2.6.18/net/bridge/br_ioctl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/bridge/br_ioctl.c 2007-06-13 06:55:07.000000000 -0400 @@ -58,12 +58,13 @@ static int get_fdb_entries(struct net_br { int num; void *buf; - size_t size = maxnum * sizeof(struct __fdb_entry); + size_t size; - if (size > PAGE_SIZE) { - size = PAGE_SIZE; + /* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */ + if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry)) maxnum = PAGE_SIZE/sizeof(struct __fdb_entry); - } + + size = maxnum * sizeof(struct __fdb_entry); buf = kmalloc(size, GFP_USER); if (!buf) diff -uprN linux-2.6.18/net/bridge/br_stp_if.c linux-2.6.18.ovz/net/bridge/br_stp_if.c --- linux-2.6.18/net/bridge/br_stp_if.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/bridge/br_stp_if.c 2007-06-13 06:55:07.000000000 -0400 @@ -124,7 +124,9 @@ void br_stp_disable_port(struct net_brid /* called under bridge lock */ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { - unsigned char oldaddr[6]; + /* should be aligned on 2 bytes for compare_ether_addr() */ + unsigned short oldaddr_aligned[ETH_ALEN >> 1]; + unsigned char *oldaddr = (unsigned char *)oldaddr_aligned; struct net_bridge_port *p; int wasroot; @@ -149,11 +151,14 @@ void br_stp_change_bridge_id(struct net_ br_become_root_bridge(br); } -static const unsigned char br_mac_zero[6]; +/* should be aligned on 2 bytes for compare_ether_addr() */ +static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1]; /* called under bridge lock */ void br_stp_recalculate_bridge_id(struct net_bridge *br) { + const unsigned char *br_mac_zero = + (const unsigned char *)br_mac_zero_aligned; const unsigned char *addr = br_mac_zero; struct net_bridge_port *p; diff -uprN linux-2.6.18/net/bridge/netfilter/ebtables.c linux-2.6.18.ovz/net/bridge/netfilter/ebtables.c --- linux-2.6.18/net/bridge/netfilter/ebtables.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/bridge/netfilter/ebtables.c 2007-06-13 06:55:07.000000000 -0400 @@ -360,10 +360,11 @@ ebt_check_match(struct ebt_entry_match * const char *name, unsigned int hookmask, unsigned int *cnt) { struct ebt_match *match; + size_t left = ((char *)e + e->watchers_offset) - (char *)m; int ret; - if (((char *)m) + m->match_size + sizeof(struct ebt_entry_match) > - ((char *)e) + e->watchers_offset) + if (left < sizeof(struct ebt_entry_match) || + left - sizeof(struct ebt_entry_match) < m->match_size) return -EINVAL; match = find_match_lock(m->u.name, &ret, &ebt_mutex); if (!match) @@ -389,10 +390,11 @@ ebt_check_watcher(struct ebt_entry_watch const char *name, unsigned int hookmask, unsigned int *cnt) { struct ebt_watcher *watcher; + size_t left = ((char *)e + e->target_offset) - (char *)w; int ret; - if (((char *)w) + w->watcher_size + sizeof(struct ebt_entry_watcher) > - ((char *)e) + e->target_offset) + if (left < sizeof(struct ebt_entry_watcher) || + left - sizeof(struct ebt_entry_watcher) < w->watcher_size) return -EINVAL; watcher = find_watcher_lock(w->u.name, &ret, &ebt_mutex); if (!watcher) @@ -423,19 +425,23 @@ ebt_check_entry_size_and_hooks(struct eb struct ebt_entries **hook_entries, unsigned int *n, unsigned int *cnt, unsigned int *totalcnt, unsigned int *udc_cnt, unsigned int valid_hooks) { + unsigned int offset = (char *)e - newinfo->entries; + size_t left = (limit - base) - offset; int i; + if (left < sizeof(unsigned int)) + goto Esmall; + for (i = 0; i < NF_BR_NUMHOOKS; i++) { if ((valid_hooks & (1 << i)) == 0) continue; - if ( (char *)hook_entries[i] - base == - (char *)e - newinfo->entries) + if ((char *)hook_entries[i] == base + offset) break; } /* beginning of a new chain if i == NF_BR_NUMHOOKS it must be a user defined chain */ if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) { - if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) != 0) { + if (e->bitmask != 0) { /* we make userspace set this right, so there is no misunderstanding */ BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set " @@ -450,11 +456,8 @@ ebt_check_entry_size_and_hooks(struct eb return -EINVAL; } /* before we look at the struct, be sure it is not too big */ - if ((char *)hook_entries[i] + sizeof(struct ebt_entries) - > limit) { - BUGPRINT("entries_size too small\n"); - return -EINVAL; - } + if (left < sizeof(struct ebt_entries)) + goto Esmall; if (((struct ebt_entries *)e)->policy != EBT_DROP && ((struct ebt_entries *)e)->policy != EBT_ACCEPT) { /* only RETURN from udc */ @@ -477,6 +480,8 @@ ebt_check_entry_size_and_hooks(struct eb return 0; } /* a plain old entry, heh */ + if (left < sizeof(struct ebt_entry)) + goto Esmall; if (sizeof(struct ebt_entry) > e->watchers_offset || e->watchers_offset > e->target_offset || e->target_offset >= e->next_offset) { @@ -488,10 +493,16 @@ ebt_check_entry_size_and_hooks(struct eb BUGPRINT("target size too small\n"); return -EINVAL; } + if (left < e->next_offset) + goto Esmall; (*cnt)++; (*totalcnt)++; return 0; + +Esmall: + BUGPRINT("entries_size too small\n"); + return -EINVAL; } struct ebt_cl_stack @@ -513,7 +524,7 @@ ebt_get_udc_positions(struct ebt_entry * int i; /* we're only interested in chain starts */ - if (e->bitmask & EBT_ENTRY_OR_ENTRIES) + if (e->bitmask) return 0; for (i = 0; i < NF_BR_NUMHOOKS; i++) { if ((valid_hooks & (1 << i)) == 0) @@ -563,7 +574,7 @@ ebt_cleanup_entry(struct ebt_entry *e, u { struct ebt_entry_target *t; - if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) == 0) + if (e->bitmask == 0) return 0; /* we're done */ if (cnt && (*cnt)-- == 0) @@ -586,10 +597,11 @@ ebt_check_entry(struct ebt_entry *e, str struct ebt_entry_target *t; struct ebt_target *target; unsigned int i, j, hook = 0, hookmask = 0; + size_t gap = e->next_offset - e->target_offset; int ret; /* don't mess with the struct ebt_entries */ - if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) == 0) + if (e->bitmask == 0) return 0; if (e->bitmask & ~EBT_F_MASK) { @@ -647,8 +659,7 @@ ebt_check_entry(struct ebt_entry *e, str t->u.target = target; if (t->u.target == &ebt_standard_target) { - if (e->target_offset + sizeof(struct ebt_standard_target) > - e->next_offset) { + if (gap < sizeof(struct ebt_standard_target)) { BUGPRINT("Standard target size too big\n"); ret = -EFAULT; goto cleanup_watchers; @@ -659,8 +670,7 @@ ebt_check_entry(struct ebt_entry *e, str ret = -EFAULT; goto cleanup_watchers; } - } else if ((e->target_offset + t->target_size + - sizeof(struct ebt_entry_target) > e->next_offset) || + } else if (t->target_size > gap - sizeof(struct ebt_entry_target) || (t->u.target->check && t->u.target->check(name, hookmask, e, t->data, t->target_size) != 0)){ module_put(t->u.target->me); @@ -730,7 +740,9 @@ static int check_chainloops(struct ebt_e BUGPRINT("loop\n"); return -1; } - /* this can't be 0, so the above test is correct */ + if (cl_s[i].hookmask & (1 << hooknr)) + goto letscontinue; + /* this can't be 0, so the loop test is correct */ cl_s[i].cs.n = pos + 1; pos = 0; cl_s[i].cs.e = ((void *)e + e->next_offset); @@ -1307,7 +1319,7 @@ static inline int ebt_make_names(struct char *hlp; struct ebt_entry_target *t; - if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) == 0) + if (e->bitmask == 0) return 0; hlp = ubase - base + (char *)e + e->target_offset; diff -uprN linux-2.6.18/net/core/datagram.c linux-2.6.18.ovz/net/core/datagram.c --- linux-2.6.18/net/core/datagram.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/datagram.c 2007-06-13 06:55:07.000000000 -0400 @@ -56,6 +56,8 @@ #include #include +#include + /* * Is a socket 'connection oriented' ? */ @@ -493,6 +495,7 @@ unsigned int datagram_poll(struct file * { struct sock *sk = sock->sk; unsigned int mask; + int no_ubc_space; poll_wait(file, sk->sk_sleep, wait); mask = 0; @@ -502,8 +505,14 @@ unsigned int datagram_poll(struct file * mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { + no_ubc_space = 0; mask |= POLLHUP; + } else { + no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); + if (no_ubc_space) + ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); + } /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || @@ -520,7 +529,7 @@ unsigned int datagram_poll(struct file * } /* writable? */ - if (sock_writeable(sk)) + if (!no_ubc_space && sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff -uprN linux-2.6.18/net/core/dev.c linux-2.6.18.ovz/net/core/dev.c --- linux-2.6.18/net/core/dev.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/dev.c 2007-06-13 06:55:07.000000000 -0400 @@ -118,6 +118,9 @@ #include #include +#include +#include + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -175,25 +178,40 @@ static spinlock_t net_dma_event_lock; * unregister_netdevice(), which must be called with the rtnl * semaphore held. */ +#ifdef CONFIG_VE +#define dev_tail (get_exec_env()->_net_dev_tail) +#else struct net_device *dev_base; static struct net_device **dev_tail = &dev_base; +EXPORT_SYMBOL(dev_base); +#endif DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base); EXPORT_SYMBOL(dev_base_lock); +#ifdef CONFIG_VE +#define MAX_UNMOVABLE_NETDEVICES (8*4096) +static uint8_t unmovable_ifindex_list[MAX_UNMOVABLE_NETDEVICES/8]; +static LIST_HEAD(dev_global_list); +#endif + #define NETDEV_HASHBITS 8 static struct hlist_head dev_name_head[1<name, name, IFNAMSIZ)) @@ -510,6 +528,32 @@ struct net_device *dev_get_by_name(const } /** + * __dev_global_get_by_name - find a device by its name in dev_global_list + * @name: name to find + * + * Find an interface by name. Must be called under RTNL semaphore + * If the name is found a pointer to the device + * is returned. If the name is not found then %NULL is returned. The + * reference counters are not incremented so the caller must be + * careful with locks. + */ + +#ifdef CONFIG_VE +struct net_device *__dev_global_get_by_name(const char *name) +{ + struct net_device *dev; + /* It's called relatively rarely */ + list_for_each_entry(dev, &dev_global_list, dev_global_list_entry) { + if (strncmp(dev->name, name, IFNAMSIZ) == 0) + return dev; + } + return NULL; +} +#else /* CONFIG_VE */ +#define __dev_global_get_by_name(name) __dev_get_by_name(name) +#endif /* CONFIG_VE */ + +/** * __dev_get_by_index - find a device by its ifindex * @ifindex: index of device * @@ -524,7 +568,7 @@ struct net_device *__dev_get_by_index(in { struct hlist_node *p; - hlist_for_each(p, dev_index_hash(ifindex)) { + hlist_for_each(p, dev_index_hash(ifindex, get_exec_env())) { struct net_device *dev = hlist_entry(p, struct net_device, index_hlist); if (dev->ifindex == ifindex) @@ -651,6 +695,23 @@ int dev_valid_name(const char *name) return 1; } +static inline void __dev_check_name(const char *dev_name, const char *name, + long *inuse, const int max_netdevices) +{ + int i = 0; + char buf[IFNAMSIZ]; + + if (!sscanf(dev_name, name, &i)) + return; + if (i < 0 || i >= max_netdevices) + return; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, sizeof(buf), name, i); + if (!strncmp(buf, dev_name, IFNAMSIZ)) + set_bit(i, inuse); +} + /** * dev_alloc_name - allocate a name for a device * @dev: device @@ -689,16 +750,20 @@ int dev_alloc_name(struct net_device *de if (!inuse) return -ENOMEM; - for (d = dev_base; d; d = d->next) { - if (!sscanf(d->name, name, &i)) - continue; - if (i < 0 || i >= max_netdevices) - continue; - - /* avoid cases where sscanf is not exact inverse of printf */ - snprintf(buf, sizeof(buf), name, i); - if (!strncmp(buf, d->name, IFNAMSIZ)) - set_bit(i, inuse); +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) { + list_for_each_entry(d, &dev_global_list, + dev_global_list_entry) { + __dev_check_name(d->name, name, inuse, + max_netdevices); + } + } else +#endif + { + for (d = dev_base; d; d = d->next) { + __dev_check_name(d->name, name, inuse, + max_netdevices); + } } i = find_first_zero_bit(inuse, max_netdevices); @@ -706,7 +771,11 @@ int dev_alloc_name(struct net_device *de } snprintf(buf, sizeof(buf), name, i); - if (!__dev_get_by_name(buf)) { + if (ve_is_super(get_exec_env())) + d = __dev_global_get_by_name(buf); + else + d = __dev_get_by_name(buf); + if (d == NULL) { strlcpy(dev->name, buf, IFNAMSIZ); return i; } @@ -739,13 +808,14 @@ int dev_change_name(struct net_device *d if (!dev_valid_name(newname)) return -EINVAL; + /* Rename of devices in VE is prohibited by CAP_NET_ADMIN */ if (strchr(newname, '%')) { err = dev_alloc_name(dev, newname); if (err < 0) return err; strcpy(newname, dev->name); } - else if (__dev_get_by_name(newname)) + else if (__dev_global_get_by_name(newname)) return -EEXIST; else strlcpy(dev->name, newname, IFNAMSIZ); @@ -753,7 +823,8 @@ int dev_change_name(struct net_device *d err = class_device_rename(&dev->class_dev, dev->name); if (!err) { hlist_del(&dev->name_hlist); - hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); + hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name, + get_exec_env())); raw_notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); } @@ -1020,6 +1091,8 @@ int call_netdevice_notifiers(unsigned lo return raw_notifier_call_chain(&netdev_chain, val, v); } +EXPORT_SYMBOL(call_netdevice_notifiers); + /* When > 0 there are consumers of rx skb time stamps */ static atomic_t netstamp_needed = ATOMIC_INIT(0); @@ -1476,16 +1549,46 @@ gso: skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); #endif if (q->enqueue) { - /* Grab device queue */ - spin_lock(&dev->queue_lock); + /* + * XXX this code is broken: + * 1) it is activated for normal devices in VE0, + * 2) it doesn't use API functions like ub_skb_set_charge, + * 3) it isn't allowed to charge skb as UB_OTHERSOCKBUF + * if its socket is TCP. + */ +#if 0 + struct user_beancounter *ub; - rc = q->enqueue(skb, q); + ub = netdev_bc(dev)->exec_ub; + /* the skb CAN be already charged if it transmitted via + * something like bonding device */ + if (ub && (skb_bc(skb)->resource == 0)) { + unsigned long chargesize; + chargesize = skb_charge_fullsize(skb); + if (charge_beancounter(ub, UB_OTHERSOCKBUF, + chargesize, UB_SOFT)) { + rcu_read_unlock(); + rc = -ENOMEM; + goto out_kfree_skb; + } + skb_bc(skb)->ub = ub; + skb_bc(skb)->charged = chargesize; + skb_bc(skb)->resource = UB_OTHERSOCKBUF; + } +#endif - qdisc_run(dev); + /* Grab device queue */ + spin_lock(&dev->queue_lock); + q = dev->qdisc; + if (q->enqueue) { + rc = q->enqueue(skb, q); + qdisc_run(dev); + spin_unlock(&dev->queue_lock); + rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; + goto out; + } spin_unlock(&dev->queue_lock); - rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; - goto out; } /* The device has no queue. Common case for software devices: @@ -1764,6 +1867,7 @@ int netif_receive_skb(struct sk_buff *sk struct net_device *orig_dev; int ret = NET_RX_DROP; unsigned short type; + struct ve_struct *old_env; /* if we've gotten here through NAPI, check netpoll */ if (skb->dev->poll && netpoll_rx(skb)) @@ -1785,6 +1889,17 @@ int netif_receive_skb(struct sk_buff *sk skb->h.raw = skb->nh.raw = skb->data; skb->mac_len = skb->nh.raw - skb->mac.raw; +#ifdef CONFIG_VE + /* + * Skb might be alloced in another VE context, than its device works. + * So, set the correct owner_env. + */ + skb->owner_env = skb->dev->owner_env; + BUG_ON(skb->owner_env == NULL); +#endif + + old_env = set_exec_env(skb->owner_env); + pt_prev = NULL; rcu_read_lock(); @@ -1850,6 +1965,7 @@ ncls: out: rcu_read_unlock(); + (void)set_exec_env(old_env); return ret; } @@ -2237,7 +2353,7 @@ static int __init dev_proc_init(void) { int rc = -ENOMEM; - if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) + if (!proc_glob_fops_create("net/dev", S_IRUGO, &dev_seq_fops)) goto out; if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) goto out_dev; @@ -2249,7 +2365,7 @@ out: out_softnet: proc_net_remove("softnet_stat"); out_dev: - proc_net_remove("dev"); + remove_proc_glob_entry("net/dev", NULL); goto out; } #else @@ -2314,9 +2430,12 @@ void dev_set_promiscuity(struct net_devi dev->flags &= ~IFF_PROMISC; else dev->flags |= IFF_PROMISC; + /* Promiscous mode on these devices does not mean anything */ + if (dev->flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) + return; if (dev->flags != old_flags) { dev_mc_upload(dev); - printk(KERN_INFO "device %s %s promiscuous mode\n", + ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : "left"); audit_log(current->audit_context, GFP_ATOMIC, @@ -2744,11 +2863,20 @@ int dev_ioctl(unsigned int cmd, void __u * - require strict serialization. * - do not return a value */ + case SIOCSIFMTU: + case SIOCSIFHWADDR: + if (!capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + return ret; + case SIOCSIFFLAGS: case SIOCSIFMETRIC: - case SIOCSIFMTU: case SIOCSIFMAP: - case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: @@ -2829,20 +2957,73 @@ int dev_ioctl(unsigned int cmd, void __u * dev_new_index - allocate an ifindex * * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the + * number. The caller must hold the rtnl semaphore or the * dev_base_lock to be sure it remains unique. + * + * Note: dev->name must be valid on entrance */ -static int dev_new_index(void) +static int dev_ve_new_index(void) { - static int ifindex; +#ifdef CONFIG_VE + int *ifindex = &get_exec_env()->ifindex; + int delta = 2; +#else + static int s_ifindex; + int *ifindex = &s_ifindex; + int delta = 1; +#endif for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(ifindex)) - return ifindex; + *ifindex += delta; + if (*ifindex <= 0) + *ifindex = 1; + if (!__dev_get_by_index(*ifindex)) + return *ifindex; } } +#ifdef CONFIG_VE +static int dev_glb_new_index(void) +{ + int i; + + i = find_first_zero_bit((long*)unmovable_ifindex_list, + MAX_UNMOVABLE_NETDEVICES); + + if (i == MAX_UNMOVABLE_NETDEVICES) + return -EMFILE; + + __set_bit(i, (long*)unmovable_ifindex_list); + return (i + 1) * 2; +} +#endif + +static void dev_glb_free_index(struct net_device *dev) +{ +#ifdef CONFIG_VE + int bit; + + bit = dev->ifindex / 2 - 1; + BUG_ON(bit >= MAX_UNMOVABLE_NETDEVICES); + __clear_bit(bit, (long*)unmovable_ifindex_list); +#endif +} + +static int dev_new_index(struct net_device *dev) +{ +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) + return dev_glb_new_index(); +#endif + + return dev_ve_new_index(); +} + +static void dev_free_index(struct net_device *dev) +{ + if ((dev->ifindex % 2) == 0) + dev_glb_free_index(dev); +} + static int dev_boot_phase = 1; /* Delayed registration/unregisteration */ @@ -2887,6 +3068,10 @@ int register_netdevice(struct net_device /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + ret = -EPERM; + if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) + goto out; + spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->_xmit_lock); dev->xmit_lock_owner = -1; @@ -2906,27 +3091,32 @@ int register_netdevice(struct net_device if (ret) { if (ret > 0) ret = -EIO; - goto out_err; + goto out_free_div; } } if (!dev_valid_name(dev->name)) { ret = -EINVAL; - goto out_err; + goto out_free_div; + } + + dev->ifindex = dev_new_index(dev); + if (dev->ifindex < 0) { + ret = dev->ifindex; + goto out_free_div; } - dev->ifindex = dev_new_index(); if (dev->iflink == -1) dev->iflink = dev->ifindex; /* Check for existence of name */ - head = dev_name_hash(dev->name); + head = dev_name_hash(dev->name, get_exec_env()); hlist_for_each(p, head) { struct net_device *d = hlist_entry(p, struct net_device, name_hlist); if (!strncmp(d->name, dev->name, IFNAMSIZ)) { ret = -EEXIST; - goto out_err; + goto out_free_ind; } } @@ -2970,7 +3160,7 @@ int register_netdevice(struct net_device ret = netdev_register_sysfs(dev); if (ret) - goto out_err; + goto out_free_ind; dev->reg_state = NETREG_REGISTERED; /* @@ -2981,12 +3171,20 @@ int register_netdevice(struct net_device set_bit(__LINK_STATE_PRESENT, &dev->state); dev->next = NULL; + dev->owner_env = get_exec_env(); + netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub()); + netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub()); dev_init_scheduler(dev); +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + list_add_tail(&dev->dev_global_list_entry, &dev_global_list); +#endif write_lock_bh(&dev_base_lock); *dev_tail = dev; dev_tail = &dev->next; hlist_add_head(&dev->name_hlist, head); - hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); + hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex, + get_exec_env())); dev_hold(dev); write_unlock_bh(&dev_base_lock); @@ -2997,7 +3195,9 @@ int register_netdevice(struct net_device out: return ret; -out_err: +out_free_ind: + dev_free_index(dev); +out_free_div: free_divert_blk(dev); goto out; } @@ -3124,6 +3324,7 @@ static DEFINE_MUTEX(net_todo_run_mutex); void netdev_run_todo(void) { struct list_head list; + struct ve_struct *current_env; /* Need to guard against multiple cpu's getting out of order. */ mutex_lock(&net_todo_run_mutex); @@ -3141,6 +3342,7 @@ void netdev_run_todo(void) list_replace_init(&net_todo_list, &list); spin_unlock(&net_todo_list_lock); + current_env = get_exec_env(); while (!list_empty(&list)) { struct net_device *dev = list_entry(list.next, struct net_device, todo_list); @@ -3153,6 +3355,7 @@ void netdev_run_todo(void) continue; } + (void)set_exec_env(dev->owner_env); netdev_unregister_sysfs(dev); dev->reg_state = NETREG_UNREGISTERED; @@ -3164,12 +3367,18 @@ void netdev_run_todo(void) BUG_TRAP(!dev->ip6_ptr); BUG_TRAP(!dev->dn_ptr); + put_beancounter(netdev_bc(dev)->exec_ub); + put_beancounter(netdev_bc(dev)->owner_ub); + netdev_bc(dev)->exec_ub = NULL; + netdev_bc(dev)->owner_ub = NULL; + /* It must be the very last action, * after this 'dev' may point to freed up memory. */ if (dev->destructor) dev->destructor(dev); } + (void)set_exec_env(current_env); out: mutex_unlock(&net_todo_run_mutex); @@ -3195,7 +3404,7 @@ struct net_device *alloc_netdev(int size alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; - p = kzalloc(alloc_size, GFP_KERNEL); + p = ub_kzalloc(alloc_size, GFP_KERNEL); if (!p) { printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); return NULL; @@ -3290,6 +3499,10 @@ int unregister_netdevice(struct net_devi dev_tail = dp; *dp = d->next; write_unlock_bh(&dev_base_lock); +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + list_del(&dev->dev_global_list_entry); +#endif break; } } @@ -3323,6 +3536,8 @@ int unregister_netdevice(struct net_devi /* Notifier chain MUST detach us from master device. */ BUG_TRAP(!dev->master); + dev_free_index(dev); + free_divert_blk(dev); /* Finish processing unregister after unlock */ @@ -3558,6 +3773,8 @@ EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_get_by_flags); EXPORT_SYMBOL(dev_get_by_index); EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(dev_name_hash); +EXPORT_SYMBOL(dev_index_hash); EXPORT_SYMBOL(dev_open); EXPORT_SYMBOL(dev_queue_xmit); EXPORT_SYMBOL(dev_remove_pack); diff -uprN linux-2.6.18/net/core/dev_mcast.c linux-2.6.18.ovz/net/core/dev_mcast.c --- linux-2.6.18/net/core/dev_mcast.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/dev_mcast.c 2007-06-13 06:55:07.000000000 -0400 @@ -290,9 +290,10 @@ static struct file_operations dev_mc_seq void __init dev_mcast_init(void) { - proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops); + proc_glob_fops_create("net/dev_mcast", 0, &dev_mc_seq_fops); } EXPORT_SYMBOL(dev_mc_add); EXPORT_SYMBOL(dev_mc_delete); EXPORT_SYMBOL(dev_mc_upload); +EXPORT_SYMBOL(dev_mc_discard); diff -uprN linux-2.6.18/net/core/dst.c linux-2.6.18.ovz/net/core/dst.c --- linux-2.6.18/net/core/dst.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/dst.c 2007-06-13 06:55:07.000000000 -0400 @@ -259,11 +259,14 @@ static int dst_dev_event(struct notifier switch (event) { case NETDEV_UNREGISTER: case NETDEV_DOWN: - spin_lock_bh(&dst_lock); + local_bh_disable(); + dst_run_gc(0); + spin_lock(&dst_lock); for (dst = dst_garbage_list; dst; dst = dst->next) { dst_ifdown(dst, dev, event != NETDEV_DOWN); } - spin_unlock_bh(&dst_lock); + spin_unlock(&dst_lock); + local_bh_enable(); break; } return NOTIFY_DONE; diff -uprN linux-2.6.18/net/core/dv.c linux-2.6.18.ovz/net/core/dv.c --- linux-2.6.18/net/core/dv.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/dv.c 2007-06-13 06:55:07.000000000 -0400 @@ -544,3 +544,5 @@ void divert_frame(struct sk_buff *skb) break; } } + +EXPORT_SYMBOL(free_divert_blk); diff -uprN linux-2.6.18/net/core/filter.c linux-2.6.18.ovz/net/core/filter.c --- linux-2.6.18/net/core/filter.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/filter.c 2007-06-13 06:55:07.000000000 -0400 @@ -407,7 +407,7 @@ int sk_attach_filter(struct sock_fprog * if (fprog->filter == NULL) return -EINVAL; - fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { diff -uprN linux-2.6.18/net/core/neighbour.c linux-2.6.18.ovz/net/core/neighbour.c --- linux-2.6.18/net/core/neighbour.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/neighbour.c 2007-06-13 06:55:07.000000000 -0400 @@ -33,6 +33,7 @@ #include #include #include +#include #define NEIGH_DEBUG 1 @@ -242,6 +243,7 @@ static struct neighbour *neigh_alloc(str int entries; entries = atomic_inc_return(&tbl->entries) - 1; + n = ERR_PTR(-ENOBUFS); if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -252,7 +254,7 @@ static struct neighbour *neigh_alloc(str n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC); if (!n) - goto out_entries; + goto out_nomem; memset(n, 0, tbl->entry_size); @@ -273,6 +275,8 @@ static struct neighbour *neigh_alloc(str out: return n; +out_nomem: + n = ERR_PTR(-ENOMEM); out_entries: atomic_dec(&tbl->entries); goto out; @@ -385,12 +389,11 @@ struct neighbour *neigh_create(struct ne u32 hash_val; int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neighbour *n1, *rc, *n; - if (!n) { - rc = ERR_PTR(-ENOBUFS); + rc = n = neigh_alloc(tbl); + if (IS_ERR(n)) goto out; - } memcpy(n->primary_key, pkey, key_len); n->dev = dev; @@ -636,6 +639,8 @@ static void neigh_periodic_timer(unsigne struct neigh_table *tbl = (struct neigh_table *)arg; struct neighbour *n, **np; unsigned long expire, now = jiffies; + struct ve_struct *env = set_exec_env(tbl->owner_env); + struct user_beancounter *ub = set_exec_ub(tbl->owner_ub); NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); @@ -697,6 +702,8 @@ next_elt: mod_timer(&tbl->gc_timer, now + expire); write_unlock(&tbl->lock); + set_exec_ub(ub); + set_exec_env(env); } static __inline__ int neigh_max_probes(struct neighbour *n) @@ -724,6 +731,11 @@ static void neigh_timer_handler(unsigned struct neighbour *neigh = (struct neighbour *)arg; unsigned state; int notify = 0; + struct ve_struct *env; + struct user_beancounter *ub; + + env = set_exec_env(neigh->dev->owner_env); + ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub); write_lock(&neigh->lock); @@ -830,6 +842,8 @@ out: neigh_app_notify(neigh); #endif neigh_release(neigh); + (void)set_exec_ub(ub); + (void)set_exec_env(env); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) @@ -1207,6 +1221,9 @@ static void neigh_proxy_process(unsigned unsigned long now = jiffies; struct sk_buff *skb; + struct ve_struct *env = set_exec_env(tbl->owner_env); + struct user_beancounter *ub = set_exec_ub(tbl->owner_ub); + spin_lock(&tbl->proxy_queue.lock); skb = tbl->proxy_queue.next; @@ -1218,6 +1235,7 @@ static void neigh_proxy_process(unsigned skb = skb->next; if (tdif <= 0) { struct net_device *dev = back->dev; + __skb_unlink(back, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) tbl->proxy_redo(back); @@ -1225,6 +1243,7 @@ static void neigh_proxy_process(unsigned kfree_skb(back); dev_put(dev); + } else if (!sched_next || tdif < sched_next) sched_next = tdif; } @@ -1232,6 +1251,8 @@ static void neigh_proxy_process(unsigned if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); + (void)set_exec_ub(ub); + (void)set_exec_env(env); } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, @@ -1327,12 +1348,17 @@ void neigh_parms_destroy(struct neigh_pa kfree(parms); } -void neigh_table_init_no_netlink(struct neigh_table *tbl) +struct lock_class_key neigh_table_proxy_queue_class; + +int neigh_table_init_no_netlink(struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; + atomic_set(&tbl->entries, 0); + tbl->hash_chain_gc = 0; atomic_set(&tbl->parms.refcnt, 1); + tbl->parms.next = NULL; INIT_RCU_HEAD(&tbl->parms.rcu_head); tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); @@ -1340,22 +1366,30 @@ void neigh_table_init_no_netlink(struct if (!tbl->kmem_cachep) tbl->kmem_cachep = kmem_cache_create(tbl->id, tbl->entry_size, - 0, SLAB_HWCACHE_ALIGN, + 0, SLAB_HWCACHE_ALIGN | SLAB_UBC, NULL, NULL); if (!tbl->kmem_cachep) - panic("cannot create neighbour cache"); + return -ENOMEM; + + tbl->owner_env = get_ve(get_exec_env()); + tbl->owner_ub = get_beancounter(get_exec_ub()); tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) - panic("cannot create neighbour cache statistics"); + goto out; #ifdef CONFIG_PROC_FS - tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat); - if (!tbl->pde) - panic("cannot create neighbour proc dir entry"); - tbl->pde->proc_fops = &neigh_stat_seq_fops; - tbl->pde->data = tbl; + if (ve_is_super(get_exec_env())) { + char name[strlen(tbl->id) + sizeof("net/stat/")]; + strcpy(name, "net/stat/"); + strcat(name, tbl->id); + tbl->pde = create_proc_glob_entry(name, S_IRUGO, NULL); + if (tbl->pde) { + tbl->pde->proc_fops = &neigh_stat_seq_fops; + tbl->pde->data = tbl; + } + } #endif tbl->hash_mask = 1; @@ -1365,7 +1399,7 @@ void neigh_table_init_no_netlink(struct tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); if (!tbl->hash_buckets || !tbl->phash_buckets) - panic("cannot allocate neighbour cache hashes"); + goto nomem; get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); @@ -1379,19 +1413,44 @@ void neigh_table_init_no_netlink(struct init_timer(&tbl->proxy_timer); tbl->proxy_timer.data = (unsigned long)tbl; tbl->proxy_timer.function = neigh_proxy_process; - skb_queue_head_init(&tbl->proxy_queue); + skb_queue_head_init_class(&tbl->proxy_queue, + &neigh_table_proxy_queue_class); tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; + return 0; + +nomem: + if (tbl->hash_buckets) { + neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); + tbl->hash_buckets = NULL; + } + if (tbl->phash_buckets) { + kfree(tbl->phash_buckets); + tbl->phash_buckets = NULL; + } + if (tbl->stats) { + free_percpu(tbl->stats); + tbl->stats = NULL; + } +out: + put_beancounter(tbl->owner_ub); + put_ve(tbl->owner_env); + return -ENOMEM; } -void neigh_table_init(struct neigh_table *tbl) +int neigh_table_init(struct neigh_table *tbl) { struct neigh_table *tmp; + int err; - neigh_table_init_no_netlink(tbl); + err = neigh_table_init_no_netlink(tbl); + if (err) + return err; write_lock(&neigh_tbl_lock); for (tmp = neigh_tables; tmp; tmp = tmp->next) { + if (!ve_accessible_strict(tmp->owner_env, get_exec_env())) + continue; if (tmp->family == tbl->family) break; } @@ -1404,6 +1463,7 @@ void neigh_table_init(struct neigh_table "family %d\n", tbl->family); dump_stack(); } + return 0; } int neigh_table_clear(struct neigh_table *tbl) @@ -1417,6 +1477,15 @@ int neigh_table_clear(struct neigh_table neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) printk(KERN_CRIT "neighbour leakage\n"); +#ifdef CONFIG_PROC_FS + if (ve_is_super(get_exec_env())) { + char name[strlen(tbl->id) + sizeof("net/stat/")]; + strcpy(name, "net/stat/"); + strcat(name, tbl->id); + remove_proc_glob_entry(name, NULL); + } +#endif + write_lock(&neigh_tbl_lock); for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { if (*tp == tbl) { @@ -1435,6 +1504,9 @@ int neigh_table_clear(struct neigh_table free_percpu(tbl->stats); tbl->stats = NULL; + put_beancounter(tbl->owner_ub); + put_ve(tbl->owner_env); + return 0; } @@ -1457,6 +1529,8 @@ int neigh_delete(struct sk_buff *skb, st if (tbl->family != ndm->ndm_family) continue; + if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) + continue; read_unlock(&neigh_tbl_lock); err = -EINVAL; @@ -1510,6 +1584,8 @@ int neigh_add(struct sk_buff *skb, struc if (tbl->family != ndm->ndm_family) continue; + if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) + continue; read_unlock(&neigh_tbl_lock); err = -EINVAL; @@ -1742,6 +1818,9 @@ int neightbl_set(struct sk_buff *skb, st if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; + if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) + continue; + if (!rtattr_strcmp(tb[NDTA_NAME - 1], tbl->id)) break; } @@ -1963,6 +2042,8 @@ int neigh_dump_info(struct sk_buff *skb, s_t = cb->args[0]; for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { + if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) + continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) @@ -2552,11 +2633,12 @@ int neigh_sysctl_register(struct net_dev int p_id, int pdev_id, char *p_name, proc_handler *handler, ctl_handler *strategy) { - struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); + struct neigh_sysctl_table *t; const char *dev_name_source = NULL; char *dev_name = NULL; int err = 0; + t = kmalloc(sizeof(*t), GFP_KERNEL); if (!t) return -ENOBUFS; memcpy(t, &neigh_sysctl_template, sizeof(*t)); diff -uprN linux-2.6.18/net/core/net-sysfs.c linux-2.6.18.ovz/net/core/net-sysfs.c --- linux-2.6.18/net/core/net-sysfs.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/net-sysfs.c 2007-06-13 06:55:07.000000000 -0400 @@ -433,7 +433,7 @@ static void netdev_release(struct class_ kfree((char *)dev - dev->padded); } -static struct class net_class = { +struct class net_class = { .name = "net", .release = netdev_release, .class_dev_attrs = net_class_attributes, @@ -441,6 +441,13 @@ static struct class net_class = { .uevent = netdev_uevent, #endif }; +EXPORT_SYMBOL(net_class); + +#ifndef CONFIG_VE +#define visible_net_class net_class +#else +#define visible_net_class (*get_exec_env()->net_class) +#endif void netdev_unregister_sysfs(struct net_device * net) { @@ -454,7 +461,7 @@ int netdev_register_sysfs(struct net_dev struct attribute_group **groups = net->sysfs_groups; class_device_initialize(class_dev); - class_dev->class = &net_class; + class_dev->class = &visible_net_class; class_dev->class_data = net; class_dev->groups = groups; @@ -473,7 +480,15 @@ int netdev_register_sysfs(struct net_dev return class_device_add(class_dev); } +void prepare_sysfs_netdev(void) +{ +#ifdef CONFIG_VE + get_ve0()->net_class = &net_class; +#endif +} + int netdev_sysfs_init(void) { + prepare_sysfs_netdev(); return class_register(&net_class); } diff -uprN linux-2.6.18/net/core/rtnetlink.c linux-2.6.18.ovz/net/core/rtnetlink.c --- linux-2.6.18/net/core/rtnetlink.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/rtnetlink.c 2007-06-13 06:55:07.000000000 -0400 @@ -595,6 +595,8 @@ static int rtnetlink_dump_all(struct sk_ if (rtnetlink_links[idx] == NULL || rtnetlink_links[idx][type].dumpit == NULL) continue; + if (vz_security_proto_check(idx, 0, 0)) + continue; if (idx > s_idx) memset(&cb->args[0], 0, sizeof(cb->args)); if (rtnetlink_links[idx][type].dumpit(skb, cb)) @@ -662,7 +664,7 @@ rtnetlink_rcv_msg(struct sk_buff *skb, s return 0; family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; - if (family >= NPROTO) { + if (family >= NPROTO || vz_security_proto_check(family, 0, 0)) { *errp = -EAFNOSUPPORT; return -1; } @@ -675,7 +677,7 @@ rtnetlink_rcv_msg(struct sk_buff *skb, s sz_idx = type>>2; kind = type&3; - if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) { + if (kind != 2 && security_netlink_recv(skb, CAP_VE_NET_ADMIN)) { *errp = -EPERM; return -1; } diff -uprN linux-2.6.18/net/core/scm.c linux-2.6.18.ovz/net/core/scm.c --- linux-2.6.18/net/core/scm.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/scm.c 2007-06-13 06:55:07.000000000 -0400 @@ -34,6 +34,7 @@ #include #include +#include /* * Only allow a user to send credentials, that they could set with @@ -42,7 +43,9 @@ static __inline__ int scm_check_creds(struct ucred *creds) { - if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) && + if ((creds->pid == virt_tgid(current) || + creds->pid == current->tgid || + capable(CAP_VE_SYS_ADMIN)) && ((creds->uid == current->uid || creds->uid == current->euid || creds->uid == current->suid) || capable(CAP_SETUID)) && ((creds->gid == current->gid || creds->gid == current->egid || @@ -69,7 +72,7 @@ static int scm_fp_copy(struct cmsghdr *c if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + fpl = ub_kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); if (!fpl) return -ENOMEM; *fplp = fpl; @@ -275,7 +278,7 @@ struct scm_fp_list *scm_fp_dup(struct sc if (!fpl) return NULL; - new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + new_fpl = ub_kmalloc(sizeof(*fpl), GFP_KERNEL); if (new_fpl) { for (i=fpl->count-1; i>=0; i--) get_file(fpl->fp[i]); diff -uprN linux-2.6.18/net/core/skbuff.c linux-2.6.18.ovz/net/core/skbuff.c --- linux-2.6.18/net/core/skbuff.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/skbuff.c 2007-06-13 06:55:07.000000000 -0400 @@ -47,6 +47,7 @@ #include #include #include +#include #include #ifdef CONFIG_NET_CLS_ACT #include @@ -67,6 +68,8 @@ #include #include +#include + static kmem_cache_t *skbuff_head_cache __read_mostly; static kmem_cache_t *skbuff_fclone_cache __read_mostly; @@ -154,6 +157,9 @@ struct sk_buff *__alloc_skb(unsigned int if (!skb) goto out; + if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) + goto nobc; + /* Get the DATA. Size must match skb_add_mtu(). */ size = SKB_DATA_ALIGN(size); data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); @@ -167,6 +173,7 @@ struct sk_buff *__alloc_skb(unsigned int skb->data = data; skb->tail = data; skb->end = data + size; + skb->owner_env = get_exec_env(); /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); @@ -189,6 +196,8 @@ struct sk_buff *__alloc_skb(unsigned int out: return skb; nodata: + ub_skb_free_bc(skb); +nobc: kmem_cache_free(cache, skb); skb = NULL; goto out; @@ -221,6 +230,9 @@ struct sk_buff *alloc_skb_from_cache(kme if (!skb) goto out; + if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) + goto nobc; + /* Get the DATA. */ size = SKB_DATA_ALIGN(size); data = kmem_cache_alloc(cp, gfp_mask); @@ -234,6 +246,7 @@ struct sk_buff *alloc_skb_from_cache(kme skb->data = data; skb->tail = data; skb->end = data + size; + skb->owner_env = get_exec_env(); atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; @@ -244,6 +257,8 @@ struct sk_buff *alloc_skb_from_cache(kme out: return skb; nodata: + ub_skb_free_bc(skb); +nobc: kmem_cache_free(skbuff_head_cache, skb); skb = NULL; goto out; @@ -328,6 +343,7 @@ void kfree_skbmem(struct sk_buff *skb) atomic_t *fclone_ref; skb_release_data(skb); + ub_skb_free_bc(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); @@ -369,6 +385,7 @@ void __kfree_skb(struct sk_buff *skb) #ifdef CONFIG_XFRM secpath_put(skb->sp); #endif + ub_skb_uncharge(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -442,6 +459,11 @@ struct sk_buff *skb_clone(struct sk_buff n->fclone = SKB_FCLONE_UNAVAILABLE; } + if (ub_skb_alloc_bc(n, gfp_mask)) { + kmem_cache_free(skbuff_head_cache, n); + return NULL; + } + #define C(x) n->x = skb->x n->next = n->prev = NULL; @@ -471,6 +493,7 @@ struct sk_buff *skb_clone(struct sk_buff C(ipvs_property); #endif C(protocol); + C(owner_env); n->destructor = NULL; #ifdef CONFIG_NETFILTER C(nfmark); @@ -638,6 +661,7 @@ struct sk_buff *pskb_copy(struct sk_buff n->csum = skb->csum; n->ip_summed = skb->ip_summed; + n->truesize += skb->data_len; n->data_len = skb->data_len; n->len = skb->len; @@ -1945,7 +1969,7 @@ struct sk_buff *skb_segment(struct sk_bu do { struct sk_buff *nskb; skb_frag_t *frag; - int hsize, nsize; + int hsize; int k; int size; @@ -1956,11 +1980,10 @@ struct sk_buff *skb_segment(struct sk_bu hsize = skb_headlen(skb) - offset; if (hsize < 0) hsize = 0; - nsize = hsize + doffset; - if (nsize > len + doffset || !sg) - nsize = len + doffset; + if (hsize > len || !sg) + hsize = len; - nskb = alloc_skb(nsize + headroom, GFP_ATOMIC); + nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC); if (unlikely(!nskb)) goto err; diff -uprN linux-2.6.18/net/core/sock.c linux-2.6.18.ovz/net/core/sock.c --- linux-2.6.18/net/core/sock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/sock.c 2007-06-13 06:55:07.000000000 -0400 @@ -107,6 +107,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,9 @@ #include #include +#include +#include + #include #ifdef CONFIG_INET @@ -203,7 +207,20 @@ static int sock_set_timeout(long *timeo_ return -EINVAL; if (copy_from_user(&tv, optval, sizeof(tv))) return -EFAULT; + if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) + return -EDOM; + + if (tv.tv_sec < 0) { + static int warned; + *timeo_p = 0; + if (warned < 10 && net_ratelimit()) + warned++; + ve_printk(VE_LOG, KERN_INFO "sock_set_timeout: " + "`%s' (pid %d) tries to set negative timeout\n", + current->comm, current->pid); + return 0; + } *timeo_p = MAX_SCHEDULE_TIMEOUT; if (tv.tv_sec == 0 && tv.tv_usec == 0) return 0; @@ -218,7 +235,7 @@ static void sock_warn_obsolete_bsdism(co static char warncomm[TASK_COMM_LEN]; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); - printk(KERN_WARNING "process `%s' is using obsolete " + ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete " "%s SO_BSDCOMPAT\n", warncomm, name); warned++; } @@ -247,6 +264,10 @@ int sock_queue_rcv_skb(struct sock *sk, goto out; } + err = ub_sockrcvbuf_charge(sk, skb); + if (err < 0) + goto out; + /* It would be deadlock, if sock_queue_rcv_skb is used with socket lock! We assume that users of this function are lock free. @@ -858,6 +879,7 @@ struct sock *sk_alloc(int family, gfp_t */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); + sk->owner_env = get_exec_env(); } if (security_sk_alloc(sk, family, priority)) @@ -897,6 +919,7 @@ void sk_free(struct sock *sk) __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); security_sk_free(sk); + ub_sock_uncharge(sk); if (sk->sk_prot_creator->slab != NULL) kmem_cache_free(sk->sk_prot_creator->slab, sk); else @@ -946,14 +969,11 @@ struct sock *sk_clone(const struct sock if (filter != NULL) sk_filter_charge(newsk, filter); - if (unlikely(xfrm_sk_clone_policy(newsk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - newsk = NULL; - goto out; - } + if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type) < 0) + goto out_err; + + if (unlikely(xfrm_sk_clone_policy(newsk))) + goto out_err; newsk->sk_err = 0; newsk->sk_priority = 0; @@ -977,8 +997,15 @@ struct sock *sk_clone(const struct sock if (newsk->sk_prot->sockets_allocated) atomic_inc(newsk->sk_prot->sockets_allocated); } -out: return newsk; + +out_err: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + sock_reset_flag(newsk, SOCK_TIMESTAMP); + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; } EXPORT_SYMBOL_GPL(sk_clone); @@ -1138,11 +1165,9 @@ static long sock_wait_for_wmem(struct so /* * Generic send/receive buffer handlers */ - -static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, - unsigned long header_len, - unsigned long data_len, - int noblock, int *errcode) +struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size, + unsigned long size2, int noblock, + int *errcode) { struct sk_buff *skb; gfp_t gfp_mask; @@ -1163,46 +1188,35 @@ static struct sk_buff *sock_alloc_send_p if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, sk->sk_allocation); - if (skb) { - int npages; - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - skb_frag_t *frag; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - frag = &skb_shinfo(skb)->frags[i]; - frag->page = page; - frag->page_offset = 0; - frag->size = (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len); - data_len -= PAGE_SIZE; - } + if (ub_sock_getwres_other(sk, skb_charge_size(size))) { + if (size2 < size) { + size = size2; + continue; + } + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = ub_sock_wait_for_space(sk, timeo, + skb_charge_size(size)); + continue; + } + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + skb = alloc_skb(size, gfp_mask); + if (skb) /* Full success... */ break; - } + ub_sock_retwres_other(sk, skb_charge_size(size), + SOCK_MIN_UBCSPACE_CH); err = -ENOBUFS; goto failure; } + ub_sock_retwres_other(sk, + skb_charge_size(size), + SOCK_MIN_UBCSPACE_CH); set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; @@ -1213,6 +1227,7 @@ static struct sk_buff *sock_alloc_send_p timeo = sock_wait_for_wmem(sk, timeo); } + ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF); skb_set_owner_w(skb, sk); return skb; @@ -1223,10 +1238,12 @@ failure: return NULL; } +EXPORT_SYMBOL(sock_alloc_send_skb2); + struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_skb2(sk, size, size, noblock, errcode); } static void __lock_sock(struct sock *sk) @@ -1709,7 +1726,8 @@ int proto_register(struct proto *prot, i if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN | SLAB_UBC, + NULL, NULL); if (prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", @@ -1725,9 +1743,11 @@ int proto_register(struct proto *prot, i goto out_free_sock_slab; sprintf(request_sock_slab_name, mask, prot->name); - prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name, - prot->rsk_prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + prot->rsk_prot->slab = + kmem_cache_create(request_sock_slab_name, + prot->rsk_prot->obj_size, 0, + SLAB_HWCACHE_ALIGN | SLAB_UBC, + NULL, NULL); if (prot->rsk_prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", @@ -1748,7 +1768,7 @@ int proto_register(struct proto *prot, i prot->twsk_prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, prot->twsk_prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, + 0, SLAB_HWCACHE_ALIGN | SLAB_UBC, NULL, NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; diff -uprN linux-2.6.18/net/core/stream.c linux-2.6.18.ovz/net/core/stream.c --- linux-2.6.18/net/core/stream.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/core/stream.c 2007-06-13 06:55:07.000000000 -0400 @@ -111,8 +111,10 @@ EXPORT_SYMBOL(sk_stream_wait_close); * sk_stream_wait_memory - Wait for more memory for a socket * @sk: socket to wait for memory * @timeo_p: for how long + * @amount - amount of memory to wait for (in UB space!) */ -int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, + unsigned long amount) { int err = 0; long vm_wait = 0; @@ -134,8 +136,11 @@ int sk_stream_wait_memory(struct sock *s if (signal_pending(current)) goto do_interrupted; clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - if (sk_stream_memory_free(sk) && !vm_wait) - break; + if (amount == 0) { + if (sk_stream_memory_free(sk) && !vm_wait) + break; + } else + ub_sock_sndqueueadd_tcp(sk, amount); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; @@ -144,6 +149,8 @@ int sk_stream_wait_memory(struct sock *s sk_stream_memory_free(sk) && vm_wait); sk->sk_write_pending--; + if (amount > 0) + ub_sock_sndqueuedel(sk); if (vm_wait) { vm_wait -= current_timeo; @@ -170,6 +177,10 @@ do_interrupted: goto out; } +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ + return __sk_stream_wait_memory(sk, timeo_p, 0); +} EXPORT_SYMBOL(sk_stream_wait_memory); void sk_stream_rfree(struct sk_buff *skb) diff -uprN linux-2.6.18/net/dccp/ipv6.c linux-2.6.18.ovz/net/dccp/ipv6.c --- linux-2.6.18/net/dccp/ipv6.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/dccp/ipv6.c 2007-06-13 06:55:07.000000000 -0400 @@ -276,7 +276,7 @@ static void dccp_v6_err(struct sk_buff * __u64 seq; sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport, - &hdr->saddr, dh->dccph_sport, skb->dev->ifindex); + &hdr->saddr, dh->dccph_sport, inet6_iif(skb)); if (sk == NULL) { ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); @@ -866,6 +866,8 @@ static struct sock *dccp_v6_request_recv __ip6_dst_store(newsk, dst, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + if (!sysctl_tcp_use_sg) + newsk->sk_route_caps &= ~NETIF_F_SG; newdp6 = (struct dccp6_sock *)newsk; newinet = inet_sk(newsk); newinet->pinet6 = &newdp6->inet6; diff -uprN linux-2.6.18/net/dccp/minisocks.c linux-2.6.18.ovz/net/dccp/minisocks.c --- linux-2.6.18/net/dccp/minisocks.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/dccp/minisocks.c 2007-06-13 06:55:07.000000000 -0400 @@ -18,6 +18,8 @@ #include #include +#include + #include "ackvec.h" #include "ccid.h" #include "dccp.h" @@ -46,7 +48,8 @@ void dccp_time_wait(struct sock *sk, int { struct inet_timewait_sock *tw = NULL; - if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) + if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets && + ub_timewait_check(sk, &dccp_death_row)) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { diff -uprN linux-2.6.18/net/decnet/af_decnet.c linux-2.6.18.ovz/net/decnet/af_decnet.c --- linux-2.6.18/net/decnet/af_decnet.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/decnet/af_decnet.c 2007-06-13 06:55:07.000000000 -0400 @@ -1177,8 +1177,10 @@ static int dn_getname(struct socket *soc if (peer) { if ((sock->state != SS_CONNECTED && sock->state != SS_CONNECTING) && - scp->accept_mode == ACC_IMMED) + scp->accept_mode == ACC_IMMED) { + release_sock(sk); return -ENOTCONN; + } memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn)); } else { diff -uprN linux-2.6.18/net/decnet/netfilter/dn_rtmsg.c linux-2.6.18.ovz/net/decnet/netfilter/dn_rtmsg.c --- linux-2.6.18/net/decnet/netfilter/dn_rtmsg.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/decnet/netfilter/dn_rtmsg.c 2007-06-13 06:55:07.000000000 -0400 @@ -107,7 +107,7 @@ static inline void dnrmg_receive_user_sk if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); /* Eventually we might send routing messages too */ diff -uprN linux-2.6.18/net/ieee80211/softmac/ieee80211softmac_io.c linux-2.6.18.ovz/net/ieee80211/softmac/ieee80211softmac_io.c --- linux-2.6.18/net/ieee80211/softmac/ieee80211softmac_io.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ieee80211/softmac/ieee80211softmac_io.c 2007-06-13 06:55:07.000000000 -0400 @@ -304,7 +304,7 @@ ieee80211softmac_auth(struct ieee80211_a 2 + /* Auth Transaction Seq */ 2 + /* Status Code */ /* Challenge Text IE */ - is_shared_response ? 0 : 1 + 1 + net->challenge_len + (is_shared_response ? 1 + 1 + net->challenge_len : 0) ); if (unlikely((*pkt) == NULL)) return 0; diff -uprN linux-2.6.18/net/ieee80211/softmac/ieee80211softmac_scan.c linux-2.6.18.ovz/net/ieee80211/softmac/ieee80211softmac_scan.c --- linux-2.6.18/net/ieee80211/softmac/ieee80211softmac_scan.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ieee80211/softmac/ieee80211softmac_scan.c 2007-06-13 06:55:07.000000000 -0400 @@ -47,7 +47,6 @@ ieee80211softmac_start_scan(struct ieee8 sm->scanning = 1; spin_unlock_irqrestore(&sm->lock, flags); - netif_tx_disable(sm->ieee->dev); ret = sm->start_scan(sm->dev); if (ret) { spin_lock_irqsave(&sm->lock, flags); @@ -248,7 +247,6 @@ void ieee80211softmac_scan_finished(stru if (net) sm->set_channel(sm->dev, net->channel); } - netif_wake_queue(sm->ieee->dev); ieee80211softmac_call_events(sm, IEEE80211SOFTMAC_EVENT_SCAN_FINISHED, NULL); } EXPORT_SYMBOL_GPL(ieee80211softmac_scan_finished); diff -uprN linux-2.6.18/net/ipv4/af_inet.c linux-2.6.18.ovz/net/ipv4/af_inet.c --- linux-2.6.18/net/ipv4/af_inet.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/af_inet.c 2007-06-13 06:55:07.000000000 -0400 @@ -115,6 +115,7 @@ #ifdef CONFIG_IP_MROUTE #include #endif +#include DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; @@ -299,6 +300,13 @@ lookup_protocol: if (sk == NULL) goto out; + err = -ENOBUFS; + if (ub_sock_charge(sk, PF_INET, sock->type)) + goto out_sk_free; + /* if charge was successful, sock_init_data() MUST be called to + * set sk->sk_type. otherwise sk will be uncharged to wrong resource + */ + err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) @@ -356,6 +364,9 @@ out: out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_free: + sk_free(sk); + return err; } @@ -370,6 +381,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + struct ve_struct *saved_env; + + saved_env = set_exec_env(sk->owner_env); /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -387,6 +401,8 @@ int inet_release(struct socket *sock) timeout = sk->sk_lingertime; sock->sk = NULL; sk->sk_prot->close(sk, timeout); + + (void)set_exec_env(saved_env); } return 0; } @@ -1213,20 +1229,20 @@ static struct net_protocol icmp_protocol static int __init init_ipv4_mibs(void) { - net_statistics[0] = alloc_percpu(struct linux_mib); - net_statistics[1] = alloc_percpu(struct linux_mib); - ip_statistics[0] = alloc_percpu(struct ipstats_mib); - ip_statistics[1] = alloc_percpu(struct ipstats_mib); - icmp_statistics[0] = alloc_percpu(struct icmp_mib); - icmp_statistics[1] = alloc_percpu(struct icmp_mib); - tcp_statistics[0] = alloc_percpu(struct tcp_mib); - tcp_statistics[1] = alloc_percpu(struct tcp_mib); - udp_statistics[0] = alloc_percpu(struct udp_mib); - udp_statistics[1] = alloc_percpu(struct udp_mib); + ve_net_statistics[0] = alloc_percpu(struct linux_mib); + ve_net_statistics[1] = alloc_percpu(struct linux_mib); + ve_ip_statistics[0] = alloc_percpu(struct ipstats_mib); + ve_ip_statistics[1] = alloc_percpu(struct ipstats_mib); + ve_icmp_statistics[0] = alloc_percpu(struct icmp_mib); + ve_icmp_statistics[1] = alloc_percpu(struct icmp_mib); + ve_tcp_statistics[0] = alloc_percpu(struct tcp_mib); + ve_tcp_statistics[1] = alloc_percpu(struct tcp_mib); + ve_udp_statistics[0] = alloc_percpu(struct udp_mib); + ve_udp_statistics[1] = alloc_percpu(struct udp_mib); if (! - (net_statistics[0] && net_statistics[1] && ip_statistics[0] - && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1] - && udp_statistics[0] && udp_statistics[1])) + (ve_net_statistics[0] && ve_net_statistics[1] && ve_ip_statistics[0] + && ve_ip_statistics[1] && ve_tcp_statistics[0] && ve_tcp_statistics[1] + && ve_udp_statistics[0] && ve_udp_statistics[1])) return -ENOMEM; (void) tcp_mib_init(); diff -uprN linux-2.6.18/net/ipv4/arp.c linux-2.6.18.ovz/net/ipv4/arp.c --- linux-2.6.18/net/ipv4/arp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/arp.c 2007-06-13 06:55:07.000000000 -0400 @@ -174,7 +174,7 @@ struct neigh_ops arp_broken_ops = { .queue_xmit = dev_queue_xmit, }; -struct neigh_table arp_tbl = { +struct neigh_table global_arp_tbl = { .family = AF_INET, .entry_size = sizeof(struct neighbour) + 4, .key_len = 4, @@ -183,7 +183,7 @@ struct neigh_table arp_tbl = { .proxy_redo = parp_redo, .id = "arp_cache", .parms = { - .tbl = &arp_tbl, + .tbl = &global_arp_tbl, .base_reachable_time = 30 * HZ, .retrans_time = 1 * HZ, .gc_staletime = 60 * HZ, @@ -919,6 +919,9 @@ out: static void parp_redo(struct sk_buff *skb) { +#if defined(CONFIG_NETFILTER) && defined(CONFIG_NETFILTER_DEBUG) + skb->nf_debug = 0; +#endif arp_process(skb); } @@ -988,7 +991,7 @@ static int arp_req_set(struct arpreq *r, return 0; } if (dev == NULL) { - ipv4_devconf.proxy_arp = 1; + ve_ipv4_devconf.proxy_arp = 1; return 0; } if (__in_dev_get_rtnl(dev)) { @@ -1094,7 +1097,7 @@ static int arp_req_delete(struct arpreq return pneigh_delete(&arp_tbl, &ip, dev); if (mask == 0) { if (dev == NULL) { - ipv4_devconf.proxy_arp = 0; + ve_ipv4_devconf.proxy_arp = 0; return 0; } if (__in_dev_get_rtnl(dev)) { @@ -1142,7 +1145,8 @@ int arp_ioctl(unsigned int cmd, void __u switch (cmd) { case SIOCDARP: case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) return -EPERM; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); @@ -1240,7 +1244,9 @@ static int arp_proc_init(void); void __init arp_init(void) { - neigh_table_init(&arp_tbl); + get_ve0()->ve_arp_tbl = &global_arp_tbl; + if (neigh_table_init(&arp_tbl)) + panic("cannot initialize ARP tables\n"); dev_add_pack(&arp_packet_type); arp_proc_init(); @@ -1372,8 +1378,9 @@ static int arp_seq_open(struct inode *in { struct seq_file *seq; int rc = -ENOMEM; - struct neigh_seq_state *s = kzalloc(sizeof(*s), GFP_KERNEL); - + struct neigh_seq_state *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) goto out; @@ -1400,7 +1407,7 @@ static struct file_operations arp_seq_fo static int __init arp_proc_init(void) { - if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops)) + if (!proc_glob_fops_create("net/arp", S_IRUGO, &arp_seq_fops)) return -ENOMEM; return 0; } @@ -1419,8 +1426,55 @@ EXPORT_SYMBOL(arp_find); EXPORT_SYMBOL(arp_create); EXPORT_SYMBOL(arp_xmit); EXPORT_SYMBOL(arp_send); -EXPORT_SYMBOL(arp_tbl); +EXPORT_SYMBOL(global_arp_tbl); #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) EXPORT_SYMBOL(clip_tbl_hook); #endif + +#ifdef CONFIG_VE +int ve_arp_init(struct ve_struct *ve) +{ + struct ve_struct *old_env; + int err; + + ve->ve_arp_tbl = kmalloc(sizeof(struct neigh_table), GFP_KERNEL); + if (ve->ve_arp_tbl == NULL) + return -ENOMEM; + + *(ve->ve_arp_tbl) = global_arp_tbl; + ve->ve_arp_tbl->parms.tbl = ve->ve_arp_tbl; + old_env = set_exec_env(ve); + err = neigh_table_init(ve->ve_arp_tbl); + if (err) + goto out_free; +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, + NET_IPV4_NEIGH, "ipv4", NULL, NULL); +#endif + err = 0; + +out: + set_exec_env(old_env); + return err; + +out_free: + kfree(ve->ve_arp_tbl); + ve->ve_arp_tbl = NULL; + goto out; +} +EXPORT_SYMBOL(ve_arp_init); + +void ve_arp_fini(struct ve_struct *ve) +{ + if (ve->ve_arp_tbl) { +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&ve->ve_arp_tbl->parms); +#endif + neigh_table_clear(ve->ve_arp_tbl); + kfree(ve->ve_arp_tbl); + ve->ve_arp_tbl = NULL; + } +} +EXPORT_SYMBOL(ve_arp_fini); +#endif /* CONFIG_VE */ diff -uprN linux-2.6.18/net/ipv4/devinet.c linux-2.6.18.ovz/net/ipv4/devinet.c --- linux-2.6.18/net/ipv4/devinet.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/devinet.c 2007-06-13 06:55:07.000000000 -0400 @@ -70,7 +70,7 @@ struct ipv4_devconf ipv4_devconf = { .shared_media = 1, }; -static struct ipv4_devconf ipv4_devconf_dflt = { +struct ipv4_devconf ipv4_devconf_dflt = { .accept_redirects = 1, .send_redirects = 1, .secure_redirects = 1, @@ -78,10 +78,16 @@ static struct ipv4_devconf ipv4_devconf_ .accept_source_route = 1, }; +#ifdef CONFIG_VE +#define ve_ipv4_devconf_dflt (*(get_exec_env()->_ipv4_devconf_dflt)) +#else +#define ve_ipv4_devconf_dflt ipv4_devconf_dflt +#endif + static void rtmsg_ifa(int event, struct in_ifaddr *); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); #ifdef CONFIG_SYSCTL static void devinet_sysctl_register(struct in_device *in_dev, @@ -91,7 +97,7 @@ static void devinet_sysctl_unregister(st /* Locks all the inet devices. */ -static struct in_ifaddr *inet_alloc_ifa(void) +struct in_ifaddr *inet_alloc_ifa(void) { struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL); @@ -101,6 +107,7 @@ static struct in_ifaddr *inet_alloc_ifa( return ifa; } +EXPORT_SYMBOL_GPL(inet_alloc_ifa); static void inet_rcu_free_ifa(struct rcu_head *head) { @@ -157,12 +164,12 @@ struct in_device *inetdev_init(struct ne /* Account for reference dev->ip_ptr */ in_dev_hold(in_dev); - rcu_assign_pointer(dev->ip_ptr, in_dev); #ifdef CONFIG_SYSCTL devinet_sysctl_register(in_dev, &in_dev->cnf); #endif ip_mc_init_dev(in_dev); + rcu_assign_pointer(dev->ip_ptr, in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); out: @@ -172,6 +179,7 @@ out_kfree: in_dev = NULL; goto out; } +EXPORT_SYMBOL_GPL(inetdev_init); static void in_dev_rcu_put(struct rcu_head *head) { @@ -187,7 +195,7 @@ static void inetdev_destroy(struct in_de ASSERT_RTNL(); dev = in_dev->dev; - if (dev == &loopback_dev) + if (dev == &ve0_loopback) return; in_dev->dead = 1; @@ -229,7 +237,7 @@ int inet_addr_onlink(struct in_device *i return 0; } -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) { struct in_ifaddr *promote = NULL; @@ -319,7 +327,7 @@ static void inet_del_ifa(struct in_devic } } -static int inet_insert_ifa(struct in_ifaddr *ifa) +int inet_insert_ifa(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1, **ifap, **last_primary; @@ -369,6 +377,7 @@ static int inet_insert_ifa(struct in_ifa return 0; } +EXPORT_SYMBOL_GPL(inet_insert_ifa); static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) { @@ -577,7 +586,7 @@ int devinet_ioctl(unsigned int cmd, void case SIOCSIFFLAGS: ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ @@ -585,7 +594,7 @@ int devinet_ioctl(unsigned int cmd, void case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) @@ -1162,10 +1171,10 @@ static struct rtnetlink_link inet_rtnetl void inet_forward_change(void) { struct net_device *dev; - int on = ipv4_devconf.forwarding; + int on = ve_ipv4_devconf.forwarding; - ipv4_devconf.accept_redirects = !on; - ipv4_devconf_dflt.forwarding = on; + ve_ipv4_devconf.accept_redirects = !on; + ve_ipv4_devconf_dflt.forwarding = on; read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { @@ -1190,9 +1199,9 @@ static int devinet_sysctl_forward(ctl_ta int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write && *valp != val) { - if (valp == &ipv4_devconf.forwarding) + if (valp == &ve_ipv4_devconf.forwarding) inet_forward_change(); - else if (valp != &ipv4_devconf_dflt.forwarding) + else if (valp != &ve_ipv4_devconf_dflt.forwarding) rt_cache_flush(0); } @@ -1471,28 +1480,21 @@ static struct devinet_sysctl_table { }, }; -static void devinet_sysctl_register(struct in_device *in_dev, - struct ipv4_devconf *p) +static struct devinet_sysctl_table *__devinet_sysctl_register(char *dev_name, + int ifindex, struct ipv4_devconf *p) { int i; - struct net_device *dev = in_dev ? in_dev->dev : NULL; - struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); - char *dev_name = NULL; + struct devinet_sysctl_table *t; + t = kmalloc(sizeof(*t), GFP_KERNEL); if (!t) - return; + goto out; + memcpy(t, &devinet_sysctl, sizeof(*t)); for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].de = NULL; - } - - if (dev) { - dev_name = dev->name; - t->devinet_dev[0].ctl_name = dev->ifindex; - } else { - dev_name = "default"; - t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + t->devinet_vars[i].owner_env = get_exec_env(); } /* @@ -1502,8 +1504,9 @@ static void devinet_sysctl_register(stru */ dev_name = kstrdup(dev_name, GFP_KERNEL); if (!dev_name) - goto free; + goto out_free_table; + t->devinet_dev[0].ctl_name = ifindex; t->devinet_dev[0].procname = dev_name; t->devinet_dev[0].child = t->devinet_vars; t->devinet_dev[0].de = NULL; @@ -1516,17 +1519,38 @@ static void devinet_sysctl_register(stru t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); if (!t->sysctl_header) - goto free_procname; + goto out_free_procname; - p->sysctl = t; - return; + return t; /* error path */ - free_procname: +out_free_procname: kfree(dev_name); - free: +out_free_table: kfree(t); - return; +out: + printk(KERN_DEBUG "Can't register net/ipv4/conf sysctls.\n"); + return NULL; +} + +static void devinet_sysctl_register(struct in_device *in_dev, + struct ipv4_devconf *p) +{ + struct net_device *dev; + char *dev_name; + int ifindex; + + dev = in_dev ? in_dev->dev : NULL; + + if (dev) { + dev_name = dev->name; + ifindex = dev->ifindex; + } else { + dev_name = "default"; + ifindex = NET_PROTO_CONF_DEFAULT; + } + + p->sysctl = __devinet_sysctl_register(dev_name, ifindex, p); } static void devinet_sysctl_unregister(struct ipv4_devconf *p) @@ -1539,8 +1563,176 @@ static void devinet_sysctl_unregister(st kfree(t); } } + +#ifdef CONFIG_VE +static ctl_table net_sysctl_tables[] = { + /* 0: net */ + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = &net_sysctl_tables[2], + }, + { .ctl_name = 0, }, + /* 2: net/ipv4 */ + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = &net_sysctl_tables[4], + }, + { .ctl_name = 0, }, + /* 4, 5: net/ipv4/[vars] */ + { + .ctl_name = NET_IPV4_FORWARD, + .procname = "ip_forward", + .data = &ipv4_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_sysctl_forward, + .strategy = &ipv4_sysctl_forward_strategy, + }, + { + .ctl_name = NET_IPV4_ROUTE, + .procname = "route", + .maxlen = 0, + .mode = 0555, + .child = &net_sysctl_tables[7], + }, + { .ctl_name = 0 }, + /* 7: net/ipv4/route/flush */ + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", + .data = NULL, /* setuped below */ + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &ipv4_sysctl_rtcache_flush, + .strategy = &ipv4_sysctl_rtcache_flush_strategy, + }, + { .ctl_name = 0 }, +}; + +static int ip_forward_sysctl_register(struct ve_struct *ve, + struct ipv4_devconf *p) +{ + struct ctl_table_header *hdr; + ctl_table *root, *ipv4_table, *route_table; + + root = clone_sysctl_template(net_sysctl_tables); + if (root == NULL) + goto out; + + ipv4_table = root->child->child; + ipv4_table[0].data = &p->forwarding; + + route_table = ipv4_table[1].child; + route_table[0].data = &ipv4_flush_delay; + + hdr = register_sysctl_table(root, 1); + if (hdr == NULL) + goto out_free; + + ve->forward_header = hdr; + ve->forward_table = root; + return 0; + +out_free: + free_sysctl_clone(root); +out: + return -ENOMEM; +} + +static inline void ip_forward_sysctl_unregister(struct ve_struct *ve) +{ + unregister_sysctl_table(ve->forward_header); + ve->forward_header = NULL; +} + +static inline void ip_forward_sysctl_free(struct ve_struct *ve) +{ + if (ve->forward_table == NULL) + return; + + free_sysctl_clone(ve->forward_table); + ve->forward_table = NULL; +} +#endif #endif +int devinet_sysctl_init(struct ve_struct *ve) +{ + int err = 0; +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_VE + struct ipv4_devconf *conf, *conf_def; + + err = -ENOMEM; + + conf = kmalloc(sizeof(*conf), GFP_KERNEL); + if (!conf) + goto err1; + + memcpy(conf, &ipv4_devconf, sizeof(*conf)); + conf->sysctl = __devinet_sysctl_register("all", + NET_PROTO_CONF_ALL, conf); + if (!conf->sysctl) + goto err2; + + conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL); + if (!conf_def) + goto err3; + + memcpy(conf_def, &ipv4_devconf_dflt, sizeof(*conf_def)); + conf_def->sysctl = __devinet_sysctl_register("default", + NET_PROTO_CONF_DEFAULT, conf_def); + if (!conf_def->sysctl) + goto err4; + + err = ip_forward_sysctl_register(ve, conf); + if (err) + goto err5; + + ve->_ipv4_devconf = conf; + ve->_ipv4_devconf_dflt = conf_def; + return 0; + +err5: + devinet_sysctl_unregister(conf_def); +err4: + kfree(conf_def); +err3: + devinet_sysctl_unregister(conf); +err2: + kfree(conf); +err1: +#endif +#endif + return err; +} + +void devinet_sysctl_fini(struct ve_struct *ve) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_VE + ip_forward_sysctl_unregister(ve); + devinet_sysctl_unregister(ve->_ipv4_devconf); + devinet_sysctl_unregister(ve->_ipv4_devconf_dflt); +#endif +#endif +} + +void devinet_sysctl_free(struct ve_struct *ve) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_VE + ip_forward_sysctl_free(ve); + kfree(ve->_ipv4_devconf); + kfree(ve->_ipv4_devconf_dflt); +#endif +#endif +} + void __init devinet_init(void) { register_gifconf(PF_INET, inet_gifconf); @@ -1549,7 +1741,8 @@ void __init devinet_init(void) #ifdef CONFIG_SYSCTL devinet_sysctl.sysctl_header = register_sysctl_table(devinet_sysctl.devinet_root_dir, 0); - devinet_sysctl_register(NULL, &ipv4_devconf_dflt); + __devinet_sysctl_register("default", NET_PROTO_CONF_DEFAULT, + &ipv4_devconf_dflt); #endif } @@ -1558,3 +1751,7 @@ EXPORT_SYMBOL(inet_select_addr); EXPORT_SYMBOL(inetdev_by_index); EXPORT_SYMBOL(register_inetaddr_notifier); EXPORT_SYMBOL(unregister_inetaddr_notifier); +EXPORT_SYMBOL(inet_del_ifa); +EXPORT_SYMBOL(devinet_sysctl_init); +EXPORT_SYMBOL(devinet_sysctl_fini); +EXPORT_SYMBOL(devinet_sysctl_free); diff -uprN linux-2.6.18/net/ipv4/fib_frontend.c linux-2.6.18.ovz/net/ipv4/fib_frontend.c --- linux-2.6.18/net/ipv4/fib_frontend.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/fib_frontend.c 2007-06-13 06:55:07.000000000 -0400 @@ -52,14 +52,46 @@ #define RT_TABLE_MIN RT_TABLE_MAIN +#undef ip_fib_local_table +#undef ip_fib_main_table struct fib_table *ip_fib_local_table; struct fib_table *ip_fib_main_table; +void prepare_fib_tables(void) +{ +#ifdef CONFIG_VE + get_ve0()->_local_table = ip_fib_local_table; + ip_fib_local_table = (struct fib_table *)0x12345678; + get_ve0()->_main_table = ip_fib_main_table; + ip_fib_main_table = (struct fib_table *)0x12345678; +#endif +} +#ifdef CONFIG_VE +#define ip_fib_local_table get_exec_env()->_local_table +#define ip_fib_main_table get_exec_env()->_main_table +#endif #else #define RT_TABLE_MIN 1 +#undef fib_tables struct fib_table *fib_tables[RT_TABLE_MAX+1]; +void prepare_fib_tables(void) +{ +#ifdef CONFIG_VE + int i; + + BUG_ON(sizeof(fib_tables) != + sizeof(((struct ve_struct *)0)->_fib_tables)); + memcpy(get_ve0()->_fib_tables, fib_tables, sizeof(fib_tables)); + for (i = 0; i <= RT_TABLE_MAX; i++) + fib_tables[i] = (void *)0x12366678; +#endif +} + +#ifdef CONFIG_VE +#define fib_tables get_exec_env()->_fib_tables +#endif struct fib_table *__fib_new_table(int id) { @@ -186,7 +218,8 @@ int fib_validate_source(u32 src, u32 dst if (fib_lookup(&fl, &res)) goto last_resort; - if (res.type != RTN_UNICAST) + if (res.type != RTN_UNICAST && + (!(dev->features & NETIF_F_VENET) || res.type != RTN_LOCAL)) goto e_inval_res; *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); @@ -249,7 +282,7 @@ int ip_rt_ioctl(unsigned int cmd, void _ switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&r, arg, sizeof(struct rtentry))) return -EFAULT; @@ -524,6 +557,12 @@ static void nl_fib_lookup(struct fib_res .fwmark = frn->fl_fwmark, .tos = frn->fl_tos, .scope = frn->fl_scope } } }; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + frn->err = -ENOENT; if (tb) { local_bh_disable(); @@ -535,6 +574,7 @@ static void nl_fib_lookup(struct fib_res frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; + fib_res_put(&res); } local_bh_enable(); } @@ -547,21 +587,24 @@ static void nl_fib_input(struct sock *sk struct fib_result_nl *frn; u32 pid; struct fib_table *tb; - + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb == NULL) + return; + nlh = (struct nlmsghdr *)skb->data; if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) { kfree_skb(skb); return; } - + frn = (struct fib_result_nl *) NLMSG_DATA(nlh); tb = fib_get_table(frn->tb_id_in); nl_fib_lookup(frn, tb); - - pid = nlh->nlmsg_pid; /*pid of sending process */ + + pid = NETLINK_CB(skb).pid; /* pid of sending process */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_pid = pid; NETLINK_CB(skb).dst_group = 0; /* unicast */ @@ -652,6 +695,7 @@ static struct notifier_block fib_netdev_ void __init ip_fib_init(void) { + prepare_fib_tables(); #ifndef CONFIG_IP_MULTIPLE_TABLES ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); diff -uprN linux-2.6.18/net/ipv4/fib_hash.c linux-2.6.18.ovz/net/ipv4/fib_hash.c --- linux-2.6.18/net/ipv4/fib_hash.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/fib_hash.c 2007-06-13 06:55:07.000000000 -0400 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -72,11 +73,6 @@ struct fn_zone { * can be cheaper than memory lookup, so that FZ_* macros are used. */ -struct fn_hash { - struct fn_zone *fn_zones[33]; - struct fn_zone *fn_zone_list; -}; - static inline u32 fn_hash(u32 key, struct fn_zone *fz) { u32 h = ntohl(key)>>(32 - fz->fz_order); @@ -621,7 +617,7 @@ fn_hash_delete(struct fib_table *tb, str return -ESRCH; } -static int fn_flush_list(struct fn_zone *fz, int idx) +static int fn_flush_list(struct fn_zone *fz, int idx, int destroy) { struct hlist_head *head = &fz->fz_hash[idx]; struct hlist_node *node, *n; @@ -636,7 +632,9 @@ static int fn_flush_list(struct fn_zone list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { struct fib_info *fi = fa->fa_info; - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { + if (fi == NULL) + continue; + if (destroy || (fi->fib_flags&RTNH_F_DEAD)) { write_lock_bh(&fib_hash_lock); list_del(&fa->fa_list); if (list_empty(&f->fn_alias)) { @@ -658,7 +656,7 @@ static int fn_flush_list(struct fn_zone return found; } -static int fn_hash_flush(struct fib_table *tb) +static int __fn_hash_flush(struct fib_table *tb, int destroy) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fn_zone *fz; @@ -668,11 +666,99 @@ static int fn_hash_flush(struct fib_tabl int i; for (i = fz->fz_divisor - 1; i >= 0; i--) - found += fn_flush_list(fz, i); + found += fn_flush_list(fz, i, destroy); } return found; } +static int fn_hash_flush(struct fib_table *tb) +{ + return __fn_hash_flush(tb, 0); +} + +#ifdef CONFIG_VE +static void fn_free_zones(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz; + + while ((fz = table->fn_zone_list) != NULL) { + table->fn_zone_list = fz->fz_next; + fz_hash_free(fz->fz_hash, fz->fz_divisor); + kfree(fz); + } +} + +void fib_hash_destroy(struct fib_table *tb) +{ + __fn_hash_flush(tb, 1); + fn_free_zones(tb); + kfree(tb); +} + +/* + * Initialization of virtualized networking subsystem. + */ +int init_ve_route(struct ve_struct *ve) +{ +#ifdef CONFIG_IP_MULTIPLE_TABLES + if (fib_rules_create()) + return -ENOMEM; + ve->_fib_tables[RT_TABLE_LOCAL] = fib_hash_init(RT_TABLE_LOCAL); + if (!ve->_fib_tables[RT_TABLE_LOCAL]) + goto out_destroy; + ve->_fib_tables[RT_TABLE_MAIN] = fib_hash_init(RT_TABLE_MAIN); + if (!ve->_fib_tables[RT_TABLE_MAIN]) + goto out_destroy_local; + + return 0; + +out_destroy_local: + fib_hash_destroy(ve->_fib_tables[RT_TABLE_LOCAL]); +out_destroy: + fib_rules_destroy(); + ve->_local_rule = NULL; + return -ENOMEM; +#else + ve->_local_table = fib_hash_init(RT_TABLE_LOCAL); + if (!ve->_local_table) + return -ENOMEM; + ve->_main_table = fib_hash_init(RT_TABLE_MAIN); + if (!ve->_main_table) { + fib_hash_destroy(ve->_local_table); + return -ENOMEM; + } + return 0; +#endif +} + +void fini_ve_route(struct ve_struct *ve) +{ + unsigned int bytes; +#ifdef CONFIG_IP_MULTIPLE_TABLES + int i; + for (i=0; i_fib_tables[i]) + continue; + fib_hash_destroy(ve->_fib_tables[i]); + } + fib_rules_destroy(); + ve->_local_rule = NULL; +#else + fib_hash_destroy(ve->_local_table); + fib_hash_destroy(ve->_main_table); +#endif + bytes = ve->_fib_hash_size * sizeof(struct hlist_head *); + fib_hash_free(ve->_fib_info_hash, bytes); + fib_hash_free(ve->_fib_info_laddrhash, bytes); + ve->_fib_info_hash = ve->_fib_info_laddrhash = NULL; +} + +EXPORT_SYMBOL(init_ve_route); +EXPORT_SYMBOL(fini_ve_route); +#endif + static inline int fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, @@ -764,7 +850,7 @@ static int fn_hash_dump(struct fib_table return skb->len; } -#ifdef CONFIG_IP_MULTIPLE_TABLES +#if defined(CONFIG_IP_MULTIPLE_TABLES) || defined(CONFIG_VE) struct fib_table * fib_hash_init(int id) #else struct fib_table * __init fib_hash_init(int id) @@ -1073,13 +1159,13 @@ static struct file_operations fib_seq_fo int __init fib_proc_init(void) { - if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops)) + if (!proc_glob_fops_create("net/route", S_IRUGO, &fib_seq_fops)) return -ENOMEM; return 0; } void __init fib_proc_exit(void) { - proc_net_remove("route"); + remove_proc_glob_entry("net/route", NULL); } #endif /* CONFIG_PROC_FS */ diff -uprN linux-2.6.18/net/ipv4/fib_lookup.h linux-2.6.18.ovz/net/ipv4/fib_lookup.h --- linux-2.6.18/net/ipv4/fib_lookup.h 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/fib_lookup.h 2007-06-13 06:55:07.000000000 -0400 @@ -41,5 +41,6 @@ extern struct fib_alias *fib_find_alias( extern int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int *dflt); +void fib_hash_free(struct hlist_head *hash, int bytes); #endif /* _FIB_LOOKUP_H */ diff -uprN linux-2.6.18/net/ipv4/fib_rules.c linux-2.6.18.ovz/net/ipv4/fib_rules.c --- linux-2.6.18/net/ipv4/fib_rules.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/fib_rules.c 2007-06-13 06:55:07.000000000 -0400 @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -93,13 +94,91 @@ static struct fib_rule main_rule = { .r_action = RTN_UNICAST, }; -static struct fib_rule local_rule = { +static struct fib_rule loc_rule = { .r_clntref = ATOMIC_INIT(2), .r_table = RT_TABLE_LOCAL, .r_action = RTN_UNICAST, }; +#ifdef CONFIG_VE +#define local_rule (*(get_exec_env()->_local_rule)) +#define fib_rules (get_exec_env()->_fib_rules) +#else +#define local_rule loc_rule static struct hlist_head fib_rules; +#endif + +#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE) +#ifdef CONFIG_VE +static inline void init_rule_struct(struct fib_rule *r, + u32 pref, unsigned char table, unsigned char action) +{ + memset(r, 0, sizeof(struct fib_rule)); + atomic_set(&r->r_clntref, 1); + r->r_preference = pref; + r->r_table = table; + r->r_action = action; +} +#endif + +int fib_rules_create(void) +{ +#ifdef CONFIG_VE + struct fib_rule *default_rule, *main_rule, *loc_rule; + + default_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); + if (default_rule == NULL) + goto out_def; + + main_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); + if (main_rule == NULL) + goto out_main; + + loc_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); + if (loc_rule == NULL) + goto out_loc; + + init_rule_struct(default_rule, 0x7FFF, RT_TABLE_DEFAULT, RTN_UNICAST); + init_rule_struct(main_rule, 0x7FFE, RT_TABLE_MAIN, RTN_UNICAST); + init_rule_struct(loc_rule, 0, RT_TABLE_LOCAL, RTN_UNICAST); + + INIT_HLIST_HEAD(&fib_rules); + hlist_add_head(&loc_rule->hlist, &fib_rules); + hlist_add_after(&loc_rule->hlist, &main_rule->hlist); + hlist_add_after(&main_rule->hlist, &default_rule->hlist); + get_exec_env()->_local_rule = loc_rule; + + return 0; + +out_loc: + kfree(main_rule); +out_main: + kfree(default_rule); +out_def: + return -1; +#else + return 0; +#endif +} + +void fib_rules_destroy(void) +{ +#ifdef CONFIG_VE + struct fib_rule *r; + struct hlist_node *pos, *tmp; + + rtnl_lock(); + hlist_for_each_safe (pos, tmp, &fib_rules) { + r = hlist_entry(pos, struct fib_rule, hlist); + + hlist_del_rcu(pos); + r->r_dead = 1; + fib_rule_put(r); + } + rtnl_unlock(); +#endif +} +#endif /* writer func called from netlink -- rtnl_sem hold*/ @@ -474,8 +553,9 @@ next: void __init fib_rules_init(void) { INIT_HLIST_HEAD(&fib_rules); - hlist_add_head(&local_rule.hlist, &fib_rules); - hlist_add_after(&local_rule.hlist, &main_rule.hlist); + hlist_add_head(&loc_rule.hlist, &fib_rules); + hlist_add_after(&loc_rule.hlist, &main_rule.hlist); hlist_add_after(&main_rule.hlist, &default_rule.hlist); + register_netdevice_notifier(&fib_rules_notifier); } diff -uprN linux-2.6.18/net/ipv4/fib_semantics.c linux-2.6.18.ovz/net/ipv4/fib_semantics.c --- linux-2.6.18/net/ipv4/fib_semantics.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/fib_semantics.c 2007-06-13 06:55:07.000000000 -0400 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,24 @@ static struct hlist_head *fib_info_laddr static unsigned int fib_hash_size; static unsigned int fib_info_cnt; +void prepare_fib_info(void) +{ +#ifdef CONFIG_VE + get_ve0()->_fib_info_hash = fib_info_hash; + get_ve0()->_fib_info_laddrhash = fib_info_laddrhash; + get_ve0()->_fib_hash_size = fib_hash_size; + get_ve0()->_fib_info_cnt = fib_info_cnt; +#endif +} + +#ifdef CONFIG_VE +#define fib_info_hash (get_exec_env()->_fib_info_hash) +#define fib_info_laddrhash (get_exec_env()->_fib_info_laddrhash) +#define fib_hash_size (get_exec_env()->_fib_hash_size) +#define fib_info_cnt (get_exec_env()->_fib_info_cnt) +#endif + + #define DEVINDEX_HASHBITS 8 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; @@ -234,13 +253,15 @@ static struct fib_info *fib_find_info(co return NULL; } -static inline unsigned int fib_devindex_hashfn(unsigned int val) +static inline unsigned int fib_devindex_hashfn(unsigned int val, + envid_t veid) { unsigned int mask = DEVINDEX_HASHSIZE - 1; return (val ^ (val >> DEVINDEX_HASHBITS) ^ - (val >> (DEVINDEX_HASHBITS * 2))) & mask; + (val >> (DEVINDEX_HASHBITS * 2)) ^ + (veid ^ (veid >> 16))) & mask; } /* Check, that the gateway is already configured. @@ -256,7 +277,7 @@ int ip_fib_check_default(u32 gw, struct read_lock(&fib_info_lock); - hash = fib_devindex_hashfn(dev->ifindex); + hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); head = &fib_info_devhash[hash]; hlist_for_each_entry(nh, node, head, nh_hash) { if (nh->nh_dev == dev && @@ -579,7 +600,7 @@ static struct hlist_head *fib_hash_alloc __get_free_pages(GFP_KERNEL, get_order(bytes)); } -static void fib_hash_free(struct hlist_head *hash, int bytes) +void fib_hash_free(struct hlist_head *hash, int bytes) { if (!hash) return; @@ -835,7 +856,8 @@ link_it: if (!nh->nh_dev) continue; - hash = fib_devindex_hashfn(nh->nh_dev->ifindex); + hash = fib_devindex_hashfn(nh->nh_dev->ifindex, + VEID(nh->nh_dev->owner_env)); head = &fib_info_devhash[hash]; hlist_add_head(&nh->nh_hash, head); } endfor_nexthops(fi) @@ -1186,7 +1208,8 @@ int fib_sync_down(u32 local, struct net_ if (dev) { struct fib_info *prev_fi = NULL; - unsigned int hash = fib_devindex_hashfn(dev->ifindex); + unsigned int hash = fib_devindex_hashfn(dev->ifindex, + VEID(dev->owner_env)); struct hlist_head *head = &fib_info_devhash[hash]; struct hlist_node *node; struct fib_nh *nh; @@ -1251,7 +1274,7 @@ int fib_sync_up(struct net_device *dev) return 0; prev_fi = NULL; - hash = fib_devindex_hashfn(dev->ifindex); + hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); head = &fib_info_devhash[hash]; ret = 0; diff -uprN linux-2.6.18/net/ipv4/igmp.c linux-2.6.18.ovz/net/ipv4/igmp.c --- linux-2.6.18/net/ipv4/igmp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/igmp.c 2007-06-13 06:55:07.000000000 -0400 @@ -455,6 +455,8 @@ static struct sk_buff *add_grec(struct s skb = add_grhead(skb, pmc, type, &pgr); first = 0; } + if (!skb) + return NULL; psrc = (u32 *)skb_put(skb, sizeof(u32)); *psrc = psf->sf_inaddr; scount++; stotal++; @@ -2265,6 +2267,8 @@ static inline struct ip_mc_list *igmp_mc state->dev; state->dev = state->dev->next) { struct in_device *in_dev; + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; in_dev = in_dev_get(state->dev); if (!in_dev) continue; @@ -2294,6 +2298,8 @@ static struct ip_mc_list *igmp_mc_get_ne state->in_dev = NULL; break; } + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; state->in_dev = in_dev_get(state->dev); if (!state->in_dev) continue; @@ -2427,6 +2433,8 @@ static inline struct ip_sf_list *igmp_mc state->dev; state->dev = state->dev->next) { struct in_device *idev; + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; idev = in_dev_get(state->dev); if (unlikely(idev == NULL)) continue; @@ -2466,6 +2474,8 @@ static struct ip_sf_list *igmp_mcf_get_n state->idev = NULL; goto out; } + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; state->idev = in_dev_get(state->dev); if (!state->idev) continue; @@ -2585,8 +2595,8 @@ static struct file_operations igmp_mcf_s int __init igmp_mc_proc_init(void) { - proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops); - proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + proc_glob_fops_create("net/igmp", S_IRUGO, &igmp_mc_seq_fops); + proc_glob_fops_create("net/mcfilter", S_IRUGO, &igmp_mcf_seq_fops); return 0; } #endif diff -uprN linux-2.6.18/net/ipv4/inet_connection_sock.c linux-2.6.18.ovz/net/ipv4/inet_connection_sock.c --- linux-2.6.18/net/ipv4/inet_connection_sock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/inet_connection_sock.c 2007-06-13 06:55:07.000000000 -0400 @@ -24,6 +24,9 @@ #include #include +#include +#include + #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); @@ -47,6 +50,7 @@ int inet_csk_bind_conflict(const struct sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && !inet_v6_ipv6only(sk2) && + ve_accessible_strict(sk->owner_env, sk2->owner_env) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { @@ -76,7 +80,9 @@ int inet_csk_get_port(struct inet_hashin struct hlist_node *node; struct inet_bind_bucket *tb; int ret; + struct ve_struct *env; + env = sk->owner_env; local_bh_disable(); if (!snum) { int low = sysctl_local_port_range[0]; @@ -84,12 +90,22 @@ int inet_csk_get_port(struct inet_hashin int remaining = (high - low) + 1; int rover = net_random() % (high - low) + low; + /* Below we treat low > high as high == low. So do here. Den */ + if (remaining < 1) { + remaining = 1; + rover = low; + } + do { - head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(rover, + hashinfo->bhash_size, VEID(env))]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (!ve_accessible_strict(tb->owner_env, env)) + continue; if (tb->port == rover) goto next; + } break; next: spin_unlock(&head->lock); @@ -112,11 +128,15 @@ int inet_csk_get_port(struct inet_hashin */ snum = rover; } else { - head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(snum, + hashinfo->bhash_size, VEID(env))]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (!ve_accessible_strict(tb->owner_env, env)) + continue; if (tb->port == snum) goto tb_found; + } } tb = NULL; goto tb_not_found; @@ -135,7 +155,7 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum, env)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -540,7 +560,7 @@ void inet_csk_destroy_sock(struct sock * sk_refcnt_debug_release(sk); - atomic_dec(sk->sk_prot->orphan_count); + ub_dec_orphan_count(sk); sock_put(sk); } @@ -620,7 +640,7 @@ void inet_csk_listen_stop(struct sock *s sock_orphan(child); - atomic_inc(sk->sk_prot->orphan_count); + ub_inc_orphan_count(sk); inet_csk_destroy_sock(child); diff -uprN linux-2.6.18/net/ipv4/inet_diag.c linux-2.6.18.ovz/net/ipv4/inet_diag.c --- linux-2.6.18/net/ipv4/inet_diag.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/inet_diag.c 2007-06-13 06:55:07.000000000 -0400 @@ -672,7 +672,9 @@ static int inet_diag_dump(struct sk_buff struct inet_diag_req *r = NLMSG_DATA(cb->nlh); const struct inet_diag_handler *handler; struct inet_hashinfo *hashinfo; + struct ve_struct *ve; + ve = get_exec_env(); handler = inet_diag_table[cb->nlh->nlmsg_type]; BUG_ON(handler == NULL); hashinfo = handler->idiag_hashinfo; @@ -693,6 +695,8 @@ static int inet_diag_dump(struct sk_buff sk_for_each(sk, node, &hashinfo->listening_hash[i]) { struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible(sk->owner_env, ve)) + continue; if (num < s_num) { num++; continue; @@ -753,6 +757,8 @@ skip_listen_ht: sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible(sk->owner_env, ve)) + continue; if (num < s_num) goto next_normal; if (!(r->idiag_states & (1 << sk->sk_state))) @@ -777,6 +783,8 @@ next_normal: inet_twsk_for_each(tw, node, &hashinfo->ehash[i + hashinfo->ehash_size].chain) { + if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) + continue; if (num < s_num) goto next_dying; if (r->id.idiag_sport != tw->tw_sport && diff -uprN linux-2.6.18/net/ipv4/inet_hashtables.c linux-2.6.18.ovz/net/ipv4/inet_hashtables.c --- linux-2.6.18/net/ipv4/inet_hashtables.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/inet_hashtables.c 2007-06-13 06:55:07.000000000 -0400 @@ -29,7 +29,8 @@ */ struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, struct inet_bind_hashbucket *head, - const unsigned short snum) + const unsigned short snum, + struct ve_struct *ve) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); @@ -37,6 +38,7 @@ struct inet_bind_bucket *inet_bind_bucke tb->port = snum; tb->fastreuse = 0; INIT_HLIST_HEAD(&tb->owners); + tb->owner_env = ve; hlist_add_head(&tb->node, &head->chain); } return tb; @@ -66,10 +68,13 @@ void inet_bind_hash(struct sock *sk, str */ static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) { - const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); - struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; + int bhash; + struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; + bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size, + VEID(sk->owner_env)); + head = &hashinfo->bhash[bhash]; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); @@ -125,7 +130,8 @@ EXPORT_SYMBOL(inet_listen_wlock); * wildcarded during the search since they can never be otherwise. */ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, - const unsigned short hnum, const int dif) + const unsigned short hnum, const int dif, + struct ve_struct *env) { struct sock *result = NULL, *sk; const struct hlist_node *node; @@ -134,6 +140,8 @@ struct sock *__inet_lookup_listener(cons sk_for_each(sk, node, head) { const struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible_strict(sk->owner_env, env)) + continue; if (inet->num == hnum && !ipv6_only_sock(sk)) { const __u32 rcv_saddr = inet->rcv_saddr; int score = sk->sk_family == PF_INET ? 1 : 0; @@ -164,7 +172,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_listener /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + struct ve_struct *ve) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -173,12 +182,15 @@ static int __inet_check_established(stru int dif = sk->sk_bound_dev_if; INET_ADDR_COOKIE(acookie, saddr, daddr) const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + unsigned int hash; + struct inet_ehash_bucket *head; struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; + hash = inet_ehashfn(daddr, lport, saddr, inet->dport, VEID(ve)); + head = inet_ehash_bucket(hinfo, hash); + prefetch(head->chain.first); write_lock(&head->lock); @@ -186,7 +198,8 @@ static int __inet_check_established(stru sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { tw = inet_twsk(sk2); - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { + if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, + ports, dif, ve)) { if (twsk_unique(sk, sk2, twp)) goto unique; else @@ -197,7 +210,8 @@ static int __inet_check_established(stru /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk2, hash, acookie, saddr, daddr, + ports, dif, ve)) goto not_unique; } @@ -248,7 +262,9 @@ int inet_hash_connect(struct inet_timewa struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; int ret; + struct ve_struct *ve; + ve = sk->owner_env; if (!snum) { int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; @@ -263,7 +279,8 @@ int inet_hash_connect(struct inet_timewa local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(port, + hinfo->bhash_size, VEID(ve))]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -271,19 +288,21 @@ int inet_hash_connect(struct inet_timewa * unique enough. */ inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { + if (tb->port == port && + ve_accessible_strict(tb->owner_env, ve)) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) goto next_port; if (!__inet_check_established(death_row, sk, port, - &tw)) + &tw, ve)) goto ok; goto next_port; } } - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + head, port, ve); if (!tb) { spin_unlock(&head->lock); break; @@ -318,7 +337,7 @@ ok: goto out; } - head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))]; tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { @@ -328,7 +347,7 @@ ok: } else { spin_unlock(&head->lock); /* No definite answer... Walk to established hash table */ - ret = __inet_check_established(death_row, sk, snum, NULL); + ret = __inet_check_established(death_row, sk, snum, NULL, ve); out: local_bh_enable(); return ret; diff -uprN linux-2.6.18/net/ipv4/inet_timewait_sock.c linux-2.6.18.ovz/net/ipv4/inet_timewait_sock.c --- linux-2.6.18/net/ipv4/inet_timewait_sock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/inet_timewait_sock.c 2007-06-13 06:55:07.000000000 -0400 @@ -13,6 +13,8 @@ #include #include +#include + /* Must be called with locally disabled BHs. */ void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { @@ -31,7 +33,8 @@ void __inet_twsk_kill(struct inet_timewa write_unlock(&ehead->lock); /* Disassociate with bind bucket. */ - bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, + hashinfo->bhash_size, tw->tw_owner_env)]; spin_lock(&bhead->lock); tb = tw->tw_tb; __hlist_del(&tw->tw_bind_node); @@ -65,7 +68,8 @@ void __inet_twsk_hashdance(struct inet_t Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(inet->num, + hashinfo->bhash_size, tw->tw_owner_env)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; BUG_TRAP(icsk->icsk_bind_hash); @@ -89,9 +93,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance) struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { - struct inet_timewait_sock *tw = - kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, - SLAB_ATOMIC); + struct user_beancounter *ub; + struct inet_timewait_sock *tw; + + ub = set_exec_ub(sock_bc(sk)->ub); + tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + SLAB_ATOMIC); + (void)set_exec_ub(ub); + if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); @@ -139,6 +148,7 @@ static int inet_twdr_do_twkill_work(stru rescan: inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { __inet_twsk_del_dead_node(tw); + ub_timewait_dec(tw, twdr); spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); inet_twsk_put(tw); @@ -237,6 +247,7 @@ void inet_twsk_deschedule(struct inet_ti { spin_lock(&twdr->death_lock); if (inet_twsk_del_dead_node(tw)) { + ub_timewait_dec(tw, twdr); inet_twsk_put(tw); if (--twdr->tw_count == 0) del_timer(&twdr->tw_timer); @@ -283,9 +294,10 @@ void inet_twsk_schedule(struct inet_time spin_lock(&twdr->death_lock); /* Unlink it, if it was scheduled */ - if (inet_twsk_del_dead_node(tw)) + if (inet_twsk_del_dead_node(tw)) { + ub_timewait_dec(tw, twdr); twdr->tw_count--; - else + } else atomic_inc(&tw->tw_refcnt); if (slot >= INET_TWDR_RECYCLE_SLOTS) { @@ -321,6 +333,7 @@ void inet_twsk_schedule(struct inet_time hlist_add_head(&tw->tw_death_node, list); + ub_timewait_inc(tw, twdr); if (twdr->tw_count++ == 0) mod_timer(&twdr->tw_timer, jiffies + twdr->period); spin_unlock(&twdr->death_lock); @@ -355,6 +368,7 @@ void inet_twdr_twcal_tick(unsigned long &twdr->twcal_row[slot]) { __inet_twsk_del_dead_node(tw); __inet_twsk_kill(tw, twdr->hashinfo); + ub_timewait_dec(tw, twdr); inet_twsk_put(tw); killed++; } diff -uprN linux-2.6.18/net/ipv4/ip_forward.c linux-2.6.18.ovz/net/ipv4/ip_forward.c --- linux-2.6.18/net/ipv4/ip_forward.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/ip_forward.c 2007-06-13 06:55:07.000000000 -0400 @@ -86,6 +86,24 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; + /* + * We try to optimize forwarding of VE packets: + * do not decrement TTL (and so save skb_cow) + * during forwarding of outgoing pkts from VE. + * For incoming pkts we still do ttl decr, + * since such skb is not cloned and does not require + * actual cow. So, there is at least one place + * in pkts path with mandatory ttl decr, that is + * sufficient to prevent routing loops. + */ + iph = skb->nh.iph; + if ( +#ifdef CONFIG_IP_ROUTE_NAT + (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */ +#endif /* and */ + (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */ + goto no_ttl_decr; + /* We are about to mangle packet. Copy it! */ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; @@ -94,6 +112,8 @@ int ip_forward(struct sk_buff *skb) /* Decrease ttl after skb cow done */ ip_decrease_ttl(iph); +no_ttl_decr: + /* * We now generate an ICMP HOST REDIRECT giving the route * we calculated. diff -uprN linux-2.6.18/net/ipv4/ip_fragment.c linux-2.6.18.ovz/net/ipv4/ip_fragment.c --- linux-2.6.18/net/ipv4/ip_fragment.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/ip_fragment.c 2007-06-13 06:55:07.000000000 -0400 @@ -96,6 +96,7 @@ struct ipq { int iif; unsigned int rid; struct inet_peer *peer; + struct ve_struct *owner_env; }; /* Hash table. */ @@ -181,7 +182,8 @@ static __inline__ void frag_free_queue(s static __inline__ struct ipq *frag_alloc_queue(void) { - struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); + struct ipq *qp = kmalloc(sizeof(struct ipq) + sizeof(void *), + GFP_ATOMIC); if(!qp) return NULL; @@ -277,6 +279,9 @@ static void ip_evictor(void) static void ip_expire(unsigned long arg) { struct ipq *qp = (struct ipq *) arg; + struct ve_struct *envid; + + envid = set_exec_env(qp->owner_env); spin_lock(&qp->lock); @@ -299,6 +304,8 @@ static void ip_expire(unsigned long arg) out: spin_unlock(&qp->lock); ipq_put(qp, NULL); + + (void)set_exec_env(envid); } /* Creation primitives. */ @@ -324,7 +331,8 @@ static struct ipq *ip_frag_intern(struct qp->saddr == qp_in->saddr && qp->daddr == qp_in->daddr && qp->protocol == qp_in->protocol && - qp->user == qp_in->user) { + qp->user == qp_in->user && + qp->owner_env == get_exec_env()) { atomic_inc(&qp->refcnt); write_unlock(&ipfrag_lock); qp_in->last_in |= COMPLETE; @@ -373,6 +381,7 @@ static struct ipq *ip_frag_create(struct qp->timer.function = ip_expire; /* expire function */ spin_lock_init(&qp->lock); atomic_set(&qp->refcnt, 1); + qp->owner_env = get_exec_env(); return ip_frag_intern(qp); @@ -401,7 +410,8 @@ static inline struct ipq *ip_find(struct qp->saddr == saddr && qp->daddr == daddr && qp->protocol == protocol && - qp->user == user) { + qp->user == user && + qp->owner_env == get_exec_env()) { atomic_inc(&qp->refcnt); read_unlock(&ipfrag_lock); return qp; @@ -723,6 +733,9 @@ struct sk_buff *ip_defrag(struct sk_buff qp->meat == qp->len) ret = ip_frag_reasm(qp, dev); + if (ret) + ret->owner_env = skb->owner_env; + spin_unlock(&qp->lock); ipq_put(qp, NULL); return ret; @@ -733,6 +746,49 @@ struct sk_buff *ip_defrag(struct sk_buff return NULL; } +#ifdef CONFIG_VE +/* XXX */ +void ip_fragment_cleanup(struct ve_struct *envid) +{ + int i, progress; + + /* All operations with fragment queues are performed from NET_RX/TX + * soft interrupts or from timer context. --Den */ + local_bh_disable(); + do { + progress = 0; + for (i = 0; i < IPQ_HASHSZ; i++) { + struct ipq *qp; + struct hlist_node *p, *n; + + if (hlist_empty(&ipq_hash[i])) + continue; +inner_restart: + read_lock(&ipfrag_lock); + hlist_for_each_entry_safe(qp, p, n, + &ipq_hash[i], list) { + if (!ve_accessible_strict(qp->owner_env, envid)) + continue; + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + + spin_lock(&qp->lock); + if (!(qp->last_in&COMPLETE)) + ipq_kill(qp); + spin_unlock(&qp->lock); + + ipq_put(qp, NULL); + progress = 1; + goto inner_restart; + } + read_unlock(&ipfrag_lock); + } + } while(progress); + local_bh_enable(); +} +EXPORT_SYMBOL(ip_fragment_cleanup); +#endif + void ipfrag_init(void) { ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ diff -uprN linux-2.6.18/net/ipv4/ip_output.c linux-2.6.18.ovz/net/ipv4/ip_output.c --- linux-2.6.18/net/ipv4/ip_output.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/ip_output.c 2007-06-13 06:55:07.000000000 -0400 @@ -1340,12 +1340,13 @@ void ip_send_reply(struct sock *sk, stru char data[40]; } replyopts; struct ipcm_cookie ipc; - u32 daddr; + u32 saddr, daddr; struct rtable *rt = (struct rtable*)skb->dst; if (ip_options_echo(&replyopts.opt, skb)) return; + saddr = skb->nh.iph->daddr; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; @@ -1359,7 +1360,7 @@ void ip_send_reply(struct sock *sk, stru { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, - .saddr = rt->rt_spec_dst, + .saddr = saddr, .tos = RT_TOS(skb->nh.iph->tos) } }, /* Not quite clean, but right. */ .uli_u = { .ports = diff -uprN linux-2.6.18/net/ipv4/ip_sockglue.c linux-2.6.18.ovz/net/ipv4/ip_sockglue.c --- linux-2.6.18/net/ipv4/ip_sockglue.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/ip_sockglue.c 2007-06-13 06:55:07.000000000 -0400 @@ -512,7 +512,8 @@ static int do_ip_setsockopt(struct sock val |= inet->tos & 3; } if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && - !capable(CAP_NET_ADMIN)) { + !capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) { err = -EPERM; break; } diff -uprN linux-2.6.18/net/ipv4/ipconfig.c linux-2.6.18.ovz/net/ipv4/ipconfig.c --- linux-2.6.18/net/ipv4/ipconfig.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/ipconfig.c 2007-06-13 06:55:07.000000000 -0400 @@ -367,7 +367,7 @@ static int __init ic_defaults(void) */ if (!ic_host_name_set) - sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); + sprintf(init_utsname()->nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); if (root_server_addr == INADDR_NONE) root_server_addr = ic_servaddr; @@ -806,7 +806,7 @@ static void __init ic_do_bootp_ext(u8 *e } break; case 12: /* Host name */ - ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN); + ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN); ic_host_name_set = 1; break; case 15: /* Domain name (DNS) */ @@ -817,7 +817,7 @@ static void __init ic_do_bootp_ext(u8 *e ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); break; case 40: /* NIS Domain name (_not_ DNS) */ - ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN); + ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); break; } } @@ -1369,7 +1369,7 @@ static int __init ip_auto_config(void) printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask)); printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway)); printk(",\n host=%s, domain=%s, nis-domain=%s", - system_utsname.nodename, ic_domain, system_utsname.domainname); + utsname()->nodename, ic_domain, utsname()->domainname); printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr)); printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr)); printk(", rootpath=%s", root_server_path); @@ -1479,11 +1479,11 @@ static int __init ip_auto_config_setup(c case 4: if ((dp = strchr(ip, '.'))) { *dp++ = '\0'; - strlcpy(system_utsname.domainname, dp, - sizeof(system_utsname.domainname)); + strlcpy(utsname()->domainname, dp, + sizeof(utsname()->domainname)); } - strlcpy(system_utsname.nodename, ip, - sizeof(system_utsname.nodename)); + strlcpy(utsname()->nodename, ip, + sizeof(utsname()->nodename)); ic_host_name_set = 1; break; case 5: diff -uprN linux-2.6.18/net/ipv4/ipmr.c linux-2.6.18.ovz/net/ipv4/ipmr.c --- linux-2.6.18/net/ipv4/ipmr.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/ipmr.c 2007-06-13 06:55:07.000000000 -0400 @@ -836,7 +836,7 @@ static void mrtsock_destruct(struct sock { rtnl_lock(); if (sk == mroute_socket) { - ipv4_devconf.mc_forwarding--; + ve_ipv4_devconf.mc_forwarding--; write_lock_bh(&mrt_lock); mroute_socket=NULL; @@ -887,7 +887,7 @@ int ip_mroute_setsockopt(struct sock *sk mroute_socket=sk; write_unlock_bh(&mrt_lock); - ipv4_devconf.mc_forwarding++; + ve_ipv4_devconf.mc_forwarding++; } rtnl_unlock(); return ret; diff -uprN linux-2.6.18/net/ipv4/ipvs/ip_vs_conn.c linux-2.6.18.ovz/net/ipv4/ipvs/ip_vs_conn.c --- linux-2.6.18/net/ipv4/ipvs/ip_vs_conn.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/ipvs/ip_vs_conn.c 2007-06-13 06:55:07.000000000 -0400 @@ -902,7 +902,8 @@ int ip_vs_conn_init(void) /* Allocate ip_vs_conn slab cache */ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN | SLAB_UBC, + NULL, NULL); if (!ip_vs_conn_cachep) { vfree(ip_vs_conn_tab); return -ENOMEM; diff -uprN linux-2.6.18/net/ipv4/ipvs/ip_vs_core.c linux-2.6.18.ovz/net/ipv4/ipvs/ip_vs_core.c --- linux-2.6.18/net/ipv4/ipvs/ip_vs_core.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/ipvs/ip_vs_core.c 2007-06-13 06:55:07.000000000 -0400 @@ -813,6 +813,16 @@ ip_vs_out(unsigned int hooknum, struct s skb->nh.iph->saddr = cp->vaddr; ip_send_check(skb->nh.iph); + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ + if (ip_route_me_harder(pskb, RTN_LOCAL) != 0) + goto drop; + skb = *pskb; + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); @@ -952,6 +962,10 @@ ip_vs_in(unsigned int hooknum, struct sk * Big tappo: only PACKET_HOST (neither loopback nor mcasts) * ... don't know why 1st test DOES NOT include 2nd (?) */ + /* + * VZ: the question above is right. + * The second test is superfluous. + */ if (unlikely(skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev || skb->sk)) { IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", diff -uprN linux-2.6.18/net/ipv4/netfilter/arp_tables.c linux-2.6.18.ovz/net/ipv4/netfilter/arp_tables.c --- linux-2.6.18/net/ipv4/netfilter/arp_tables.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/arp_tables.c 2007-06-13 06:55:07.000000000 -0400 @@ -380,6 +380,13 @@ static int mark_source_chains(struct xt_ && unconditional(&e->arp)) { unsigned int oldpos, size; + if (t->verdict < -NF_MAX_VERDICT - 1) { + duprintf("mark_source_chains: bad " + "negative verdict (%i)\n", + t->verdict); + return 0; + } + /* Return: backtrack through the last * big jump. */ @@ -409,6 +416,14 @@ static int mark_source_chains(struct xt_ if (strcmp(t->target.u.user.name, ARPT_STANDARD_TARGET) == 0 && newpos >= 0) { + if (newpos > newinfo->size - + sizeof(struct arpt_entry)) { + duprintf("mark_source_chains: " + "bad verdict (%i)\n", + newpos); + return 0; + } + /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -431,8 +446,6 @@ static int mark_source_chains(struct xt_ static inline int standard_check(const struct arpt_entry_target *t, unsigned int max_offset) { - struct arpt_standard_target *targ = (void *)t; - /* Check standard info. */ if (t->u.target_size != ARPT_ALIGN(sizeof(struct arpt_standard_target))) { @@ -442,18 +455,6 @@ static inline int standard_check(const s return 0; } - if (targ->verdict >= 0 - && targ->verdict > max_offset - sizeof(struct arpt_entry)) { - duprintf("arpt_standard_check: bad verdict (%i)\n", - targ->verdict); - return 0; - } - - if (targ->verdict < -NF_MAX_VERDICT - 1) { - duprintf("arpt_standard_check: bad negative verdict (%i)\n", - targ->verdict); - return 0; - } return 1; } @@ -471,7 +472,13 @@ static inline int check_entry(struct arp return -EINVAL; } + if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) + return -EINVAL; + t = arpt_get_target(e); + if (e->target_offset + t->u.target_size > e->next_offset) + return -EINVAL; + target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name, t->u.user.revision), "arpt_%s", t->u.user.name); @@ -641,7 +648,7 @@ static int translate_table(const char *n if (ret != 0) { ARPT_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + cleanup_entry, &i); return ret; } @@ -1204,6 +1211,8 @@ err1: static void __exit arp_tables_fini(void) { nf_unregister_sockopt(&arpt_sockopts); + xt_unregister_target(&arpt_error_target); + xt_unregister_target(&arpt_standard_target); xt_proto_fini(NF_ARP); } diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_core.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_core.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_core.c 2007-06-13 06:55:07.000000000 -0400 @@ -48,6 +48,7 @@ #include #include #include +#include #define IP_CONNTRACK_VERSION "2.4" @@ -59,22 +60,41 @@ DEFINE_RWLOCK(ip_conntrack_lock); -/* ip_conntrack_standalone needs this */ -atomic_t ip_conntrack_count = ATOMIC_INIT(0); +#ifdef CONFIG_VE_IPTABLES +#define ve_ip_conntrack_helpers \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_helpers) +#define ve_ip_conntrack_max \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_max) +#define ve_ip_conntrack_count \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_count) +#define ve_ip_conntrack_unconfirmed \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_unconfirmed) +#else void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; LIST_HEAD(ip_conntrack_expect_list); struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; static LIST_HEAD(helpers); +struct list_head *ip_conntrack_hash; +static LIST_HEAD(unconfirmed); +#define ve_ip_conntrack_count ip_conntrack_count +#define ve_ip_conntrack_helpers helpers +#define ve_ip_conntrack_max ip_conntrack_max +#define ve_ip_conntrack_unconfirmed unconfirmed +#endif + +/* ip_conntrack_standalone needs this */ +atomic_t ip_conntrack_count = ATOMIC_INIT(0); + unsigned int ip_conntrack_htable_size = 0; int ip_conntrack_max; -struct list_head *ip_conntrack_hash; static kmem_cache_t *ip_conntrack_cachep __read_mostly; static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; struct ip_conntrack ip_conntrack_untracked; unsigned int ip_ct_log_invalid; -static LIST_HEAD(unconfirmed); +#ifndef CONFIG_VE_IPTABLES static int ip_conntrack_vmalloc; +#endif static unsigned int ip_conntrack_next_id; static unsigned int ip_conntrack_expect_next_id; @@ -104,6 +124,9 @@ void ip_ct_deliver_cached_events(const s { struct ip_conntrack_ecache *ecache; + if (!ve_is_super(get_exec_env())) + return; + local_bh_disable(); ecache = &__get_cpu_var(ip_conntrack_ecache); if (ecache->ct == ct) @@ -132,6 +155,9 @@ static void ip_ct_event_cache_flush(void struct ip_conntrack_ecache *ecache; int cpu; + if (!ve_is_super(get_exec_env())) + return; + for_each_possible_cpu(cpu) { ecache = &per_cpu(ip_conntrack_ecache, cpu); if (ecache->ct) @@ -225,7 +251,7 @@ __ip_conntrack_expect_find(const struct { struct ip_conntrack_expect *i; - list_for_each_entry(i, &ip_conntrack_expect_list, list) { + list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { atomic_inc(&i->use); return i; @@ -254,7 +280,7 @@ find_expectation(const struct ip_conntra { struct ip_conntrack_expect *i; - list_for_each_entry(i, &ip_conntrack_expect_list, list) { + list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { /* If master is not in hash table yet (ie. packet hasn't left this machine yet), how can other end know about expected? Hence these are not the droids you are looking for (if @@ -283,7 +309,7 @@ void ip_ct_remove_expectations(struct ip if (ct->expecting == 0) return; - list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { + list_for_each_entry_safe(i, tmp, &ve_ip_conntrack_expect_list, list) { if (i->master == ct && del_timer(&i->timeout)) { ip_ct_unlink_expect(i); ip_conntrack_expect_put(i); @@ -301,8 +327,10 @@ clean_from_lists(struct ip_conntrack *ct ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); + LIST_DELETE(&ct->ct_owner_env->_ip_conntrack->_ip_conntrack_hash[ho], + &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + LIST_DELETE(&ct->ct_owner_env->_ip_conntrack->_ip_conntrack_hash[hr], + &ct->tuplehash[IP_CT_DIR_REPLY]); /* Destroy all pending expectations */ ip_ct_remove_expectations(ct); @@ -313,7 +341,11 @@ destroy_conntrack(struct nf_conntrack *n { struct ip_conntrack *ct = (struct ip_conntrack *)nfct; struct ip_conntrack_protocol *proto; +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *old; + old = set_exec_env(ct->ct_owner_env); +#endif DEBUGP("destroy_conntrack(%p)\n", ct); IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); @@ -328,8 +360,8 @@ destroy_conntrack(struct nf_conntrack *n if (proto && proto->destroy) proto->destroy(ct); - if (ip_conntrack_destroyed) - ip_conntrack_destroyed(ct); + if (ve_ip_conntrack_destroyed) + ve_ip_conntrack_destroyed(ct); write_lock_bh(&ip_conntrack_lock); /* Expectations will have been removed in clean_from_lists, @@ -352,6 +384,9 @@ destroy_conntrack(struct nf_conntrack *n DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); ip_conntrack_free(ct); +#ifdef CONFIG_VE_IPTABLES + (void)set_exec_env(old); +#endif } static void death_by_timeout(unsigned long ul_conntrack) @@ -385,7 +420,7 @@ __ip_conntrack_find(const struct ip_conn unsigned int hash = hash_conntrack(tuple); ASSERT_READ_LOCK(&ip_conntrack_lock); - list_for_each_entry(h, &ip_conntrack_hash[hash], list) { + list_for_each_entry(h, &ve_ip_conntrack_hash[hash], list) { if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { CONNTRACK_STAT_INC(found); return h; @@ -417,9 +452,9 @@ static void __ip_conntrack_hash_insert(s unsigned int repl_hash) { ct->id = ++ip_conntrack_next_id; - list_prepend(&ip_conntrack_hash[hash], + list_prepend(&ve_ip_conntrack_hash[hash], &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_prepend(&ip_conntrack_hash[repl_hash], + list_prepend(&ve_ip_conntrack_hash[repl_hash], &ct->tuplehash[IP_CT_DIR_REPLY].list); } @@ -470,11 +505,11 @@ __ip_conntrack_confirm(struct sk_buff ** /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - if (!LIST_FIND(&ip_conntrack_hash[hash], + if (!LIST_FIND(&ve_ip_conntrack_hash[hash], conntrack_tuple_cmp, struct ip_conntrack_tuple_hash *, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) - && !LIST_FIND(&ip_conntrack_hash[repl_hash], + && !LIST_FIND(&ve_ip_conntrack_hash[repl_hash], conntrack_tuple_cmp, struct ip_conntrack_tuple_hash *, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { @@ -568,7 +603,7 @@ static inline int helper_cmp(const struc static struct ip_conntrack_helper * __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) { - return LIST_FIND(&helpers, helper_cmp, + return LIST_FIND(&ve_ip_conntrack_helpers, helper_cmp, struct ip_conntrack_helper *, tuple); } @@ -604,7 +639,7 @@ void ip_conntrack_helper_put(struct ip_c struct ip_conntrack_protocol * __ip_conntrack_proto_find(u_int8_t protocol) { - return ip_ct_protos[protocol]; + return ve_ip_ct_protos[protocol]; } /* this is guaranteed to always return a valid protocol helper, since @@ -631,29 +666,32 @@ void ip_conntrack_proto_put(struct ip_co } struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, - struct ip_conntrack_tuple *repl) + struct ip_conntrack_tuple *repl, struct user_beancounter *ub) { struct ip_conntrack *conntrack; + struct user_beancounter *old_ub; if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); ip_conntrack_hash_rnd_initted = 1; } - if (ip_conntrack_max - && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + if (ve_ip_conntrack_max + && atomic_read(&ve_ip_conntrack_count) >= ve_ip_conntrack_max) { unsigned int hash = hash_conntrack(orig); /* Try dropping from this hash chain. */ - if (!early_drop(&ip_conntrack_hash[hash])) { + if (!early_drop(&ve_ip_conntrack_hash[hash])) { if (net_ratelimit()) - printk(KERN_WARNING - "ip_conntrack: table full, dropping" - " packet.\n"); + ve_printk(VE_LOG_BOTH, KERN_WARNING + "ip_conntrack: VE %d: table full, dropping" + " packet.\n", VEID(get_exec_env())); return ERR_PTR(-ENOMEM); } } + old_ub = set_exec_ub(ub); conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); + (void)set_exec_ub(old_ub); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); return ERR_PTR(-ENOMEM); @@ -668,8 +706,11 @@ struct ip_conntrack *ip_conntrack_alloc( init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; +#ifdef CONFIG_VE_IPTABLES + conntrack->ct_owner_env = get_exec_env(); +#endif - atomic_inc(&ip_conntrack_count); + atomic_inc(&ve_ip_conntrack_count); return conntrack; } @@ -677,7 +718,7 @@ struct ip_conntrack *ip_conntrack_alloc( void ip_conntrack_free(struct ip_conntrack *conntrack) { - atomic_dec(&ip_conntrack_count); + atomic_dec(&ve_ip_conntrack_count); kmem_cache_free(ip_conntrack_cachep, conntrack); } @@ -691,13 +732,22 @@ init_conntrack(struct ip_conntrack_tuple struct ip_conntrack *conntrack; struct ip_conntrack_tuple repl_tuple; struct ip_conntrack_expect *exp; + struct user_beancounter *ub; if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { DEBUGP("Can't invert tuple.\n"); return NULL; } - conntrack = ip_conntrack_alloc(tuple, &repl_tuple); +#ifdef CONFIG_USER_RESOURCE + if (skb->dev != NULL) /* received skb */ + ub = netdev_bc(skb->dev)->exec_ub; + else if (skb->sk != NULL) /* sent skb */ + ub = sock_bc(skb->sk)->ub; + else +#endif + ub = NULL; + conntrack = ip_conntrack_alloc(tuple, &repl_tuple, ub); if (conntrack == NULL || IS_ERR(conntrack)) return (struct ip_conntrack_tuple_hash *)conntrack; @@ -735,7 +785,8 @@ init_conntrack(struct ip_conntrack_tuple } /* Overload tuple linked list to put us in unconfirmed list. */ - list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, + &ve_ip_conntrack_unconfirmed); write_unlock_bh(&ip_conntrack_lock); @@ -927,7 +978,7 @@ void ip_conntrack_unexpect_related(struc write_lock_bh(&ip_conntrack_lock); /* choose the the oldest expectation to evict */ - list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) { if (expect_matches(i, exp) && del_timer(&i->timeout)) { ip_ct_unlink_expect(i); write_unlock_bh(&ip_conntrack_lock); @@ -961,11 +1012,11 @@ void ip_conntrack_expect_put(struct ip_c kmem_cache_free(ip_conntrack_expect_cachep, exp); } -static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) +void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) { atomic_inc(&exp->use); exp->master->expecting++; - list_add(&exp->list, &ip_conntrack_expect_list); + list_add(&exp->list, &ve_ip_conntrack_expect_list); init_timer(&exp->timeout); exp->timeout.data = (unsigned long)exp; @@ -977,13 +1028,14 @@ static void ip_conntrack_expect_insert(s atomic_inc(&exp->use); CONNTRACK_STAT_INC(expect_create); } +EXPORT_SYMBOL_GPL(ip_conntrack_expect_insert); /* Race with expectations being used means we could have none to find; OK. */ static void evict_oldest_expect(struct ip_conntrack *master) { struct ip_conntrack_expect *i; - list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) { if (i->master == master) { if (del_timer(&i->timeout)) { ip_ct_unlink_expect(i); @@ -1014,7 +1066,7 @@ int ip_conntrack_expect_related(struct i DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); write_lock_bh(&ip_conntrack_lock); - list_for_each_entry(i, &ip_conntrack_expect_list, list) { + list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { if (expect_matches(i, expect)) { /* Refresh timer: if it's dying, ignore.. */ if (refresh_timer(i)) { @@ -1062,18 +1114,48 @@ int ip_conntrack_helper_register(struct { BUG_ON(me->timeout == 0); write_lock_bh(&ip_conntrack_lock); - list_prepend(&helpers, me); + list_prepend(&ve_ip_conntrack_helpers, me); write_unlock_bh(&ip_conntrack_lock); return 0; } +int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *me) +{ + int ret; + struct module *mod = me->me; + + if (!ve_is_super(get_exec_env())) { + struct ip_conntrack_helper *tmp; + __module_get(mod); + ret = -ENOMEM; + tmp = kmalloc(sizeof(struct ip_conntrack_helper), GFP_KERNEL); + if (!tmp) + goto nomem; + memcpy(tmp, me, sizeof(struct ip_conntrack_helper)); + me = tmp; + } + + ret = ip_conntrack_helper_register(me); + if (ret) + goto out; + + return 0; +out: + if (!ve_is_super(get_exec_env())){ + kfree(me); +nomem: + module_put(mod); + } + return ret; +} + struct ip_conntrack_helper * __ip_conntrack_helper_find_byname(const char *name) { struct ip_conntrack_helper *h; - list_for_each_entry(h, &helpers, list) { + list_for_each_entry(h, &ve_ip_conntrack_helpers, list) { if (!strcmp(h->name, name)) return h; } @@ -1098,19 +1180,20 @@ void ip_conntrack_helper_unregister(stru /* Need write lock here, to delete helper. */ write_lock_bh(&ip_conntrack_lock); - LIST_DELETE(&helpers, me); + LIST_DELETE(&ve_ip_conntrack_helpers, me); /* Get rid of expectations */ - list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { + list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, list) { if (exp->master->helper == me && del_timer(&exp->timeout)) { ip_ct_unlink_expect(exp); ip_conntrack_expect_put(exp); } } /* Get rid of expecteds, set helpers to NULL. */ - LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); + LIST_FIND_W(&ve_ip_conntrack_unconfirmed, unhelp, + struct ip_conntrack_tuple_hash*, me); for (i = 0; i < ip_conntrack_htable_size; i++) - LIST_FIND_W(&ip_conntrack_hash[i], unhelp, + LIST_FIND_W(&ve_ip_conntrack_hash[i], unhelp, struct ip_conntrack_tuple_hash *, me); write_unlock_bh(&ip_conntrack_lock); @@ -1118,6 +1201,25 @@ void ip_conntrack_helper_unregister(stru synchronize_net(); } +void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) +{ + + if (!ve_is_super(get_exec_env())) { + read_lock_bh(&ip_conntrack_lock); + me = list_named_find(&ve_ip_conntrack_helpers, me->name); + read_unlock_bh(&ip_conntrack_lock); + if (!me) + return; + } + + ip_conntrack_helper_unregister(me); + + if (!ve_is_super(get_exec_env())) { + module_put(me->me); + kfree(me); + } +} + /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __ip_ct_refresh_acct(struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, @@ -1254,13 +1356,13 @@ get_next_corpse(int (*iter)(struct ip_co write_lock_bh(&ip_conntrack_lock); for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { - h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, + h = LIST_FIND_W(&ve_ip_conntrack_hash[*bucket], do_iter, struct ip_conntrack_tuple_hash *, iter, data); if (h) break; } if (!h) - h = LIST_FIND_W(&unconfirmed, do_iter, + h = LIST_FIND_W(&ve_ip_conntrack_unconfirmed, do_iter, struct ip_conntrack_tuple_hash *, iter, data); if (h) atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); @@ -1297,6 +1399,11 @@ getorigdst(struct sock *sk, int optval, struct ip_conntrack_tuple_hash *h; struct ip_conntrack_tuple tuple; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_ip_conntrack) + return -ENOPROTOOPT; +#endif + IP_CT_TUPLE_U_BLANK(&tuple); tuple.src.ip = inet->rcv_saddr; tuple.src.u.tcp.port = inet->sport; @@ -1368,12 +1475,17 @@ static void free_conntrack_hash(struct l get_order(sizeof(struct list_head) * size)); } +static void ip_conntrack_cache_free(void) +{ + kmem_cache_destroy(ip_conntrack_expect_cachep); + kmem_cache_destroy(ip_conntrack_cachep); + nf_unregister_sockopt(&so_getorigdst); +} + /* Mishearing the voices in his head, our hero wonders how he's supposed to kill the mall. */ void ip_conntrack_cleanup(void) { - ip_ct_attach = NULL; - /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ @@ -1382,19 +1494,32 @@ void ip_conntrack_cleanup(void) ip_ct_event_cache_flush(); i_see_dead_people: ip_conntrack_flush(); - if (atomic_read(&ip_conntrack_count) != 0) { + if (atomic_read(&ve_ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; } - /* wait until all references to ip_conntrack_untracked are dropped */ - while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) - schedule(); - - kmem_cache_destroy(ip_conntrack_cachep); - kmem_cache_destroy(ip_conntrack_expect_cachep); - free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + if (ve_is_super(get_exec_env())) { + /* wait until all references to ip_conntrack_untracked are + * dropped */ + while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) + schedule(); + ip_ct_attach = NULL; + ip_conntrack_cache_free(); + } + free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc, ip_conntrack_htable_size); - nf_unregister_sockopt(&so_getorigdst); + ve_ip_conntrack_hash = NULL; + INIT_LIST_HEAD(&ve_ip_conntrack_unconfirmed); + INIT_LIST_HEAD(&ve_ip_conntrack_expect_list); + INIT_LIST_HEAD(&ve_ip_conntrack_helpers); + atomic_set(&ve_ip_conntrack_count, 0); + ve_ip_conntrack_max = 0; +#ifdef CONFIG_VE_IPTABLES + kfree(ve_ip_ct_protos); + ve_ip_ct_protos = NULL; + kfree(get_exec_env()->_ip_conntrack); + get_exec_env()->_ip_conntrack = NULL; +#endif } static struct list_head *alloc_hashtable(int size, int *vmalloced) @@ -1403,13 +1528,13 @@ static struct list_head *alloc_hashtable unsigned int i; *vmalloced = 0; - hash = (void*)__get_free_pages(GFP_KERNEL, + hash = (void*)__get_free_pages(GFP_KERNEL_UBC, get_order(sizeof(struct list_head) * size)); if (!hash) { *vmalloced = 1; printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); - hash = vmalloc(sizeof(struct list_head) * size); + hash = ub_vmalloc(sizeof(struct list_head) * size); } if (hash) @@ -1445,8 +1570,8 @@ static int set_hashsize(const char *val, write_lock_bh(&ip_conntrack_lock); for (i = 0; i < ip_conntrack_htable_size; i++) { - while (!list_empty(&ip_conntrack_hash[i])) { - h = list_entry(ip_conntrack_hash[i].next, + while (!list_empty(&ve_ip_conntrack_hash[i])) { + h = list_entry(ve_ip_conntrack_hash[i].next, struct ip_conntrack_tuple_hash, list); list_del(&h->list); bucket = __hash_conntrack(&h->tuple, hashsize, rnd); @@ -1454,12 +1579,12 @@ static int set_hashsize(const char *val, } } old_size = ip_conntrack_htable_size; - old_vmalloced = ip_conntrack_vmalloc; - old_hash = ip_conntrack_hash; + old_vmalloced = ve_ip_conntrack_vmalloc; + old_hash = ve_ip_conntrack_hash; ip_conntrack_htable_size = hashsize; - ip_conntrack_vmalloc = vmalloced; - ip_conntrack_hash = hash; + ve_ip_conntrack_vmalloc = vmalloced; + ve_ip_conntrack_hash = hash; ip_conntrack_hash_rnd = rnd; write_unlock_bh(&ip_conntrack_lock); @@ -1470,9 +1595,8 @@ static int set_hashsize(const char *val, module_param_call(hashsize, set_hashsize, param_get_uint, &ip_conntrack_htable_size, 0600); -int __init ip_conntrack_init(void) +static int ip_conntrack_cache_create(void) { - unsigned int i; int ret; /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB @@ -1486,70 +1610,127 @@ int __init ip_conntrack_init(void) if (ip_conntrack_htable_size < 16) ip_conntrack_htable_size = 16; } - ip_conntrack_max = 8 * ip_conntrack_htable_size; + ve_ip_conntrack_max = 8 * ip_conntrack_htable_size; printk("ip_conntrack version %s (%u buckets, %d max)" " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION, - ip_conntrack_htable_size, ip_conntrack_max, + ip_conntrack_htable_size, ve_ip_conntrack_max, sizeof(struct ip_conntrack)); ret = nf_register_sockopt(&so_getorigdst); if (ret != 0) { printk(KERN_ERR "Unable to register netfilter socket option\n"); - return ret; - } - - ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, - &ip_conntrack_vmalloc); - if (!ip_conntrack_hash) { - printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); - goto err_unreg_sockopt; + goto out_sockopt; } + ret = -ENOMEM; ip_conntrack_cachep = kmem_cache_create("ip_conntrack", sizeof(struct ip_conntrack), 0, - 0, NULL, NULL); + SLAB_UBC, NULL, NULL); if (!ip_conntrack_cachep) { printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); - goto err_free_hash; + goto err_unreg_sockopt; } ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", sizeof(struct ip_conntrack_expect), - 0, 0, NULL, NULL); + 0, SLAB_UBC, NULL, NULL); if (!ip_conntrack_expect_cachep) { printk(KERN_ERR "Unable to create ip_expect slab cache\n"); goto err_free_conntrack_slab; } + return 0; + +err_free_conntrack_slab: + kmem_cache_destroy(ip_conntrack_cachep); +err_unreg_sockopt: + nf_unregister_sockopt(&so_getorigdst); +out_sockopt: + return ret; +} + +int ip_conntrack_init(void) +{ + struct ve_struct *env; + unsigned int i; + int ret; + + env = get_exec_env(); +#ifdef CONFIG_VE_IPTABLES + ret = -ENOMEM; + env->_ip_conntrack = + kmalloc(sizeof(struct ve_ip_conntrack), GFP_KERNEL); + if (!env->_ip_conntrack) + goto out; + memset(env->_ip_conntrack, 0, sizeof(struct ve_ip_conntrack)); + if (ve_is_super(env)) { + ret = ip_conntrack_cache_create(); + if (ret) + goto cache_fail; + } else + ve_ip_conntrack_max = 8 * ip_conntrack_htable_size; +#else /* CONFIG_VE_IPTABLES */ + ret = ip_conntrack_cache_create(); + if (ret) + goto out; +#endif + + ret = -ENOMEM; + ve_ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, + &ve_ip_conntrack_vmalloc); + if (!ve_ip_conntrack_hash) { + printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); + goto err_free_cache; + } + +#ifdef CONFIG_VE_IPTABLES + ve_ip_ct_protos = (struct ip_conntrack_protocol **) + ub_kmalloc(sizeof(void *)*MAX_IP_CT_PROTO, GFP_KERNEL); + if (!ve_ip_ct_protos) + goto err_free_hash; +#endif /* Don't NEED lock here, but good form anyway. */ write_lock_bh(&ip_conntrack_lock); for (i = 0; i < MAX_IP_CT_PROTO; i++) - ip_ct_protos[i] = &ip_conntrack_generic_protocol; + ve_ip_ct_protos[i] = &ip_conntrack_generic_protocol; /* Sew in builtin protocols. */ - ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; - ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; - ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; + ve_ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; + ve_ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; + ve_ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; write_unlock_bh(&ip_conntrack_lock); - /* For use by ipt_REJECT */ - ip_ct_attach = ip_conntrack_attach; - - /* Set up fake conntrack: - - to never be deleted, not in any hashes */ - atomic_set(&ip_conntrack_untracked.ct_general.use, 1); - /* - and look it like as a confirmed connection */ - set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); + INIT_LIST_HEAD(&ve_ip_conntrack_unconfirmed); + INIT_LIST_HEAD(&ve_ip_conntrack_expect_list); + INIT_LIST_HEAD(&ve_ip_conntrack_helpers); + + if (ve_is_super(env)) { + /* For use by ipt_REJECT */ + ip_ct_attach = ip_conntrack_attach; + + /* Set up fake conntrack: + - to never be deleted, not in any hashes */ + atomic_set(&ip_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); + } - return ret; + return 0; -err_free_conntrack_slab: - kmem_cache_destroy(ip_conntrack_cachep); +#ifdef CONFIG_VE_IPTABLES err_free_hash: - free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, +#endif + free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc, ip_conntrack_htable_size); -err_unreg_sockopt: - nf_unregister_sockopt(&so_getorigdst); - - return -ENOMEM; + ve_ip_conntrack_hash = NULL; +err_free_cache: + if (ve_is_super(env)) + ip_conntrack_cache_free(); +#ifdef CONFIG_VE_IPTABLES +cache_fail: + kfree(env->_ip_conntrack); + env->_ip_conntrack = NULL; +#endif +out: + return ret; } diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_ftp.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_ftp.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_ftp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_ftp.c 2007-06-13 06:55:07.000000000 -0400 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -433,8 +434,8 @@ static int help(struct sk_buff **pskb, /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ - if (ip_nat_ftp_hook) - ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, + if (ve_ip_nat_ftp_hook) + ret = ve_ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, matchoff, matchlen, exp, &seq); else { /* Can't expect this? Best to drop packet now. */ @@ -461,15 +462,40 @@ static struct ip_conntrack_helper ftp[MA static char ftp_names[MAX_PORTS][sizeof("ftp-65535")]; /* Not __exit: called from init() */ -static void ip_conntrack_ftp_fini(void) +void fini_ip_ct_ftp(void) { int i; for (i = 0; i < ports_c; i++) { DEBUGP("ip_ct_ftp: unregistering helper for port %d\n", ports[i]); - ip_conntrack_helper_unregister(&ftp[i]); + virt_ip_conntrack_helper_unregister(&ftp[i]); } +} + +int init_ip_ct_ftp(void) +{ + int i, ret; + + for (i = 0; i < ports_c; i++) { + DEBUGP("ip_ct_ftp: registering helper for port %d\n", + ports[i]); + ret = virt_ip_conntrack_helper_register(&ftp[i]); + if (ret) { + fini_ip_ct_ftp(); + return ret; + } + } + return 0; +} + +/* Not __exit: called from init() */ +static void ip_conntrack_ftp_fini(void) +{ + KSYMMODUNRESOLVE(ip_conntrack_ftp); + KSYMUNRESOLVE(init_ip_ct_ftp); + KSYMUNRESOLVE(fini_ip_ct_ftp); + fini_ip_ct_ftp(); kfree(ftp_buffer); } @@ -504,13 +530,17 @@ static int __init ip_conntrack_ftp_init( DEBUGP("ip_ct_ftp: registering helper for port %d\n", ports[i]); - ret = ip_conntrack_helper_register(&ftp[i]); + ret = virt_ip_conntrack_helper_register(&ftp[i]); if (ret) { ip_conntrack_ftp_fini(); return ret; } } + + KSYMRESOLVE(init_ip_ct_ftp); + KSYMRESOLVE(fini_ip_ct_ftp); + KSYMMODRESOLVE(ip_conntrack_ftp); return 0; } diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_helper_h323.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_helper_h323.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_helper_h323.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_helper_h323.c 2007-06-13 06:55:07.000000000 -0400 @@ -1417,7 +1417,7 @@ static int process_rcf(struct sk_buff ** DEBUGP ("ip_ct_ras: set RAS connection timeout to %u seconds\n", info->timeout); - ip_ct_refresh_acct(ct, ctinfo, NULL, info->timeout * HZ); + ip_ct_refresh(ct, *pskb, info->timeout * HZ); /* Set expect timeout */ read_lock_bh(&ip_conntrack_lock); @@ -1465,7 +1465,7 @@ static int process_urq(struct sk_buff ** info->sig_port[!dir] = 0; /* Give it 30 seconds for UCF or URJ */ - ip_ct_refresh_acct(ct, ctinfo, NULL, 30 * HZ); + ip_ct_refresh(ct, *pskb, 30 * HZ); return 0; } diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_irc.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_irc.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_irc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_irc.c 2007-06-13 06:55:07.000000000 -0400 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -243,6 +244,33 @@ static char irc_names[MAX_PORTS][sizeof( static void ip_conntrack_irc_fini(void); +void fini_ip_ct_irc(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("unregistering port %d\n", + ports[i]); + virt_ip_conntrack_helper_unregister(&irc_helpers[i]); + } +} + +int init_ip_ct_irc(void) +{ + int i, ret; + + for (i = 0; i < ports_c; i++) { + DEBUGP("port #%d: %d\n", i, ports[i]); + ret = virt_ip_conntrack_helper_register(&irc_helpers[i]); + if (ret) { + printk("ip_conntrack_irc: ERROR registering port %d\n", + ports[i]); + fini_ip_ct_irc(); + return -EBUSY; + } + } + return 0; +} + static int __init ip_conntrack_irc_init(void) { int i, ret; @@ -282,7 +310,7 @@ static int __init ip_conntrack_irc_init( DEBUGP("port #%d: %d\n", i, ports[i]); - ret = ip_conntrack_helper_register(hlpr); + ret = virt_ip_conntrack_helper_register(hlpr); if (ret) { printk("ip_conntrack_irc: ERROR registering port %d\n", @@ -291,6 +319,10 @@ static int __init ip_conntrack_irc_init( return -EBUSY; } } + + KSYMRESOLVE(init_ip_ct_irc); + KSYMRESOLVE(fini_ip_ct_irc); + KSYMMODRESOLVE(ip_conntrack_irc); return 0; } @@ -298,12 +330,10 @@ static int __init ip_conntrack_irc_init( * it is needed by the init function */ static void ip_conntrack_irc_fini(void) { - int i; - for (i = 0; i < ports_c; i++) { - DEBUGP("unregistering port %d\n", - ports[i]); - ip_conntrack_helper_unregister(&irc_helpers[i]); - } + KSYMMODUNRESOLVE(ip_conntrack_irc); + KSYMUNRESOLVE(init_ip_ct_irc); + KSYMUNRESOLVE(fini_ip_ct_irc); + fini_ip_ct_irc(); kfree(irc_buffer); } diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_netlink.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_netlink.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_netlink.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_netlink.c 2007-06-13 06:55:07.000000000 -0400 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,8 @@ #include #include +#include +#include MODULE_LICENSE("GPL"); @@ -418,7 +421,7 @@ ctnetlink_dump_table(struct sk_buff *skb last = (struct ip_conntrack *)cb->args[1]; for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { restart: - list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { + list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -465,7 +468,7 @@ ctnetlink_dump_table_w(struct sk_buff *s write_lock_bh(&ip_conntrack_lock); for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { - list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { + list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -1019,14 +1022,15 @@ ctnetlink_change_conntrack(struct ip_con static int ctnetlink_create_conntrack(struct nfattr *cda[], struct ip_conntrack_tuple *otuple, - struct ip_conntrack_tuple *rtuple) + struct ip_conntrack_tuple *rtuple, + struct user_beancounter *ub) { struct ip_conntrack *ct; int err = -EINVAL; DEBUGP("entered %s\n", __FUNCTION__); - ct = ip_conntrack_alloc(otuple, rtuple); + ct = ip_conntrack_alloc(otuple, rtuple, ub); if (ct == NULL || IS_ERR(ct)) return -ENOMEM; @@ -1103,8 +1107,16 @@ ctnetlink_new_conntrack(struct sock *ctn write_unlock_bh(&ip_conntrack_lock); DEBUGP("no such conntrack, create new\n"); err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) - err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + if (nlh->nlmsg_flags & NLM_F_CREATE) { +#ifdef CONFIG_USER_RESOURCE + if (skb->sk) + err = ctnetlink_create_conntrack(cda, &otuple, + &rtuple, sock_bc(skb->sk)->ub); + else +#endif + err = ctnetlink_create_conntrack(cda, + &otuple, &rtuple, NULL); + } return err; } /* implicit 'else' */ @@ -1292,7 +1304,7 @@ ctnetlink_exp_dump_table(struct sk_buff DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); read_lock_bh(&ip_conntrack_lock); - list_for_each_prev(i, &ip_conntrack_expect_list) { + list_for_each_prev(i, &ve_ip_conntrack_expect_list) { exp = (struct ip_conntrack_expect *) i; if (exp->id <= *id) continue; @@ -1438,7 +1450,7 @@ ctnetlink_del_expect(struct sock *ctnl, write_unlock_bh(&ip_conntrack_lock); return -EINVAL; } - list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, list) { if (exp->master->helper == h && del_timer(&exp->timeout)) { @@ -1450,7 +1462,7 @@ ctnetlink_del_expect(struct sock *ctnl, } else { /* This basically means we have to flush everything*/ write_lock_bh(&ip_conntrack_lock); - list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, list) { if (del_timer(&exp->timeout)) { ip_ct_unlink_expect(exp); diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_generic.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_generic.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2007-06-13 06:55:07.000000000 -0400 @@ -52,7 +52,7 @@ static int packet(struct ip_conntrack *c const struct sk_buff *skb, enum ip_conntrack_info ctinfo) { - ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); + ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_generic_timeout); return NF_ACCEPT; } diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_icmp.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_icmp.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2007-06-13 06:55:07.000000000 -0400 @@ -104,7 +104,7 @@ static int icmp_packet(struct ip_conntra } else { atomic_inc(&ct->proto.icmp.count); ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); + ip_ct_refresh_acct(ct, ctinfo, skb, ve_ip_ct_icmp_timeout); } return NF_ACCEPT; diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_sctp.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_sctp.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_sctp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_sctp.c 2007-06-13 06:55:07.000000000 -0400 @@ -461,7 +461,8 @@ static int sctp_new(struct ip_conntrack SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ - if (newconntrack == SCTP_CONNTRACK_MAX) { + if (newconntrack == SCTP_CONNTRACK_NONE || + newconntrack == SCTP_CONNTRACK_MAX) { DEBUGP("ip_conntrack_sctp: invalid new deleting.\n"); return 0; } diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_tcp.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_tcp.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2007-06-13 06:55:07.000000000 -0400 @@ -97,7 +97,7 @@ unsigned int ip_ct_tcp_timeout_close = to ~13-30min depending on RTO. */ unsigned int ip_ct_tcp_timeout_max_retrans = 5 MINS; -static const unsigned int * tcp_timeouts[] +const unsigned int * tcp_timeouts[] = { NULL, /* TCP_CONNTRACK_NONE */ &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ @@ -761,7 +761,7 @@ static int tcp_in_window(struct ip_ct_tc : "SEQ is under the lower bound (already ACKed data retransmitted)" : "SEQ is over the upper bound (over the window of the receiver)"); - res = ip_ct_tcp_be_liberal; + res = ve_ip_ct_tcp_be_liberal; } DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " @@ -1029,9 +1029,11 @@ static int tcp_packet(struct ip_conntrac && (new_state == TCP_CONNTRACK_FIN_WAIT || new_state == TCP_CONNTRACK_CLOSE)) conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; - timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans - && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans - ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + timeout = conntrack->proto.tcp.retrans >= ve_ip_ct_tcp_max_retrans && + ve_ip_ct_tcp_timeouts[new_state] > + ve_ip_ct_tcp_timeout_max_retrans + ? ve_ip_ct_tcp_timeout_max_retrans : + ve_ip_ct_tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); @@ -1106,7 +1108,7 @@ static int tcp_new(struct ip_conntrack * conntrack->proto.tcp.seen[1].flags = 0; conntrack->proto.tcp.seen[0].loose = conntrack->proto.tcp.seen[1].loose = 0; - } else if (ip_ct_tcp_loose == 0) { + } else if (ve_ip_ct_tcp_loose == 0) { /* Don't try to pick up connections. */ return 0; } else { @@ -1130,7 +1132,7 @@ static int tcp_new(struct ip_conntrack * conntrack->proto.tcp.seen[0].flags = conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; conntrack->proto.tcp.seen[0].loose = - conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose; + conntrack->proto.tcp.seen[1].loose = ve_ip_ct_tcp_loose; } conntrack->proto.tcp.seen[1].td_end = 0; diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_udp.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_udp.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2007-06-13 06:55:07.000000000 -0400 @@ -71,12 +71,12 @@ static int udp_packet(struct ip_conntrac stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { ip_ct_refresh_acct(conntrack, ctinfo, skb, - ip_ct_udp_timeout_stream); + ve_ip_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) ip_conntrack_event_cache(IPCT_STATUS, skb); } else - ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); + ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_udp_timeout); return NF_ACCEPT; } diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_conntrack_standalone.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_standalone.c --- linux-2.6.18/net/ipv4/netfilter/ip_conntrack_standalone.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_conntrack_standalone.c 2007-06-13 06:55:07.000000000 -0400 @@ -27,6 +27,7 @@ #include #include #include +#include #define ASSERT_READ_LOCK(x) #define ASSERT_WRITE_LOCK(x) @@ -45,9 +46,31 @@ MODULE_LICENSE("GPL"); +int ip_conntrack_disable_ve0 = 0; +module_param(ip_conntrack_disable_ve0, int, 0440); + extern atomic_t ip_conntrack_count; +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_ip_conntrack_count \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_count) +#else +#define ve_ip_conntrack_count ip_conntrack_count +#endif DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); +/* Prior to 2.6.15, we had a ip_conntrack_enable_ve0 param. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO KBUILD_MODNAME + ": parameter ip_conntrack_enable_ve0 is obsoleted. In ovzkernel" + " >= 2.6.15 connection tracking on hardware node is enabled by " + "default, use ip_conntrack_disable_ve0=1 parameter to " + "disable.\n"); + return 0; +} +module_param_call(ip_conntrack_enable_ve0, warn_set, NULL, NULL, 0); + static int kill_proto(struct ip_conntrack *i, void *data) { return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == @@ -88,8 +111,8 @@ static struct list_head *ct_get_first(st for (st->bucket = 0; st->bucket < ip_conntrack_htable_size; st->bucket++) { - if (!list_empty(&ip_conntrack_hash[st->bucket])) - return ip_conntrack_hash[st->bucket].next; + if (!list_empty(&ve_ip_conntrack_hash[st->bucket])) + return ve_ip_conntrack_hash[st->bucket].next; } return NULL; } @@ -99,10 +122,10 @@ static struct list_head *ct_get_next(str struct ct_iter_state *st = seq->private; head = head->next; - while (head == &ip_conntrack_hash[st->bucket]) { + while (head == &ve_ip_conntrack_hash[st->bucket]) { if (++st->bucket >= ip_conntrack_htable_size) return NULL; - head = ip_conntrack_hash[st->bucket].next; + head = ve_ip_conntrack_hash[st->bucket].next; } return head; } @@ -238,7 +261,7 @@ static struct file_operations ct_file_op /* expects */ static void *exp_seq_start(struct seq_file *s, loff_t *pos) { - struct list_head *e = &ip_conntrack_expect_list; + struct list_head *e = &ve_ip_conntrack_expect_list; loff_t i; /* strange seq_file api calls stop even if we fail, @@ -250,7 +273,7 @@ static void *exp_seq_start(struct seq_fi for (i = 0; i <= *pos; i++) { e = e->next; - if (e == &ip_conntrack_expect_list) + if (e == &ve_ip_conntrack_expect_list) return NULL; } return e; @@ -263,7 +286,7 @@ static void *exp_seq_next(struct seq_fil ++*pos; e = e->next; - if (e == &ip_conntrack_expect_list) + if (e == &ve_ip_conntrack_expect_list) return NULL; return e; @@ -348,7 +371,7 @@ static void ct_cpu_seq_stop(struct seq_f static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); + unsigned int nr_conntracks = atomic_read(&ve_ip_conntrack_count); struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -540,6 +563,28 @@ int ip_conntrack_checksum = 1; /* From ip_conntrack_core.c */ extern int ip_conntrack_max; +#ifdef CONFIG_VE_IPTABLES +#define ve_ip_conntrack_max \ + (get_exec_env()->_ip_conntrack->_ip_conntrack_max) +#define ve_ip_ct_sysctl_header \ + (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_header) +#define ve_ip_ct_net_table \ + (get_exec_env()->_ip_conntrack->_ip_ct_net_table) +#define ve_ip_ct_ipv4_table \ + (get_exec_env()->_ip_conntrack->_ip_ct_ipv4_table) +#define ve_ip_ct_netfilter_table \ + (get_exec_env()->_ip_conntrack->_ip_ct_netfilter_table) +#define ve_ip_ct_sysctl_table \ + (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_table) +#else +#define ve_ip_conntrack_max ip_conntrack_max +static struct ctl_table_header *ip_ct_sysctl_header; +#define ve_ip_ct_sysctl_header ip_ct_sysctl_header +#define ve_ip_ct_net_table ip_ct_net_table +#define ve_ip_ct_ipv4_table ip_ct_ipv4_table +#define ve_ip_ct_netfilter_table ip_ct_netfilter_table +#define ve_ip_ct_sysctl_table ip_ct_sysctl_table +#endif extern unsigned int ip_conntrack_htable_size; /* From ip_conntrack_proto_tcp.c */ @@ -570,8 +615,6 @@ extern unsigned int ip_ct_generic_timeou static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; -static struct ctl_table_header *ip_ct_sysctl_header; - static ctl_table ip_ct_sysctl_table[] = { { .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, @@ -788,6 +831,78 @@ static ctl_table ip_ct_net_table[] = { }; EXPORT_SYMBOL(ip_ct_log_invalid); + +#ifdef CONFIG_VE_IPTABLES +static void ip_conntrack_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) + free_sysctl_clone(ve_ip_ct_net_table); + + ve_ip_ct_net_table = NULL; + ve_ip_ct_ipv4_table = NULL; + ve_ip_ct_netfilter_table = NULL; + ve_ip_ct_sysctl_table = NULL; +} + +static int ip_conntrack_sysctl_init(void) +{ + if (ve_is_super(get_exec_env())) { + ve_ip_ct_net_table = ip_ct_net_table; + ve_ip_ct_ipv4_table = ip_ct_ipv4_table; + ve_ip_ct_netfilter_table = ip_ct_netfilter_table; + ve_ip_ct_sysctl_table = ip_ct_sysctl_table; + } else { + ve_ip_ct_net_table = clone_sysctl_template(ip_ct_net_table); + if (ve_ip_ct_net_table == NULL) + return -ENOMEM; + + ve_ip_ct_ipv4_table = ve_ip_ct_net_table[0].child; + ve_ip_ct_netfilter_table = ve_ip_ct_ipv4_table[0].child; + ve_ip_ct_sysctl_table = ve_ip_ct_netfilter_table[0].child; + } + + ve_ip_ct_sysctl_table[0].data = &ve_ip_conntrack_max; + ve_ip_ct_netfilter_table[1].data = &ve_ip_conntrack_max; + ve_ip_ct_sysctl_table[1].data = &ve_ip_conntrack_count; + /* skip ve_ip_ct_sysctl_table[2].data as it is read-only and common + * for all environments */ + ve_ip_ct_tcp_timeouts[1] = ip_ct_tcp_timeout_syn_sent; + ve_ip_ct_sysctl_table[3].data = &ve_ip_ct_tcp_timeouts[1]; + ve_ip_ct_tcp_timeouts[2] = ip_ct_tcp_timeout_syn_recv; + ve_ip_ct_sysctl_table[4].data = &ve_ip_ct_tcp_timeouts[2]; + ve_ip_ct_tcp_timeouts[3] = ip_ct_tcp_timeout_established; + ve_ip_ct_sysctl_table[5].data = &ve_ip_ct_tcp_timeouts[3]; + ve_ip_ct_tcp_timeouts[4] = ip_ct_tcp_timeout_fin_wait; + ve_ip_ct_sysctl_table[6].data = &ve_ip_ct_tcp_timeouts[4]; + ve_ip_ct_tcp_timeouts[5] = ip_ct_tcp_timeout_close_wait; + ve_ip_ct_sysctl_table[7].data = &ve_ip_ct_tcp_timeouts[5]; + ve_ip_ct_tcp_timeouts[6] = ip_ct_tcp_timeout_last_ack; + ve_ip_ct_sysctl_table[8].data = &ve_ip_ct_tcp_timeouts[6]; + ve_ip_ct_tcp_timeouts[7] = ip_ct_tcp_timeout_time_wait; + ve_ip_ct_sysctl_table[9].data = &ve_ip_ct_tcp_timeouts[7]; + ve_ip_ct_tcp_timeouts[8] = ip_ct_tcp_timeout_close; + ve_ip_ct_sysctl_table[10].data = &ve_ip_ct_tcp_timeouts[8]; + ve_ip_ct_udp_timeout = ip_ct_udp_timeout; + ve_ip_ct_sysctl_table[11].data = &ve_ip_ct_udp_timeout; + ve_ip_ct_udp_timeout_stream = ip_ct_udp_timeout_stream; + ve_ip_ct_sysctl_table[12].data = &ve_ip_ct_udp_timeout_stream; + ve_ip_ct_icmp_timeout = ip_ct_icmp_timeout; + ve_ip_ct_sysctl_table[13].data = &ve_ip_ct_icmp_timeout; + ve_ip_ct_generic_timeout = ip_ct_generic_timeout; + ve_ip_ct_sysctl_table[14].data = &ve_ip_ct_generic_timeout; + ve_ip_ct_log_invalid = ip_ct_log_invalid; + ve_ip_ct_sysctl_table[15].data = &ve_ip_ct_log_invalid; + ve_ip_ct_tcp_timeout_max_retrans = ip_ct_tcp_timeout_max_retrans; + ve_ip_ct_sysctl_table[16].data = &ve_ip_ct_tcp_timeout_max_retrans; + ve_ip_ct_tcp_loose = ip_ct_tcp_loose; + ve_ip_ct_sysctl_table[17].data = &ve_ip_ct_tcp_loose; + ve_ip_ct_tcp_be_liberal = ip_ct_tcp_be_liberal; + ve_ip_ct_sysctl_table[18].data = &ve_ip_ct_tcp_be_liberal; + ve_ip_ct_tcp_max_retrans = ip_ct_tcp_max_retrans; + ve_ip_ct_sysctl_table[19].data = &ve_ip_ct_tcp_max_retrans; + return 0; +} +#endif /*CONFIG_VE*/ #endif /* CONFIG_SYSCTL */ /* FIXME: Allow NULL functions and sub in pointers to generic for @@ -797,11 +912,11 @@ int ip_conntrack_protocol_register(struc int ret = 0; write_lock_bh(&ip_conntrack_lock); - if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { + if (ve_ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { ret = -EBUSY; goto out; } - ip_ct_protos[proto->proto] = proto; + ve_ip_ct_protos[proto->proto] = proto; out: write_unlock_bh(&ip_conntrack_lock); return ret; @@ -810,7 +925,7 @@ int ip_conntrack_protocol_register(struc void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) { write_lock_bh(&ip_conntrack_lock); - ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; + ve_ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; write_unlock_bh(&ip_conntrack_lock); /* Somebody could be still looking at the proto in bh. */ @@ -820,16 +935,22 @@ void ip_conntrack_protocol_unregister(st ip_ct_iterate_cleanup(kill_proto, &proto->proto); } -static int __init ip_conntrack_standalone_init(void) +int init_iptable_conntrack(void) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc, *proc_exp, *proc_stat; #endif int ret = 0; + if (!ve_is_super(get_exec_env())) + __module_get(THIS_MODULE); + ret = ip_conntrack_init(); if (ret < 0) - return ret; + goto cleanup_unget; + + if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) + return 0; #ifdef CONFIG_PROC_FS ret = -ENOMEM; @@ -840,12 +961,14 @@ static int __init ip_conntrack_standalon &exp_file_ops); if (!proc_exp) goto cleanup_proc; - proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); - if (!proc_stat) - goto cleanup_proc_exp; + if (ve_is_super(get_exec_env())) { + proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); + if (!proc_stat) + goto cleanup_proc_exp; - proc_stat->proc_fops = &ct_cpu_seq_fops; - proc_stat->owner = THIS_MODULE; + proc_stat->proc_fops = &ct_cpu_seq_fops; + proc_stat->owner = THIS_MODULE; + } #endif ret = nf_register_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops)); @@ -854,22 +977,32 @@ static int __init ip_conntrack_standalon goto cleanup_proc_stat; } #ifdef CONFIG_SYSCTL - ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); - if (ip_ct_sysctl_header == NULL) { +#ifdef CONFIG_VE_IPTABLES + ret = ip_conntrack_sysctl_init(); + if (ret < 0) + goto cleanup_sysctl; +#endif + ret = -ENOMEM; + ve_ip_ct_sysctl_header = register_sysctl_table(ve_ip_ct_net_table, 0); + if (ve_ip_ct_sysctl_header == NULL) { printk("ip_conntrack: can't register to sysctl.\n"); - ret = -ENOMEM; - goto cleanup_hooks; + goto cleanup_sysctl2; } #endif - return ret; + return 0; #ifdef CONFIG_SYSCTL - cleanup_hooks: + cleanup_sysctl2: +#ifdef CONFIG_VE_IPTABLES + ip_conntrack_sysctl_cleanup(); + cleanup_sysctl: +#endif nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops)); #endif cleanup_proc_stat: #ifdef CONFIG_PROC_FS - remove_proc_entry("ip_conntrack", proc_net_stat); + if (ve_is_super(get_exec_env())) + remove_proc_entry("ip_conntrack", proc_net_stat); cleanup_proc_exp: proc_net_remove("ip_conntrack_expect"); cleanup_proc: @@ -877,25 +1010,59 @@ static int __init ip_conntrack_standalon cleanup_init: #endif /* CONFIG_PROC_FS */ ip_conntrack_cleanup(); + cleanup_unget: + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); return ret; } -static void __exit ip_conntrack_standalone_fini(void) +void fini_iptable_conntrack(void) { synchronize_net(); + if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) + goto cleanup; #ifdef CONFIG_SYSCTL - unregister_sysctl_table(ip_ct_sysctl_header); + unregister_sysctl_table(ve_ip_ct_sysctl_header); +#ifdef CONFIG_VE_IPTABLES + ip_conntrack_sysctl_cleanup(); +#endif #endif nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops)); #ifdef CONFIG_PROC_FS - remove_proc_entry("ip_conntrack", proc_net_stat); + if (ve_is_super(get_exec_env())) + remove_proc_entry("ip_conntrack", proc_net_stat); proc_net_remove("ip_conntrack_expect"); proc_net_remove("ip_conntrack"); #endif /* CONFIG_PROC_FS */ +cleanup: ip_conntrack_cleanup(); + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +} + +static int __init ip_conntrack_standalone_init(void) +{ + int err; + + err = init_iptable_conntrack(); + if (err < 0) + return err; + + KSYMRESOLVE(init_iptable_conntrack); + KSYMRESOLVE(fini_iptable_conntrack); + KSYMMODRESOLVE(ip_conntrack); + return 0; } -module_init(ip_conntrack_standalone_init); +static void __exit ip_conntrack_standalone_fini(void) +{ + KSYMMODUNRESOLVE(ip_conntrack); + KSYMUNRESOLVE(init_iptable_conntrack); + KSYMUNRESOLVE(fini_iptable_conntrack); + fini_iptable_conntrack(); +} + +subsys_initcall(ip_conntrack_standalone_init); module_exit(ip_conntrack_standalone_fini); /* Some modules need us, but don't depend directly on any symbol. @@ -912,15 +1079,20 @@ EXPORT_SYMBOL_GPL(ip_conntrack_unregiste EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); #endif +EXPORT_SYMBOL(ip_conntrack_disable_ve0); EXPORT_SYMBOL(ip_conntrack_protocol_register); EXPORT_SYMBOL(ip_conntrack_protocol_unregister); EXPORT_SYMBOL(ip_ct_get_tuple); EXPORT_SYMBOL(invert_tuplepr); EXPORT_SYMBOL(ip_conntrack_alter_reply); +#ifndef CONFIG_VE_IPTABLES EXPORT_SYMBOL(ip_conntrack_destroyed); +#endif EXPORT_SYMBOL(need_conntrack); EXPORT_SYMBOL(ip_conntrack_helper_register); EXPORT_SYMBOL(ip_conntrack_helper_unregister); +EXPORT_SYMBOL(virt_ip_conntrack_helper_register); +EXPORT_SYMBOL(virt_ip_conntrack_helper_unregister); EXPORT_SYMBOL(ip_ct_iterate_cleanup); EXPORT_SYMBOL(__ip_ct_refresh_acct); @@ -930,14 +1102,18 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_expect_ EXPORT_SYMBOL_GPL(ip_conntrack_expect_find); EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_unexpect_related); +#ifndef CONFIG_VE_IPTABLES EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); +#endif EXPORT_SYMBOL_GPL(ip_ct_unlink_expect); EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); EXPORT_SYMBOL(ip_conntrack_htable_size); EXPORT_SYMBOL(ip_conntrack_lock); +#ifndef CONFIG_VE_IPTABLES EXPORT_SYMBOL(ip_conntrack_hash); +#endif EXPORT_SYMBOL(ip_conntrack_untracked); EXPORT_SYMBOL_GPL(ip_conntrack_find_get); #ifdef CONFIG_IP_NF_NAT_NEEDED diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_nat_core.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_nat_core.c --- linux-2.6.18/net/ipv4/netfilter/ip_nat_core.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_nat_core.c 2007-06-13 06:55:07.000000000 -0400 @@ -21,6 +21,8 @@ #include #include #include +#include +#include #define ASSERT_READ_LOCK(x) #define ASSERT_WRITE_LOCK(x) @@ -46,15 +48,24 @@ DEFINE_RWLOCK(ip_nat_lock); /* Calculated at init based on memory size */ static unsigned int ip_nat_htable_size; -static struct list_head *bysource; - #define MAX_IP_NAT_PROTO 256 + +#ifdef CONFIG_VE_IPTABLES +#define ve_ip_nat_bysource \ + (get_exec_env()->_ip_conntrack->_ip_nat_bysource) +#define ve_ip_nat_protos \ + (get_exec_env()->_ip_conntrack->_ip_nat_protos) +#else +static struct list_head *bysource; +#define ve_ip_nat_bysource bysource static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; +#define ve_ip_nat_protos ip_nat_protos +#endif static inline struct ip_nat_protocol * __ip_nat_proto_find(u_int8_t protonum) { - return ip_nat_protos[protonum]; + return ve_ip_nat_protos[protonum]; } struct ip_nat_protocol * @@ -177,7 +188,7 @@ find_appropriate_src(const struct ip_con struct ip_conntrack *ct; read_lock_bh(&ip_nat_lock); - list_for_each_entry(ct, &bysource[h], nat.info.bysource) { + list_for_each_entry(ct, &ve_ip_nat_bysource[h], nat.info.bysource) { if (same_src(ct, tuple)) { /* Copy source part from reply tuple. */ invert_tuplepr(result, @@ -291,13 +302,22 @@ get_unique_tuple(struct ip_conntrack_tup ip_nat_proto_put(proto); } +void ip_nat_hash_conntrack(struct ip_conntrack *conntrack) +{ + unsigned int srchash + = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + write_lock_bh(&ip_nat_lock); + list_add(&conntrack->nat.info.bysource, &ve_ip_nat_bysource[srchash]); + write_unlock_bh(&ip_nat_lock); +} +EXPORT_SYMBOL_GPL(ip_nat_hash_conntrack); + unsigned int ip_nat_setup_info(struct ip_conntrack *conntrack, const struct ip_nat_range *range, unsigned int hooknum) { struct ip_conntrack_tuple curr_tuple, new_tuple; - struct ip_nat_info *info = &conntrack->nat.info; int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK); enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); @@ -332,14 +352,8 @@ ip_nat_setup_info(struct ip_conntrack *c } /* Place in source hash if this is the first time. */ - if (have_to_hash) { - unsigned int srchash - = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple); - write_lock_bh(&ip_nat_lock); - list_add(&info->bysource, &bysource[srchash]); - write_unlock_bh(&ip_nat_lock); - } + if (have_to_hash) + ip_nat_hash_conntrack(conntrack); /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) @@ -521,11 +535,11 @@ int ip_nat_protocol_register(struct ip_n int ret = 0; write_lock_bh(&ip_nat_lock); - if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { + if (ve_ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { ret = -EBUSY; goto out; } - ip_nat_protos[proto->protonum] = proto; + ve_ip_nat_protos[proto->protonum] = proto; out: write_unlock_bh(&ip_nat_lock); return ret; @@ -536,7 +550,7 @@ EXPORT_SYMBOL(ip_nat_protocol_register); void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) { write_lock_bh(&ip_nat_lock); - ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; + ve_ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; write_unlock_bh(&ip_nat_lock); /* Someone could be still looking at the proto in a bh. */ @@ -589,38 +603,55 @@ EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_ EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr); #endif -static int __init ip_nat_init(void) +static int ip_nat_init(void) { size_t i; + int ret; - /* Leave them the same for the moment. */ - ip_nat_htable_size = ip_conntrack_htable_size; + if (ve_is_super(get_exec_env())) + ip_nat_htable_size = ip_conntrack_htable_size; /* One vmalloc for both hash tables */ - bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size); - if (!bysource) - return -ENOMEM; + ret = -ENOMEM; + ve_ip_nat_bysource = + ub_vmalloc(sizeof(struct list_head)*ip_nat_htable_size*2); + if (!ve_ip_nat_bysource) + goto nomem; + +#ifdef CONFIG_VE_IPTABLES + ve_ip_nat_protos = + ub_kmalloc(sizeof(void *)*MAX_IP_NAT_PROTO, GFP_KERNEL); + if (!ve_ip_nat_protos) + goto nomem2; +#endif /* Sew in builtin protocols. */ write_lock_bh(&ip_nat_lock); for (i = 0; i < MAX_IP_NAT_PROTO; i++) - ip_nat_protos[i] = &ip_nat_unknown_protocol; - ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; - ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; - ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; + ve_ip_nat_protos[i] = &ip_nat_unknown_protocol; + ve_ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; + ve_ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; + ve_ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; write_unlock_bh(&ip_nat_lock); for (i = 0; i < ip_nat_htable_size; i++) { - INIT_LIST_HEAD(&bysource[i]); + INIT_LIST_HEAD(&ve_ip_nat_bysource[i]); } /* FIXME: Man, this is a hack. */ IP_NF_ASSERT(ip_conntrack_destroyed == NULL); - ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; + ve_ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; - /* Initialize fake conntrack so that NAT will skip it */ - ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + if (ve_is_super(get_exec_env())) + /* Initialize fake conntrack so that NAT will skip it */ + ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; return 0; +#ifdef CONFIG_VE_IPTABLES +nomem2: +#endif + vfree(ve_ip_nat_bysource); +nomem: + return ret; } /* Clear NAT section of all conntracks, in case we're loaded again. */ @@ -631,14 +662,41 @@ static int clean_nat(struct ip_conntrack return 0; } -static void __exit ip_nat_cleanup(void) +static void ip_nat_cleanup(void) { ip_ct_iterate_cleanup(&clean_nat, NULL); - ip_conntrack_destroyed = NULL; - vfree(bysource); + ve_ip_conntrack_destroyed = NULL; + vfree(ve_ip_nat_bysource); + ve_ip_nat_bysource = NULL; +#ifdef CONFIG_VE_IPTABLES + kfree(ve_ip_nat_protos); + ve_ip_nat_protos = NULL; +#endif +} + +static int __init init(void) +{ + int err; + + err = ip_nat_init(); + if (err < 0) + return err; + + KSYMRESOLVE(ip_nat_init); + KSYMRESOLVE(ip_nat_cleanup); + KSYMMODRESOLVE(ip_nat); + return 0; +} + +static void __exit fini(void) +{ + KSYMMODUNRESOLVE(ip_nat); + KSYMUNRESOLVE(ip_nat_cleanup); + KSYMUNRESOLVE(ip_nat_init); + ip_nat_cleanup(); } MODULE_LICENSE("GPL"); -module_init(ip_nat_init); -module_exit(ip_nat_cleanup); +fs_initcall(init); +module_exit(fini); diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_nat_ftp.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_nat_ftp.c --- linux-2.6.18/net/ipv4/netfilter/ip_nat_ftp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_nat_ftp.c 2007-06-13 06:55:07.000000000 -0400 @@ -19,6 +19,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Rusty Russell "); @@ -154,18 +155,43 @@ static unsigned int ip_nat_ftp(struct sk return NF_ACCEPT; } -static void __exit ip_nat_ftp_fini(void) +#ifdef CONFIG_VE_IPTABLES +#undef ve_ip_nat_ftp_hook +#define ve_ip_nat_ftp_hook \ + (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook) +#endif +int init_iptable_nat_ftp(void) { - ip_nat_ftp_hook = NULL; + BUG_ON(ve_ip_nat_ftp_hook); +#ifdef CONFIG_VE_IPTABLES + ve_ip_nat_ftp_hook = (ip_nat_helper_func)ip_nat_ftp; +#else + ve_ip_nat_ftp_hook = ip_nat_ftp; +#endif + return 0; +} + +void fini_iptable_nat_ftp(void) +{ + ve_ip_nat_ftp_hook = NULL; /* Make sure noone calls it, meanwhile. */ synchronize_net(); } +static void __exit ip_nat_ftp_fini(void) +{ + KSYMMODUNRESOLVE(ip_nat_ftp); + KSYMUNRESOLVE(init_iptable_nat_ftp); + KSYMUNRESOLVE(fini_iptable_nat_ftp); + fini_iptable_nat_ftp(); +} + static int __init ip_nat_ftp_init(void) { - BUG_ON(ip_nat_ftp_hook); - ip_nat_ftp_hook = ip_nat_ftp; - return 0; + KSYMRESOLVE(init_iptable_nat_ftp); + KSYMRESOLVE(fini_iptable_nat_ftp); + KSYMMODRESOLVE(ip_nat_ftp); + return init_iptable_nat_ftp(); } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_nat_irc.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_nat_irc.c --- linux-2.6.18/net/ipv4/netfilter/ip_nat_irc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_nat_irc.c 2007-06-13 06:55:07.000000000 -0400 @@ -23,6 +23,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -96,18 +97,44 @@ static unsigned int help(struct sk_buff return ret; } -static void __exit ip_nat_irc_fini(void) +#ifdef CONFIG_VE_IPTABLES +#undef ve_ip_nat_irc_hook +#define ve_ip_nat_irc_hook \ + (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook) +#endif + +int init_iptable_nat_irc(void) +{ + BUG_ON(ve_ip_nat_irc_hook); +#ifdef CONFIG_VE_IPTABLES + ve_ip_nat_irc_hook = (ip_nat_helper_func)help; +#else + ve_ip_nat_irc_hook = help; +#endif + return 0; +} + +void fini_iptable_nat_irc(void) { - ip_nat_irc_hook = NULL; + ve_ip_nat_irc_hook = NULL; /* Make sure noone calls it, meanwhile. */ synchronize_net(); } +static void __exit ip_nat_irc_fini(void) +{ + KSYMMODUNRESOLVE(ip_nat_irc); + KSYMUNRESOLVE(init_iptable_nat_irc); + KSYMUNRESOLVE(fini_iptable_nat_irc); + fini_iptable_nat_irc(); +} + static int __init ip_nat_irc_init(void) { - BUG_ON(ip_nat_irc_hook); - ip_nat_irc_hook = help; - return 0; + KSYMRESOLVE(init_iptable_nat_irc); + KSYMRESOLVE(fini_iptable_nat_irc); + KSYMMODRESOLVE(ip_nat_irc); + return init_iptable_nat_irc(); } /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_nat_rule.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_nat_rule.c --- linux-2.6.18/net/ipv4/netfilter/ip_nat_rule.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_nat_rule.c 2007-06-13 06:55:07.000000000 -0400 @@ -34,6 +34,13 @@ #define DEBUGP(format, args...) #endif +#ifdef CONFIG_VE_IPTABLES +#define ve_ip_nat_table \ + (get_exec_env()->_ip_conntrack->_ip_nat_table) +#else +#define ve_ip_nat_table &nat_table +#endif + #define NAT_VALID_HOOKS ((1< #include #include +#include #define ASSERT_READ_LOCK(x) #define ASSERT_WRITE_LOCK(x) @@ -110,12 +111,17 @@ ip_nat_fn(unsigned int hooknum, IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))); + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* Don't try to NAT if this packet is not conntracked */ + if (ct == &ip_conntrack_untracked) + return NF_ACCEPT; + /* If we had a hardware checksum before, it's now invalid */ if ((*pskb)->ip_summed == CHECKSUM_HW) if (skb_checksum_help(*pskb, (out == NULL))) return NF_DROP; - ct = ip_conntrack_get(*pskb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would have dropped it. Hence it's the user's responsibilty to packet filter it out, or implement conntrack/NAT for that @@ -137,10 +143,6 @@ ip_nat_fn(unsigned int hooknum, return NF_ACCEPT; } - /* Don't try to NAT if this packet is not conntracked */ - if (ct == &ip_conntrack_untracked) - return NF_ACCEPT; - switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: @@ -274,7 +276,8 @@ ip_nat_local_fn(unsigned int hooknum, ct->tuplehash[!dir].tuple.src.u.all #endif ) - return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + if (ip_route_me_harder(pskb, RTN_UNSPEC)) + ret = NF_DROP; } return ret; } @@ -351,21 +354,19 @@ static struct nf_hook_ops ip_nat_ops[] = }, }; -static int __init ip_nat_standalone_init(void) +int init_iptable_nat(void) { int ret = 0; - need_conntrack(); + if (!ve_is_super(get_exec_env())) + __module_get(THIS_MODULE); -#ifdef CONFIG_XFRM - BUG_ON(ip_nat_decode_session != NULL); - ip_nat_decode_session = nat_decode_session; -#endif ret = ip_nat_rule_init(); if (ret < 0) { printk("ip_nat_init: can't setup rules.\n"); - goto cleanup_decode_session; + goto out_modput; } + ret = nf_register_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops)); if (ret < 0) { printk("ip_nat_init: can't register hooks.\n"); @@ -375,25 +376,59 @@ static int __init ip_nat_standalone_init cleanup_rule_init: ip_nat_rule_cleanup(); - cleanup_decode_session: -#ifdef CONFIG_XFRM - ip_nat_decode_session = NULL; - synchronize_net(); -#endif + out_modput: + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); return ret; } -static void __exit ip_nat_standalone_fini(void) +void fini_iptable_nat(void) { nf_unregister_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops)); ip_nat_rule_cleanup(); + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +} + +static int __init ip_nat_standalone_init(void) +{ + int err; + + need_conntrack(); + +#ifdef CONFIG_XFRM + BUG_ON(ip_nat_decode_session != NULL); + ip_nat_decode_session = nat_decode_session; +#endif + if (!ip_conntrack_disable_ve0 && + (err = init_iptable_nat()) < 0) { +#ifdef CONFIG_XFRM + ip_nat_decode_session = NULL; + synchronize_net(); +#endif + return err; + } + + KSYMRESOLVE(init_iptable_nat); + KSYMRESOLVE(fini_iptable_nat); + KSYMMODRESOLVE(iptable_nat); + return 0; +} + +static void __exit ip_nat_standalone_fini(void) +{ + KSYMMODUNRESOLVE(iptable_nat); + KSYMUNRESOLVE(init_iptable_nat); + KSYMUNRESOLVE(fini_iptable_nat); + if (!ip_conntrack_disable_ve0) + fini_iptable_nat(); #ifdef CONFIG_XFRM ip_nat_decode_session = NULL; synchronize_net(); #endif } -module_init(ip_nat_standalone_init); +fs_initcall(ip_nat_standalone_init); module_exit(ip_nat_standalone_fini); MODULE_LICENSE("GPL"); diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_queue.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_queue.c --- linux-2.6.18/net/ipv4/netfilter/ip_queue.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_queue.c 2007-06-13 06:55:07.000000000 -0400 @@ -515,7 +515,7 @@ ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); write_lock_bh(&queue_lock); @@ -547,12 +547,15 @@ ipq_rcv_sk(struct sock *sk, int len) { struct sk_buff *skb; unsigned int qlen; + struct ve_struct *env; mutex_lock(&ipqnl_mutex); for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); + env = set_exec_env(skb->owner_env); ipq_rcv_skb(skb); + (void)set_exec_env(env); kfree_skb(skb); } diff -uprN linux-2.6.18/net/ipv4/netfilter/ip_tables.c linux-2.6.18.ovz/net/ipv4/netfilter/ip_tables.c --- linux-2.6.18/net/ipv4/netfilter/ip_tables.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ip_tables.c 2007-06-13 06:55:07.000000000 -0400 @@ -29,9 +29,11 @@ #include #include #include +#include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); @@ -389,8 +391,8 @@ mark_source_chains(struct xt_table_info = (void *)ipt_get_target(e); if (e->comefrom & (1 << NF_IP_NUMHOOKS)) { - printk("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + ve_printk(VE_LOG, "iptables: loop hook %u pos " + "%u %08X.\n", hook, pos, e->comefrom); return 0; } e->comefrom @@ -404,6 +406,13 @@ mark_source_chains(struct xt_table_info && unconditional(&e->ip)) { unsigned int oldpos, size; + if (t->verdict < -NF_MAX_VERDICT - 1) { + duprintf("mark_source_chains: bad " + "negative verdict (%i)\n", + t->verdict); + return 0; + } + /* Return: backtrack through the last big jump. */ do { @@ -441,6 +450,14 @@ mark_source_chains(struct xt_table_info if (strcmp(t->target.u.user.name, IPT_STANDARD_TARGET) == 0 && newpos >= 0) { + if (newpos > newinfo->size - + sizeof(struct ipt_entry)) { + duprintf("mark_source_chains: " + "bad verdict (%i)\n", + newpos); + return 0; + } + /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -473,29 +490,29 @@ cleanup_match(struct ipt_entry_match *m, return 0; } -static inline int -standard_check(const struct ipt_entry_target *t, - unsigned int max_offset) +static inline int check_match(struct ipt_entry_match *m, const char *name, + const struct ipt_ip *ip, unsigned int hookmask) { - struct ipt_standard_target *targ = (void *)t; + struct ipt_match *match; + int ret; - /* Check standard info. */ - if (targ->verdict >= 0 - && targ->verdict > max_offset - sizeof(struct ipt_entry)) { - duprintf("ipt_standard_check: bad verdict (%i)\n", - targ->verdict); - return 0; - } - if (targ->verdict < -NF_MAX_VERDICT - 1) { - duprintf("ipt_standard_check: bad negative verdict (%i)\n", - targ->verdict); - return 0; + match = m->u.kernel.match; + ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m), + name, hookmask, ip->proto, + ip->invflags & IPT_INV_PROTO); + if (!ret && m->u.kernel.match->checkentry + && !m->u.kernel.match->checkentry(name, ip, match, m->data, + m->u.match_size - sizeof(*m), + hookmask)) { + duprintf("ip_tables: check failed for `%s'.\n", + m->u.kernel.match->name); + ret = -EINVAL; } - return 1; + return ret; } static inline int -check_match(struct ipt_entry_match *m, +find_check_match(struct ipt_entry_match *m, const char *name, const struct ipt_ip *ip, unsigned int hookmask, @@ -508,27 +525,15 @@ check_match(struct ipt_entry_match *m, m->u.user.revision), "ipt_%s", m->u.user.name); if (IS_ERR(match) || !match) { - duprintf("check_match: `%s' not found\n", m->u.user.name); + duprintf("find_check_match: `%s' not found\n", m->u.user.name); return match ? PTR_ERR(match) : -ENOENT; } m->u.kernel.match = match; - ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m), - name, hookmask, ip->proto, - ip->invflags & IPT_INV_PROTO); + ret = check_match(m, name, ip, hookmask); if (ret) goto err; - if (m->u.kernel.match->checkentry - && !m->u.kernel.match->checkentry(name, ip, match, m->data, - m->u.match_size - sizeof(*m), - hookmask)) { - duprintf("ip_tables: check failed for `%s'.\n", - m->u.kernel.match->name); - ret = -EINVAL; - goto err; - } - (*i)++; return 0; err: @@ -536,10 +541,52 @@ err: return ret; } -static struct ipt_target ipt_standard_target; +static inline int check_target(struct ipt_entry *e, const char *name) +{ + struct ipt_entry_target *t; + struct ipt_target *target; + int ret; + + t = ipt_get_target(e); + target = t->u.kernel.target; + ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), + name, e->comefrom, e->ip.proto, + e->ip.invflags & IPT_INV_PROTO); + if (!ret && t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, target, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + duprintf("check_target: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + } + return ret; + +} + +static inline int check_entry(struct ipt_entry *e, const char *name) +{ + struct ipt_entry_target *t; + + if (!ip_checkentry(&e->ip)) { + duprintf("check_entry: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + if (e->target_offset + sizeof(struct ipt_entry_target) > + e->next_offset) + return -EINVAL; + + t = ipt_get_target(e); + if (e->target_offset + t->u.target_size > e->next_offset) + return -EINVAL; + + return 0; +} static inline int -check_entry(struct ipt_entry *e, const char *name, unsigned int size, +find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, unsigned int *i) { struct ipt_entry_target *t; @@ -547,13 +594,13 @@ check_entry(struct ipt_entry *e, const c int ret; unsigned int j; - if (!ip_checkentry(&e->ip)) { - duprintf("ip_tables: ip check failed %p %s.\n", e, name); - return -EINVAL; - } + ret = check_entry(e, name); + if (ret != 0) + return ret; j = 0; - ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j); + ret = IPT_MATCH_ITERATE(e, find_check_match, name, &e->ip, e->comefrom, + &j); if (ret != 0) goto cleanup_matches; @@ -569,28 +616,10 @@ check_entry(struct ipt_entry *e, const c } t->u.kernel.target = target; - ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), - name, e->comefrom, e->ip.proto, - e->ip.invflags & IPT_INV_PROTO); + ret = check_target(e, name); if (ret) goto err; - if (t->u.kernel.target == &ipt_standard_target) { - if (!standard_check(t, size)) { - ret = -EINVAL; - goto cleanup_matches; - } - } else if (t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - t->u.target_size - - sizeof(*t), - e->comefrom)) { - duprintf("ip_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - ret = -EINVAL; - goto err; - } - (*i)++; return 0; err: @@ -726,8 +755,7 @@ translate_table(const char *name, /* Finally, each sanity check must pass */ i = 0; ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, - check_entry, name, size, &i); - + find_check_entry, name, size, &i); if (ret != 0) { IPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); @@ -810,7 +838,7 @@ static inline struct xt_counters * alloc (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = ub_vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -901,13 +929,13 @@ copy_entries_to_user(unsigned int total_ #ifdef CONFIG_COMPAT struct compat_delta { struct compat_delta *next; - u_int16_t offset; + unsigned int offset; short delta; }; static struct compat_delta *compat_offsets = NULL; -static int compat_add_offset(u_int16_t offset, short delta) +static int compat_add_offset(unsigned int offset, short delta) { struct compat_delta *tmp; @@ -939,7 +967,7 @@ static void compat_flush_offsets(void) } } -static short compat_calc_jump(u_int16_t offset) +static short compat_calc_jump(unsigned int offset) { struct compat_delta *tmp; short delta; @@ -1024,7 +1052,7 @@ static int compat_calc_entry(struct ipt_ void *base, struct xt_table_info *newinfo) { struct ipt_entry_target *t; - u_int16_t entry_offset; + unsigned int entry_offset; int off, i, ret; off = 0; @@ -1187,7 +1215,7 @@ __do_replace(const char *name, unsigned void *loc_cpu_old_entry; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; @@ -1366,7 +1394,7 @@ do_add_counters(void __user *user, unsig if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = ub_vmalloc_node(len - size, numa_node_id()); if (!paddc) return -ENOMEM; @@ -1509,7 +1537,7 @@ check_compat_entry_size_and_hooks(struct { struct ipt_entry_target *t; struct ipt_target *target; - u_int16_t entry_offset; + unsigned int entry_offset; int ret, off, h, j; duprintf("check_compat_entry_size_and_hooks %p\n", e); @@ -1526,10 +1554,9 @@ check_compat_entry_size_and_hooks(struct return -EINVAL; } - if (!ip_checkentry(&e->ip)) { - duprintf("ip_tables: ip check failed %p %s.\n", e, name); - return -EINVAL; - } + ret = check_entry(e, name); + if (ret) + return ret; off = 0; entry_offset = (void *)e - (void *)base; @@ -1537,7 +1564,7 @@ check_compat_entry_size_and_hooks(struct ret = IPT_MATCH_ITERATE(e, compat_check_calc_match, name, &e->ip, e->comefrom, &off, &j); if (ret != 0) - goto out; + goto cleanup_matches; t = ipt_get_target(e); target = try_then_request_module(xt_find_target(AF_INET, @@ -1545,9 +1572,10 @@ check_compat_entry_size_and_hooks(struct t->u.user.revision), "ipt_%s", t->u.user.name); if (IS_ERR(target) || !target) { - duprintf("check_entry: `%s' not found\n", t->u.user.name); + duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", + t->u.user.name); ret = target ? PTR_ERR(target) : -ENOENT; - goto out; + goto cleanup_matches; } t->u.kernel.target = target; @@ -1575,39 +1603,22 @@ check_compat_entry_size_and_hooks(struct (*i)++; return 0; out: + module_put(t->u.kernel.target->me); +cleanup_matches: IPT_MATCH_ITERATE(e, cleanup_match, &j); return ret; } static inline int compat_copy_match_from_user(struct ipt_entry_match *m, - void **dstptr, compat_uint_t *size, const char *name, - const struct ipt_ip *ip, unsigned int hookmask) + void **dstptr, compat_uint_t *size) { - struct ipt_entry_match *dm; struct ipt_match *match; - int ret; - dm = (struct ipt_entry_match *)*dstptr; match = m->u.kernel.match; if (match->compat) match->compat(m, dstptr, size, COMPAT_FROM_USER); else xt_compat_match(m, dstptr, size, COMPAT_FROM_USER); - - ret = xt_check_match(match, AF_INET, dm->u.match_size - sizeof(*dm), - name, hookmask, ip->proto, - ip->invflags & IPT_INV_PROTO); - if (ret) - return ret; - - if (m->u.kernel.match->checkentry - && !m->u.kernel.match->checkentry(name, ip, match, dm->data, - dm->u.match_size - sizeof(*dm), - hookmask)) { - duprintf("ip_tables: check failed for `%s'.\n", - m->u.kernel.match->name); - return -EINVAL; - } return 0; } @@ -1627,10 +1638,9 @@ static int compat_copy_entry_from_user(s memcpy(de, e, sizeof(struct ipt_entry)); *dstptr += sizeof(struct compat_ipt_entry); - ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size, - name, &de->ip, de->comefrom); + ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size); if (ret) - goto out; + return ret; de->target_offset = e->target_offset - (origsize - *size); t = ipt_get_target(e); target = t->u.kernel.target; @@ -1646,30 +1656,18 @@ static int compat_copy_entry_from_user(s if ((unsigned char *)de - base < newinfo->underflow[h]) newinfo->underflow[h] -= origsize - *size; } + return ret; +} - t = ipt_get_target(de); - target = t->u.kernel.target; - ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), - name, e->comefrom, e->ip.proto, - e->ip.invflags & IPT_INV_PROTO); - if (ret) - goto out; +static inline int check_entry_data(struct ipt_entry *e, const char *name) +{ + int ret; - ret = -EINVAL; - if (t->u.kernel.target == &ipt_standard_target) { - if (!standard_check(t, *size)) - goto out; - } else if (t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, de, target, - t->data, t->u.target_size - sizeof(*t), - de->comefrom)) { - duprintf("ip_tables: compat: check failed for `%s'.\n", - t->u.kernel.target->name); - goto out; - } - ret = 0; -out: - return ret; + ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom); + if (ret != 0) + return ret; + + return check_target(e, name); } static int @@ -1682,7 +1680,7 @@ translate_compat_table(const char *name, unsigned int *hook_entries, unsigned int *underflows) { - unsigned int i; + unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; unsigned int size; @@ -1719,18 +1717,18 @@ translate_compat_table(const char *name, } /* Check hooks all assigned */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (j = 0; j < NF_IP_NUMHOOKS; j++) { /* Only hooks which are valid */ - if (!(valid_hooks & (1 << i))) + if (!(valid_hooks & (1 << j))) continue; - if (info->hook_entry[i] == 0xFFFFFFFF) { + if (info->hook_entry[j] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", - i, hook_entries[i]); + j, hook_entries[j]); goto out_unlock; } - if (info->underflow[i] == 0xFFFFFFFF) { + if (info->underflow[j] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", - i, underflows[i]); + j, underflows[j]); goto out_unlock; } } @@ -1741,9 +1739,9 @@ translate_compat_table(const char *name, goto out_unlock; newinfo->number = number; - for (i = 0; i < NF_IP_NUMHOOKS; i++) { - newinfo->hook_entry[i] = info->hook_entry[i]; - newinfo->underflow[i] = info->underflow[i]; + for (j = 0; j < NF_IP_NUMHOOKS; j++) { + newinfo->hook_entry[j] = info->hook_entry[j]; + newinfo->underflow[j] = info->underflow[j]; } entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; @@ -1760,6 +1758,10 @@ translate_compat_table(const char *name, if (!mark_source_chains(newinfo, valid_hooks, entry1)) goto free_newinfo; + ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, check_entry_data, name); + if (ret) + goto free_newinfo; + /* And one copy for every other CPU */ for_each_possible_cpu(i) if (newinfo->entries[i] && newinfo->entries[i] != entry1) @@ -1773,8 +1775,10 @@ translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: + IPT_ENTRY_ITERATE(entry0, total_size, cleanup_entry, &i); return ret; out_unlock: + compat_flush_offsets(); xt_compat_unlock(AF_INET); goto out; } @@ -1835,15 +1839,22 @@ compat_do_replace(void __user *user, uns return ret; } +static int do_ipt_set_ctl(struct sock *, int, void __user *, unsigned int); + static int compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET].next) + return -ENOENT; +#endif + switch (cmd) { case IPT_SO_SET_REPLACE: ret = compat_do_replace(user, len); @@ -1854,8 +1865,7 @@ compat_do_ipt_set_ctl(struct sock *sk, i break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); - ret = -EINVAL; + ret = do_ipt_set_ctl(sk, cmd, user, len); } return ret; @@ -1989,11 +1999,21 @@ compat_get_entries(struct compat_ipt_get return ret; } +static int do_ipt_get_ctl(struct sock *, int, void __user *, int *); + static int compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET].next) + return -ENOENT; +#endif + switch (cmd) { case IPT_SO_GET_INFO: ret = get_info(user, len, 1); @@ -2002,8 +2022,7 @@ compat_do_ipt_get_ctl(struct sock *sk, i ret = compat_get_entries(user, len); break; default: - duprintf("compat_do_ipt_get_ctl: unknown request %i\n", cmd); - ret = -EINVAL; + ret = do_ipt_get_ctl(sk, cmd, user, len); } return ret; } @@ -2014,9 +2033,14 @@ do_ipt_set_ctl(struct sock *sk, int cmd, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET].next) + return -ENOENT; +#endif + switch (cmd) { case IPT_SO_SET_REPLACE: ret = do_replace(user, len); @@ -2039,9 +2063,14 @@ do_ipt_get_ctl(struct sock *sk, int cmd, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET].next) + return -ENOENT; +#endif + switch (cmd) { case IPT_SO_GET_INFO: ret = get_info(user, len, 0); @@ -2085,17 +2114,18 @@ do_ipt_get_ctl(struct sock *sk, int cmd, return ret; } -int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl) +struct xt_table *ipt_register_table(struct xt_table *table, + const struct ipt_replace *repl) { int ret; struct xt_table_info *newinfo; static struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* choose the copy on our node/cpu * but dont care of preemption @@ -2110,28 +2140,30 @@ int ipt_register_table(struct xt_table * repl->underflow); if (ret != 0) { xt_free_table_info(newinfo); - return ret; + return ERR_PTR(ret); } - ret = xt_register_table(table, &bootstrap, newinfo); - if (ret != 0) { + table = virt_xt_register_table(table, &bootstrap, newinfo); + if (IS_ERR(table)) xt_free_table_info(newinfo); - return ret; - } - return 0; + return table; } void ipt_unregister_table(struct ipt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; + struct module *me; - private = xt_unregister_table(table); + me = table->me; + private = virt_xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + if (private->number > private->initial_entries) + module_put(me); xt_free_table_info(private); } @@ -2236,12 +2268,30 @@ static struct ipt_match icmp_matchstruct .checkentry = icmp_checkentry, }; +static int init_iptables(void) +{ +#ifdef CONFIG_VE_IPTABLES + if (get_exec_env()->_xt_tables[AF_INET].next != NULL) + return -EEXIST; +#endif + + return xt_proto_init(AF_INET); +} + +static void fini_iptables(void) +{ +#ifdef CONFIG_VE_IPTABLES + get_exec_env()->_xt_tables[AF_INET].next = NULL; +#endif + xt_proto_fini(AF_INET); +} + static int __init ip_tables_init(void) { int ret; - ret = xt_proto_init(AF_INET); - if (ret < 0) + ret = init_iptables(); + if (ret) goto err1; /* Noone else will be downing sem now, so we won't sleep */ @@ -2260,6 +2310,9 @@ static int __init ip_tables_init(void) if (ret < 0) goto err5; + KSYMRESOLVE(init_iptables); + KSYMRESOLVE(fini_iptables); + KSYMMODRESOLVE(ip_tables); printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n"); return 0; @@ -2270,24 +2323,25 @@ err4: err3: xt_unregister_target(&ipt_standard_target); err2: - xt_proto_fini(AF_INET); + fini_iptables(); err1: return ret; } static void __exit ip_tables_fini(void) { + KSYMMODUNRESOLVE(ip_tables); + KSYMUNRESOLVE(init_iptables); + KSYMUNRESOLVE(fini_iptables); nf_unregister_sockopt(&ipt_sockopts); - xt_unregister_match(&icmp_matchstruct); xt_unregister_target(&ipt_error_target); xt_unregister_target(&ipt_standard_target); - - xt_proto_fini(AF_INET); + fini_iptables(); } EXPORT_SYMBOL(ipt_register_table); EXPORT_SYMBOL(ipt_unregister_table); EXPORT_SYMBOL(ipt_do_table); -module_init(ip_tables_init); +subsys_initcall(ip_tables_init); module_exit(ip_tables_fini); diff -uprN linux-2.6.18/net/ipv4/netfilter/ipt_LOG.c linux-2.6.18.ovz/net/ipv4/netfilter/ipt_LOG.c --- linux-2.6.18/net/ipv4/netfilter/ipt_LOG.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ipt_LOG.c 2007-06-13 06:55:07.000000000 -0400 @@ -51,32 +51,32 @@ static void dump_packet(const struct nf_ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Important fields: * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", + ve_printk(VE_LOG, "SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ve_printk(VE_LOG, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ if (ntohs(ih->frag_off) & IP_CE) - printk("CE "); + ve_printk(VE_LOG, "CE "); if (ntohs(ih->frag_off) & IP_DF) - printk("DF "); + ve_printk(VE_LOG, "DF "); if (ntohs(ih->frag_off) & IP_MF) - printk("MF "); + ve_printk(VE_LOG, "MF "); /* Max length: 11 "FRAG:65535 " */ if (ntohs(ih->frag_off) & IP_OFFSET) - printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + ve_printk(VE_LOG, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { @@ -87,15 +87,15 @@ static void dump_packet(const struct nf_ op = skb_header_pointer(skb, iphoff+sizeof(_iph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + ve_printk(VE_LOG, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + ve_printk(VE_LOG, "%02X", op[i]); + ve_printk(VE_LOG, ") "); } switch (ih->protocol) { @@ -103,7 +103,7 @@ static void dump_packet(const struct nf_ struct tcphdr _tcph, *th; /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); + ve_printk(VE_LOG, "PROTO=TCP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -112,41 +112,41 @@ static void dump_packet(const struct nf_ th = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_tcph), &_tcph); if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & IPT_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", + ve_printk(VE_LOG, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); + ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3F " */ - printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + ve_printk(VE_LOG, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ if (th->cwr) - printk("CWR "); + ve_printk(VE_LOG, "CWR "); if (th->ece) - printk("ECE "); + ve_printk(VE_LOG, "ECE "); if (th->urg) - printk("URG "); + ve_printk(VE_LOG, "URG "); if (th->ack) - printk("ACK "); + ve_printk(VE_LOG, "ACK "); if (th->psh) - printk("PSH "); + ve_printk(VE_LOG, "PSH "); if (th->rst) - printk("RST "); + ve_printk(VE_LOG, "RST "); if (th->syn) - printk("SYN "); + ve_printk(VE_LOG, "SYN "); if (th->fin) - printk("FIN "); + ve_printk(VE_LOG, "FIN "); /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); + ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { @@ -159,15 +159,15 @@ static void dump_packet(const struct nf_ iphoff+ih->ihl*4+sizeof(_tcph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + ve_printk(VE_LOG, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + ve_printk(VE_LOG, "%02X", op[i]); + ve_printk(VE_LOG, ") "); } break; } @@ -175,7 +175,7 @@ static void dump_packet(const struct nf_ struct udphdr _udph, *uh; /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP "); + ve_printk(VE_LOG, "PROTO=UDP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -184,13 +184,13 @@ static void dump_packet(const struct nf_ uh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_udph), &_udph); if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); break; @@ -216,7 +216,7 @@ static void dump_packet(const struct nf_ [ICMP_ADDRESSREPLY] = 12 }; /* Max length: 11 "PROTO=ICMP " */ - printk("PROTO=ICMP "); + ve_printk(VE_LOG, "PROTO=ICMP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -225,19 +225,19 @@ static void dump_packet(const struct nf_ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_icmph), &_icmph); if (ich == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ich->type, ich->code); + ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ if (ich->type <= NR_ICMP_TYPES && required_len[ich->type] && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } @@ -246,19 +246,19 @@ static void dump_packet(const struct nf_ case ICMP_ECHOREPLY: case ICMP_ECHO: /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", + ve_printk(VE_LOG, "ID=%u SEQ=%u ", ntohs(ich->un.echo.id), ntohs(ich->un.echo.sequence)); break; case ICMP_PARAMETERPROB: /* Max length: 14 "PARAMETER=255 " */ - printk("PARAMETER=%u ", + ve_printk(VE_LOG, "PARAMETER=%u ", ntohl(ich->un.gateway) >> 24); break; case ICMP_REDIRECT: /* Max length: 24 "GATEWAY=255.255.255.255 " */ - printk("GATEWAY=%u.%u.%u.%u ", + ve_printk(VE_LOG, "GATEWAY=%u.%u.%u.%u ", NIPQUAD(ich->un.gateway)); /* Fall through */ case ICMP_DEST_UNREACH: @@ -266,16 +266,16 @@ static void dump_packet(const struct nf_ case ICMP_TIME_EXCEEDED: /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ - printk("["); + ve_printk(VE_LOG, "["); dump_packet(info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); - printk("] "); + ve_printk(VE_LOG, "] "); } /* Max length: 10 "MTU=65535 " */ if (ich->type == ICMP_DEST_UNREACH && ich->code == ICMP_FRAG_NEEDED) - printk("MTU=%u ", ntohs(ich->un.frag.mtu)); + ve_printk(VE_LOG, "MTU=%u ", ntohs(ich->un.frag.mtu)); } break; } @@ -287,26 +287,26 @@ static void dump_packet(const struct nf_ break; /* Max length: 9 "PROTO=AH " */ - printk("PROTO=AH "); + ve_printk(VE_LOG, "PROTO=AH "); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ah = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_ahdr), &_ahdr); if (ah == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(ah->spi)); + ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi)); break; } case IPPROTO_ESP: { struct ip_esp_hdr _esph, *eh; /* Max length: 10 "PROTO=ESP " */ - printk("PROTO=ESP "); + ve_printk(VE_LOG, "PROTO=ESP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -315,25 +315,25 @@ static void dump_packet(const struct nf_ eh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_esph), &_esph); if (eh == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(eh->spi)); + ve_printk(VE_LOG, "SPI=0x%x ", ntohl(eh->spi)); break; } /* Max length: 10 "PROTO 255 " */ default: - printk("PROTO=%u ", ih->protocol); + ve_printk(VE_LOG, "PROTO=%u ", ih->protocol); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + ve_printk(VE_LOG, "UID=%u ", skb->sk->sk_socket->file->f_uid); read_unlock_bh(&skb->sk->sk_callback_lock); } @@ -374,7 +374,7 @@ ipt_log_packet(unsigned int pf, loginfo = &default_loginfo; spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); @@ -384,29 +384,29 @@ ipt_log_packet(unsigned int pf, struct net_device *physoutdev = skb->nf_bridge->physoutdev; if (physindev && in != physindev) - printk("PHYSIN=%s ", physindev->name); + ve_printk(VE_LOG, "PHYSIN=%s ", physindev->name); if (physoutdev && out != physoutdev) - printk("PHYSOUT=%s ", physoutdev->name); + ve_printk(VE_LOG, "PHYSOUT=%s ", physoutdev->name); } #endif if (in && !out) { /* MAC logging for input chain only. */ - printk("MAC="); + ve_printk(VE_LOG, "MAC="); if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)skb->nh.iph) { int i; unsigned char *p = skb->mac.raw; for (i = 0; i < skb->dev->hard_header_len; i++,p++) - printk("%02x%c", *p, + ve_printk(VE_LOG, "%02x%c", *p, i==skb->dev->hard_header_len - 1 ? ' ':':'); } else - printk(" "); + ve_printk(VE_LOG, " "); } dump_packet(loginfo, skb, 0); - printk("\n"); + ve_printk(VE_LOG, "\n"); spin_unlock_bh(&log_lock); } @@ -481,7 +481,7 @@ static int __init ipt_log_init(void) /* we cannot make module load fail here, since otherwise * iptables userspace would abort */ } - + return 0; } diff -uprN linux-2.6.18/net/ipv4/netfilter/ipt_MASQUERADE.c linux-2.6.18.ovz/net/ipv4/netfilter/ipt_MASQUERADE.c --- linux-2.6.18/net/ipv4/netfilter/ipt_MASQUERADE.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ipt_MASQUERADE.c 2007-06-13 06:55:07.000000000 -0400 @@ -108,6 +108,7 @@ masquerade_target(struct sk_buff **pskb, return ip_nat_setup_info(ct, &newrange, hooknum); } +#if 0 static inline int device_cmp(struct ip_conntrack *i, void *ifindex) { @@ -163,6 +164,7 @@ static struct notifier_block masq_dev_no static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; +#endif static struct ipt_target masquerade = { .name = "MASQUERADE", @@ -180,12 +182,16 @@ static int __init ipt_masquerade_init(vo ret = ipt_register_target(&masquerade); +#if 0 +/* These notifiers are unnecessary and may + lead to oops in virtual environments */ if (ret == 0) { /* Register for device down reports */ register_netdevice_notifier(&masq_dev_notifier); /* Register IP address change reports */ register_inetaddr_notifier(&masq_inet_notifier); } +#endif return ret; } @@ -193,8 +199,8 @@ static int __init ipt_masquerade_init(vo static void __exit ipt_masquerade_fini(void) { ipt_unregister_target(&masquerade); - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); +/* unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); */ } module_init(ipt_masquerade_init); diff -uprN linux-2.6.18/net/ipv4/netfilter/ipt_REDIRECT.c linux-2.6.18.ovz/net/ipv4/netfilter/ipt_REDIRECT.c --- linux-2.6.18/net/ipv4/netfilter/ipt_REDIRECT.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ipt_REDIRECT.c 2007-06-13 06:55:07.000000000 -0400 @@ -84,8 +84,14 @@ redirect_target(struct sk_buff **pskb, rcu_read_lock(); indev = __in_dev_get_rcu((*pskb)->dev); - if (indev && (ifa = indev->ifa_list)) + if (indev && (ifa = indev->ifa_list)) { + /* because of venet device specific, we should use + * second ifa in the list */ + if (IN_LOOPBACK(ntohl(ifa->ifa_local)) && + ifa->ifa_next) + ifa = ifa->ifa_next; newdst = ifa->ifa_local; + } rcu_read_unlock(); if (!newdst) diff -uprN linux-2.6.18/net/ipv4/netfilter/ipt_REJECT.c linux-2.6.18.ovz/net/ipv4/netfilter/ipt_REJECT.c --- linux-2.6.18/net/ipv4/netfilter/ipt_REJECT.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ipt_REJECT.c 2007-06-13 06:55:07.000000000 -0400 @@ -282,7 +282,7 @@ static int check(const char *tablename, const struct ipt_entry *e = e_void; if (rejinfo->with == IPT_ICMP_ECHOREPLY) { - printk("REJECT: ECHOREPLY no longer supported.\n"); + ve_printk(VE_LOG, "REJECT: ECHOREPLY no longer supported.\n"); return 0; } else if (rejinfo->with == IPT_TCP_RESET) { /* Must specify that it's a TCP packet */ diff -uprN linux-2.6.18/net/ipv4/netfilter/ipt_TCPMSS.c linux-2.6.18.ovz/net/ipv4/netfilter/ipt_TCPMSS.c --- linux-2.6.18/net/ipv4/netfilter/ipt_TCPMSS.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ipt_TCPMSS.c 2007-06-13 06:55:07.000000000 -0400 @@ -224,13 +224,14 @@ ipt_tcpmss_checkentry(const char *tablen ((hook_mask & ~((1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) != 0)) { - printk("TCPMSS: path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); + ve_printk(VE_LOG, "TCPMSS: path-MTU clamping only supported in" + " FORWARD, OUTPUT and POSTROUTING hooks\n"); return 0; } if (IPT_MATCH_ITERATE(e, find_syn_match)) return 1; - printk("TCPMSS: Only works on TCP SYN packets\n"); + ve_printk(VE_LOG, "TCPMSS: Only works on TCP SYN packets\n"); return 0; } diff -uprN linux-2.6.18/net/ipv4/netfilter/ipt_TOS.c linux-2.6.18.ovz/net/ipv4/netfilter/ipt_TOS.c --- linux-2.6.18/net/ipv4/netfilter/ipt_TOS.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/ipt_TOS.c 2007-06-13 06:55:07.000000000 -0400 @@ -66,7 +66,7 @@ checkentry(const char *tablename, && tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST && tos != IPTOS_NORMALSVC) { - printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); + ve_printk(VE_LOG, KERN_WARNING "TOS: bad tos value %#x\n", tos); return 0; } return 1; diff -uprN linux-2.6.18/net/ipv4/netfilter/iptable_filter.c linux-2.6.18.ovz/net/ipv4/netfilter/iptable_filter.c --- linux-2.6.18/net/ipv4/netfilter/iptable_filter.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/iptable_filter.c 2007-06-13 06:55:07.000000000 -0400 @@ -12,12 +12,20 @@ #include #include +#include #include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("iptables filter table"); +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_packet_filter (get_exec_env()->_ve_ipt_filter_pf) +#else +#define ve_packet_filter &packet_filter +#endif + #define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) static struct @@ -25,7 +33,7 @@ static struct struct ipt_replace repl; struct ipt_standard entries[3]; struct ipt_error term; -} initial_table __initdata +} initial_table = { { "filter", FILTER_VALID_HOOKS, 4, sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), { [NF_IP_LOCAL_IN] = 0, @@ -90,7 +98,7 @@ ipt_hook(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); + return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL); } static unsigned int @@ -108,7 +116,7 @@ ipt_local_out_hook(unsigned int hook, return NF_ACCEPT; } - return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); + return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL); } static struct nf_hook_ops ipt_ops[] = { @@ -139,22 +147,19 @@ static struct nf_hook_ops ipt_ops[] = { static int forward = NF_ACCEPT; module_param(forward, bool, 0000); -static int __init iptable_filter_init(void) +int init_iptable_filter(void) { int ret; - - if (forward < 0 || forward > NF_MAX_VERDICT) { - printk("iptables forward must be 0 or 1\n"); - return -EINVAL; - } - - /* Entry 1 is the FORWARD hook */ - initial_table.entries[1].target.verdict = -forward - 1; + struct ipt_table *tmp_filter; /* Register table */ - ret = ipt_register_table(&packet_filter, &initial_table.repl); - if (ret < 0) - return ret; + tmp_filter = ipt_register_table(&packet_filter, + &initial_table.repl); + if (IS_ERR(tmp_filter)) + return PTR_ERR(tmp_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = tmp_filter; +#endif /* Register hooks */ ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); @@ -164,14 +169,50 @@ static int __init iptable_filter_init(vo return ret; cleanup_table: - ipt_unregister_table(&packet_filter); + ipt_unregister_table(ve_packet_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = NULL; +#endif return ret; } -static void __exit iptable_filter_fini(void) +void fini_iptable_filter(void) { nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - ipt_unregister_table(&packet_filter); + ipt_unregister_table(ve_packet_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = NULL; +#endif +} + +static int __init iptable_filter_init(void) +{ + int err; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + err = init_iptable_filter(); + if (err < 0) + return err; + + KSYMRESOLVE(init_iptable_filter); + KSYMRESOLVE(fini_iptable_filter); + KSYMMODRESOLVE(iptable_filter); + return 0; +} + +static void __exit iptable_filter_fini(void) +{ + KSYMMODUNRESOLVE(iptable_filter); + KSYMUNRESOLVE(init_iptable_filter); + KSYMUNRESOLVE(fini_iptable_filter); + fini_iptable_filter(); } module_init(iptable_filter_init); diff -uprN linux-2.6.18/net/ipv4/netfilter/iptable_mangle.c linux-2.6.18.ovz/net/ipv4/netfilter/iptable_mangle.c --- linux-2.6.18/net/ipv4/netfilter/iptable_mangle.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/iptable_mangle.c 2007-06-13 06:55:07.000000000 -0400 @@ -16,6 +16,7 @@ #include #include #include +#include #include MODULE_LICENSE("GPL"); @@ -34,7 +35,7 @@ static struct struct ipt_replace repl; struct ipt_standard entries[5]; struct ipt_error term; -} initial_table __initdata +} initial_table = { { "mangle", MANGLE_VALID_HOOKS, 6, sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), { [NF_IP_PRE_ROUTING] = 0, @@ -111,6 +112,13 @@ static struct ipt_table packet_mangler = .af = AF_INET, }; +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_packet_mangler (get_exec_env()->_ipt_mangle_table) +#else +#define ve_packet_mangler &packet_mangler +#endif + /* The work comes in here from netfilter.c. */ static unsigned int ipt_route_hook(unsigned int hook, @@ -119,7 +127,7 @@ ipt_route_hook(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); + return ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); } static unsigned int @@ -148,7 +156,8 @@ ipt_local_hook(unsigned int hook, daddr = (*pskb)->nh.iph->daddr; tos = (*pskb)->nh.iph->tos; - ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); + ret = ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); + /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE && ((*pskb)->nh.iph->saddr != saddr @@ -157,7 +166,8 @@ ipt_local_hook(unsigned int hook, || (*pskb)->nfmark != nfmark #endif || (*pskb)->nh.iph->tos != tos)) - return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + if (ip_route_me_harder(pskb, RTN_UNSPEC)) + ret = NF_DROP; return ret; } @@ -200,14 +210,19 @@ static struct nf_hook_ops ipt_ops[] = { }, }; -static int __init iptable_mangle_init(void) +int init_iptable_mangle(void) { int ret; + struct ipt_table *tmp_mangler; /* Register table */ - ret = ipt_register_table(&packet_mangler, &initial_table.repl); - if (ret < 0) - return ret; + tmp_mangler = ipt_register_table(&packet_mangler, + &initial_table.repl); + if (IS_ERR(tmp_mangler)) + return PTR_ERR(tmp_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = tmp_mangler; +#endif /* Register hooks */ ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); @@ -217,14 +232,42 @@ static int __init iptable_mangle_init(vo return ret; cleanup_table: - ipt_unregister_table(&packet_mangler); + ipt_unregister_table(ve_packet_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = NULL; +#endif return ret; } -static void __exit iptable_mangle_fini(void) +void fini_iptable_mangle(void) { nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - ipt_unregister_table(&packet_mangler); + ipt_unregister_table(ve_packet_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = NULL; +#endif +} + +static int __init iptable_mangle_init(void) +{ + int err; + + err = init_iptable_mangle(); + if (err < 0) + return err; + + KSYMRESOLVE(init_iptable_mangle); + KSYMRESOLVE(fini_iptable_mangle); + KSYMMODRESOLVE(iptable_mangle); + return 0; +} + +static void __exit iptable_mangle_fini(void) +{ + KSYMMODUNRESOLVE(iptable_mangle); + KSYMUNRESOLVE(init_iptable_mangle); + KSYMUNRESOLVE(fini_iptable_mangle); + fini_iptable_mangle(); } module_init(iptable_mangle_init); diff -uprN linux-2.6.18/net/ipv4/netfilter/iptable_raw.c linux-2.6.18.ovz/net/ipv4/netfilter/iptable_raw.c --- linux-2.6.18/net/ipv4/netfilter/iptable_raw.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter/iptable_raw.c 2007-06-13 06:55:07.000000000 -0400 @@ -118,12 +118,13 @@ static struct nf_hook_ops ipt_ops[] = { static int __init iptable_raw_init(void) { + struct ipt_table *tmp; int ret; /* Register table */ - ret = ipt_register_table(&packet_raw, &initial_table.repl); - if (ret < 0) - return ret; + tmp = ipt_register_table(&packet_raw, &initial_table.repl); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); /* Register hooks */ ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); diff -uprN linux-2.6.18/net/ipv4/netfilter.c linux-2.6.18.ovz/net/ipv4/netfilter.c --- linux-2.6.18/net/ipv4/netfilter.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/netfilter.c 2007-06-13 06:55:07.000000000 -0400 @@ -8,7 +8,7 @@ #include /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff **pskb) +int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type) { struct iphdr *iph = (*pskb)->nh.iph; struct rtable *rt; @@ -16,10 +16,13 @@ int ip_route_me_harder(struct sk_buff ** struct dst_entry *odst; unsigned int hh_len; + if (addr_type == RTN_UNSPEC) + addr_type = inet_addr_type(iph->saddr); + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. */ - if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + if (addr_type == RTN_LOCAL) { fl.nl_u.ip4_u.daddr = iph->daddr; fl.nl_u.ip4_u.saddr = iph->saddr; fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); @@ -156,7 +159,7 @@ static int nf_ip_reroute(struct sk_buff if (!(iph->tos == rt_info->tos && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(pskb); + return ip_route_me_harder(pskb, RTN_UNSPEC); } return 0; } diff -uprN linux-2.6.18/net/ipv4/proc.c linux-2.6.18.ovz/net/ipv4/proc.c --- linux-2.6.18/net/ipv4/proc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/proc.c 2007-06-13 06:55:07.000000000 -0400 @@ -60,6 +60,9 @@ static int fold_prot_inuse(struct proto */ static int sockstat_seq_show(struct seq_file *seq, void *v) { + if (!ve_is_super(get_exec_env())) + return 0; + socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), @@ -258,11 +261,12 @@ static int snmp_seq_show(struct seq_file seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", - ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl); + ve_ipv4_devconf.forwarding ? 1 : 2, + sysctl_ip_default_ttl); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) ip_statistics, + fold_field((void **) ve_ip_statistics, snmp4_ipstats_list[i].entry)); seq_puts(seq, "\nIcmp:"); @@ -272,7 +276,7 @@ static int snmp_seq_show(struct seq_file seq_puts(seq, "\nIcmp:"); for (i = 0; snmp4_icmp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) icmp_statistics, + fold_field((void **) ve_icmp_statistics, snmp4_icmp_list[i].entry)); seq_puts(seq, "\nTcp:"); @@ -284,11 +288,11 @@ static int snmp_seq_show(struct seq_file /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - fold_field((void **) tcp_statistics, + fold_field((void **) ve_tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - fold_field((void **) tcp_statistics, + fold_field((void **) ve_tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -299,7 +303,7 @@ static int snmp_seq_show(struct seq_file seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) udp_statistics, + fold_field((void **) ve_udp_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -333,7 +337,7 @@ static int netstat_seq_show(struct seq_f seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) net_statistics, + fold_field((void **) ve_net_statistics, snmp4_net_list[i].entry)); seq_putc(seq, '\n'); @@ -357,20 +361,20 @@ int __init ip_misc_proc_init(void) { int rc = 0; - if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops)) + if (!proc_glob_fops_create("net/netstat", S_IRUGO, &netstat_seq_fops)) goto out_netstat; - if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops)) + if (!proc_glob_fops_create("net/snmp", S_IRUGO, &snmp_seq_fops)) goto out_snmp; - if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops)) + if (!proc_glob_fops_create("net/sockstat", S_IRUGO, &sockstat_seq_fops)) goto out_sockstat; out: return rc; out_sockstat: - proc_net_remove("snmp"); + remove_proc_glob_entry("net/snmp", NULL); out_snmp: - proc_net_remove("netstat"); + remove_proc_glob_entry("net/netstat", NULL); out_netstat: rc = -ENOMEM; goto out; diff -uprN linux-2.6.18/net/ipv4/raw.c linux-2.6.18.ovz/net/ipv4/raw.c --- linux-2.6.18/net/ipv4/raw.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/raw.c 2007-06-13 06:55:07.000000000 -0400 @@ -114,7 +114,8 @@ struct sock *__raw_v4_lookup(struct sock if (inet->num == num && !(inet->daddr && inet->daddr != raddr) && !(inet->rcv_saddr && inet->rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) && + ve_accessible_strict(sk->owner_env, get_exec_env())) goto found; /* gotcha */ } sk = NULL; @@ -788,8 +789,12 @@ static struct sock *raw_get_first(struct struct hlist_node *node; sk_for_each(sk, node, &raw_v4_htable[state->bucket]) - if (sk->sk_family == PF_INET) + if (sk->sk_family == PF_INET) { + if (!ve_accessible(sk->owner_env, + get_exec_env())) + continue; goto found; + } } sk = NULL; found: @@ -803,8 +808,13 @@ static struct sock *raw_get_next(struct do { sk = sk_next(sk); try_again: - ; - } while (sk && sk->sk_family != PF_INET); + if (!sk) + break; + if (sk->sk_family != PF_INET) + continue; + if (ve_accessible(sk->owner_env, get_exec_env())) + break; + } while (1); if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { sk = sk_head(&raw_v4_htable[state->bucket]); @@ -921,13 +931,13 @@ static struct file_operations raw_seq_fo int __init raw_proc_init(void) { - if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops)) + if (!proc_glob_fops_create("net/raw", S_IRUGO, &raw_seq_fops)) return -ENOMEM; return 0; } void __init raw_proc_exit(void) { - proc_net_remove("raw"); + remove_proc_glob_entry("net/raw", NULL); } #endif /* CONFIG_PROC_FS */ diff -uprN linux-2.6.18/net/ipv4/route.c linux-2.6.18.ovz/net/ipv4/route.c --- linux-2.6.18/net/ipv4/route.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/route.c 2007-06-13 06:55:07.000000000 -0400 @@ -116,6 +116,8 @@ #define RT_GC_TIMEOUT (300*HZ) +int ip_rt_src_check = 1; + static int ip_rt_min_delay = 2 * HZ; static int ip_rt_max_delay = 10 * HZ; static int ip_rt_max_size; @@ -261,11 +263,28 @@ static unsigned int rt_hash_code(u32 dad & rt_hash_mask); } +void prepare_rt_cache(void) +{ +#ifdef CONFIG_VE + struct rtable *r; + int i; + + for (i = rt_hash_mask; i >= 0; i--) { + spin_lock_bh(rt_hash_lock_addr(i)); + for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) { + r->fl.owner_env = get_ve0(); + } + spin_unlock_bh(rt_hash_lock_addr(i)); + } +#endif +} + #ifdef CONFIG_PROC_FS struct rt_cache_iter_state { int bucket; }; +static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r); static struct rtable *rt_cache_get_first(struct seq_file *seq) { struct rtable *r = NULL; @@ -278,6 +297,8 @@ static struct rtable *rt_cache_get_first break; rcu_read_unlock_bh(); } + if (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())) + r = rt_cache_get_next(seq, r); return r; } @@ -285,6 +306,7 @@ static struct rtable *rt_cache_get_next( { struct rt_cache_iter_state *st = rcu_dereference(seq->private); +loop: r = r->u.rt_next; while (!r) { rcu_read_unlock_bh(); @@ -293,6 +315,8 @@ static struct rtable *rt_cache_get_next( rcu_read_lock_bh(); r = rt_hash_table[st->bucket].chain; } + if (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())) + goto loop; return r; } @@ -564,7 +588,8 @@ static inline int compare_keys(struct fl { return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && fl1->oif == fl2->oif && - fl1->iif == fl2->iif; + fl1->iif == fl2->iif && + ve_accessible_strict(fl1->owner_env, fl2->owner_env); } #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED @@ -678,26 +703,105 @@ static void rt_check_expire(unsigned lon mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); } +typedef unsigned long rt_flush_gen_t; + +#ifdef CONFIG_VE + +static rt_flush_gen_t rt_flush_gen; + +/* called under rt_flush_lock */ +static void set_rt_flush_required(struct ve_struct *env) +{ + /* + * If the global generation rt_flush_gen is equal to G, then + * the pass considering entries labelled by G is yet to come. + */ + env->rt_flush_required = rt_flush_gen; +} + +static spinlock_t rt_flush_lock; +static rt_flush_gen_t reset_rt_flush_required(void) +{ + rt_flush_gen_t g; + + spin_lock_bh(&rt_flush_lock); + g = rt_flush_gen++; + spin_unlock_bh(&rt_flush_lock); + return g; +} + +static int check_rt_flush_required(struct ve_struct *env, rt_flush_gen_t gen) +{ + /* can be checked without the lock */ + return env->rt_flush_required >= gen; +} + +#else + +static void set_rt_flush_required(struct ve_struct *env) +{ +} + +static rt_flush_gen_t reset_rt_flush_required(void) +{ + return 0; +} + +#endif + /* This can run from both BH and non-BH contexts, the latter * in the case of a forced flush event. */ static void rt_run_flush(unsigned long dummy) { int i; - struct rtable *rth, *next; + struct rtable * rth, * next; + struct rtable * tail; + rt_flush_gen_t gen; rt_deadline = 0; get_random_bytes(&rt_hash_rnd, 4); + gen = reset_rt_flush_required(); + for (i = rt_hash_mask; i >= 0; i--) { +#ifdef CONFIG_VE + struct rtable ** prev, * p; + + spin_lock_bh(rt_hash_lock_addr(i)); + rth = rt_hash_table[i].chain; + + /* defer releasing the head of the list after spin_unlock */ + for (tail = rth; tail; tail = tail->u.rt_next) + if (!check_rt_flush_required(tail->fl.owner_env, gen)) + break; + if (rth != tail) + rt_hash_table[i].chain = tail; + + /* call rt_free on entries after the tail requiring flush */ + prev = &rt_hash_table[i].chain; + for (p = *prev; p; p = next) { + next = p->u.rt_next; + if (!check_rt_flush_required(p->fl.owner_env, gen)) { + prev = &p->u.rt_next; + } else { + *prev = next; + rt_free(p); + } + } + +#else spin_lock_bh(rt_hash_lock_addr(i)); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; + tail = NULL; + +#endif spin_unlock_bh(rt_hash_lock_addr(i)); - for (; rth; rth = next) { + for (; rth != tail; rth = next) { next = rth->u.rt_next; rt_free(rth); } @@ -736,6 +840,8 @@ void rt_cache_flush(int delay) delay = tmo; } + set_rt_flush_required(get_exec_env()); + if (delay <= 0) { spin_unlock_bh(&rt_flush_lock); rt_run_flush(0); @@ -751,9 +857,30 @@ void rt_cache_flush(int delay) static void rt_secret_rebuild(unsigned long dummy) { + int i; + struct rtable *rth, *next; unsigned long now = jiffies; - rt_cache_flush(0); + spin_lock_bh(&rt_flush_lock); + del_timer(&rt_flush_timer); + spin_unlock_bh(&rt_flush_lock); + + rt_deadline = 0; + get_random_bytes(&rt_hash_rnd, 4); + + for (i = rt_hash_mask; i >= 0; i--) { + spin_lock_bh(rt_hash_lock_addr(i)); + rth = rt_hash_table[i].chain; + if (rth) + rt_hash_table[i].chain = NULL; + spin_unlock_bh(rt_hash_lock_addr(i)); + + for (; rth; rth = next) { + next = rth->u.rt_next; + rt_free(rth); + } + } + mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); } @@ -1127,6 +1254,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd u32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct netevent_redirect netevent; + struct ve_struct *ve; + + ve = get_exec_env(); if (!in_dev) return; @@ -1159,6 +1289,10 @@ void ip_rt_redirect(u32 old_gw, u32 dadd if (rth->fl.fl4_dst != daddr || rth->fl.fl4_src != skeys[i] || rth->fl.oif != ikeys[k] || +#ifdef CONFIG_VE + !ve_accessible_strict(rth->fl.owner_env, + ve) || +#endif rth->fl.iif != 0) { rthp = &rth->u.rt_next; continue; @@ -1197,6 +1331,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd rt->u.dst.neighbour = NULL; rt->u.dst.hh = NULL; rt->u.dst.xfrm = NULL; +#ifdef CONFIG_VE + rt->fl.owner_env = ve; +#endif rt->rt_flags |= RTCF_REDIRECTED; @@ -1638,6 +1775,9 @@ static int ip_route_input_mc(struct sk_b #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark= skb->nfmark; #endif +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE @@ -1775,7 +1915,7 @@ static inline int __mkroute_input(struct #endif if (in_dev->cnf.no_policy) rth->u.dst.flags |= DST_NOPOLICY; - if (in_dev->cnf.no_xfrm) + if (out_dev->cnf.no_xfrm) rth->u.dst.flags |= DST_NOXFRM; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; @@ -1783,6 +1923,9 @@ static inline int __mkroute_input(struct #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark= skb->nfmark; #endif +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->fl.fl4_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; @@ -2028,6 +2171,9 @@ local_input: #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark= skb->nfmark; #endif +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE @@ -2107,6 +2253,9 @@ int ip_route_input(struct sk_buff *skb, #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark == skb->nfmark && #endif +#ifdef CONFIG_VE + rth->fl.owner_env == get_exec_env() && +#endif rth->fl.fl4_tos == tos) { rth->u.dst.lastuse = jiffies; dst_hold(&rth->u.dst); @@ -2233,6 +2382,9 @@ static inline int __mkroute_output(struc #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark= oldflp->fl4_fwmark; #endif +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_dst = fl->fl4_dst; rth->rt_src = fl->fl4_src; rth->rt_iif = oldflp->oif ? : dev_out->ifindex; @@ -2403,10 +2555,13 @@ static int ip_route_output_slow(struct r ZERONET(oldflp->fl4_src)) goto out; - /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(oldflp->fl4_src); - if (dev_out == NULL) - goto out; + if (ip_rt_src_check) { + /* It is equivalent to + inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(oldflp->fl4_src); + if (dev_out == NULL) + goto out; + } /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: @@ -2433,6 +2588,12 @@ static int ip_route_output_slow(struct r Luckily, this hack is good workaround. */ + if (dev_out == NULL) { + dev_out = ip_dev_find(oldflp->fl4_src); + if (dev_out == NULL) + goto out; + } + fl.oif = dev_out->ifindex; goto make_route; } @@ -2579,6 +2740,7 @@ int __ip_route_output_key(struct rtable #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark == flp->fl4_fwmark && #endif + ve_accessible_strict(rth->fl.owner_env, get_exec_env()) && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { @@ -2709,7 +2871,7 @@ static int rt_fill_info(struct sk_buff * u32 dst = rt->rt_dst; if (MULTICAST(dst) && !LOCAL_MCAST(dst) && - ipv4_devconf.mc_forwarding) { + ve_ipv4_devconf.mc_forwarding) { int err = ipmr_get_route(skb, r, nowait); if (err <= 0) { if (!nowait) { @@ -2860,22 +3022,22 @@ void ip_rt_multicast_event(struct in_dev } #ifdef CONFIG_SYSCTL -static int flush_delay; +int ipv4_flush_delay; -static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, +int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - rt_cache_flush(flush_delay); + rt_cache_flush(ipv4_flush_delay); return 0; } return -EINVAL; } -static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, +int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int __user *name, int nlen, void __user *oldval, @@ -2897,7 +3059,7 @@ ctl_table ipv4_route_table[] = { { .ctl_name = NET_IPV4_ROUTE_FLUSH, .procname = "flush", - .data = &flush_delay, + .data = &ipv4_flush_delay, .maxlen = sizeof(int), .mode = 0200, .proc_handler = &ipv4_sysctl_rtcache_flush, @@ -3191,15 +3353,18 @@ int __init ip_rt_init(void) #ifdef CONFIG_PROC_FS { struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ - if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || - !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, - proc_net_stat))) { + + if (!proc_glob_fops_create("net/rt_cache", + S_IRUGO, &rt_cache_seq_fops)) + return -ENOMEM; + + if (!(rtstat_pde = create_proc_glob_entry("net/stat/rt_cache", + S_IRUGO, NULL))) return -ENOMEM; - } rtstat_pde->proc_fops = &rt_cpu_seq_fops; } #ifdef CONFIG_NET_CLS_ROUTE - create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); + create_proc_read_entry("net/rt_acct", 0, NULL, ip_rt_acct_read, NULL); #endif #endif #ifdef CONFIG_XFRM diff -uprN linux-2.6.18/net/ipv4/sysctl_net_ipv4.c linux-2.6.18.ovz/net/ipv4/sysctl_net_ipv4.c --- linux-2.6.18/net/ipv4/sysctl_net_ipv4.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/sysctl_net_ipv4.c 2007-06-13 06:55:07.000000000 -0400 @@ -21,6 +21,9 @@ /* From af_inet.c */ extern int sysctl_ip_nonlocal_bind; +int sysctl_tcp_use_sg = 1; +EXPORT_SYMBOL(sysctl_tcp_use_sg); + #ifdef CONFIG_SYSCTL static int zero; static int tcp_retr1_max = 255; @@ -32,22 +35,21 @@ struct ipv4_config ipv4_config; #ifdef CONFIG_SYSCTL -static int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int val = ipv4_devconf.forwarding; + int val = ve_ipv4_devconf.forwarding; int ret; ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (write && ipv4_devconf.forwarding != val) + if (write && ve_ipv4_devconf.forwarding != val) inet_forward_change(); return ret; } -static int ipv4_sysctl_forward_strategy(ctl_table *table, +int ipv4_sysctl_forward_strategy(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, @@ -423,6 +425,14 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .ctl_name = NET_TCP_USE_SG, + .procname = "tcp_use_sg", + .data = &sysctl_tcp_use_sg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { @@ -616,6 +626,22 @@ ctl_table ipv4_table[] = { .extra1 = &zero }, { + .ctl_name = NET_TCP_MAX_TW_KMEM_FRACTION, + .procname = "tcp_max_tw_kmem_fraction", + .data = &sysctl_tcp_max_tw_kmem_fraction, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MAX_TW_BUCKETS_UB, + .procname = "tcp_max_tw_buckets_ub", + .data = &sysctl_tcp_max_tw_buckets_ub, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, diff -uprN linux-2.6.18/net/ipv4/tcp.c linux-2.6.18.ovz/net/ipv4/tcp.c --- linux-2.6.18/net/ipv4/tcp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/tcp.c 2007-06-13 06:55:07.000000000 -0400 @@ -247,6 +247,7 @@ * TCP_CLOSE socket is finished */ +#include #include #include #include @@ -265,6 +266,10 @@ #include #include +#include +#include +#include + #include #include @@ -322,6 +327,7 @@ unsigned int tcp_poll(struct file *file, unsigned int mask; struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); + int check_send_space; poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) @@ -336,6 +342,21 @@ unsigned int tcp_poll(struct file *file, if (sk->sk_err) mask = POLLERR; + check_send_space = 1; +#ifdef CONFIG_USER_RESOURCE + if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) { + unsigned long size; + size = MAX_TCP_HEADER + tp->mss_cache; + if (size > SOCK_MIN_UBCSPACE) + size = SOCK_MIN_UBCSPACE; + size = skb_charge_size(size); + if (ub_sock_makewres_tcp(sk, size)) { + check_send_space = 0; + ub_sock_sndqueueadd_tcp(sk, size); + } + } +#endif + /* * POLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -379,7 +400,7 @@ unsigned int tcp_poll(struct file *file, sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) mask |= POLLIN | POLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ @@ -529,16 +550,23 @@ static ssize_t do_tcp_sendpages(struct s int copy, i, can_coalesce; int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); + unsigned long chargesize = 0; if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { new_segment: + chargesize = 0; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; + chargesize = skb_charge_size(MAX_TCP_HEADER + + tp->mss_cache); + if (ub_sock_getwres_tcp(sk, chargesize) < 0) + goto wait_for_ubspace; skb = sk_stream_alloc_pskb(sk, 0, 0, sk->sk_allocation); if (!skb) goto wait_for_memory; + ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); skb_entail(sk, tp, skb); copy = size_goal; @@ -594,10 +622,15 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: + ub_sock_retwres_tcp(sk, chargesize, + skb_charge_size(MAX_TCP_HEADER + tp->mss_cache)); + chargesize = 0; +wait_for_ubspace: if (copied) tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = __sk_stream_wait_memory(sk, &timeo, chargesize); + if (err != 0) goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); @@ -634,9 +667,6 @@ ssize_t tcp_sendpage(struct socket *sock return res; } -#define TCP_PAGE(sk) (sk->sk_sndmsg_page) -#define TCP_OFF(sk) (sk->sk_sndmsg_off) - static inline int select_size(struct sock *sk, struct tcp_sock *tp) { int tmp = tp->mss_cache; @@ -696,6 +726,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru while (--iovlen >= 0) { int seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; + unsigned long chargesize = 0; iov++; @@ -706,18 +737,26 @@ int tcp_sendmsg(struct kiocb *iocb, stru if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { + unsigned long size; new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. */ + chargesize = 0; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - - skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), - 0, sk->sk_allocation); + size = select_size(sk, tp); + chargesize = skb_charge_size(MAX_TCP_HEADER + + size); + if (ub_sock_getwres_tcp(sk, chargesize) < 0) + goto wait_for_ubspace; + skb = sk_stream_alloc_pskb(sk, size, 0, + sk->sk_allocation); if (!skb) goto wait_for_memory; + ub_skb_set_charge(skb, sk, chargesize, + UB_TCPSNDBUF); /* * Check whether we can use HW checksum. @@ -763,6 +802,7 @@ new_segment: } else if (page) { if (off == PAGE_SIZE) { put_page(page); + ub_sock_tcp_detachpage(sk); TCP_PAGE(sk) = page = NULL; off = 0; } @@ -776,6 +816,9 @@ new_segment: goto wait_for_memory; if (!page) { + chargesize = PAGE_SIZE; + if (ub_sock_tcp_chargepage(sk) < 0) + goto wait_for_ubspace; /* Allocate new cache page. */ if (!(page = sk_stream_alloc_page(sk))) goto wait_for_memory; @@ -807,7 +850,8 @@ new_segment: } else if (off + copy < PAGE_SIZE) { get_page(page); TCP_PAGE(sk) = page; - } + } else + ub_sock_tcp_detachpage(sk); } TCP_OFF(sk) = off + copy; @@ -838,10 +882,15 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: + ub_sock_retwres_tcp(sk, chargesize, + skb_charge_size(MAX_TCP_HEADER+tp->mss_cache)); + chargesize = 0; +wait_for_ubspace: if (copied) tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = __sk_stream_wait_memory(sk, &timeo, chargesize); + if (err != 0) goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); @@ -939,7 +988,18 @@ void tcp_cleanup_rbuf(struct sock *sk, i #if TCP_DEBUG struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); + if (!(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))) { + printk("KERNEL: assertion: skb==NULL || " + "before(tp->copied_seq, skb->end_seq)\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("copied=%d, copied_seq=%d, rcv_nxt=%d\n", copied, + tp->copied_seq, tp->rcv_nxt); + printk("skb->len=%d, skb->seq=%d, skb->end_seq=%d\n", + skb->len, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + } #endif if (inet_csk_ack_scheduled(sk)) { @@ -1175,7 +1235,23 @@ int tcp_recvmsg(struct kiocb *iocb, stru goto found_ok_skb; if (skb->h.th->fin) goto found_fin_ok; - BUG_TRAP(flags & MSG_PEEK); + if (!(flags & MSG_PEEK)) { + printk("KERNEL: assertion: flags&MSG_PEEK\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? + VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("flags=0x%x, len=%d, copied_seq=%d, " + "rcv_nxt=%d\n", flags, + (int)len, tp->copied_seq, + tp->rcv_nxt); + printk("skb->len=%d, *seq=%d, skb->seq=%d, " + "skb->end_seq=%d, offset=%d\n", + skb->len, *seq, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + offset); + } skb = skb->next; } while (skb != (struct sk_buff *)&sk->sk_receive_queue); @@ -1238,8 +1314,19 @@ int tcp_recvmsg(struct kiocb *iocb, stru tp->ucopy.len = len; - BUG_TRAP(tp->copied_seq == tp->rcv_nxt || - (flags & (MSG_PEEK | MSG_TRUNC))); + if (!(tp->copied_seq == tp->rcv_nxt || + (flags&(MSG_PEEK|MSG_TRUNC)))) { + printk("KERNEL: assertion: tp->copied_seq == " + "tp->rcv_nxt || ...\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? + VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("flags=0x%x, len=%d, copied_seq=%d, " + "rcv_nxt=%d\n", flags, + (int)len, tp->copied_seq, + tp->rcv_nxt); + } /* Ugly... If prequeue is not empty, we have to * process it before releasing socket, otherwise @@ -1618,7 +1705,7 @@ adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); - atomic_inc(sk->sk_prot->orphan_count); + ub_inc_orphan_count(sk); /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); @@ -1669,9 +1756,7 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_stream_mem_reclaim(sk); - if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || - (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (ub_too_many_orphans(sk, ub_get_orphan_count(sk))) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " "sockets\n"); @@ -1750,6 +1835,7 @@ int tcp_disconnect(struct sock *sk, int tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; + tp->advmss = 65535; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2229,6 +2315,7 @@ out: EXPORT_SYMBOL(tcp_tso_segment); extern void __skb_cb_too_small_for_tcp(int, int); +extern unsigned int nr_free_lowpages(void); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; @@ -2246,6 +2333,7 @@ void __init tcp_init(void) struct sk_buff *skb = NULL; unsigned long limit; int order, i, max_share; + unsigned long goal; if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), @@ -2254,7 +2342,7 @@ void __init tcp_init(void) tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN | SLAB_UBC, NULL, NULL); if (!tcp_hashinfo.bind_bucket_cachep) panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); @@ -2269,7 +2357,7 @@ void __init tcp_init(void) thash_entries, (num_physpages >= 128 * 1024) ? 13 : 15, - HASH_HIGHMEM, + 0, &tcp_hashinfo.ehash_size, NULL, 0); @@ -2285,7 +2373,7 @@ void __init tcp_init(void) tcp_hashinfo.ehash_size, (num_physpages >= 128 * 1024) ? 13 : 15, - HASH_HIGHMEM, + 0, &tcp_hashinfo.bhash_size, NULL, 64 * 1024); @@ -2315,10 +2403,19 @@ void __init tcp_init(void) sysctl_max_syn_backlog = 128; } + goal = nr_free_lowpages() / 6; + while (order >= 3 && (1536< goal) + order--; + sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; sysctl_tcp_mem[2] = 1536 << order; + if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096) + sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096; + if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096) + sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096; + limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); diff -uprN linux-2.6.18/net/ipv4/tcp_cubic.c linux-2.6.18.ovz/net/ipv4/tcp_cubic.c --- linux-2.6.18/net/ipv4/tcp_cubic.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/tcp_cubic.c 2007-06-13 06:55:07.000000000 -0400 @@ -190,7 +190,7 @@ static inline void bictcp_update(struct */ /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start) + t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) << BICTCP_HZ) / HZ; if (t < ca->bic_K) /* t - K */ @@ -259,7 +259,7 @@ static inline void measure_delay(struct (s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; - delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + delay = (tcp_time_stamp - tp->rx_opt.rcv_tsecr)<<3; if (delay == 0) delay = 1; @@ -366,7 +366,7 @@ static int __init cubictcp_register(void beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); - cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */ + cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 * so K = cubic_root( (wmax-cwnd)*rtt/c ) diff -uprN linux-2.6.18/net/ipv4/tcp_input.c linux-2.6.18.ovz/net/ipv4/tcp_input.c --- linux-2.6.18/net/ipv4/tcp_input.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/tcp_input.c 2007-06-13 06:55:07.000000000 -0400 @@ -72,6 +72,8 @@ #include #include +#include + int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; int sysctl_tcp_sack = 1; @@ -252,7 +254,7 @@ static void tcp_grow_window(struct sock /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_memory_pressure) { + ub_tcp_rmem_allows_expand(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -321,6 +323,8 @@ static void tcp_init_buffer_space(struct tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_time_stamp; + + ub_tcp_update_maxadvmss(sk); } /* 5. Recalculate window clamp after socket hit its memory bounds. */ @@ -332,7 +336,7 @@ static void tcp_clamp_window(struct sock if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && + !ub_tcp_memory_pressure(sk) && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -2237,13 +2241,12 @@ static int tcp_tso_acked(struct sock *sk return acked; } -static u32 tcp_usrtt(const struct sk_buff *skb) +static u32 tcp_usrtt(struct timeval *tv) { - struct timeval tv, now; + struct timeval now; do_gettimeofday(&now); - skb_get_timestamp(skb, &tv); - return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); + return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec); } /* Remove acknowledged frames from the retransmission queue. */ @@ -2258,6 +2261,7 @@ static int tcp_clean_rtx_queue(struct so u32 pkts_acked = 0; void (*rtt_sample)(struct sock *sk, u32 usrtt) = icsk->icsk_ca_ops->rtt_sample; + struct timeval tv; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2306,8 +2310,7 @@ static int tcp_clean_rtx_queue(struct so seq_rtt = -1; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - if (rtt_sample) - (*rtt_sample)(sk, tcp_usrtt(skb)); + skb_get_timestamp(skb, &tv); } if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); @@ -2320,8 +2323,7 @@ static int tcp_clean_rtx_queue(struct so } } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - if (rtt_sample) - (*rtt_sample)(sk, tcp_usrtt(skb)); + skb_get_timestamp(skb, &tv); } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); @@ -2333,6 +2335,8 @@ static int tcp_clean_rtx_queue(struct so if (acked&FLAG_ACKED) { tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk, tp); + if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED)) + (*rtt_sample)(sk, tcp_usrtt(&tv)); if (icsk->icsk_ca_ops->pkts_acked) icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); @@ -3178,7 +3182,7 @@ queue_and_out: !sk_stream_rmem_schedule(sk, skb))) { if (tcp_prune_queue(sk) < 0 || !sk_stream_rmem_schedule(sk, skb)) - goto drop; + goto drop_part; } sk_stream_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); @@ -3222,6 +3226,12 @@ out_of_window: drop: __kfree_skb(skb); return; + +drop_part: + if (after(tp->copied_seq, tp->rcv_nxt)) + tp->rcv_nxt = tp->copied_seq; + __kfree_skb(skb); + return; } /* Out of window. F.e. zero window probe. */ @@ -3393,6 +3403,10 @@ tcp_collapse(struct sock *sk, struct sk_ nskb = alloc_skb(copy+header, GFP_ATOMIC); if (!nskb) return; + if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) { + kfree_skb(nskb); + return; + } skb_reserve(nskb, header); memcpy(nskb->head, skb->head, header); nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); @@ -3489,7 +3503,7 @@ static int tcp_prune_queue(struct sock * if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk, tp); - else if (tcp_memory_pressure) + else if (ub_tcp_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -3566,7 +3580,7 @@ static int tcp_should_expand_sndbuf(stru return 0; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_memory_pressure) + if (ub_tcp_memory_pressure(sk)) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ @@ -4011,6 +4025,10 @@ int tcp_rcv_established(struct sock *sk, if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; + /* This is OK not to try to free memory here. + * Do this below on slow path. Den */ + if (ub_tcprcvbuf_charge(sk, skb) < 0) + goto step5; NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); diff -uprN linux-2.6.18/net/ipv4/tcp_ipv4.c linux-2.6.18.ovz/net/ipv4/tcp_ipv4.c --- linux-2.6.18/net/ipv4/tcp_ipv4.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/tcp_ipv4.c 2007-06-13 06:55:07.000000000 -0400 @@ -72,6 +72,8 @@ #include #include +#include + #include #include #include @@ -621,7 +623,8 @@ static void tcp_v4_timewait_ack(struct s const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, - tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent); + tcptw->tw_rcv_wnd >> (tw->tw_rcv_wscale& TW_WSCALE_MASK), + tcptw->tw_ts_recent); inet_twsk_put(tw); } @@ -723,6 +726,7 @@ struct request_sock_ops tcp_request_sock .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, }; +EXPORT_SYMBOL_GPL(tcp_request_sock_ops); static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), @@ -998,12 +1002,15 @@ static int tcp_v4_checksum_init(struct s */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { + struct user_beancounter *ub; + + ub = set_exec_ub(sock_bc(sk)->ub); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) goto reset; TCP_CHECK_TIMER(sk); - return 0; + goto restore_context; } if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb)) @@ -1017,7 +1024,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc if (nsk != sk) { if (tcp_child_process(sk, nsk, skb)) goto reset; - return 0; + goto restore_context; } } @@ -1025,6 +1032,9 @@ int tcp_v4_do_rcv(struct sock *sk, struc if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) goto reset; TCP_CHECK_TIMER(sk); + +restore_context: + (void)set_exec_ub(ub); return 0; reset: @@ -1036,7 +1046,7 @@ discard: * might be destroyed here. This current version compiles correctly, * but you have been warned. */ - return 0; + goto restore_context; csum_err: TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -1291,6 +1301,8 @@ static int tcp_v4_init_sock(struct sock tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; + tp->advmss = 65535; /* max value */ + tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; @@ -1340,6 +1352,8 @@ int tcp_v4_destroy_sock(struct sock *sk) * If sendmsg cached page exists, toss it. */ if (sk->sk_sndmsg_page) { + /* queue is empty, uncharge */ + ub_sock_tcp_detachpage(sk); __free_page(sk->sk_sndmsg_page); sk->sk_sndmsg_page = NULL; } @@ -1354,16 +1368,34 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) +static inline struct inet_timewait_sock *tw_head(struct hlist_head *head, + envid_t veid) { - return hlist_empty(head) ? NULL : - list_entry(head->first, struct inet_timewait_sock, tw_node); + struct inet_timewait_sock *tw; + struct hlist_node *pos; + + if (hlist_empty(head)) + return NULL; + hlist_for_each_entry(tw, pos, head, tw_node) { + if (!ve_accessible_veid(tw->tw_owner_env, veid)) + continue; + return tw; + } + return NULL; } -static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) +static inline struct inet_timewait_sock * + tw_next(struct inet_timewait_sock *tw, envid_t veid) { - return tw->tw_node.next ? - hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; + while (1) { + if (tw->tw_node.next == NULL) + return NULL; + tw = hlist_entry(tw->tw_node.next, typeof(*tw), tw_node); + if (!ve_accessible_veid(tw->tw_owner_env, veid)) + continue; + return tw; + } + return NULL; /* make compiler happy */ } static void *listening_get_next(struct seq_file *seq, void *cur) @@ -1372,7 +1404,9 @@ static void *listening_get_next(struct s struct hlist_node *node; struct sock *sk = cur; struct tcp_iter_state* st = seq->private; + struct ve_struct *ve; + ve = get_exec_env(); if (!sk) { st->bucket = 0; sk = sk_head(&tcp_hashinfo.listening_hash[0]); @@ -1412,6 +1446,8 @@ get_req: } get_sk: sk_for_each_from(sk, node) { + if (!ve_accessible(sk->owner_env, ve)) + continue; if (sk->sk_family == st->family) { cur = sk; goto out; @@ -1452,7 +1488,9 @@ static void *established_get_first(struc { struct tcp_iter_state* st = seq->private; void *rc = NULL; + struct ve_struct *ve; + ve = get_exec_env(); for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { struct sock *sk; struct hlist_node *node; @@ -1463,6 +1501,8 @@ static void *established_get_first(struc read_lock(&tcp_hashinfo.ehash[st->bucket].lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { + if (!ve_accessible(sk->owner_env, ve)) + continue; if (sk->sk_family != st->family) { continue; } @@ -1472,6 +1512,8 @@ static void *established_get_first(struc st->state = TCP_SEQ_STATE_TIME_WAIT; inet_twsk_for_each(tw, node, &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { + if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) + continue; if (tw->tw_family != st->family) { continue; } @@ -1491,16 +1533,17 @@ static void *established_get_next(struct struct inet_timewait_sock *tw; struct hlist_node *node; struct tcp_iter_state* st = seq->private; + struct ve_struct *ve; + ve = get_exec_env(); ++st->num; if (st->state == TCP_SEQ_STATE_TIME_WAIT) { tw = cur; - tw = tw_next(tw); + tw = tw_next(tw, VEID(ve)); get_tw: - while (tw && tw->tw_family != st->family) { - tw = tw_next(tw); - } + while (tw && tw->tw_family != st->family) + tw = tw_next(tw, VEID(ve)); if (tw) { cur = tw; goto out; @@ -1522,12 +1565,15 @@ get_tw: sk = sk_next(sk); sk_for_each_from(sk, node) { + if (!ve_accessible(sk->owner_env, ve)) + continue; if (sk->sk_family == st->family) goto found; } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain); + tw = tw_head(&tcp_hashinfo.ehash[st->bucket + + tcp_hashinfo.ehash_size].chain, VEID(ve)); goto get_tw; found: cur = sk; @@ -1672,7 +1718,7 @@ int tcp_proc_register(struct tcp_seq_afi afinfo->seq_fops->llseek = seq_lseek; afinfo->seq_fops->release = seq_release_private; - p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + p = proc_glob_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); if (p) p->data = afinfo; else @@ -1684,7 +1730,8 @@ void tcp_proc_unregister(struct tcp_seq_ { if (!afinfo) return; - proc_net_remove(afinfo->name); + + remove_proc_glob_entry(afinfo->name, NULL); memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); } @@ -1815,7 +1862,7 @@ out: static struct file_operations tcp4_seq_fops; static struct tcp_seq_afinfo tcp4_seq_afinfo = { .owner = THIS_MODULE, - .name = "tcp", + .name = "net/tcp", .family = AF_INET, .seq_show = tcp4_seq_show, .seq_fops = &tcp4_seq_fops, @@ -1875,6 +1922,86 @@ void __init tcp_v4_init(struct net_proto panic("Failed to create the TCP control socket.\n"); } +#ifdef CONFIG_VE +static void tcp_kill_ve_onesk(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Check the assumed state of the socket. */ + if (!sock_flag(sk, SOCK_DEAD)) { + static int printed; +invalid: + if (!printed) + printk(KERN_DEBUG "Killing sk: dead %d, state %d, " + "wrseq %u unseq %u, wrqu %d.\n", + sock_flag(sk, SOCK_DEAD), sk->sk_state, + tp->write_seq, tp->snd_una, + !skb_queue_empty(&sk->sk_write_queue)); + printed = 1; + return; + } + + tcp_send_active_reset(sk, GFP_ATOMIC); + switch (sk->sk_state) { + case TCP_FIN_WAIT1: + case TCP_CLOSING: + /* In these 2 states the peer may want us to retransmit + * some data and/or FIN. Entering "resetting mode" + * instead. + */ + tcp_time_wait(sk, TCP_CLOSE, 0); + break; + case TCP_FIN_WAIT2: + /* By some reason the socket may stay in this state + * without turning into a TW bucket. Fix it. + */ + tcp_time_wait(sk, TCP_FIN_WAIT2, 0); + break; + case TCP_LAST_ACK: + /* Just jump into CLOSED state. */ + tcp_done(sk); + break; + default: + /* The socket must be already close()d. */ + goto invalid; + } +} + +void tcp_v4_kill_ve_sockets(struct ve_struct *envid) +{ + struct inet_ehash_bucket *head; + int i; + + /* alive */ + local_bh_disable(); + head = tcp_hashinfo.ehash; + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + struct sock *sk; + struct hlist_node *node; +more_work: + write_lock(&head[i].lock); + sk_for_each(sk, node, &head[i].chain) { + if (ve_accessible_strict(sk->owner_env, envid)) { + sock_hold(sk); + write_unlock(&head[i].lock); + + bh_lock_sock(sk); + /* sk might have disappeared from the hash before + * we got the lock */ + if (sk->sk_state != TCP_CLOSE) + tcp_kill_ve_onesk(sk); + bh_unlock_sock(sk); + sock_put(sk); + goto more_work; + } + } + write_unlock(&head[i].lock); + } + local_bh_enable(); +} +EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); +#endif + EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_prot); diff -uprN linux-2.6.18/net/ipv4/tcp_minisocks.c linux-2.6.18.ovz/net/ipv4/tcp_minisocks.c --- linux-2.6.18/net/ipv4/tcp_minisocks.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/tcp_minisocks.c 2007-06-13 06:55:07.000000000 -0400 @@ -28,6 +28,9 @@ #include #include +#include +#include + #ifdef CONFIG_SYSCTL #define SYNC_INIT 0 /* let the user enable it */ #else @@ -36,6 +39,11 @@ int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_abort_on_overflow; +int sysctl_tcp_max_tw_kmem_fraction = 384; +int sysctl_tcp_max_tw_buckets_ub = 16536; + +EXPORT_SYMBOL(sysctl_tcp_max_tw_kmem_fraction); +EXPORT_SYMBOL(sysctl_tcp_max_tw_buckets_ub); struct inet_timewait_death_row tcp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, @@ -52,6 +60,7 @@ struct inet_timewait_death_row tcp_death .twcal_hand = -1, .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, (unsigned long)&tcp_death_row), + .ub_managed = 1, }; EXPORT_SYMBOL_GPL(tcp_death_row); @@ -280,7 +289,8 @@ void tcp_time_wait(struct sock *sk, int if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); - if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) + if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets && + ub_timewait_check(sk, &tcp_death_row)) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { @@ -293,6 +303,8 @@ void tcp_time_wait(struct sock *sk, int tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + if (sk->sk_user_data != NULL) + tw->tw_rcv_wscale |= TW_WSCALE_SPEC; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { @@ -306,6 +318,8 @@ void tcp_time_wait(struct sock *sk, int tw->tw_ipv6only = np->ipv6only; } #endif + tw->tw_owner_env = VEID(sk->owner_env); + /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); @@ -354,6 +368,8 @@ struct sock *tcp_create_openreq_child(st struct tcp_sock *newtp; /* Now setup tcp_sock */ + newsk->owner_env = sk->owner_env; + newtp = tcp_sk(newsk); newtp->pred_flags = 0; newtp->rcv_nxt = treq->rcv_isn + 1; diff -uprN linux-2.6.18/net/ipv4/tcp_output.c linux-2.6.18.ovz/net/ipv4/tcp_output.c --- linux-2.6.18/net/ipv4/tcp_output.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/tcp_output.c 2007-06-13 06:55:07.000000000 -0400 @@ -42,6 +42,9 @@ #include #include +#include +#include + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; @@ -339,6 +342,13 @@ static void tcp_syn_build_options(__u32 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); } +static int skb_header_size(struct sock *sk, int tcp_hlen) +{ + struct ip_options *opt = inet_sk(sk)->opt; + return tcp_hlen + sizeof(struct iphdr) + + (opt ? opt->optlen : 0) + ETH_HLEN /* For hard header */; +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -357,6 +367,7 @@ static int tcp_transmit_skb(struct sock struct tcp_sock *tp; struct tcp_skb_cb *tcb; int tcp_header_size; + int header_size; struct tcphdr *th; int sysctl_flags; int err; @@ -411,7 +422,21 @@ static int tcp_transmit_skb(struct sock (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } - + + /* Unfortunately, we can have skb from outside world here + * with size insufficient for header. It is impossible to make + * guess when we queue skb, so the decision should be made + * here. Den + */ + header_size = skb_header_size(sk, tcp_header_size); + if (skb->data - header_size < skb->head) { + int delta = header_size - skb_headroom(skb); + err = pskb_expand_head(skb, SKB_DATA_ALIGN(delta), + 0, GFP_ATOMIC); + if (err) + return err; + } + if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); @@ -550,15 +575,23 @@ int tcp_fragment(struct sock *sk, struct if (nsize < 0) nsize = 0; - if (skb_cloned(skb) && - skb_is_nonlinear(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; + if (skb_cloned(skb) && skb_is_nonlinear(skb)) { + unsigned long chargesize; + chargesize = skb_bc(skb)->charged; + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + ub_sock_tcp_unchargesend(sk, chargesize); + ub_tcpsndbuf_charge_forced(sk, skb); + } /* Get a new skb... force flag on. */ buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOMEM; + } sk_charge_skb(sk, buff); nlen = skb->len - len - nsize; @@ -1045,6 +1078,11 @@ static int tso_fragment(struct sock *sk, if (unlikely(buff == NULL)) return -ENOMEM; + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOMEM; + } + sk_charge_skb(sk, buff); buff->truesize += nlen; skb->truesize -= nlen; @@ -1492,7 +1530,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < full_space/2) { icsk->icsk_ack.quick = 0; - if (tcp_memory_pressure) + if (ub_tcp_shrink_rcvbuf(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); if (free_space < mss) @@ -1926,6 +1964,7 @@ void tcp_send_fin(struct sock *sk) break; yield(); } + ub_tcpsndbuf_charge_forced(sk, skb); /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); @@ -1997,6 +2036,10 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) return -ENOMEM; + if (ub_tcpsndbuf_charge(sk, skb) < 0) { + kfree_skb(nskb); + return -ENOMEM; + } __skb_unlink(skb, &sk->sk_write_queue); skb_header_release(nskb); __skb_queue_head(&sk->sk_write_queue, nskb); @@ -2091,6 +2134,7 @@ static void tcp_connect_init(struct sock struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + static int once = 0; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -2105,9 +2149,23 @@ static void tcp_connect_init(struct sock tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); + if (!once && dst_metric(dst, RTAX_ADVMSS) == 0) { + once = 1; + + printk("Oops in connect_init! dst->advmss=%d\n", + dst_metric(dst, RTAX_ADVMSS)); + printk("dst: pmtu=%u\n", dst_metric(dst, RTAX_MTU)); + printk("sk->state=%d, tp: ack.rcv_mss=%d, mss_cache=%d, " + "advmss=%d, user_mss=%d\n", + sk->sk_state, inet_csk(sk)->icsk_ack.rcv_mss, + tp->mss_cache, tp->advmss, tp->rx_opt.user_mss); + } + if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tp->advmss == 0) + tp->advmss = 1460; tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), @@ -2148,6 +2206,10 @@ int tcp_connect(struct sock *sk) buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOBUFS; + } /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); diff -uprN linux-2.6.18/net/ipv4/tcp_timer.c linux-2.6.18.ovz/net/ipv4/tcp_timer.c --- linux-2.6.18/net/ipv4/tcp_timer.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/tcp_timer.c 2007-06-13 06:55:07.000000000 -0400 @@ -22,6 +22,8 @@ #include #include +#include +#include int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; @@ -67,7 +69,7 @@ static void tcp_write_err(struct sock *s static int tcp_out_of_resources(struct sock *sk, int do_reset) { struct tcp_sock *tp = tcp_sk(sk); - int orphans = atomic_read(&tcp_orphan_count); + int orphans = ub_get_orphan_count(sk); /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ @@ -78,9 +80,7 @@ static int tcp_out_of_resources(struct s if (sk->sk_err_soft) orphans <<= 1; - if (orphans >= sysctl_tcp_max_orphans || - (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (ub_too_many_orphans(sk, orphans)) { if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); @@ -169,9 +169,12 @@ static int tcp_write_timeout(struct sock static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; + struct ve_struct *env; struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + env = set_exec_env(sk->owner_env); + bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ @@ -220,11 +223,12 @@ static void tcp_delack_timer(unsigned lo TCP_CHECK_TIMER(sk); out: - if (tcp_memory_pressure) + if (ub_tcp_memory_pressure(sk)) sk_stream_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(env); } static void tcp_probe_timer(struct sock *sk) @@ -276,11 +280,14 @@ static void tcp_probe_timer(struct sock * The TCP retransmit timer. */ -static void tcp_retransmit_timer(struct sock *sk) +static noinline void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct ve_struct *env; struct inet_connection_sock *icsk = inet_csk(sk); + env = set_exec_env(sk->owner_env); + if (!tp->packets_out) goto out; @@ -377,15 +384,19 @@ out_reset_timer: if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); -out:; +out: + (void)set_exec_env(env); } static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock*)data; + struct ve_struct *env; struct inet_connection_sock *icsk = inet_csk(sk); int event; + env = set_exec_env(sk->owner_env); + bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ @@ -419,6 +430,7 @@ out: out_unlock: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(env); } /* @@ -446,10 +458,13 @@ void tcp_set_keepalive(struct sock *sk, static void tcp_keepalive_timer (unsigned long data) { struct sock *sk = (struct sock *) data; + struct ve_struct *env; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); __u32 elapsed; + env = set_exec_env(sk->owner_env); + /* Only process if socket is not in use. */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -521,4 +536,5 @@ death: out: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(env); } diff -uprN linux-2.6.18/net/ipv4/udp.c linux-2.6.18.ovz/net/ipv4/udp.c --- linux-2.6.18/net/ipv4/udp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/udp.c 2007-06-13 06:55:07.000000000 -0400 @@ -126,7 +126,9 @@ static int udp_v4_get_port(struct sock * struct hlist_node *node; struct sock *sk2; struct inet_sock *inet = inet_sk(sk); + struct ve_struct *env; + env = sk->owner_env; write_lock_bh(&udp_hash_lock); if (snum == 0) { int best_size_so_far, best, result, i; @@ -140,7 +142,7 @@ static int udp_v4_get_port(struct sock * struct hlist_head *list; int size; - list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; + list = &udp_hash[udp_hashfn(result, VEID(env))]; if (hlist_empty(list)) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + @@ -162,7 +164,7 @@ static int udp_v4_get_port(struct sock * result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); - if (!udp_lport_inuse(result)) + if (!udp_lport_inuse(result, env)) break; } if (i >= (1 << 16) / UDP_HTABLE_SIZE) @@ -171,11 +173,12 @@ gotit: udp_port_rover = snum = result; } else { sk_for_each(sk2, node, - &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { + &udp_hash[udp_hashfn(snum, VEID(env))]) { struct inet_sock *inet2 = inet_sk(sk2); if (inet2->num == snum && sk2 != sk && + ve_accessible_strict(sk2->owner_env, env) && !ipv6_only_sock(sk2) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || @@ -189,7 +192,7 @@ gotit: } inet->num = snum; if (sk_unhashed(sk)) { - struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; + struct hlist_head *h = &udp_hash[udp_hashfn(snum, VEID(env))]; sk_add_node(sk, h); sock_prot_inc_use(sk->sk_prot); @@ -227,11 +230,15 @@ static struct sock *udp_v4_lookup_longwa struct hlist_node *node; unsigned short hnum = ntohs(dport); int badness = -1; + struct ve_struct *env; - sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { + env = get_exec_env(); + sk_for_each(sk, node, &udp_hash[udp_hashfn(hnum, VEID(env))]) { struct inet_sock *inet = inet_sk(sk); - if (inet->num == hnum && !ipv6_only_sock(sk)) { + if (inet->num == hnum && + ve_accessible_strict(sk->owner_env, env) && + !ipv6_only_sock(sk)) { int score = (sk->sk_family == PF_INET ? 1 : 0); if (inet->rcv_saddr) { if (inet->rcv_saddr != daddr) @@ -892,23 +899,32 @@ static int udp_encap_rcv(struct sock * s return 1; #else struct udp_sock *up = udp_sk(sk); - struct udphdr *uh = skb->h.uh; + struct udphdr *uh; struct iphdr *iph; int iphlen, len; - __u8 *udpdata = (__u8 *)uh + sizeof(struct udphdr); - __u32 *udpdata32 = (__u32 *)udpdata; + __u8 *udpdata; + __u32 *udpdata32; __u16 encap_type = up->encap_type; /* if we're overly short, let UDP handle it */ - if (udpdata > skb->tail) + len = skb->len - sizeof(struct udphdr); + if (len <= 0) return 1; /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; - len = skb->tail - udpdata; + /* If this is a paged skb, make sure we pull up + * whatever data we need to look at. */ + if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) + return 1; + + /* Now we can get the pointers */ + uh = skb->h.uh; + udpdata = (__u8 *)uh + sizeof(struct udphdr); + udpdata32 = (__u32 *)udpdata; switch (encap_type) { default: @@ -1048,7 +1064,8 @@ static int udp_v4_mcast_deliver(struct s int dif; read_lock(&udp_hash_lock); - sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + sk = sk_head(&udp_hash[udp_hashfn(ntohs(uh->dest), + VEID(skb->owner_env))]); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { @@ -1399,10 +1416,14 @@ static struct sock *udp_get_first(struct { struct sock *sk; struct udp_iter_state *state = seq->private; + struct ve_struct *env; + env = get_exec_env(); for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { struct hlist_node *node; sk_for_each(sk, node, &udp_hash[state->bucket]) { + if (!ve_accessible(sk->owner_env, env)) + continue; if (sk->sk_family == state->family) goto found; } @@ -1419,8 +1440,13 @@ static struct sock *udp_get_next(struct do { sk = sk_next(sk); try_again: - ; - } while (sk && sk->sk_family != state->family); + if (!sk) + break; + if (sk->sk_family != state->family) + continue; + if (ve_accessible(sk->owner_env, get_exec_env())) + break; + } while (1); if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { sk = sk_head(&udp_hash[state->bucket]); @@ -1505,7 +1531,7 @@ int udp_proc_register(struct udp_seq_afi afinfo->seq_fops->llseek = seq_lseek; afinfo->seq_fops->release = seq_release_private; - p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + p = proc_glob_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); if (p) p->data = afinfo; else @@ -1517,7 +1543,8 @@ void udp_proc_unregister(struct udp_seq_ { if (!afinfo) return; - proc_net_remove(afinfo->name); + + remove_proc_glob_entry(afinfo->name, NULL); memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); } @@ -1560,7 +1587,7 @@ static int udp4_seq_show(struct seq_file static struct file_operations udp4_seq_fops; static struct udp_seq_afinfo udp4_seq_afinfo = { .owner = THIS_MODULE, - .name = "udp", + .name = "net/udp", .family = AF_INET, .seq_show = udp4_seq_show, .seq_fops = &udp4_seq_fops, diff -uprN linux-2.6.18/net/ipv4/xfrm4_policy.c linux-2.6.18.ovz/net/ipv4/xfrm4_policy.c --- linux-2.6.18/net/ipv4/xfrm4_policy.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv4/xfrm4_policy.c 2007-06-13 06:55:07.000000000 -0400 @@ -252,6 +252,8 @@ static void xfrm4_dst_destroy(struct dst if (likely(xdst->u.rt.idev)) in_dev_put(xdst->u.rt.idev); + if (likely(xdst->u.rt.peer)) + inet_putpeer(xdst->u.rt.peer); xfrm_dst_destroy(xdst); } diff -uprN linux-2.6.18/net/ipv6/addrconf.c linux-2.6.18.ovz/net/ipv6/addrconf.c --- linux-2.6.18/net/ipv6/addrconf.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/addrconf.c 2007-06-13 06:55:07.000000000 -0400 @@ -97,6 +97,7 @@ #define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b))) #ifdef CONFIG_SYSCTL +static struct addrconf_sysctl_table * __addrconf_sysctl_register(struct inet6_dev *idev, char *devname, int ifindex, struct ipv6_devconf *p); static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p); static void addrconf_sysctl_unregister(struct ipv6_devconf *p); #endif @@ -128,8 +129,6 @@ static DEFINE_SPINLOCK(addrconf_verify_l static void addrconf_join_anycast(struct inet6_ifaddr *ifp); static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); -static int addrconf_ifdown(struct net_device *dev, int how); - static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); static void addrconf_dad_timer(unsigned long data); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); @@ -144,7 +143,7 @@ static int ipv6_chk_same_addr(const stru static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); -struct ipv6_devconf ipv6_devconf = { +struct ipv6_devconf global_ipv6_devconf = { .forwarding = 0, .hop_limit = IPV6_DEFAULT_HOPLIMIT, .mtu6 = IPV6_MIN_MTU, @@ -175,7 +174,7 @@ struct ipv6_devconf ipv6_devconf = { #endif }; -static struct ipv6_devconf ipv6_devconf_dflt = { +struct ipv6_devconf global_ipv6_devconf_dflt = { .forwarding = 0, .hop_limit = IPV6_DEFAULT_HOPLIMIT, .mtu6 = IPV6_MIN_MTU, @@ -205,6 +204,12 @@ static struct ipv6_devconf ipv6_devconf_ #endif }; +#ifdef CONFIG_VE +#define ipv6_devconf_dflt (*(get_exec_env()->_ipv6_devconf_dflt)) +#else +#define ipv6_devconf_dflt global_ipv6_devconf_dflt +#endif + /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ #if 0 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; @@ -334,6 +339,7 @@ void in6_dev_finish_destroy(struct inet6 static struct inet6_dev * ipv6_add_dev(struct net_device *dev) { struct inet6_dev *ndev; + struct in6_addr maddr; ASSERT_RTNL(); @@ -391,9 +397,8 @@ static struct inet6_dev * ipv6_add_dev(s dev->type == ARPHRD_TUNNEL || dev->type == ARPHRD_NONE || dev->type == ARPHRD_SIT) { - printk(KERN_INFO - "%s: Disabled Privacy Extensions\n", - dev->name); + ADBG((KERN_INFO "%s: Disabled Privacy Extensions\n", + dev->name)); ndev->cnf.use_tempaddr = -1; } else { in6_dev_hold(ndev); @@ -404,10 +409,6 @@ static struct inet6_dev * ipv6_add_dev(s if (netif_carrier_ok(dev)) ndev->if_flags |= IF_READY; - write_lock_bh(&addrconf_lock); - dev->ip6_ptr = ndev; - write_unlock_bh(&addrconf_lock); - ipv6_mc_init_dev(ndev); ndev->tstamp = jiffies; #ifdef CONFIG_SYSCTL @@ -417,6 +418,14 @@ static struct inet6_dev * ipv6_add_dev(s NULL); addrconf_sysctl_register(ndev, &ndev->cnf); #endif + write_lock_bh(&addrconf_lock); + dev->ip6_ptr = ndev; + write_unlock_bh(&addrconf_lock); + + /* Join all-node multicast group */ + ipv6_addr_all_nodes(&maddr); + ipv6_dev_mc_inc(dev, &maddr); + return ndev; } @@ -473,8 +482,8 @@ static void addrconf_forward_change(void read_lock(&addrconf_lock); idev = __in6_dev_get(dev); if (idev) { - int changed = (!idev->cnf.forwarding) ^ (!ipv6_devconf.forwarding); - idev->cnf.forwarding = ipv6_devconf.forwarding; + int changed = (!idev->cnf.forwarding) ^ (!ve_ipv6_devconf.forwarding); + idev->cnf.forwarding = ve_ipv6_devconf.forwarding; if (changed) dev_forward_change(idev); } @@ -1185,9 +1194,10 @@ int ipv6_chk_addr(struct in6_addr *addr, read_lock_bh(&addrconf_hash_lock); for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { if (ipv6_addr_equal(&ifp->addr, addr) && - !(ifp->flags&IFA_F_TENTATIVE)) { + !(ifp->flags&IFA_F_TENTATIVE) && + ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) { if (dev == NULL || ifp->idev->dev == dev || - !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) + !((ifp->scope&(IFA_LINK|IFA_HOST)) || strict)) break; } } @@ -1203,7 +1213,9 @@ int ipv6_chk_same_addr(const struct in6_ for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { if (ipv6_addr_equal(&ifp->addr, addr)) { - if (dev == NULL || ifp->idev->dev == dev) + if ((dev == NULL && + ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) + || ifp->idev->dev == dev) break; } } @@ -1217,9 +1229,10 @@ struct inet6_ifaddr * ipv6_get_ifaddr(st read_lock_bh(&addrconf_hash_lock); for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { - if (ipv6_addr_equal(&ifp->addr, addr)) { + if (ipv6_addr_equal(&ifp->addr, addr) && + ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) { if (dev == NULL || ifp->idev->dev == dev || - !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { + !((ifp->scope&(IFA_LINK|IFA_HOST)) || strict)) { in6_ifa_hold(ifp); break; } @@ -1869,7 +1882,7 @@ err_exit: /* * Manual configuration of address on an interface */ -static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, +int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, __u32 prefered_lft, __u32 valid_lft) { struct inet6_ifaddr *ifp; @@ -1923,6 +1936,7 @@ static int inet6_addr_add(int ifindex, s return PTR_ERR(ifp); } +EXPORT_SYMBOL_GPL(inet6_addr_add); static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen) { @@ -1963,7 +1977,7 @@ int addrconf_add_ifaddr(void __user *arg struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) @@ -1981,7 +1995,7 @@ int addrconf_del_ifaddr(void __user *arg struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) @@ -2322,7 +2336,7 @@ static struct notifier_block ipv6_dev_no .priority = 0 }; -static int addrconf_ifdown(struct net_device *dev, int how) +int addrconf_ifdown(struct net_device *dev, int how) { struct inet6_dev *idev; struct inet6_ifaddr *ifa, **bifa; @@ -2330,7 +2344,7 @@ static int addrconf_ifdown(struct net_de ASSERT_RTNL(); - if (dev == &loopback_dev && how == 1) + if (dev == get_ve0()->_loopback_dev && how == 1) how = 0; rt6_ifdown(dev); @@ -2438,10 +2452,12 @@ static int addrconf_ifdown(struct net_de } return 0; } +EXPORT_SYMBOL_GPL(addrconf_ifdown); static void addrconf_rs_timer(unsigned long data) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct ve_struct *old_env = set_exec_env(ifp->idev->dev->owner_env); if (ifp->idev->cnf.forwarding) goto out; @@ -2480,6 +2496,7 @@ static void addrconf_rs_timer(unsigned l out: in6_ifa_put(ifp); + set_exec_env(old_env); } /* @@ -2547,6 +2564,7 @@ static void addrconf_dad_timer(unsigned struct inet6_dev *idev = ifp->idev; struct in6_addr unspec; struct in6_addr mcaddr; + struct ve_struct *old_env = set_exec_env(ifp->idev->dev->owner_env); read_lock_bh(&idev->lock); if (idev->dead) { @@ -2579,6 +2597,7 @@ static void addrconf_dad_timer(unsigned ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec); out: in6_ifa_put(ifp); + set_exec_env(old_env); } static void addrconf_dad_completed(struct inet6_ifaddr *ifp) @@ -2646,8 +2665,11 @@ static struct inet6_ifaddr *if6_get_firs for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { ifa = inet6_addr_lst[state->bucket]; - if (ifa) - break; + while (ifa) { + if (ve_accessible_strict(ifa->idev->dev->owner_env, get_exec_env())) + return ifa; + ifa = ifa->lst_next; + } } return ifa; } @@ -2658,6 +2680,11 @@ static struct inet6_ifaddr *if6_get_next ifa = ifa->lst_next; try_again: + while (ifa) { + if (ve_accessible_strict(ifa->idev->dev->owner_env, get_exec_env())) + break; + ifa = ifa->lst_next; + } if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) { ifa = inet6_addr_lst[state->bucket]; goto try_again; @@ -2748,14 +2775,14 @@ static struct file_operations if6_fops = int __init if6_proc_init(void) { - if (!proc_net_fops_create("if_inet6", S_IRUGO, &if6_fops)) + if (!proc_glob_fops_create("net/if_inet6", S_IRUGO, &if6_fops)) return -ENOMEM; return 0; } void if6_proc_exit(void) { - proc_net_remove("if_inet6"); + remove_proc_glob_entry("net/if_inet6", NULL); } #endif /* CONFIG_PROC_FS */ @@ -2768,6 +2795,7 @@ static void addrconf_verify(unsigned lon struct inet6_ifaddr *ifp; unsigned long now, next; int i; + struct ve_struct *old_env; spin_lock_bh(&addrconf_verify_lock); now = jiffies; @@ -2788,6 +2816,8 @@ restart: if (ifp->flags & IFA_F_PERMANENT) continue; + old_env = set_exec_env(ifp->idev->dev->owner_env); + spin_lock(&ifp->lock); age = (now - ifp->tstamp) / HZ; @@ -2803,6 +2833,7 @@ restart: in6_ifa_hold(ifp); read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); + set_exec_env(old_env); goto restart; } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { spin_unlock(&ifp->lock); @@ -2827,6 +2858,7 @@ restart: ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); + set_exec_env(old_env); goto restart; } #ifdef CONFIG_IPV6_PRIVACY @@ -2848,6 +2880,7 @@ restart: ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); + set_exec_env(old_env); goto restart; } } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) @@ -2860,6 +2893,7 @@ restart: next = ifp->tstamp + ifp->prefered_lft * HZ; spin_unlock(&ifp->lock); } + set_exec_env(old_env); } read_unlock(&addrconf_hash_lock); } @@ -3572,7 +3606,7 @@ int addrconf_sysctl_forward(ctl_table *c ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write && valp != &ipv6_devconf_dflt.forwarding) { - if (valp != &ipv6_devconf.forwarding) { + if (valp != &ve_ipv6_devconf.forwarding) { if ((!*valp) ^ (!val)) { struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; if (idev == NULL) @@ -3580,7 +3614,7 @@ int addrconf_sysctl_forward(ctl_table *c dev_forward_change(idev); } } else { - ipv6_devconf_dflt.forwarding = ipv6_devconf.forwarding; + ipv6_devconf_dflt.forwarding = ve_ipv6_devconf.forwarding; addrconf_forward_change(); } if (*valp) @@ -3623,7 +3657,7 @@ static int addrconf_sysctl_forward_strat } if (valp != &ipv6_devconf_dflt.forwarding) { - if (valp != &ipv6_devconf.forwarding) { + if (valp != &ve_ipv6_devconf.forwarding) { struct inet6_dev *idev = (struct inet6_dev *)table->extra1; int changed; if (unlikely(idev == NULL)) @@ -3659,7 +3693,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_FORWARDING, .procname = "forwarding", - .data = &ipv6_devconf.forwarding, + .data = &global_ipv6_devconf.forwarding, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &addrconf_sysctl_forward, @@ -3668,7 +3702,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_HOP_LIMIT, .procname = "hop_limit", - .data = &ipv6_devconf.hop_limit, + .data = &global_ipv6_devconf.hop_limit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -3676,7 +3710,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_MTU, .procname = "mtu", - .data = &ipv6_devconf.mtu6, + .data = &global_ipv6_devconf.mtu6, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3684,7 +3718,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA, .procname = "accept_ra", - .data = &ipv6_devconf.accept_ra, + .data = &global_ipv6_devconf.accept_ra, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3692,7 +3726,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_REDIRECTS, .procname = "accept_redirects", - .data = &ipv6_devconf.accept_redirects, + .data = &global_ipv6_devconf.accept_redirects, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3700,7 +3734,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_AUTOCONF, .procname = "autoconf", - .data = &ipv6_devconf.autoconf, + .data = &global_ipv6_devconf.autoconf, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3708,7 +3742,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_DAD_TRANSMITS, .procname = "dad_transmits", - .data = &ipv6_devconf.dad_transmits, + .data = &global_ipv6_devconf.dad_transmits, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3716,7 +3750,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_RTR_SOLICITS, .procname = "router_solicitations", - .data = &ipv6_devconf.rtr_solicits, + .data = &global_ipv6_devconf.rtr_solicits, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3724,7 +3758,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_RTR_SOLICIT_INTERVAL, .procname = "router_solicitation_interval", - .data = &ipv6_devconf.rtr_solicit_interval, + .data = &global_ipv6_devconf.rtr_solicit_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -3733,7 +3767,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_RTR_SOLICIT_DELAY, .procname = "router_solicitation_delay", - .data = &ipv6_devconf.rtr_solicit_delay, + .data = &global_ipv6_devconf.rtr_solicit_delay, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -3742,7 +3776,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_FORCE_MLD_VERSION, .procname = "force_mld_version", - .data = &ipv6_devconf.force_mld_version, + .data = &global_ipv6_devconf.force_mld_version, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3751,7 +3785,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_USE_TEMPADDR, .procname = "use_tempaddr", - .data = &ipv6_devconf.use_tempaddr, + .data = &global_ipv6_devconf.use_tempaddr, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3759,7 +3793,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_TEMP_VALID_LFT, .procname = "temp_valid_lft", - .data = &ipv6_devconf.temp_valid_lft, + .data = &global_ipv6_devconf.temp_valid_lft, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3767,7 +3801,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_TEMP_PREFERED_LFT, .procname = "temp_prefered_lft", - .data = &ipv6_devconf.temp_prefered_lft, + .data = &global_ipv6_devconf.temp_prefered_lft, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3775,7 +3809,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_REGEN_MAX_RETRY, .procname = "regen_max_retry", - .data = &ipv6_devconf.regen_max_retry, + .data = &global_ipv6_devconf.regen_max_retry, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3783,7 +3817,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_MAX_DESYNC_FACTOR, .procname = "max_desync_factor", - .data = &ipv6_devconf.max_desync_factor, + .data = &global_ipv6_devconf.max_desync_factor, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3792,7 +3826,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_MAX_ADDRESSES, .procname = "max_addresses", - .data = &ipv6_devconf.max_addresses, + .data = &global_ipv6_devconf.max_addresses, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3800,7 +3834,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA_DEFRTR, .procname = "accept_ra_defrtr", - .data = &ipv6_devconf.accept_ra_defrtr, + .data = &global_ipv6_devconf.accept_ra_defrtr, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3808,7 +3842,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA_PINFO, .procname = "accept_ra_pinfo", - .data = &ipv6_devconf.accept_ra_pinfo, + .data = &global_ipv6_devconf.accept_ra_pinfo, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3817,7 +3851,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA_RTR_PREF, .procname = "accept_ra_rtr_pref", - .data = &ipv6_devconf.accept_ra_rtr_pref, + .data = &global_ipv6_devconf.accept_ra_rtr_pref, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3825,7 +3859,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_RTR_PROBE_INTERVAL, .procname = "router_probe_interval", - .data = &ipv6_devconf.rtr_probe_interval, + .data = &global_ipv6_devconf.rtr_probe_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -3835,7 +3869,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, .procname = "accept_ra_rt_info_max_plen", - .data = &ipv6_devconf.accept_ra_rt_info_max_plen, + .data = &global_ipv6_devconf.accept_ra_rt_info_max_plen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3892,29 +3926,22 @@ static struct addrconf_sysctl_table }, }; -static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) +static struct addrconf_sysctl_table * +__addrconf_sysctl_register(struct inet6_dev *idev, char *dev_name, int ifindex, struct ipv6_devconf *p) { int i; - struct net_device *dev = idev ? idev->dev : NULL; struct addrconf_sysctl_table *t; - char *dev_name = NULL; t = kmalloc(sizeof(*t), GFP_KERNEL); if (t == NULL) - return; + return NULL; + memcpy(t, &addrconf_sysctl, sizeof(*t)); for (i=0; t->addrconf_vars[i].data; i++) { - t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; + t->addrconf_vars[i].data += (char*)p - (char*)&global_ipv6_devconf; t->addrconf_vars[i].de = NULL; t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ } - if (dev) { - dev_name = dev->name; - t->addrconf_dev[0].ctl_name = dev->ifindex; - } else { - dev_name = "default"; - t->addrconf_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; - } /* * Make a copy of dev_name, because '.procname' is regarded as const @@ -3925,6 +3952,7 @@ static void addrconf_sysctl_register(str if (!dev_name) goto free; + t->addrconf_dev[0].ctl_name = ifindex; t->addrconf_dev[0].procname = dev_name; t->addrconf_dev[0].child = t->addrconf_vars; @@ -3939,9 +3967,7 @@ static void addrconf_sysctl_register(str t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0); if (t->sysctl_header == NULL) goto free_procname; - else - p->sysctl = t; - return; + return t; /* error path */ free_procname: @@ -3949,7 +3975,26 @@ static void addrconf_sysctl_register(str free: kfree(t); - return; + return NULL; +} + +static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) +{ + struct net_device *dev; + char *dev_name; + int ifindex; + + dev = idev ? idev->dev : NULL; + + if (dev) { + dev_name = dev->name; + ifindex = dev->ifindex; + } else { + dev_name = "default"; + ifindex = NET_PROTO_CONF_DEFAULT; + } + + p->sysctl = __addrconf_sysctl_register(idev, dev_name, ifindex, p); } static void addrconf_sysctl_unregister(struct ipv6_devconf *p) @@ -3963,9 +4008,64 @@ static void addrconf_sysctl_unregister(s } } +#ifdef CONFIG_VE +int addrconf_sysctl_init(struct ve_struct *ve) +{ + int err = 0; + struct ipv6_devconf *conf, *conf_def; -#endif + err = -ENOMEM; + + conf = kmalloc(sizeof(*conf), GFP_KERNEL); + if (!conf) + goto err1; + + memcpy(conf, &global_ipv6_devconf, sizeof(*conf)); + conf->sysctl = __addrconf_sysctl_register(NULL, "all", + NET_PROTO_CONF_ALL, conf); + if (!conf->sysctl) + goto err2; + + conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL); + if (!conf_def) + goto err3; + + memcpy(conf_def, &global_ipv6_devconf_dflt, sizeof(*conf_def)); + conf_def->sysctl = __addrconf_sysctl_register(NULL, "default", + NET_PROTO_CONF_DEFAULT, conf_def); + if (!conf_def->sysctl) + goto err4; + + ve->_ipv6_devconf = conf; + ve->_ipv6_devconf_dflt = conf_def; + return 0; + +err4: + kfree(conf_def); +err3: + addrconf_sysctl_unregister(conf); +err2: + kfree(conf); +err1: + return err; +} +EXPORT_SYMBOL(addrconf_sysctl_init); +void addrconf_sysctl_fini(struct ve_struct *ve) +{ + addrconf_sysctl_unregister(ve->_ipv6_devconf); + addrconf_sysctl_unregister(ve->_ipv6_devconf_dflt); +} +EXPORT_SYMBOL(addrconf_sysctl_fini); + +void addrconf_sysctl_free(struct ve_struct *ve) +{ + kfree(ve->_ipv6_devconf); + kfree(ve->_ipv6_devconf_dflt); +} +EXPORT_SYMBOL(addrconf_sysctl_free); +#endif /* CONFIG_VE */ +#endif /* CONFIG_SYSCTL */ /* * Device notifier */ @@ -3988,6 +4088,11 @@ int __init addrconf_init(void) { int err = 0; +#ifdef CONFIG_VE + get_ve0()->_ipv6_devconf = &global_ipv6_devconf; + get_ve0()->_ipv6_devconf_dflt = &global_ipv6_devconf_dflt; +#endif + /* The addrconf netdev notifier requires that loopback_dev * has it's ipv6 private information allocated and setup * before it can bring up and give link-local addresses @@ -4022,7 +4127,7 @@ int __init addrconf_init(void) #ifdef CONFIG_SYSCTL addrconf_sysctl.sysctl_header = register_sysctl_table(addrconf_sysctl.addrconf_root_dir, 0); - addrconf_sysctl_register(NULL, &ipv6_devconf_dflt); + __addrconf_sysctl_register(NULL, "default", NET_PROTO_CONF_DEFAULT, &global_ipv6_devconf_dflt); #endif return 0; @@ -4039,8 +4144,8 @@ void __exit addrconf_cleanup(void) rtnetlink_links[PF_INET6] = NULL; #ifdef CONFIG_SYSCTL - addrconf_sysctl_unregister(&ipv6_devconf_dflt); - addrconf_sysctl_unregister(&ipv6_devconf); + addrconf_sysctl_unregister(&global_ipv6_devconf_dflt); + addrconf_sysctl_unregister(&global_ipv6_devconf); #endif rtnl_lock(); @@ -4080,6 +4185,6 @@ void __exit addrconf_cleanup(void) rtnl_unlock(); #ifdef CONFIG_PROC_FS - proc_net_remove("if_inet6"); + remove_proc_glob_entry("net/if_inet6", NULL); #endif } diff -uprN linux-2.6.18/net/ipv6/af_inet6.c linux-2.6.18.ovz/net/ipv6/af_inet6.c --- linux-2.6.18/net/ipv6/af_inet6.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/af_inet6.c 2007-06-13 06:55:07.000000000 -0400 @@ -59,6 +59,7 @@ #ifdef CONFIG_IPV6_TUNNEL #include #endif +#include #include #include @@ -159,6 +160,13 @@ lookup_protocol: if (sk == NULL) goto out; + err = -ENOBUFS; + if (ub_sock_charge(sk, PF_INET6, sock->type)) + goto out_sk_free; + /* if charge was successful, sock_init_data() MUST be called to + * set sk->sk_type. otherwise sk will be uncharged to wrong resource + */ + sock_init_data(sock, sk); err = 0; @@ -233,6 +241,9 @@ out: out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_free: + sk_free(sk); + return err; } @@ -724,21 +735,21 @@ snmp6_mib_free(void *ptr[2]) static int __init init_ipv6_mibs(void) { - if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib), + if (snmp6_mib_init((void **)ve_ipv6_statistics, sizeof (struct ipstats_mib), __alignof__(struct ipstats_mib)) < 0) goto err_ip_mib; - if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib), + if (snmp6_mib_init((void **)ve_icmpv6_statistics, sizeof (struct icmpv6_mib), __alignof__(struct icmpv6_mib)) < 0) goto err_icmp_mib; - if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib), + if (snmp6_mib_init((void **)ve_udp_stats_in6, sizeof (struct udp_mib), __alignof__(struct udp_mib)) < 0) goto err_udp_mib; return 0; err_udp_mib: - snmp6_mib_free((void **)icmpv6_statistics); + snmp6_mib_free((void **)ve_icmpv6_statistics); err_icmp_mib: - snmp6_mib_free((void **)ipv6_statistics); + snmp6_mib_free((void **)ve_ipv6_statistics); err_ip_mib: return -ENOMEM; @@ -746,9 +757,9 @@ err_ip_mib: static void cleanup_ipv6_mibs(void) { - snmp6_mib_free((void **)ipv6_statistics); - snmp6_mib_free((void **)icmpv6_statistics); - snmp6_mib_free((void **)udp_stats_in6); + snmp6_mib_free((void **)ve_ipv6_statistics); + snmp6_mib_free((void **)ve_icmpv6_statistics); + snmp6_mib_free((void **)ve_udp_stats_in6); } static int __init inet6_init(void) diff -uprN linux-2.6.18/net/ipv6/anycast.c linux-2.6.18.ovz/net/ipv6/anycast.c --- linux-2.6.18/net/ipv6/anycast.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/anycast.c 2007-06-13 06:55:07.000000000 -0400 @@ -82,7 +82,7 @@ int ipv6_sock_ac_join(struct sock *sk, i struct net_device *dev = NULL; struct inet6_dev *idev; struct ipv6_ac_socklist *pac; - int ishost = !ipv6_devconf.forwarding; + int ishost = !ve_ipv6_devconf.forwarding; int err = 0; if (!capable(CAP_NET_ADMIN)) @@ -452,6 +452,8 @@ static inline struct ifacaddr6 *ac6_get_ state->dev; state->dev = state->dev->next) { struct inet6_dev *idev; + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; idev = in6_dev_get(state->dev); if (!idev) continue; @@ -481,6 +483,8 @@ static struct ifacaddr6 *ac6_get_next(st state->idev = NULL; break; } + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; state->idev = in6_dev_get(state->dev); if (!state->idev) continue; @@ -575,7 +579,7 @@ static struct file_operations ac6_seq_fo int __init ac6_proc_init(void) { - if (!proc_net_fops_create("anycast6", S_IRUGO, &ac6_seq_fops)) + if (!proc_glob_fops_create("net/anycast6", S_IRUGO, &ac6_seq_fops)) return -ENOMEM; return 0; @@ -583,7 +587,7 @@ int __init ac6_proc_init(void) void ac6_proc_exit(void) { - proc_net_remove("anycast6"); + remove_proc_glob_entry("net/anycast6", NULL); } #endif diff -uprN linux-2.6.18/net/ipv6/inet6_connection_sock.c linux-2.6.18.ovz/net/ipv6/inet6_connection_sock.c --- linux-2.6.18/net/ipv6/inet6_connection_sock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/inet6_connection_sock.c 2007-06-13 06:55:07.000000000 -0400 @@ -25,6 +25,8 @@ #include #include #include +#include +#include int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) @@ -35,6 +37,7 @@ int inet6_csk_bind_conflict(const struct /* We must walk the whole port owner list in this case. -DaveM */ sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && + ve_accessible_strict(sk->owner_env, sk2->owner_env) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && diff -uprN linux-2.6.18/net/ipv6/inet6_hashtables.c linux-2.6.18.ovz/net/ipv6/inet6_hashtables.c --- linux-2.6.18/net/ipv6/inet6_hashtables.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/inet6_hashtables.c 2007-06-13 06:55:07.000000000 -0400 @@ -68,14 +68,15 @@ struct sock *__inet6_lookup_established( /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport); + struct ve_struct *env = get_exec_env(); + unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport, VEID(env)); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); prefetch(head->chain.first); read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ - if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif)) + if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif, env)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ @@ -88,6 +89,7 @@ struct sock *__inet6_lookup_established( if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && + ve_accessible_strict(tw->tw_owner_env, VEID(env)) && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) goto hit; } @@ -110,9 +112,15 @@ struct sock *inet6_lookup_listener(struc const struct hlist_node *node; struct sock *result = NULL; int score, hiscore = 0; + struct ve_struct *env; + + env = get_exec_env(); read_lock(&hashinfo->lhash_lock); - sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { + sk_for_each(sk, node, &hashinfo->listening_hash[ + inet_lhashfn(hnum, VEID(env))]) { + if (!ve_accessible_strict(sk->owner_env, env)) + continue; if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { const struct ipv6_pinfo *np = inet6_sk(sk); @@ -163,7 +171,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + struct ve_struct *ve) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -173,7 +182,7 @@ static int __inet6_check_established(str const int dif = sk->sk_bound_dev_if; const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, - inet->dport); + inet->dport, VEID(ve)); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); struct sock *sk2; const struct hlist_node *node; @@ -192,7 +201,8 @@ static int __inet6_check_established(str sk2->sk_family == PF_INET6 && ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + sk2->sk_bound_dev_if == sk->sk_bound_dev_if && + ve_accessible_strict(tw->tw_owner_env, VEID(ve))) { if (twsk_unique(sk, sk2, twp)) goto unique; else @@ -203,7 +213,7 @@ static int __inet6_check_established(str /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) + if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif, ve)) goto not_unique; } @@ -252,7 +262,9 @@ int inet6_hash_connect(struct inet_timew struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; int ret; + struct ve_struct *ve; + ve = sk->owner_env; if (snum == 0) { const int low = sysctl_local_port_range[0]; const int high = sysctl_local_port_range[1]; @@ -266,7 +278,8 @@ int inet6_hash_connect(struct inet_timew local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(port, + hinfo->bhash_size, VEID(ve))]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -274,20 +287,21 @@ int inet6_hash_connect(struct inet_timew * unique enough. */ inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { + if (tb->port == port && + ve_accessible_strict(tb->owner_env, ve)) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) goto next_port; if (!__inet6_check_established(death_row, sk, port, - &tw)) + &tw, ve)) goto ok; goto next_port; } } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - head, port); + head, port, ve); if (!tb) { spin_unlock(&head->lock); break; @@ -322,7 +336,7 @@ ok: goto out; } - head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))]; tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); @@ -333,7 +347,7 @@ ok: } else { spin_unlock(&head->lock); /* No definite answer... Walk to established hash table */ - ret = __inet6_check_established(death_row, sk, snum, NULL); + ret = __inet6_check_established(death_row, sk, snum, NULL, ve); out: local_bh_enable(); return ret; diff -uprN linux-2.6.18/net/ipv6/ip6_fib.c linux-2.6.18.ovz/net/ipv6/ip6_fib.c --- linux-2.6.18/net/ipv6/ip6_fib.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/ip6_fib.c 2007-06-13 06:55:07.000000000 -0400 @@ -1126,8 +1126,12 @@ static int fib6_age(struct rt6_info *rt, static DEFINE_SPINLOCK(fib6_gc_lock); +LIST_HEAD(fib6_table_list); + void fib6_run_gc(unsigned long dummy) { + struct fib6_table *tbl; + if (dummy != ~0UL) { spin_lock_bh(&fib6_gc_lock); gc_args.timeout = dummy ? (int)dummy : ip6_rt_gc_interval; @@ -1145,7 +1149,11 @@ void fib6_run_gc(unsigned long dummy) write_lock_bh(&rt6_lock); ndisc_dst_gc(&gc_args.more); - fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL); + list_for_each_entry(tbl, &fib6_table_list, list) { + struct ve_struct *old_env = set_exec_env(tbl->owner_env); + fib6_clean_tree(&tbl->root, fib6_age, 0, NULL); + set_exec_env(old_env); + } write_unlock_bh(&rt6_lock); if (gc_args.more) @@ -1161,7 +1169,7 @@ void __init fib6_init(void) { fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN, + 0, SLAB_HWCACHE_ALIGN | SLAB_UBC, NULL, NULL); if (!fib6_node_kmem) panic("cannot create fib6_nodes cache"); diff -uprN linux-2.6.18/net/ipv6/ip6_flowlabel.c linux-2.6.18.ovz/net/ipv6/ip6_flowlabel.c --- linux-2.6.18/net/ipv6/ip6_flowlabel.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/ip6_flowlabel.c 2007-06-13 06:55:07.000000000 -0400 @@ -415,6 +415,9 @@ int ipv6_flowlabel_opt(struct sock *sk, struct ipv6_fl_socklist *sfl, **sflp; struct ip6_flowlabel *fl; + if (!ve_is_super(get_exec_env())) + return -EPERM; + if (optlen < sizeof(freq)) return -EINVAL; @@ -587,6 +590,8 @@ static struct ip6_flowlabel *ip6fl_get_n while (!fl) { if (++state->bucket <= FL_HASH_MASK) fl = fl_ht[state->bucket]; + else + break; } return fl; } diff -uprN linux-2.6.18/net/ipv6/ip6_output.c linux-2.6.18.ovz/net/ipv6/ip6_output.c --- linux-2.6.18/net/ipv6/ip6_output.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/ip6_output.c 2007-06-13 06:55:07.000000000 -0400 @@ -319,7 +319,7 @@ int ip6_forward(struct sk_buff *skb) struct ipv6hdr *hdr = skb->nh.ipv6h; struct inet6_skb_parm *opt = IP6CB(skb); - if (ipv6_devconf.forwarding == 0) + if (ve_ipv6_devconf.forwarding == 0) goto error; if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { @@ -408,6 +408,20 @@ int ip6_forward(struct sk_buff *skb) return -EMSGSIZE; } + /* + * We try to optimize forwarding of VE packets: + * do not decrement TTL (and so save skb_cow) + * during forwarding of outgoing pkts from VE. + * For incoming pkts we still do ttl decr, + * since such skb is not cloned and does not require + * actual cow. So, there is at least one place + * in pkts path with mandatory ttl decr, that is + * sufficient to prevent routing loops. + */ + hdr = skb->nh.ipv6h; + if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */ + goto no_ttl_decr; + if (skb_cow(skb, dst->dev->hard_header_len)) { IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); goto drop; @@ -419,6 +433,7 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; +no_ttl_decr: IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); diff -uprN linux-2.6.18/net/ipv6/ipv6_sockglue.c linux-2.6.18.ovz/net/ipv6/ipv6_sockglue.c --- linux-2.6.18/net/ipv6/ipv6_sockglue.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/ipv6_sockglue.c 2007-06-13 06:55:07.000000000 -0400 @@ -123,6 +123,9 @@ static struct sk_buff *ipv6_gso_segment( struct ipv6hdr *ipv6h; struct inet6_protocol *ops; + if (!(features & NETIF_F_HW_CSUM)) + features &= ~NETIF_F_SG; + if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | diff -uprN linux-2.6.18/net/ipv6/mcast.c linux-2.6.18.ovz/net/ipv6/mcast.c --- linux-2.6.18/net/ipv6/mcast.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/mcast.c 2007-06-13 06:55:07.000000000 -0400 @@ -155,7 +155,7 @@ static int ip6_mc_leave_src(struct sock #define IGMP6_UNSOLICITED_IVAL (10*HZ) #define MLD_QRV_DEFAULT 2 -#define MLD_V1_SEEN(idev) (ipv6_devconf.force_mld_version == 1 || \ +#define MLD_V1_SEEN(idev) (ve_ipv6_devconf.force_mld_version == 1 || \ (idev)->cnf.force_mld_version == 1 || \ ((idev)->mc_v1_seen && \ time_before(jiffies, (idev)->mc_v1_seen))) @@ -247,6 +247,7 @@ int ipv6_sock_mc_join(struct sock *sk, i return 0; } +EXPORT_SYMBOL_GPL(ipv6_sock_mc_join); /* * socket leave on multicast group @@ -1582,6 +1583,8 @@ static struct sk_buff *add_grec(struct s skb = add_grhead(skb, pmc, type, &pgr); first = 0; } + if (!skb) + return NULL; psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc)); *psrc = psf->sf_addr; scount++; stotal++; @@ -2166,15 +2169,18 @@ static void igmp6_leave_group(struct ifm static void mld_gq_timer_expire(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; + struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); idev->mc_gq_running = 0; mld_send_report(idev, NULL); __in6_dev_put(idev); + set_exec_env(old_env); } static void mld_ifc_timer_expire(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; + struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); mld_send_cr(idev); if (idev->mc_ifc_count) { @@ -2183,6 +2189,7 @@ static void mld_ifc_timer_expire(unsigne mld_ifc_start_timer(idev, idev->mc_maxdelay); } __in6_dev_put(idev); + set_exec_env(old_env); } static void mld_ifc_event(struct inet6_dev *idev) @@ -2197,6 +2204,7 @@ static void mld_ifc_event(struct inet6_d static void igmp6_timer_handler(unsigned long data) { struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env); if (MLD_V1_SEEN(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); @@ -2208,6 +2216,7 @@ static void igmp6_timer_handler(unsigned ma->mca_flags &= ~MAF_TIMER_RUNNING; spin_unlock(&ma->mca_lock); ma_put(ma); + set_exec_env(old_env); } /* Device going down */ @@ -2252,8 +2261,6 @@ void ipv6_mc_up(struct inet6_dev *idev) void ipv6_mc_init_dev(struct inet6_dev *idev) { - struct in6_addr maddr; - write_lock_bh(&idev->lock); rwlock_init(&idev->mc_lock); idev->mc_gq_running = 0; @@ -2269,10 +2276,6 @@ void ipv6_mc_init_dev(struct inet6_dev * idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; idev->mc_v1_seen = 0; write_unlock_bh(&idev->lock); - - /* Add all-nodes address. */ - ipv6_addr_all_nodes(&maddr); - ipv6_dev_mc_inc(idev->dev, &maddr); } /* @@ -2331,6 +2334,8 @@ static inline struct ifmcaddr6 *igmp6_mc state->dev; state->dev = state->dev->next) { struct inet6_dev *idev; + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; idev = in6_dev_get(state->dev); if (!idev) continue; @@ -2361,6 +2366,8 @@ static struct ifmcaddr6 *igmp6_mc_get_ne state->idev = NULL; break; } + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; state->idev = in6_dev_get(state->dev); if (!state->idev) continue; @@ -2475,6 +2482,8 @@ static inline struct ip6_sf_list *igmp6_ state->dev; state->dev = state->dev->next) { struct inet6_dev *idev; + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; idev = in6_dev_get(state->dev); if (unlikely(idev == NULL)) continue; @@ -2514,6 +2523,8 @@ static struct ip6_sf_list *igmp6_mcf_get state->idev = NULL; goto out; } + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; state->idev = in6_dev_get(state->dev); if (!state->idev) continue; @@ -2655,8 +2666,8 @@ int __init igmp6_init(struct net_proto_f np->hop_limit = 1; #ifdef CONFIG_PROC_FS - proc_net_fops_create("igmp6", S_IRUGO, &igmp6_mc_seq_fops); - proc_net_fops_create("mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops); + proc_glob_fops_create("net/igmp6", S_IRUGO, &igmp6_mc_seq_fops); + proc_glob_fops_create("net/mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops); #endif return 0; @@ -2668,7 +2679,7 @@ void igmp6_cleanup(void) igmp6_socket = NULL; /* for safety */ #ifdef CONFIG_PROC_FS - proc_net_remove("mcfilter6"); - proc_net_remove("igmp6"); + remove_proc_glob_entry("net/mcfilter6", NULL); + remove_proc_glob_entry("net/igmp6", NULL); #endif } diff -uprN linux-2.6.18/net/ipv6/ndisc.c linux-2.6.18.ovz/net/ipv6/ndisc.c --- linux-2.6.18/net/ipv6/ndisc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/ndisc.c 2007-06-13 06:55:07.000000000 -0400 @@ -123,7 +123,7 @@ static struct neigh_ops ndisc_direct_ops .queue_xmit = dev_queue_xmit, }; -struct neigh_table nd_tbl = { +struct neigh_table global_nd_tbl = { .family = AF_INET6, .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr), .key_len = sizeof(struct in6_addr), @@ -134,7 +134,7 @@ struct neigh_table nd_tbl = { .proxy_redo = pndisc_redo, .id = "ndisc_cache", .parms = { - .tbl = &nd_tbl, + .tbl = &global_nd_tbl, .base_reachable_time = 30 * HZ, .retrans_time = 1 * HZ, .gc_staletime = 60 * HZ, @@ -467,7 +467,9 @@ static void ndisc_send_na(struct net_dev inc_opt = 0; } - skb = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + skb = sock_alloc_send_skb(sk, + (MAX_HEADER + sizeof(struct ipv6hdr) + + len + LL_RESERVED_SPACE(dev)), 1, &err); if (skb == NULL) { @@ -555,7 +557,9 @@ void ndisc_send_ns(struct net_device *de if (send_llinfo) len += ndisc_opt_addr_space(dev); - skb = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + skb = sock_alloc_send_skb(sk, + (MAX_HEADER + sizeof(struct ipv6hdr) + + len + LL_RESERVED_SPACE(dev)), 1, &err); if (skb == NULL) { ND_PRINTK0(KERN_ERR @@ -629,7 +633,9 @@ void ndisc_send_rs(struct net_device *de if (dev->addr_len) len += ndisc_opt_addr_space(dev); - skb = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + skb = sock_alloc_send_skb(sk, + (MAX_HEADER + sizeof(struct ipv6hdr) + + len + LL_RESERVED_SPACE(dev)), 1, &err); if (skb == NULL) { ND_PRINTK0(KERN_ERR @@ -1419,7 +1425,9 @@ void ndisc_send_redirect(struct sk_buff rd_len &= ~0x7; len += rd_len; - buff = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + buff = sock_alloc_send_skb(sk, + (MAX_HEADER + sizeof(struct ipv6hdr) + + len + LL_RESERVED_SPACE(dev)), 1, &err); if (buff == NULL) { ND_PRINTK0(KERN_ERR @@ -1700,7 +1708,9 @@ int __init ndisc_init(struct net_proto_f * Initialize the neighbour table */ - neigh_table_init(&nd_tbl); + get_ve0()->ve_nd_tbl = &global_nd_tbl; + if (neigh_table_init(&nd_tbl)) + panic("cannot initialize IPv6 NDISC tables\n"); #ifdef CONFIG_SYSCTL neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, @@ -1722,3 +1732,52 @@ void ndisc_cleanup(void) sock_release(ndisc_socket); ndisc_socket = NULL; /* For safety. */ } + +#ifdef CONFIG_VE +int ve_ndisc_init(struct ve_struct *ve) +{ + struct ve_struct *old_env; + int err; + + ve->ve_nd_tbl = kmalloc(sizeof(struct neigh_table), GFP_KERNEL); + if (ve->ve_nd_tbl == NULL) + return -ENOMEM; + + *(ve->ve_nd_tbl) = global_nd_tbl; + ve->ve_nd_tbl->parms.tbl = ve->ve_nd_tbl; + old_env = set_exec_env(ve); + err = neigh_table_init(ve->ve_nd_tbl); + if (err) + goto out_free; +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, + "ipv6", + &ndisc_ifinfo_sysctl_change, + &ndisc_ifinfo_sysctl_strategy); +#endif + err = 0; + +out: + set_exec_env(old_env); + return err; + +out_free: + kfree(ve->ve_nd_tbl); + ve->ve_nd_tbl = NULL; + goto out; +} +EXPORT_SYMBOL(ve_ndisc_init); + +void ve_ndisc_fini(struct ve_struct *ve) +{ + if (ve->ve_nd_tbl) { +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&ve->ve_nd_tbl->parms); +#endif + neigh_table_clear(ve->ve_nd_tbl); + kfree(ve->ve_nd_tbl); + ve->ve_nd_tbl = NULL; + } +} +EXPORT_SYMBOL(ve_ndisc_fini); +#endif /* CONFIG_VE */ diff -uprN linux-2.6.18/net/ipv6/netfilter/ip6_queue.c linux-2.6.18.ovz/net/ipv6/netfilter/ip6_queue.c --- linux-2.6.18/net/ipv6/netfilter/ip6_queue.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/netfilter/ip6_queue.c 2007-06-13 06:55:07.000000000 -0400 @@ -505,7 +505,7 @@ ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); write_lock_bh(&queue_lock); @@ -541,8 +541,11 @@ ipq_rcv_sk(struct sock *sk, int len) mutex_lock(&ipqnl_mutex); for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { + struct ve_struct *env; skb = skb_dequeue(&sk->sk_receive_queue); + env = set_exec_env(skb->owner_env); ipq_rcv_skb(skb); + (void)set_exec_env(env); kfree_skb(skb); } diff -uprN linux-2.6.18/net/ipv6/netfilter/ip6_tables.c linux-2.6.18.ovz/net/ipv6/netfilter/ip6_tables.c --- linux-2.6.18/net/ipv6/netfilter/ip6_tables.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/netfilter/ip6_tables.c 2007-06-13 06:55:07.000000000 -0400 @@ -32,9 +32,11 @@ #include #include #include +#include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); @@ -444,6 +446,13 @@ mark_source_chains(struct xt_table_info && unconditional(&e->ipv6)) { unsigned int oldpos, size; + if (t->verdict < -NF_MAX_VERDICT - 1) { + duprintf("mark_source_chains: bad " + "negative verdict (%i)\n", + t->verdict); + return 0; + } + /* Return: backtrack through the last big jump. */ do { @@ -481,6 +490,13 @@ mark_source_chains(struct xt_table_info if (strcmp(t->target.u.user.name, IP6T_STANDARD_TARGET) == 0 && newpos >= 0) { + if (newpos > newinfo->size - + sizeof(struct ip6t_entry)) { + duprintf("mark_source_chains: " + "bad verdict (%i)\n", + newpos); + return 0; + } /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -514,27 +530,6 @@ cleanup_match(struct ip6t_entry_match *m } static inline int -standard_check(const struct ip6t_entry_target *t, - unsigned int max_offset) -{ - struct ip6t_standard_target *targ = (void *)t; - - /* Check standard info. */ - if (targ->verdict >= 0 - && targ->verdict > max_offset - sizeof(struct ip6t_entry)) { - duprintf("ip6t_standard_check: bad verdict (%i)\n", - targ->verdict); - return 0; - } - if (targ->verdict < -NF_MAX_VERDICT - 1) { - duprintf("ip6t_standard_check: bad negative verdict (%i)\n", - targ->verdict); - return 0; - } - return 1; -} - -static inline int check_match(struct ip6t_entry_match *m, const char *name, const struct ip6t_ip6 *ipv6, @@ -592,12 +587,19 @@ check_entry(struct ip6t_entry *e, const return -EINVAL; } + if (e->target_offset + sizeof(struct ip6t_entry_target) > + e->next_offset) + return -EINVAL; + j = 0; ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6, e->comefrom, &j); if (ret != 0) goto cleanup_matches; t = ip6t_get_target(e); + ret = -EINVAL; + if (e->target_offset + t->u.target_size > e->next_offset) + goto cleanup_matches; target = try_then_request_module(xt_find_target(AF_INET6, t->u.user.name, t->u.user.revision), @@ -615,12 +617,7 @@ check_entry(struct ip6t_entry *e, const if (ret) goto err; - if (t->u.kernel.target == &ip6t_standard_target) { - if (!standard_check(t, size)) { - ret = -EINVAL; - goto cleanup_matches; - } - } else if (t->u.kernel.target->checkentry + if (t->u.kernel.target->checkentry && !t->u.kernel.target->checkentry(name, e, target, t->data, t->u.target_size - sizeof(*t), @@ -770,7 +767,7 @@ translate_table(const char *name, if (ret != 0) { IP6T_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + cleanup_entry, &i); return ret; } @@ -780,7 +777,7 @@ translate_table(const char *name, memcpy(newinfo->entries[i], entry0, newinfo->size); } - return ret; + return 0; } /* Gets counters. */ @@ -1131,9 +1128,14 @@ do_ip6t_set_ctl(struct sock *sk, int cmd { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET6].next) + return -ENOENT; +#endif + switch (cmd) { case IP6T_SO_SET_REPLACE: ret = do_replace(user, len); @@ -1156,9 +1158,14 @@ do_ip6t_get_ctl(struct sock *sk, int cmd { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET6].next) + return -ENOENT; +#endif + switch (cmd) { case IP6T_SO_GET_INFO: { char name[IP6T_TABLE_MAXNAMELEN]; @@ -1254,18 +1261,18 @@ do_ip6t_get_ctl(struct sock *sk, int cmd return ret; } -int ip6t_register_table(struct xt_table *table, +struct ip6t_table *ip6t_register_table(struct xt_table *table, const struct ip6t_replace *repl) { int ret; struct xt_table_info *newinfo; static struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* choose the copy on our node/cpu */ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; @@ -1278,28 +1285,29 @@ int ip6t_register_table(struct xt_table repl->underflow); if (ret != 0) { xt_free_table_info(newinfo); - return ret; + return ERR_PTR(ret); } - ret = xt_register_table(table, &bootstrap, newinfo); - if (ret != 0) { + table = virt_xt_register_table(table, &bootstrap, newinfo); + if (IS_ERR(table)) xt_free_table_info(newinfo); - return ret; - } - - return 0; + return table; } void ip6t_unregister_table(struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; + struct module *me; - private = xt_unregister_table(table); + me = table->me; + private = virt_xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + if (private->number > private->initial_entries) + module_put(me); xt_free_table_info(private); } @@ -1394,12 +1402,30 @@ static struct ip6t_match icmp6_matchstru .family = AF_INET6, }; +static int init_ip6tables(void) +{ +#ifdef CONFIG_VE_IPTABLES + if (get_exec_env()->_xt_tables[AF_INET6].next != NULL) + return -EEXIST; +#endif + + return xt_proto_init(AF_INET6); +} + +static void fini_ip6tables(void) +{ +#ifdef CONFIG_VE_IPTABLES + get_exec_env()->_xt_tables[AF_INET6].next = NULL; +#endif + xt_proto_fini(AF_INET6); +} + static int __init ip6_tables_init(void) { int ret; - ret = xt_proto_init(AF_INET6); - if (ret < 0) + ret = init_ip6tables(); + if (ret) goto err1; /* Noone else will be downing sem now, so we won't sleep */ @@ -1418,6 +1444,9 @@ static int __init ip6_tables_init(void) if (ret < 0) goto err5; + KSYMRESOLVE(init_ip6tables); + KSYMRESOLVE(fini_ip6tables); + KSYMMODRESOLVE(ip6_tables); printk("ip6_tables: (C) 2000-2006 Netfilter Core Team\n"); return 0; @@ -1428,18 +1457,21 @@ err4: err3: xt_unregister_target(&ip6t_standard_target); err2: - xt_proto_fini(AF_INET6); + fini_ip6tables(); err1: return ret; } static void __exit ip6_tables_fini(void) { + KSYMMODUNRESOLVE(ip6_tables); + KSYMUNRESOLVE(init_ip6tables); + KSYMUNRESOLVE(fini_ip6tables); nf_unregister_sockopt(&ip6t_sockopts); xt_unregister_match(&icmp6_matchstruct); xt_unregister_target(&ip6t_error_target); xt_unregister_target(&ip6t_standard_target); - xt_proto_fini(AF_INET6); + fini_ip6tables(); } /* @@ -1521,5 +1553,5 @@ EXPORT_SYMBOL(ip6t_do_table); EXPORT_SYMBOL(ip6t_ext_hdr); EXPORT_SYMBOL(ipv6_find_hdr); -module_init(ip6_tables_init); +subsys_initcall(ip6_tables_init); module_exit(ip6_tables_fini); diff -uprN linux-2.6.18/net/ipv6/netfilter/ip6table_filter.c linux-2.6.18.ovz/net/ipv6/netfilter/ip6table_filter.c --- linux-2.6.18/net/ipv6/netfilter/ip6table_filter.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/netfilter/ip6table_filter.c 2007-06-13 06:55:07.000000000 -0400 @@ -11,12 +11,20 @@ #include #include +#include #include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("ip6tables filter table"); +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_packet_filter (get_exec_env()->_ve_ip6t_filter_pf) +#else +#define ve_packet_filter &packet_filter +#endif + #define FILTER_VALID_HOOKS ((1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT)) /* Standard entry. */ @@ -43,7 +51,7 @@ static struct struct ip6t_replace repl; struct ip6t_standard entries[3]; struct ip6t_error term; -} initial_table __initdata +} initial_table = { { "filter", FILTER_VALID_HOOKS, 4, sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error), { [NF_IP6_LOCAL_IN] = 0, @@ -108,7 +116,7 @@ ip6t_hook(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); + return ip6t_do_table(pskb, hook, in, out, ve_packet_filter, NULL); } static unsigned int @@ -128,7 +136,7 @@ ip6t_local_out_hook(unsigned int hook, } #endif - return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); + return ip6t_do_table(pskb, hook, in, out, ve_packet_filter, NULL); } static struct nf_hook_ops ip6t_ops[] = { @@ -159,22 +167,19 @@ static struct nf_hook_ops ip6t_ops[] = { static int forward = NF_ACCEPT; module_param(forward, bool, 0000); -static int __init ip6table_filter_init(void) +int init_ip6table_filter(void) { int ret; - - if (forward < 0 || forward > NF_MAX_VERDICT) { - printk("iptables forward must be 0 or 1\n"); - return -EINVAL; - } - - /* Entry 1 is the FORWARD hook */ - initial_table.entries[1].target.verdict = -forward - 1; + struct ip6t_table *tmp_filter; /* Register table */ - ret = ip6t_register_table(&packet_filter, &initial_table.repl); - if (ret < 0) - return ret; + tmp_filter = ip6t_register_table(&packet_filter, + &initial_table.repl); + if (IS_ERR(tmp_filter)) + return PTR_ERR(tmp_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = tmp_filter; +#endif /* Register hooks */ ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); @@ -184,14 +189,50 @@ static int __init ip6table_filter_init(v return ret; cleanup_table: - ip6t_unregister_table(&packet_filter); + ip6t_unregister_table(ve_packet_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = NULL; +#endif return ret; } -static void __exit ip6table_filter_fini(void) +void fini_ip6table_filter(void) { nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - ip6t_unregister_table(&packet_filter); + ip6t_unregister_table(ve_packet_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = NULL; +#endif +} + +static int __init ip6table_filter_init(void) +{ + int err; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + err = init_ip6table_filter(); + if (err < 0) + return err; + + KSYMRESOLVE(init_ip6table_filter); + KSYMRESOLVE(fini_ip6table_filter); + KSYMMODRESOLVE(ip6table_filter); + return 0; +} + +static void __exit ip6table_filter_fini(void) +{ + KSYMMODUNRESOLVE(ip6table_filter); + KSYMUNRESOLVE(init_ip6table_filter); + KSYMUNRESOLVE(fini_ip6table_filter); + fini_ip6table_filter(); } module_init(ip6table_filter_init); diff -uprN linux-2.6.18/net/ipv6/netfilter/ip6table_mangle.c linux-2.6.18.ovz/net/ipv6/netfilter/ip6table_mangle.c --- linux-2.6.18/net/ipv6/netfilter/ip6table_mangle.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/netfilter/ip6table_mangle.c 2007-06-13 06:55:07.000000000 -0400 @@ -12,6 +12,7 @@ */ #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); @@ -53,7 +54,7 @@ static struct struct ip6t_replace repl; struct ip6t_standard entries[5]; struct ip6t_error term; -} initial_table __initdata +} initial_table = { { "mangle", MANGLE_VALID_HOOKS, 6, sizeof(struct ip6t_standard) * 5 + sizeof(struct ip6t_error), { [NF_IP6_PRE_ROUTING] = 0, @@ -130,6 +131,13 @@ static struct ip6t_table packet_mangler .af = AF_INET6, }; +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_packet_mangler (get_exec_env()->_ip6t_mangle_table) +#else +#define ve_packet_mangler &packet_mangler +#endif + /* The work comes in here from netfilter.c. */ static unsigned int ip6t_route_hook(unsigned int hook, @@ -138,7 +146,7 @@ ip6t_route_hook(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL); + return ip6t_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); } static unsigned int @@ -174,7 +182,7 @@ ip6t_local_hook(unsigned int hook, /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *) (*pskb)->nh.ipv6h); - ret = ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL); + ret = ip6t_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); if (ret != NF_DROP && ret != NF_STOLEN && (memcmp(&(*pskb)->nh.ipv6h->saddr, &saddr, sizeof(saddr)) @@ -228,14 +236,19 @@ static struct nf_hook_ops ip6t_ops[] = { }, }; -static int __init ip6table_mangle_init(void) +int init_ip6table_mangle(void) { int ret; + struct ip6t_table *tmp_mangler; /* Register table */ - ret = ip6t_register_table(&packet_mangler, &initial_table.repl); - if (ret < 0) - return ret; + tmp_mangler = ip6t_register_table(&packet_mangler, + &initial_table.repl); + if (IS_ERR(tmp_mangler)) + return PTR_ERR(tmp_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = tmp_mangler; +#endif /* Register hooks */ ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); @@ -245,14 +258,42 @@ static int __init ip6table_mangle_init(v return ret; cleanup_table: - ip6t_unregister_table(&packet_mangler); + ip6t_unregister_table(ve_packet_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = NULL; +#endif return ret; } -static void __exit ip6table_mangle_fini(void) +void fini_ip6table_mangle(void) { nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - ip6t_unregister_table(&packet_mangler); + ip6t_unregister_table(ve_packet_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = NULL; +#endif +} + +static int __init ip6table_mangle_init(void) +{ + int err; + + err = init_ip6table_mangle(); + if (err < 0) + return err; + + KSYMRESOLVE(init_ip6table_mangle); + KSYMRESOLVE(fini_ip6table_mangle); + KSYMMODRESOLVE(ip6table_mangle); + return 0; +} + +static void __exit ip6table_mangle_fini(void) +{ + KSYMMODUNRESOLVE(ip6table_mangle); + KSYMUNRESOLVE(init_ip6table_mangle); + KSYMUNRESOLVE(fini_ip6table_mangle); + fini_ip6table_mangle(); } module_init(ip6table_mangle_init); diff -uprN linux-2.6.18/net/ipv6/netfilter/ip6table_raw.c linux-2.6.18.ovz/net/ipv6/netfilter/ip6table_raw.c --- linux-2.6.18/net/ipv6/netfilter/ip6table_raw.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/netfilter/ip6table_raw.c 2007-06-13 06:55:07.000000000 -0400 @@ -145,11 +145,12 @@ static struct nf_hook_ops ip6t_ops[] = { static int __init ip6table_raw_init(void) { int ret; + struct ip6t_table *tmp; /* Register table */ - ret = ip6t_register_table(&packet_raw, &initial_table.repl); - if (ret < 0) - return ret; + tmp = ip6t_register_table(&packet_raw, &initial_table.repl); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); /* Register hooks */ ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); diff -uprN linux-2.6.18/net/ipv6/proc.c linux-2.6.18.ovz/net/ipv6/proc.c --- linux-2.6.18/net/ipv6/proc.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/proc.c 2007-06-13 06:55:07.000000000 -0400 @@ -24,13 +24,18 @@ #include #include #include +#include #include #include #include #include #ifdef CONFIG_PROC_FS +#ifdef CONFIG_VE +#define proc_net_devsnmp6 (get_exec_env()->_proc_net_devsnmp6) +#else static struct proc_dir_entry *proc_net_devsnmp6; +#endif static int fold_prot_inuse(struct proto *proto) { @@ -163,9 +168,9 @@ static int snmp6_seq_show(struct seq_fil seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list); } else { - snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list); - snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list); - snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list); + snmp6_seq_show_item(seq, (void **)ve_ipv6_statistics, snmp6_ipstats_list); + snmp6_seq_show_item(seq, (void **)ve_icmpv6_statistics, snmp6_icmp6_list); + snmp6_seq_show_item(seq, (void **)ve_udp_stats_in6, snmp6_udp6_list); } return 0; } @@ -228,15 +233,27 @@ int snmp6_unregister_dev(struct inet6_de return 0; } +int ve_snmp_proc_init(void) +{ + proc_net_devsnmp6 = proc_mkdir("dev_snmp6", proc_net); + return proc_net_devsnmp6 == NULL ? -ENOMEM : 0; +} +EXPORT_SYMBOL(ve_snmp_proc_init); + +void ve_snmp_proc_fini(void) +{ + proc_net_remove("dev_snmp6"); +} +EXPORT_SYMBOL(ve_snmp_proc_fini); + int __init ipv6_misc_proc_init(void) { int rc = 0; - if (!proc_net_fops_create("snmp6", S_IRUGO, &snmp6_seq_fops)) + if (!proc_glob_fops_create("net/snmp6", S_IRUGO, &snmp6_seq_fops)) goto proc_snmp6_fail; - proc_net_devsnmp6 = proc_mkdir("dev_snmp6", proc_net); - if (!proc_net_devsnmp6) + if (ve_snmp_proc_init()) goto proc_dev_snmp6_fail; if (!proc_net_fops_create("sockstat6", S_IRUGO, &sockstat6_seq_fops)) @@ -245,9 +262,9 @@ out: return rc; proc_sockstat6_fail: - proc_net_remove("dev_snmp6"); + ve_snmp_proc_fini(); proc_dev_snmp6_fail: - proc_net_remove("snmp6"); + remove_proc_glob_entry("net/snmp6", NULL); proc_snmp6_fail: rc = -ENOMEM; goto out; @@ -256,7 +273,7 @@ proc_snmp6_fail: void ipv6_misc_proc_exit(void) { proc_net_remove("sockstat6"); - proc_net_remove("dev_snmp6"); + ve_snmp_proc_fini(); proc_net_remove("snmp6"); } diff -uprN linux-2.6.18/net/ipv6/raw.c linux-2.6.18.ovz/net/ipv6/raw.c --- linux-2.6.18/net/ipv6/raw.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/raw.c 2007-06-13 06:55:07.000000000 -0400 @@ -99,6 +99,10 @@ struct sock *__raw_v6_lookup(struct sock if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; + if (!ve_accessible_strict(sk->owner_env, + get_exec_env())) + continue; + if (!ipv6_addr_any(&np->rcv_saddr)) { if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) goto found; @@ -1105,8 +1109,13 @@ static struct sock *raw6_get_next(struct do { sk = sk_next(sk); try_again: - ; - } while (sk && sk->sk_family != PF_INET6); + if (!sk) + break; + if (sk->sk_family != PF_INET6) + continue; + if (ve_accessible(sk->owner_env, get_exec_env())) + break; + } while (1); if (!sk && ++state->bucket < RAWV6_HTABLE_SIZE) { sk = sk_head(&raw_v6_htable[state->bucket]); @@ -1224,13 +1233,13 @@ static struct file_operations raw6_seq_f int __init raw6_proc_init(void) { - if (!proc_net_fops_create("raw6", S_IRUGO, &raw6_seq_fops)) + if (!proc_glob_fops_create("net/raw6", S_IRUGO, &raw6_seq_fops)) return -ENOMEM; return 0; } void raw6_proc_exit(void) { - proc_net_remove("raw6"); + remove_proc_glob_entry("net/raw6", NULL); } #endif /* CONFIG_PROC_FS */ diff -uprN linux-2.6.18/net/ipv6/reassembly.c linux-2.6.18.ovz/net/ipv6/reassembly.c --- linux-2.6.18/net/ipv6/reassembly.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/reassembly.c 2007-06-13 06:55:07.000000000 -0400 @@ -94,6 +94,7 @@ struct frag_queue #define FIRST_IN 2 #define LAST_IN 1 __u16 nhoffset; + struct ve_struct *owner_env; }; /* Hash table. */ @@ -292,7 +293,9 @@ static void ip6_frag_expire(unsigned lon { struct frag_queue *fq = (struct frag_queue *) data; struct net_device *dev; + struct ve_struct *envid; + envid = set_exec_env(fq->owner_env); spin_lock(&fq->lock); if (fq->last_in & COMPLETE) @@ -322,6 +325,8 @@ static void ip6_frag_expire(unsigned lon out: spin_unlock(&fq->lock); fq_put(fq, NULL); + + (void)set_exec_env(envid); } /* Creation primitives. */ @@ -341,7 +346,8 @@ static struct frag_queue *ip6_frag_inter hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == fq_in->id && ipv6_addr_equal(&fq_in->saddr, &fq->saddr) && - ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) { + ipv6_addr_equal(&fq_in->daddr, &fq->daddr) && + fq->owner_env == get_exec_env()) { atomic_inc(&fq->refcnt); write_unlock(&ip6_frag_lock); fq_in->last_in |= COMPLETE; @@ -382,6 +388,7 @@ ip6_frag_create(u32 id, struct in6_addr fq->timer.data = (long) fq; spin_lock_init(&fq->lock); atomic_set(&fq->refcnt, 1); + fq->owner_env = get_exec_env(); return ip6_frag_intern(fq); @@ -402,7 +409,8 @@ fq_find(u32 id, struct in6_addr *src, st hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == id && ipv6_addr_equal(src, &fq->saddr) && - ipv6_addr_equal(dst, &fq->daddr)) { + ipv6_addr_equal(dst, &fq->daddr) && + fq->owner_env == get_exec_env()) { atomic_inc(&fq->refcnt); read_unlock(&ip6_frag_lock); return fq; @@ -731,6 +739,9 @@ static int ipv6_frag_rcv(struct sk_buff fq->meat == fq->len) ret = ip6_frag_reasm(fq, skbp, dev); + if (ret > 0) + (*skbp)->owner_env = skb->owner_env; + spin_unlock(&fq->lock); fq_put(fq, NULL); return ret; @@ -741,6 +752,48 @@ static int ipv6_frag_rcv(struct sk_buff return -1; } +#ifdef CONFIG_VE +/* XXX */ +void ip6_frag_cleanup(struct ve_struct *envid) +{ + int i, progress; + + local_bh_disable(); + do { + progress = 0; + for (i = 0; i < IP6Q_HASHSZ; i++) { + struct frag_queue *fq; + struct hlist_node *p, *n; + + if (hlist_empty(&ip6_frag_hash[i])) + continue; +inner_restart: + read_lock(&ip6_frag_lock); + hlist_for_each_entry_safe(fq, p, n, + &ip6_frag_hash[i], list) { + if (!ve_accessible_strict(fq->owner_env, envid)) + continue; + atomic_inc(&fq->refcnt); + read_unlock(&ip6_frag_lock); + + spin_lock(&fq->lock); + if (!(fq->last_in&COMPLETE)) + fq_kill(fq); + spin_unlock(&fq->lock); + + fq_put(fq, NULL); + progress = 1; + goto inner_restart; + } + read_unlock(&ip6_frag_lock); + } + } while(progress); + local_bh_enable(); +} +EXPORT_SYMBOL(ip6_frag_cleanup); +#endif + + static struct inet6_protocol frag_protocol = { .handler = ipv6_frag_rcv, diff -uprN linux-2.6.18/net/ipv6/route.c linux-2.6.18.ovz/net/ipv6/route.c --- linux-2.6.18/net/ipv6/route.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/route.c 2007-06-13 06:55:07.000000000 -0400 @@ -51,7 +51,6 @@ #include #include #include -#include #include #include @@ -125,7 +124,6 @@ struct rt6_info ip6_null_entry = { .dst = { .__refcnt = ATOMIC_INIT(1), .__use = 1, - .dev = &loopback_dev, .obsolete = -1, .error = -ENETUNREACH, .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, @@ -140,11 +138,19 @@ struct rt6_info ip6_null_entry = { .rt6i_ref = ATOMIC_INIT(1), }; -struct fib6_node ip6_routing_table = { - .leaf = &ip6_null_entry, - .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO, +struct fib6_table global_fib6_table = { + .root = { + .leaf = &ip6_null_entry, + .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO, + } }; +#ifdef CONFIG_VE +#define ip6_routing_table (get_exec_env()->_fib6_table->root) +#else +#define ip6_routing_table (global_ip6_routing_table.root) +#endif + /* Protects all the ip6 fib */ DEFINE_RWLOCK(rt6_lock); @@ -884,7 +890,7 @@ static int ipv6_get_mtu(struct net_devic int ipv6_get_hoplimit(struct net_device *dev) { - int hoplimit = ipv6_devconf.hop_limit; + int hoplimit = ve_ipv6_devconf.hop_limit; struct inet6_dev *idev; idev = in6_dev_get(dev); @@ -1579,10 +1585,12 @@ struct rt6_info *addrconf_dst_alloc(stru rt->rt6i_flags |= RTF_ANYCAST; else rt->rt6i_flags |= RTF_LOCAL; - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); - if (rt->rt6i_nexthop == NULL) { + rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev); + if (IS_ERR(rt->rt6i_nexthop)) { + void *err = rt->rt6i_nexthop; + rt->rt6i_nexthop = NULL; dst_free((struct dst_entry *) rt); - return ERR_PTR(-ENOMEM); + return err; } ipv6_addr_copy(&rt->rt6i_dst.addr, addr); @@ -1798,8 +1806,12 @@ static int rt6_fill_node(struct sk_buff goto rtattr_failure; if (rt->u.dst.neighbour) RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); - if (rt->u.dst.dev) - RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex); + if (rt->u.dst.dev) { + struct net_device *odev = rt->rt6i_dev; + if (rt == &ip6_null_entry) + odev = &loopback_dev; + RTA_PUT(skb, RTA_OIF, sizeof(int), &odev->ifindex); + } RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric); ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse); if (rt->rt6i_expires) @@ -2267,23 +2279,31 @@ void __init ip6_route_init(void) if (!ip6_dst_ops.kmem_cachep) panic("cannot create ip6_dst_cache"); +#ifdef CONFIG_VE + global_fib6_table.owner_env = get_ve0(); + get_ve0()->_fib6_table = &global_fib6_table; +#endif + list_add(&global_fib6_table.list, &fib6_table_list); fib6_init(); #ifdef CONFIG_PROC_FS - p = proc_net_create("ipv6_route", 0, rt6_proc_info); - if (p) + p = create_proc_glob_entry("net/ipv6_route", 0, NULL); + if (p) { p->owner = THIS_MODULE; + p->get_info = rt6_proc_info; + } proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops); #endif #ifdef CONFIG_XFRM xfrm6_init(); #endif + ip6_null_entry.u.dst.dev = &loopback_dev; } void ip6_route_cleanup(void) { #ifdef CONFIG_PROC_FS - proc_net_remove("ipv6_route"); + remove_proc_glob_entry("net/ipv6_route", NULL); proc_net_remove("rt6_stats"); #endif #ifdef CONFIG_XFRM @@ -2293,3 +2313,37 @@ void ip6_route_cleanup(void) fib6_gc_cleanup(); kmem_cache_destroy(ip6_dst_ops.kmem_cachep); } + +#ifdef CONFIG_VE +int init_ve_route6(struct ve_struct *ve) +{ + struct ve_struct *old_env = set_exec_env(ve); + ve->_fib6_table = kzalloc(sizeof(struct fib6_table), GFP_KERNEL_UBC); + if (ve->_fib6_table) { + ve->_fib6_table->owner_env = ve; + ve->_fib6_table->root.leaf = &ip6_null_entry; + ve->_fib6_table->root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + write_lock_bh(&rt6_lock); + list_add(&ve->_fib6_table->list, &fib6_table_list); + write_unlock_bh(&rt6_lock); + } + set_exec_env(old_env); + return ve->_fib6_table ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(init_ve_route6); + +void fini_ve_route6(struct ve_struct *ve) +{ + struct ve_struct *old_env = set_exec_env(ve); + + if (ve->_fib6_table) { + rt6_ifdown(NULL); + write_lock_bh(&rt6_lock); + list_del(&ve->_fib6_table->list); + write_unlock_bh(&rt6_lock); + kfree(ve->_fib6_table); + } + set_exec_env(old_env); +} +EXPORT_SYMBOL(fini_ve_route6); +#endif diff -uprN linux-2.6.18/net/ipv6/tcp_ipv6.c linux-2.6.18.ovz/net/ipv6/tcp_ipv6.c --- linux-2.6.18/net/ipv6/tcp_ipv6.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/tcp_ipv6.c 2007-06-13 06:55:07.000000000 -0400 @@ -61,6 +61,8 @@ #include #include +#include + #include #include @@ -76,7 +78,7 @@ static void tcp_v6_send_check(struct soc static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static struct inet_connection_sock_af_ops ipv6_mapped; +struct inet_connection_sock_af_ops ipv6_mapped; static struct inet_connection_sock_af_ops ipv6_specific; static int tcp_v6_get_port(struct sock *sk, unsigned short snum) @@ -1054,6 +1056,8 @@ static int tcp_v6_do_rcv(struct sock *sk struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp; struct sk_buff *opt_skb = NULL; + struct user_beancounter *ub; + /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. @@ -1066,6 +1070,8 @@ static int tcp_v6_do_rcv(struct sock *sk if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + ub = set_exec_ub(sock_bc(sk)->ub); + if (sk_filter(sk, skb, 0)) goto discard; @@ -1097,7 +1103,7 @@ static int tcp_v6_do_rcv(struct sock *sk TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; - return 0; + goto restore_context; } if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb)) @@ -1118,7 +1124,7 @@ static int tcp_v6_do_rcv(struct sock *sk goto reset; if (opt_skb) __kfree_skb(opt_skb); - return 0; + goto restore_context; } } @@ -1128,6 +1134,9 @@ static int tcp_v6_do_rcv(struct sock *sk TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; + +restore_context: + (void)set_exec_ub(ub); return 0; reset: @@ -1136,7 +1145,7 @@ discard: if (opt_skb) __kfree_skb(opt_skb); kfree_skb(skb); - return 0; + goto restore_context; csum_err: TCP_INC_STATS_BH(TCP_MIB_INERRS); goto discard; @@ -1168,7 +1177,7 @@ ipv6_pktoptions: if (opt_skb) kfree_skb(opt_skb); - return 0; + goto restore_context; } static int tcp_v6_rcv(struct sk_buff **pskb) @@ -1228,7 +1237,7 @@ process: skb->dev = NULL; - bh_lock_sock(sk); + bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { #ifdef CONFIG_NET_DMA @@ -1335,12 +1344,13 @@ static struct inet_connection_sock_af_op .compat_getsockopt = compat_ipv6_getsockopt, #endif }; +EXPORT_SYMBOL_GPL(ipv6_mapped); /* * TCP over IPv4 via INET6 API */ -static struct inet_connection_sock_af_ops ipv6_mapped = { +struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1562,7 +1572,7 @@ out: static struct file_operations tcp6_seq_fops; static struct tcp_seq_afinfo tcp6_seq_afinfo = { .owner = THIS_MODULE, - .name = "tcp6", + .name = "net/tcp6", .family = AF_INET6, .seq_show = tcp6_seq_show, .seq_fops = &tcp6_seq_fops, diff -uprN linux-2.6.18/net/ipv6/udp.c linux-2.6.18.ovz/net/ipv6/udp.c --- linux-2.6.18/net/ipv6/udp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/ipv6/udp.c 2007-06-13 06:55:07.000000000 -0400 @@ -68,7 +68,9 @@ static int udp_v6_get_port(struct sock * { struct sock *sk2; struct hlist_node *node; + struct ve_struct *env; + env = sk->owner_env; write_lock_bh(&udp_hash_lock); if (snum == 0) { int best_size_so_far, best, result, i; @@ -82,7 +84,7 @@ static int udp_v6_get_port(struct sock * int size; struct hlist_head *list; - list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; + list = &udp_hash[udp_hashfn(result, VEID(env))]; if (hlist_empty(list)) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + @@ -104,7 +106,7 @@ static int udp_v6_get_port(struct sock * result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); - if (!udp_lport_inuse(result)) + if (!udp_lport_inuse(result, env)) break; } if (i >= (1 << 16) / UDP_HTABLE_SIZE) @@ -113,9 +115,10 @@ gotit: udp_port_rover = snum = result; } else { sk_for_each(sk2, node, - &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { + &udp_hash[udp_hashfn(snum, VEID(env))]) { if (inet_sk(sk2)->num == snum && sk2 != sk && + ve_accessible_strict(sk2->owner_env, env) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && @@ -127,7 +130,7 @@ gotit: inet_sk(sk)->num = snum; if (sk_unhashed(sk)) { - sk_add_node(sk, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]); + sk_add_node(sk, &udp_hash[udp_hashfn(snum, VEID(env))]); sock_prot_inc_use(sk->sk_prot); } write_unlock_bh(&udp_hash_lock); @@ -160,12 +163,15 @@ static struct sock *udp_v6_lookup(struct struct hlist_node *node; unsigned short hnum = ntohs(dport); int badness = -1; + struct ve_struct *env; read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { + env = get_exec_env(); + sk_for_each(sk, node, &udp_hash[udp_hashfn(hnum, VEID(env))]) { struct inet_sock *inet = inet_sk(sk); - if (inet->num == hnum && sk->sk_family == PF_INET6) { + if (inet->num == hnum && sk->sk_family == PF_INET6 && + ve_accessible_strict(sk->owner_env, env)) { struct ipv6_pinfo *np = inet6_sk(sk); int score = 0; if (inet->dport) { @@ -314,14 +320,13 @@ static void udpv6_err(struct sk_buff *sk { struct ipv6_pinfo *np; struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; - struct net_device *dev = skb->dev; struct in6_addr *saddr = &hdr->saddr; struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr*)(skb->data+offset); struct sock *sk; int err; - sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source, dev->ifindex); + sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source, inet6_iif(skb)); if (sk == NULL) return; @@ -414,8 +419,9 @@ static void udpv6_mcast_deliver(struct u int dif; read_lock(&udp_hash_lock); - sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); - dif = skb->dev->ifindex; + sk = sk_head(&udp_hash[udp_hashfn(ntohs(uh->dest), + VEID(skb->owner_env))]); + dif = inet6_iif(skb); sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (!sk) { kfree_skb(skb); @@ -496,7 +502,7 @@ static int udpv6_rcv(struct sk_buff **ps * check socket cache ... must talk to Alan about his plans * for sock caches... i'll skip this for now. */ - sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex); + sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, inet6_iif(skb)); if (sk == NULL) { if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -1049,7 +1055,7 @@ static int udp6_seq_show(struct seq_file static struct file_operations udp6_seq_fops; static struct udp_seq_afinfo udp6_seq_afinfo = { .owner = THIS_MODULE, - .name = "udp6", + .name = "net/udp6", .family = AF_INET6, .seq_show = udp6_seq_show, .seq_fops = &udp6_seq_fops, diff -uprN linux-2.6.18/net/irda/irttp.c linux-2.6.18.ovz/net/irda/irttp.c --- linux-2.6.18/net/irda/irttp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/irda/irttp.c 2007-06-13 06:55:07.000000000 -0400 @@ -1098,7 +1098,7 @@ int irttp_connect_request(struct tsap_cb return -ENOMEM; /* Reserve space for MUX_CONTROL and LAP header */ - skb_reserve(tx_skb, TTP_MAX_HEADER); + skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER); } else { tx_skb = userdata; /* @@ -1346,7 +1346,7 @@ int irttp_connect_response(struct tsap_c return -ENOMEM; /* Reserve space for MUX_CONTROL and LAP header */ - skb_reserve(tx_skb, TTP_MAX_HEADER); + skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER); } else { tx_skb = userdata; /* diff -uprN linux-2.6.18/net/netfilter/Kconfig linux-2.6.18.ovz/net/netfilter/Kconfig --- linux-2.6.18/net/netfilter/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netfilter/Kconfig 2007-06-13 06:55:07.000000000 -0400 @@ -197,7 +197,9 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_CONNSECMARK tristate '"CONNSECMARK" target support' - depends on NETFILTER_XTABLES && (NF_CONNTRACK_SECMARK || IP_NF_CONNTRACK_SECMARK) + depends on NETFILTER_XTABLES && \ + ((NF_CONNTRACK && NF_CONNTRACK_SECMARK) || \ + (IP_NF_CONNTRACK && IP_NF_CONNTRACK_SECMARK)) help The CONNSECMARK target copies security markings from packets to connections, and restores security markings from connections @@ -342,7 +344,7 @@ config NETFILTER_XT_MATCH_MULTIPORT config NETFILTER_XT_MATCH_PHYSDEV tristate '"physdev" match support' - depends on NETFILTER_XTABLES && BRIDGE_NETFILTER + depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER help Physdev packet matching matches against the physical bridge ports the IP packet arrived on or will leave by. diff -uprN linux-2.6.18/net/netfilter/core.c linux-2.6.18.ovz/net/netfilter/core.c --- linux-2.6.18/net/netfilter/core.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netfilter/core.c 2007-06-13 06:55:08.000000000 -0400 @@ -54,16 +54,34 @@ EXPORT_SYMBOL_GPL(nf_unregister_afinfo); * of skbuffs queued for userspace, and not deregister a hook unless * this is zero, but that sucks. Now, we simply check when the * packets come back: if the hook is gone, the packet is discarded. */ +static DEFINE_SPINLOCK(nf_hook_lock); + struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks); -static DEFINE_SPINLOCK(nf_hook_lock); +#ifdef CONFIG_VE_IPTABLES +#define VE_NF_HOOKS(env, x, y) \ + ((struct list_head (*)[NF_MAX_HOOKS])(env->_nf_hooks))[x][y] +#else +#define VE_NF_HOOKS(env, x, y) nf_hooks[x][y] +#endif int nf_register_hook(struct nf_hook_ops *reg) { struct list_head *i; + struct ve_struct *env; + + env = get_exec_env(); + if (!ve_is_super(env)) { + struct nf_hook_ops *tmp; + tmp = kmalloc(sizeof(struct nf_hook_ops), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + memcpy(tmp, reg, sizeof(struct nf_hook_ops)); + reg = tmp; + } spin_lock_bh(&nf_hook_lock); - list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { + list_for_each(i, &VE_NF_HOOKS(env, reg->pf, reg->hooknum)) { if (reg->priority < ((struct nf_hook_ops *)i)->priority) break; } @@ -77,11 +95,29 @@ EXPORT_SYMBOL(nf_register_hook); void nf_unregister_hook(struct nf_hook_ops *reg) { + struct nf_hook_ops *i; + struct ve_struct *env; + + env = get_exec_env(); + if (!ve_is_super(env)) { + list_for_each_entry_rcu(i, + &VE_NF_HOOKS(env, reg->pf, reg->hooknum), list) { + if (reg->hook == i->hook) { + reg = i; + break; + } + } + if (reg != i) + return; + } + spin_lock_bh(&nf_hook_lock); list_del_rcu(®->list); spin_unlock_bh(&nf_hook_lock); synchronize_net(); + if (!ve_is_super(env)) + kfree(reg); } EXPORT_SYMBOL(nf_unregister_hook); @@ -166,13 +202,15 @@ int nf_hook_slow(int pf, unsigned int ho struct list_head *elem; unsigned int verdict; int ret = 0; + struct ve_struct *env; /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - elem = &nf_hooks[pf][hook]; + env = get_exec_env(); + elem = &VE_NF_HOOKS(env, pf, hook); next_hook: - verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, + verdict = nf_iterate(&VE_NF_HOOKS(env, pf, hook), pskb, hook, indev, outdev, &elem, okfn, hook_thresh); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; @@ -245,13 +283,54 @@ struct proc_dir_entry *proc_net_netfilte EXPORT_SYMBOL(proc_net_netfilter); #endif -void __init netfilter_init(void) +void init_nf_hooks(struct list_head (*nh)[NF_MAX_HOOKS]) { int i, h; for (i = 0; i < NPROTO; i++) { for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&nf_hooks[i][h]); + INIT_LIST_HEAD(&nh[i][h]); } +} + +int init_netfilter(void) +{ +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *envid; + + envid = get_exec_env(); + envid->_nf_hooks = kmalloc(sizeof(nf_hooks), GFP_KERNEL); + if (envid->_nf_hooks == NULL) + return -ENOMEM; + + /* FIXME: charge ubc */ + + init_nf_hooks(envid->_nf_hooks); + return 0; +#else + init_nf_hooks(nf_hooks); + return 0; +#endif +} +EXPORT_SYMBOL(init_netfilter); + +#ifdef CONFIG_VE_IPTABLES +void fini_netfilter(void) +{ + struct ve_struct *envid; + + envid = get_exec_env(); + if (envid->_nf_hooks != NULL) + kfree(envid->_nf_hooks); + envid->_nf_hooks = NULL; + + /* FIXME: uncharge ubc */ +} +EXPORT_SYMBOL(fini_netfilter); +#endif + +void __init netfilter_init(void) +{ + init_netfilter(); #ifdef CONFIG_PROC_FS proc_net_netfilter = proc_mkdir("netfilter", proc_net); @@ -264,3 +343,4 @@ void __init netfilter_init(void) if (netfilter_log_init() < 0) panic("cannot initialize nf_log"); } + diff -uprN linux-2.6.18/net/netfilter/nf_conntrack_proto_sctp.c linux-2.6.18.ovz/net/netfilter/nf_conntrack_proto_sctp.c --- linux-2.6.18/net/netfilter/nf_conntrack_proto_sctp.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netfilter/nf_conntrack_proto_sctp.c 2007-06-13 06:55:08.000000000 -0400 @@ -469,7 +469,8 @@ static int sctp_new(struct nf_conn *conn SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ - if (newconntrack == SCTP_CONNTRACK_MAX) { + if (newconntrack == SCTP_CONNTRACK_NONE || + newconntrack == SCTP_CONNTRACK_MAX) { DEBUGP("nf_conntrack_sctp: invalid new deleting.\n"); return 0; } diff -uprN linux-2.6.18/net/netfilter/nf_queue.c linux-2.6.18.ovz/net/netfilter/nf_queue.c --- linux-2.6.18/net/netfilter/nf_queue.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netfilter/nf_queue.c 2007-06-13 06:55:08.000000000 -0400 @@ -185,12 +185,12 @@ void nf_reinject(struct sk_buff *skb, st /* Drop reference to owner of hook which queued us. */ module_put(info->elem->owner); - list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { + list_for_each_rcu(i, &ve_nf_hooks[info->pf][info->hook]) { if (i == elem) break; } - if (i == &nf_hooks[info->pf][info->hook]) { + if (i == &ve_nf_hooks[info->pf][info->hook]) { /* The module which sent it to userspace is gone. */ NFDEBUG("%s: module disappeared, dropping packet.\n", __FUNCTION__); @@ -211,7 +211,7 @@ void nf_reinject(struct sk_buff *skb, st if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(&nf_hooks[info->pf][info->hook], + verdict = nf_iterate(&ve_nf_hooks[info->pf][info->hook], &skb, info->hook, info->indev, info->outdev, &elem, info->okfn, INT_MIN); diff -uprN linux-2.6.18/net/netfilter/nf_sockopt.c linux-2.6.18.ovz/net/netfilter/nf_sockopt.c --- linux-2.6.18/net/netfilter/nf_sockopt.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netfilter/nf_sockopt.c 2007-06-13 06:55:08.000000000 -0400 @@ -80,6 +80,11 @@ static int nf_sockopt(struct sock *sk, i struct nf_sockopt_ops *ops; int ret; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_nf_hooks) + return -ENOPROTOOPT; +#endif + if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) return -EINTR; @@ -138,6 +143,11 @@ static int compat_nf_sockopt(struct sock struct nf_sockopt_ops *ops; int ret; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_nf_hooks) + return -ENOPROTOOPT; +#endif + if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) return -EINTR; diff -uprN linux-2.6.18/net/netfilter/nfnetlink.c linux-2.6.18.ovz/net/netfilter/nfnetlink.c --- linux-2.6.18/net/netfilter/nfnetlink.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netfilter/nfnetlink.c 2007-06-13 06:55:08.000000000 -0400 @@ -228,7 +228,7 @@ static int nfnetlink_rcv_msg(struct sk_b NFNL_SUBSYS_ID(nlh->nlmsg_type), NFNL_MSG_TYPE(nlh->nlmsg_type)); - if (security_netlink_recv(skb, CAP_NET_ADMIN)) { + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) { DEBUGP("missing CAP_NET_ADMIN\n"); *errp = -EPERM; return -1; diff -uprN linux-2.6.18/net/netfilter/x_tables.c linux-2.6.18.ovz/net/netfilter/x_tables.c --- linux-2.6.18/net/netfilter/x_tables.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netfilter/x_tables.c 2007-06-13 06:55:08.000000000 -0400 @@ -24,6 +24,10 @@ #include #include +#include + +#include +#include MODULE_LICENSE("GPL"); @@ -42,6 +46,14 @@ struct xt_af { static struct xt_af *xt; +#ifdef CONFIG_VE_IPTABLES +/* include ve.h and define get_exec_env */ +#include +#define xt_tables(af) (get_exec_env()->_xt_tables[af]) +#else +#define xt_tables(af) xt[af].tables +#endif + #ifdef DEBUG_IP_FIREWALL_USER #define duprintf(format, args...) printk(format , ## args) #else @@ -60,6 +72,46 @@ static const char *xt_prefix[NPROTO] = { [NF_ARP] = "arp", }; +#ifdef CONFIG_USER_RESOURCE +static inline struct user_beancounter *xt_table_ub(struct xt_table_info *info) +{ + struct user_beancounter *ub; + + for (ub = mem_ub(info); ub->parent != NULL; ub = ub->parent); + return ub; +} + +static void uncharge_xtables(struct xt_table_info *info, unsigned long size) +{ + struct user_beancounter *ub; + + ub = xt_table_ub(info); + uncharge_beancounter(ub, UB_NUMXTENT, size); +} + +static int recharge_xtables(int check_ub, + struct xt_table_info *new, struct xt_table_info *old) +{ + struct user_beancounter *ub; + long change; + + ub = xt_table_ub(new); + BUG_ON(check_ub && ub != xt_table_ub(old)); + + change = (long)new->number - (long)old->number; + if (change > 0) { + if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT)) + return -ENOMEM; + } else if (change < 0) + uncharge_beancounter(ub, UB_NUMXTENT, -change); + + return 0; +} +#else +#define recharge_xtables(c, new, old) (0) +#define uncharge_xtables(info, s) do { } while (0) +#endif /* CONFIG_USER_RESOURCE */ + /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) @@ -71,7 +123,7 @@ xt_register_target(struct xt_target *tar return ret; list_add(&target->list, &xt[af].target); mutex_unlock(&xt[af].mutex); - return ret; + return 0; } EXPORT_SYMBOL(xt_register_target); @@ -98,7 +150,7 @@ xt_register_match(struct xt_match *match list_add(&match->list, &xt[af].match); mutex_unlock(&xt[af].mutex); - return ret; + return 0; } EXPORT_SYMBOL(xt_register_match); @@ -248,24 +300,25 @@ int xt_check_match(const struct xt_match unsigned short proto, int inv_proto) { if (XT_ALIGN(match->matchsize) != size) { - printk("%s_tables: %s match: invalid size %Zu != %u\n", - xt_prefix[family], match->name, - XT_ALIGN(match->matchsize), size); + ve_printk(VE_LOG, "%s_tables: %s match: invalid size %Zu != " + "%u\n", xt_prefix[family], match->name, + XT_ALIGN(match->matchsize), size); return -EINVAL; } if (match->table && strcmp(match->table, table)) { - printk("%s_tables: %s match: only valid in %s table, not %s\n", - xt_prefix[family], match->name, match->table, table); + ve_printk(VE_LOG, "%s_tables: %s match: only valid in %s table," + " not %s\n", xt_prefix[family], match->name, + match->table, table); return -EINVAL; } if (match->hooks && (hook_mask & ~match->hooks) != 0) { - printk("%s_tables: %s match: bad hook_mask %u\n", + ve_printk(VE_LOG, "%s_tables: %s match: bad hook_mask %u\n", xt_prefix[family], match->name, hook_mask); return -EINVAL; } if (match->proto && (match->proto != proto || inv_proto)) { - printk("%s_tables: %s match: only valid for protocol %u\n", - xt_prefix[family], match->name, match->proto); + ve_printk(VE_LOG, "%s_tables: %s match: only valid for protocol" + " %u\n", xt_prefix[family], match->name, match->proto); return -EINVAL; } return 0; @@ -325,24 +378,26 @@ int xt_check_target(const struct xt_targ unsigned short proto, int inv_proto) { if (XT_ALIGN(target->targetsize) != size) { - printk("%s_tables: %s target: invalid size %Zu != %u\n", - xt_prefix[family], target->name, - XT_ALIGN(target->targetsize), size); + ve_printk(VE_LOG, "%s_tables: %s target: invalid size %Zu != " + "%u\n", xt_prefix[family], target->name, + XT_ALIGN(target->targetsize), size); return -EINVAL; } if (target->table && strcmp(target->table, table)) { - printk("%s_tables: %s target: only valid in %s table, not %s\n", - xt_prefix[family], target->name, target->table, table); + ve_printk(VE_LOG, "%s_tables: %s target: only valid in %s " + "table, not %s\n", xt_prefix[family], target->name, + target->table, table); return -EINVAL; } if (target->hooks && (hook_mask & ~target->hooks) != 0) { - printk("%s_tables: %s target: bad hook_mask %u\n", + ve_printk(VE_LOG, "%s_tables: %s target: bad hook_mask %u\n", xt_prefix[family], target->name, hook_mask); return -EINVAL; } if (target->proto && (target->proto != proto || inv_proto)) { - printk("%s_tables: %s target: only valid for protocol %u\n", - xt_prefix[family], target->name, target->proto); + ve_printk(VE_LOG, "%s_tables: %s target: only valid for " + "protocol %u\n", xt_prefix[family], target->name, + target->proto); return -EINVAL; } return 0; @@ -406,19 +461,19 @@ struct xt_table_info *xt_alloc_table_inf if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) return NULL; - newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL); + newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL_UBC); if (!newinfo) return NULL; - newinfo->size = size; + newinfo->alloc_size = newinfo->size = size; for_each_possible_cpu(cpu) { if (size <= PAGE_SIZE) newinfo->entries[cpu] = kmalloc_node(size, - GFP_KERNEL, + GFP_KERNEL_UBC, cpu_to_node(cpu)); else - newinfo->entries[cpu] = vmalloc_node(size, + newinfo->entries[cpu] = ub_vmalloc_node(size, cpu_to_node(cpu)); if (newinfo->entries[cpu] == NULL) { @@ -436,7 +491,7 @@ void xt_free_table_info(struct xt_table_ int cpu; for_each_possible_cpu(cpu) { - if (info->size <= PAGE_SIZE) + if (info->alloc_size <= PAGE_SIZE) kfree(info->entries[cpu]); else vfree(info->entries[cpu]); @@ -453,7 +508,7 @@ struct xt_table *xt_find_table_lock(int if (mutex_lock_interruptible(&xt[af].mutex) != 0) return ERR_PTR(-EINTR); - list_for_each_entry(t, &xt[af].tables, list) + list_for_each_entry(t, &xt_tables(af), list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; mutex_unlock(&xt[af].mutex); @@ -501,6 +556,13 @@ xt_replace_table(struct xt_table *table, return NULL; } oldinfo = private; + + if (recharge_xtables(num_counters != 0, newinfo, oldinfo)) { + write_unlock_bh(&table->lock); + *error = -ENOMEM; + return NULL; + } + table->private = newinfo; newinfo->initial_entries = oldinfo->initial_entries; write_unlock_bh(&table->lock); @@ -521,7 +583,7 @@ int xt_register_table(struct xt_table *t return ret; /* Don't autoload: we'd eat our tail... */ - if (list_named_find(&xt[table->af].tables, table->name)) { + if (list_named_find(&xt_tables(table->af), table->name)) { ret = -EEXIST; goto unlock; } @@ -538,7 +600,7 @@ int xt_register_table(struct xt_table *t /* save number of initial entries */ private->initial_entries = private->number; - list_prepend(&xt[table->af].tables, table); + list_prepend(&xt_tables(table->af), table); ret = 0; unlock: @@ -547,19 +609,67 @@ int xt_register_table(struct xt_table *t } EXPORT_SYMBOL_GPL(xt_register_table); +struct xt_table * virt_xt_register_table(struct xt_table *table, + struct xt_table_info *bootstrap, + struct xt_table_info *newinfo) +{ + int ret; + struct module *mod = table->me; + + if (!ve_is_super(get_exec_env())) { + struct xt_table *tmp; + __module_get(mod); + ret = -ENOMEM; + tmp = ub_kmalloc(sizeof(struct xt_table), GFP_KERNEL); + if (!tmp) + goto nomem; + memcpy(tmp, table, sizeof(struct xt_table)); + table = tmp; + } + + ret = xt_register_table(table, bootstrap, newinfo); + if (ret) + goto out; + + return table; +out: + if (!ve_is_super(get_exec_env())) { + kfree(table); +nomem: + module_put(mod); + } + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(virt_xt_register_table); + void *xt_unregister_table(struct xt_table *table) { struct xt_table_info *private; mutex_lock(&xt[table->af].mutex); private = table->private; - LIST_DELETE(&xt[table->af].tables, table); + LIST_DELETE(&xt_tables(table->af), table); mutex_unlock(&xt[table->af].mutex); + uncharge_xtables(private, private->number); + return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); +void *virt_xt_unregister_table(struct xt_table *table) +{ + void *ret; + + ret = xt_unregister_table(table); + if (!ve_is_super(get_exec_env())) { + module_put(table->me); + kfree(table); + } + return ret; +} +EXPORT_SYMBOL_GPL(virt_xt_unregister_table); + #ifdef CONFIG_PROC_FS static char *xt_proto_prefix[NPROTO] = { [AF_INET] = "ip", @@ -594,7 +704,7 @@ static struct list_head *type2list(u_int list = &xt[af].match; break; case TABLE: - list = &xt[af].tables; + list = &xt_tables(af); break; default: list = NULL; @@ -707,6 +817,7 @@ int xt_proto_init(int af) return -EINVAL; + INIT_LIST_HEAD(&xt_tables(af)); #ifdef CONFIG_PROC_FS strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); @@ -795,6 +906,6 @@ static void __exit xt_fini(void) kfree(xt); } -module_init(xt_init); +subsys_initcall(xt_init); module_exit(xt_fini); diff -uprN linux-2.6.18/net/netfilter/xt_MARK.c linux-2.6.18.ovz/net/netfilter/xt_MARK.c --- linux-2.6.18/net/netfilter/xt_MARK.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netfilter/xt_MARK.c 2007-06-13 06:55:08.000000000 -0400 @@ -82,7 +82,8 @@ checkentry_v0(const char *tablename, struct xt_mark_target_info *markinfo = targinfo; if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide" + " mark\n"); return 0; } return 1; @@ -101,12 +102,13 @@ checkentry_v1(const char *tablename, if (markinfo->mode != XT_MARK_SET && markinfo->mode != XT_MARK_AND && markinfo->mode != XT_MARK_OR) { - printk(KERN_WARNING "MARK: unknown mode %u\n", + ve_printk(VE_LOG, KERN_WARNING "MARK: unknown mode %u\n", markinfo->mode); return 0; } if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide" + " mark\n"); return 0; } return 1; diff -uprN linux-2.6.18/net/netfilter/xt_limit.c linux-2.6.18.ovz/net/netfilter/xt_limit.c --- linux-2.6.18/net/netfilter/xt_limit.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netfilter/xt_limit.c 2007-06-13 06:55:08.000000000 -0400 @@ -118,7 +118,7 @@ ipt_limit_checkentry(const char *tablena /* Check for overflow. */ if (r->burst == 0 || user2credits(r->avg * r->burst) < user2credits(r->avg)) { - printk("Overflow in xt_limit, try lower: %u/%u\n", + ve_printk(VE_LOG, "Overflow in xt_limit, try lower: %u/%u\n", r->avg, r->burst); return 0; } @@ -136,11 +136,96 @@ ipt_limit_checkentry(const char *tablena return 1; } +#ifdef CONFIG_COMPAT +static int ipt_limit_compat_to_user(void *match, void **dstptr, + int *size, int off) +{ + struct xt_entry_match *pm; + struct xt_rateinfo *pinfo; + struct compat_xt_rateinfo rinfo; + u_int16_t msize; + + pm = (struct xt_entry_match *)match; + msize = pm->u.user.match_size; + if (__copy_to_user(*dstptr, pm, sizeof(struct xt_entry_match))) + return -EFAULT; + pinfo = (struct xt_rateinfo *)pm->data; + memset(&rinfo, 0, sizeof(struct compat_xt_rateinfo)); + rinfo.avg = pinfo->avg; + rinfo.burst = pinfo->burst; + if (__copy_to_user(*dstptr + sizeof(struct xt_entry_match), + &rinfo, sizeof(struct compat_xt_rateinfo))) + return -EFAULT; + msize -= off; + if (put_user(msize, (u_int16_t *)*dstptr)) + return -EFAULT; + *size -= off; + *dstptr += msize; + return 0; +} + +static int ipt_limit_compat_from_user(void *match, void **dstptr, + int *size, int off) +{ + struct compat_xt_entry_match *pm; + struct xt_entry_match *dstpm; + struct compat_xt_rateinfo *pinfo; + struct xt_rateinfo rinfo; + u_int16_t msize; + + pm = (struct compat_xt_entry_match *)match; + dstpm = (struct xt_entry_match *)*dstptr; + msize = pm->u.user.match_size; + memcpy(*dstptr, pm, sizeof(struct compat_xt_entry_match)); + pinfo = (struct compat_xt_rateinfo *)pm->data; + memset(&rinfo, 0, sizeof(struct xt_rateinfo)); + rinfo.avg = pinfo->avg; + rinfo.burst = pinfo->burst; + memcpy(*dstptr + sizeof(struct compat_xt_entry_match), + &rinfo, sizeof(struct xt_rateinfo)); + msize += off; + dstpm->u.user.match_size = msize; + *size += off; + *dstptr += msize; + return 0; +} + +static int ipt_limit_compat(void *match, void **dstptr, + int *size, int convert) +{ + int ret, off; + + off = XT_ALIGN(sizeof(struct xt_rateinfo)) - + COMPAT_XT_ALIGN(sizeof(struct compat_xt_rateinfo)); + switch (convert) { + case COMPAT_TO_USER: + ret = ipt_limit_compat_to_user(match, + dstptr, size, off); + break; + case COMPAT_FROM_USER: + ret = ipt_limit_compat_from_user(match, + dstptr, size, off); + break; + case COMPAT_CALC_SIZE: + *size += off; + ret = 0; + break; + default: + ret = -ENOPROTOOPT; + break; + } + return ret; +} +#endif + static struct xt_match ipt_limit_reg = { .name = "limit", .match = ipt_limit_match, .matchsize = sizeof(struct xt_rateinfo), .checkentry = ipt_limit_checkentry, +#ifdef CONFIG_COMPAT + .compat = ipt_limit_compat, +#endif .family = AF_INET, .me = THIS_MODULE, }; @@ -149,6 +234,9 @@ static struct xt_match limit6_reg = { .match = ipt_limit_match, .matchsize = sizeof(struct xt_rateinfo), .checkentry = ipt_limit_checkentry, +#ifdef CONFIG_COMPAT + .compat = ipt_limit_compat, +#endif .family = AF_INET6, .me = THIS_MODULE, }; diff -uprN linux-2.6.18/net/netlink/af_netlink.c linux-2.6.18.ovz/net/netlink/af_netlink.c --- linux-2.6.18/net/netlink/af_netlink.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netlink/af_netlink.c 2007-06-13 06:55:08.000000000 -0400 @@ -60,26 +60,13 @@ #include #include #include +#include + +#include +#include #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) -struct netlink_sock { - /* struct sock has to be the first member of netlink_sock */ - struct sock sk; - u32 pid; - u32 dst_pid; - u32 dst_group; - u32 flags; - u32 subscriptions; - u32 ngroups; - unsigned long *groups; - unsigned long state; - wait_queue_head_t wait; - struct netlink_callback *cb; - spinlock_t cb_lock; - void (*data_ready)(struct sock *sk, int bytes); - struct module *module; -}; #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 @@ -209,7 +196,10 @@ static __inline__ struct sock *netlink_l read_lock(&nl_table_lock); head = nl_pid_hashfn(hash, pid); sk_for_each(sk, node, head) { - if (nlk_sk(sk)->pid == pid) { + /* VEs should find sockets, created by kernel */ + if ((nlk_sk(sk)->pid == pid) && + (!pid || ve_accessible_strict(sk->owner_env, + get_exec_env()))){ sock_hold(sk); goto found; } @@ -327,7 +317,9 @@ static int netlink_insert(struct sock *s head = nl_pid_hashfn(hash, pid); len = 0; sk_for_each(osk, node, head) { - if (nlk_sk(osk)->pid == pid) + if ((nlk_sk(sk)->pid == pid) && + ve_accessible_strict(sk->owner_env, + get_exec_env())) break; len++; } @@ -380,6 +372,8 @@ static int __netlink_create(struct socke sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); if (!sk) return -ENOMEM; + if (ub_other_sock_charge(sk)) + goto out_free; sock_init_data(sock, sk); @@ -390,6 +384,10 @@ static int __netlink_create(struct socke sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; + +out_free: + sk_free(sk); + return -ENOMEM; } static int netlink_create(struct socket *sock, int protocol) @@ -443,6 +441,7 @@ static int netlink_release(struct socket return 0; netlink_remove(sk); + sock_orphan(sk); nlk = nlk_sk(sk); spin_lock(&nlk->cb_lock); @@ -457,7 +456,6 @@ static int netlink_release(struct socket /* OK. Socket is unlinked, and, therefore, no new packets will arrive */ - sock_orphan(sk); sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); @@ -498,7 +496,7 @@ static int netlink_autobind(struct socke struct hlist_head *head; struct sock *osk; struct hlist_node *node; - s32 pid = current->tgid; + s32 pid = virt_pid(current); int err; static s32 rover = -4097; @@ -507,7 +505,9 @@ retry: netlink_table_grab(); head = nl_pid_hashfn(hash, pid); sk_for_each(osk, node, head) { - if (nlk_sk(osk)->pid == pid) { + if ((nlk_sk(osk)->pid == pid) && + ve_accessible_strict(osk->owner_env, + get_exec_env())) { /* Bind collision, search negative pid values. */ pid = rover--; if (rover > -4097) @@ -532,7 +532,7 @@ retry: static inline int netlink_capable(struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || - capable(CAP_NET_ADMIN); + capable(CAP_VE_NET_ADMIN); } static void @@ -877,6 +877,9 @@ static inline int do_one_broadcast(struc !test_bit(p->group - 1, nlk->groups)) goto out; + if (!ve_accessible_strict(get_exec_env(), sk->owner_env)) + goto out; + if (p->failure) { netlink_overrun(sk); goto out; @@ -974,6 +977,9 @@ static inline int do_one_set_err(struct !test_bit(p->group - 1, nlk->groups)) goto out; + if (!ve_accessible_strict(get_exec_env(), sk->owner_env)) + goto out; + sk->sk_err = p->code; sk->sk_error_report(sk); out: @@ -1109,12 +1115,17 @@ static int netlink_sendmsg(struct kiocb struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *addr=msg->msg_name; + struct sockaddr_nl *addr = msg->msg_name; u32 dst_pid; - u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; + struct sock *dstsk; + long timeo; + int no_ubc, no_buf; + unsigned long chargesize; + + DECLARE_WAITQUEUE(wait, current); if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; @@ -1125,17 +1136,16 @@ static int netlink_sendmsg(struct kiocb if (err < 0) return err; + /* Broadcasts from user to kernel are disabled. This is OK + * according to ANK */ if (msg->msg_namelen) { if (addr->nl_family != AF_NETLINK) return -EINVAL; dst_pid = addr->nl_pid; - dst_group = ffs(addr->nl_groups); - if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) + if (addr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) return -EPERM; - } else { + } else dst_pid = nlk->dst_pid; - dst_group = nlk->dst_group; - } if (!nlk->pid) { err = netlink_autobind(sock); @@ -1148,12 +1158,12 @@ static int netlink_sendmsg(struct kiocb goto out; err = -ENOBUFS; skb = alloc_skb(len, GFP_KERNEL); - if (skb==NULL) + if (skb == NULL) goto out; NETLINK_CB(skb).pid = nlk->pid; NETLINK_CB(skb).dst_pid = dst_pid; - NETLINK_CB(skb).dst_group = dst_group; + NETLINK_CB(skb).dst_group = 0; NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); selinux_get_task_sid(current, &(NETLINK_CB(skb).sid)); memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); @@ -1165,25 +1175,88 @@ static int netlink_sendmsg(struct kiocb */ err = -EFAULT; - if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) { - kfree_skb(skb); - goto out; - } + if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) + goto out_free; err = security_netlink_send(sk, skb); - if (err) { - kfree_skb(skb); - goto out; + if (err) + goto out_free; + + timeo = sock_sndtimeo(sk, msg->msg_flags&MSG_DONTWAIT); +retry: + dstsk = netlink_getsockbypid(sk, dst_pid); + if (IS_ERR(dstsk)) { + err = PTR_ERR(dstsk); + goto out_free; } - if (dst_group) { - atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); + nlk = nlk_sk(dstsk); +#ifdef NL_EMULATE_DEV + if (nlk->handler) { + skb_orphan(skb); + err = nlk->handler(protocol, skb); + goto out_put; + } +#endif + + /* BTW, it could be done once, before the retry loop */ + chargesize = skb_charge_fullsize(skb); + no_ubc = ub_sock_getwres_other(sk, chargesize); + no_buf = atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf || + test_bit(0, &nlk->state); + if (no_ubc || no_buf) { + wait_queue_head_t *sleep; + + if (!no_ubc) + ub_sock_retwres_other(sk, chargesize, + SOCK_MIN_UBCSPACE_CH); + err = -EAGAIN; + if (timeo == 0) { + kfree_skb(skb); + goto out_put; + } + + /* wake up comes to different queues */ + sleep = no_ubc ? sk->sk_sleep : &nlk->wait; + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(sleep, &wait); + + /* this if can't be moved upper because ub_sock_snd_queue_add() + * may change task state to TASK_RUNNING */ + if (no_ubc) + ub_sock_sndqueueadd_other(sk, chargesize); + + if ((atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf || + test_bit(0, &nlk->state) || no_ubc) && + !sock_flag(dstsk, SOCK_DEAD)) + timeo = schedule_timeout(timeo); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(sleep, &wait); + if (no_ubc) + ub_sock_sndqueuedel(sk); + sock_put(dstsk); + + if (!signal_pending(current)) + goto retry; + err = sock_intr_errno(timeo); + goto out_free; } - err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); + skb_orphan(skb); + skb_set_owner_r(skb, dstsk); + ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); + skb_queue_tail(&dstsk->sk_receive_queue, skb); + dstsk->sk_data_ready(dstsk, len); + err = len; +out_put: + sock_put(dstsk); out: return err; + +out_free: + kfree_skb(skb); + return err; } static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, @@ -1346,6 +1419,10 @@ static int netlink_dump(struct sock *sk) skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); if (!skb) return -ENOBUFS; + if (ub_nlrcvbuf_charge(skb, sk) < 0) { + kfree_skb(skb); + return -EACCES; + } spin_lock(&nlk->cb_lock); @@ -1407,9 +1484,9 @@ int netlink_dump_start(struct sock *ssk, return -ECONNREFUSED; } nlk = nlk_sk(sk); - /* A dump is in progress... */ + /* A dump or destruction is in progress... */ spin_lock(&nlk->cb_lock); - if (nlk->cb) { + if (nlk->cb || sock_flag(sk, SOCK_DEAD)) { spin_unlock(&nlk->cb_lock); netlink_destroy_callback(cb); sock_put(sk); @@ -1513,8 +1590,15 @@ void netlink_run_queue(struct sock *sk, *qlen = skb_queue_len(&sk->sk_receive_queue); for (; *qlen; (*qlen)--) { + int ret; + struct ve_struct *old_env; skb = skb_dequeue(&sk->sk_receive_queue); - if (netlink_rcv_skb(skb, cb)) { + + old_env = set_exec_env(skb->owner_env); + ret = netlink_rcv_skb(skb, cb); + (void)set_exec_env(old_env); + + if (ret) { if (skb->len) skb_queue_head(&sk->sk_receive_queue, skb); else { @@ -1776,6 +1860,7 @@ static int __init netlink_proto_init(voi sock_register(&netlink_family_ops); #ifdef CONFIG_PROC_FS + /* FIXME: virtualize before give access from VEs */ proc_net_fops_create("netlink", 0, &netlink_seq_fops); #endif /* The netlink device handler may be needed early. */ diff -uprN linux-2.6.18/net/netlink/attr.c linux-2.6.18.ovz/net/netlink/attr.c --- linux-2.6.18/net/netlink/attr.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netlink/attr.c 2007-06-13 06:55:08.000000000 -0400 @@ -118,7 +118,7 @@ int nla_parse(struct nlattr *tb[], int m } if (unlikely(rem > 0)) - printk(KERN_WARNING "netlink: %d bytes leftover after parsing " + ve_printk(VE_LOG, KERN_WARNING "netlink: %d bytes leftover after parsing " "attributes.\n", rem); err = 0; diff -uprN linux-2.6.18/net/netlink/genetlink.c linux-2.6.18.ovz/net/netlink/genetlink.c --- linux-2.6.18/net/netlink/genetlink.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/netlink/genetlink.c 2007-06-13 06:55:08.000000000 -0400 @@ -319,7 +319,8 @@ static int genl_rcv_msg(struct sk_buff * goto errout; } - if ((ops->flags & GENL_ADMIN_PERM) && security_netlink_recv(skb, CAP_NET_ADMIN)) { + if ((ops->flags & GENL_ADMIN_PERM) && security_netlink_recv(skb, + CAP_VE_NET_ADMIN)) { err = -EPERM; goto errout; } diff -uprN linux-2.6.18/net/packet/af_packet.c linux-2.6.18.ovz/net/packet/af_packet.c --- linux-2.6.18/net/packet/af_packet.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/packet/af_packet.c 2007-06-13 06:55:08.000000000 -0400 @@ -78,6 +78,8 @@ #include #include +#include + #ifdef CONFIG_INET #include #endif @@ -279,7 +281,8 @@ static int packet_rcv_spkt(struct sk_buf * so that this procedure is noop. */ - if (skb->pkt_type == PACKET_LOOPBACK) + if (skb->pkt_type == PACKET_LOOPBACK || + !ve_accessible(skb->owner_env, sk->owner_env)) goto out; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) @@ -471,6 +474,9 @@ static int packet_rcv(struct sk_buff *sk sk = pt->af_packet_priv; po = pkt_sk(sk); + if (!ve_accessible(skb->owner_env, sk->owner_env)) + goto drop; + skb->dev = dev; if (dev->hard_header) { @@ -530,6 +536,9 @@ static int packet_rcv(struct sk_buff *sk if (pskb_trim(skb, snaplen)) goto drop_n_acct; + if (ub_sockrcvbuf_charge(sk, skb)) + goto drop_n_acct; + skb_set_owner_r(skb, sk); skb->dev = NULL; dst_release(skb->dst); @@ -580,6 +589,9 @@ static int tpacket_rcv(struct sk_buff *s sk = pt->af_packet_priv; po = pkt_sk(sk); + if (!ve_accessible(skb->owner_env, sk->owner_env)) + goto drop; + if (dev->hard_header) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb->mac.raw); @@ -627,6 +639,12 @@ static int tpacket_rcv(struct sk_buff *s snaplen = 0; } + if (copy_skb && + ub_sockrcvbuf_charge(sk, copy_skb)) { + spin_lock(&sk->sk_receive_queue.lock); + goto ring_is_full; + } + spin_lock(&sk->sk_receive_queue.lock); h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head); @@ -1007,6 +1025,8 @@ static int packet_create(struct socket * sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1); if (sk == NULL) goto out; + if (ub_other_sock_charge(sk)) + goto out_free; sock->ops = &packet_ops; #ifdef CONFIG_SOCK_PACKET @@ -1045,6 +1065,9 @@ static int packet_create(struct socket * sk_add_node(sk, &packet_sklist); write_unlock_bh(&packet_sklist_lock); return(0); + +out_free: + sk_free(sk); out: return err; } @@ -1427,11 +1450,16 @@ static int packet_notifier(struct notifi struct sock *sk; struct hlist_node *node; struct net_device *dev = (struct net_device*)data; + struct ve_struct *ve; + ve = get_exec_env(); read_lock(&packet_sklist_lock); sk_for_each(sk, node, &packet_sklist) { struct packet_sock *po = pkt_sk(sk); + if (!ve_accessible_strict(sk->owner_env, ve)) + continue; + switch (msg) { case NETDEV_UNREGISTER: #ifdef CONFIG_PACKET_MULTICAST @@ -1842,6 +1870,8 @@ static inline struct sock *packet_seq_id struct hlist_node *node; sk_for_each(s, node, &packet_sklist) { + if (!ve_accessible(s->owner_env, get_exec_env())) + continue; if (!off--) return s; } @@ -1857,9 +1887,14 @@ static void *packet_seq_start(struct seq static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return (v == SEQ_START_TOKEN) - ? sk_head(&packet_sklist) - : sk_next((struct sock*)v) ; + do { + v = (v == SEQ_START_TOKEN) + ? sk_head(&packet_sklist) + : sk_next((struct sock*)v); + } while (v != NULL && + !ve_accessible(((struct sock*)v)->owner_env, + get_exec_env())); + return v; } static void packet_seq_stop(struct seq_file *seq, void *v) @@ -1915,7 +1950,7 @@ static struct file_operations packet_seq static void __exit packet_exit(void) { - proc_net_remove("packet"); + remove_proc_glob_entry("net/packet", NULL); unregister_netdevice_notifier(&packet_netdev_notifier); sock_unregister(PF_PACKET); proto_unregister(&packet_proto); @@ -1930,7 +1965,7 @@ static int __init packet_init(void) sock_register(&packet_family_ops); register_netdevice_notifier(&packet_netdev_notifier); - proc_net_fops_create("packet", 0, &packet_seq_fops); + proc_glob_fops_create("net/packet", 0, &packet_seq_fops); out: return rc; } diff -uprN linux-2.6.18/net/sched/act_gact.c linux-2.6.18.ovz/net/sched/act_gact.c --- linux-2.6.18/net/sched/act_gact.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sched/act_gact.c 2007-06-13 06:55:08.000000000 -0400 @@ -54,14 +54,14 @@ static DEFINE_RWLOCK(gact_lock); #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *p) { - if (net_random()%p->pval) + if (!p->pval || net_random()%p->pval) return p->action; return p->paction; } static int gact_determ(struct tcf_gact *p) { - if (p->bstats.packets%p->pval) + if (!p->pval || p->bstats.packets%p->pval) return p->action; return p->paction; } diff -uprN linux-2.6.18/net/sched/act_police.c linux-2.6.18.ovz/net/sched/act_police.c --- linux-2.6.18/net/sched/act_police.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sched/act_police.c 2007-06-13 06:55:08.000000000 -0400 @@ -44,6 +44,18 @@ static struct tcf_police *tcf_police_ht[ /* Policer hash table lock */ static DEFINE_RWLOCK(police_lock); +/* old policer structure from before tc actions */ +struct tc_police_compat +{ + u32 index; + int action; + u32 limit; + u32 burst; + u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; +}; + /* Each policer is serialized by its individual spinlock */ static __inline__ unsigned tcf_police_hash(u32 index) @@ -169,12 +181,15 @@ static int tcf_act_police_locate(struct struct tc_police *parm; struct tcf_police *p; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; + int size; if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) return -EINVAL; - if (tb[TCA_POLICE_TBF-1] == NULL || - RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) + if (tb[TCA_POLICE_TBF-1] == NULL) + return -EINVAL; + size = RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]); + if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat)) return -EINVAL; parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); @@ -413,12 +428,15 @@ struct tcf_police * tcf_police_locate(st struct tcf_police *p; struct rtattr *tb[TCA_POLICE_MAX]; struct tc_police *parm; + int size; if (rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) return NULL; - if (tb[TCA_POLICE_TBF-1] == NULL || - RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) + if (tb[TCA_POLICE_TBF-1] == NULL) + return NULL; + size = RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]); + if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat)) return NULL; parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); diff -uprN linux-2.6.18/net/sched/cls_api.c linux-2.6.18.ovz/net/sched/cls_api.c --- linux-2.6.18/net/sched/cls_api.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sched/cls_api.c 2007-06-13 06:55:08.000000000 -0400 @@ -401,7 +401,7 @@ static int tc_dump_tfilter(struct sk_buf if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) return skb->len; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); if (!tcm->tcm_parent) q = dev->qdisc_sleeping; else @@ -458,7 +458,7 @@ errout: if (cl) cops->put(q, cl); out: - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); dev_put(dev); return skb->len; } diff -uprN linux-2.6.18/net/sched/cls_basic.c linux-2.6.18.ovz/net/sched/cls_basic.c --- linux-2.6.18/net/sched/cls_basic.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sched/cls_basic.c 2007-06-13 06:55:08.000000000 -0400 @@ -194,7 +194,7 @@ static int basic_change(struct tcf_proto if (handle) f->handle = handle; else { - int i = 0x80000000; + unsigned int i = 0x80000000; do { if (++head->hgenerator == 0x7FFFFFFF) head->hgenerator = 1; diff -uprN linux-2.6.18/net/sched/sch_api.c linux-2.6.18.ovz/net/sched/sch_api.c --- linux-2.6.18/net/sched/sch_api.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sched/sch_api.c 2007-06-13 06:55:08.000000000 -0400 @@ -195,14 +195,14 @@ struct Qdisc *qdisc_lookup(struct net_de { struct Qdisc *q; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); list_for_each_entry(q, &dev->qdisc_list, list) { if (q->handle == handle) { - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); return q; } } - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); return NULL; } @@ -837,7 +837,7 @@ static int tc_dump_qdisc(struct sk_buff continue; if (idx > s_idx) s_q_idx = 0; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); q_idx = 0; list_for_each_entry(q, &dev->qdisc_list, list) { if (q_idx < s_q_idx) { @@ -846,12 +846,12 @@ static int tc_dump_qdisc(struct sk_buff } if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); goto done; } q_idx++; } - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); } done: @@ -1074,7 +1074,7 @@ static int tc_dump_tclass(struct sk_buff s_t = cb->args[0]; t = 0; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); list_for_each_entry(q, &dev->qdisc_list, list) { if (t < s_t || !q->ops->cl_ops || (tcm->tcm_parent && @@ -1096,7 +1096,7 @@ static int tc_dump_tclass(struct sk_buff break; t++; } - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); cb->args[0] = t; diff -uprN linux-2.6.18/net/sched/sch_cbq.c linux-2.6.18.ovz/net/sched/sch_cbq.c --- linux-2.6.18/net/sched/sch_cbq.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sched/sch_cbq.c 2007-06-13 06:55:08.000000000 -0400 @@ -931,8 +931,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int if (cl->deficit <= 0) { q->active[prio] = cl; - cl = cl->next_alive; cl->deficit += cl->quantum; + cl = cl->next_alive; } return skb; @@ -1108,17 +1108,19 @@ static void cbq_normalize_quanta(struct for (h=0; h<16; h++) { for (cl = q->classes[h]; cl; cl = cl->next) { + long mtu; /* BUGGGG... Beware! This expression suffer of arithmetic overflows! */ if (cl->priority == prio) { - cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ - q->quanta[prio]; - } - if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { - printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); - cl->quantum = cl->qdisc->dev->mtu/2 + 1; + cl->quantum = (cl->weight * cl->allot) / + (q->quanta[prio] / q->nclasses[prio]); } + mtu = cl->qdisc->dev->mtu; + if (cl->quantum <= mtu/2) + cl->quantum = mtu/2 + 1; + else if (cl->quantum > 32*mtu) + cl->quantum = 32*mtu; } } } diff -uprN linux-2.6.18/net/sched/sch_generic.c linux-2.6.18.ovz/net/sched/sch_generic.c --- linux-2.6.18/net/sched/sch_generic.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sched/sch_generic.c 2007-06-13 06:55:08.000000000 -0400 @@ -45,11 +45,10 @@ The idea is the following: - enqueue, dequeue are serialized via top level device spinlock dev->queue_lock. - - tree walking is protected by read_lock_bh(qdisc_tree_lock) + - tree walking is protected by read_lock(qdisc_tree_lock) and this lock is used only in process context. - - updates to tree are made under rtnl semaphore or - from softirq context (__qdisc_destroy rcu-callback) - hence this lock needs local bh disabling. + - updates to tree are made only under rtnl semaphore, + hence this lock may be made without local bh disabling. qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! */ @@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock); void qdisc_lock_tree(struct net_device *dev) { - write_lock_bh(&qdisc_tree_lock); + write_lock(&qdisc_tree_lock); spin_lock_bh(&dev->queue_lock); } void qdisc_unlock_tree(struct net_device *dev) { spin_unlock_bh(&dev->queue_lock); - write_unlock_bh(&qdisc_tree_lock); + write_unlock(&qdisc_tree_lock); } /* @@ -96,6 +95,7 @@ static inline int qdisc_restart(struct n /* Dequeue packet */ if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) { + struct ve_struct *envid; unsigned nolock = (dev->features & NETIF_F_LLTX); dev->gso_skb = NULL; @@ -109,6 +109,7 @@ static inline int qdisc_restart(struct n * of lock congestion it should return -1 and the packet * will be requeued. */ + envid = set_exec_env(skb->owner_env); if (!nolock) { if (!netif_tx_trylock(dev)) { collision: @@ -123,6 +124,7 @@ static inline int qdisc_restart(struct n kfree_skb(skb); if (net_ratelimit()) printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); + (void)set_exec_env(envid); return -1; } __get_cpu_var(netdev_rx_stat).cpu_collision++; @@ -143,6 +145,7 @@ static inline int qdisc_restart(struct n netif_tx_unlock(dev); } spin_lock(&dev->queue_lock); + (void)set_exec_env(envid); return -1; } if (ret == NETDEV_TX_LOCKED && nolock) { @@ -176,6 +179,7 @@ requeue: else q->ops->requeue(skb, q); netif_schedule(dev); + (void)set_exec_env(envid); return 1; } BUG_ON((int) q->q.qlen < 0); @@ -483,20 +487,6 @@ void qdisc_reset(struct Qdisc *qdisc) static void __qdisc_destroy(struct rcu_head *head) { struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); - struct Qdisc_ops *ops = qdisc->ops; - -#ifdef CONFIG_NET_ESTIMATOR - gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); -#endif - write_lock(&qdisc_tree_lock); - if (ops->reset) - ops->reset(qdisc); - if (ops->destroy) - ops->destroy(qdisc); - write_unlock(&qdisc_tree_lock); - module_put(ops->owner); - - dev_put(qdisc->dev); kfree((char *) qdisc - qdisc->padded); } @@ -504,32 +494,23 @@ static void __qdisc_destroy(struct rcu_h void qdisc_destroy(struct Qdisc *qdisc) { - struct list_head cql = LIST_HEAD_INIT(cql); - struct Qdisc *cq, *q, *n; + struct Qdisc_ops *ops = qdisc->ops; if (qdisc->flags & TCQ_F_BUILTIN || - !atomic_dec_and_test(&qdisc->refcnt)) + !atomic_dec_and_test(&qdisc->refcnt)) return; - if (!list_empty(&qdisc->list)) { - if (qdisc->ops->cl_ops == NULL) - list_del(&qdisc->list); - else - list_move(&qdisc->list, &cql); - } - - /* unlink inner qdiscs from dev->qdisc_list immediately */ - list_for_each_entry(cq, &cql, list) - list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) - if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) { - if (q->ops->cl_ops == NULL) - list_del_init(&q->list); - else - list_move_tail(&q->list, &cql); - } - list_for_each_entry_safe(cq, n, &cql, list) - list_del_init(&cq->list); + list_del(&qdisc->list); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); +#endif + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + module_put(ops->owner); + dev_put(qdisc->dev); call_rcu(&qdisc->q_rcu, __qdisc_destroy); } @@ -549,15 +530,15 @@ void dev_activate(struct net_device *dev printk(KERN_INFO "%s: activation failed\n", dev->name); return; } - write_lock_bh(&qdisc_tree_lock); + write_lock(&qdisc_tree_lock); list_add_tail(&qdisc->list, &dev->qdisc_list); - write_unlock_bh(&qdisc_tree_lock); + write_unlock(&qdisc_tree_lock); } else { qdisc = &noqueue_qdisc; } - write_lock_bh(&qdisc_tree_lock); + write_lock(&qdisc_tree_lock); dev->qdisc_sleeping = qdisc; - write_unlock_bh(&qdisc_tree_lock); + write_unlock(&qdisc_tree_lock); } if (!netif_carrier_ok(dev)) @@ -641,3 +622,4 @@ EXPORT_SYMBOL(qdisc_destroy); EXPORT_SYMBOL(qdisc_reset); EXPORT_SYMBOL(qdisc_lock_tree); EXPORT_SYMBOL(qdisc_unlock_tree); +EXPORT_SYMBOL(dev_shutdown); diff -uprN linux-2.6.18/net/sched/sch_teql.c linux-2.6.18.ovz/net/sched/sch_teql.c --- linux-2.6.18/net/sched/sch_teql.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sched/sch_teql.c 2007-06-13 06:55:08.000000000 -0400 @@ -189,6 +189,9 @@ static int teql_qdisc_init(struct Qdisc struct teql_master *m = (struct teql_master*)sch->ops; struct teql_sched_data *q = qdisc_priv(sch); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (dev->hard_header_len > m->dev->hard_header_len) return -EINVAL; diff -uprN linux-2.6.18/net/sctp/input.c linux-2.6.18.ovz/net/sctp/input.c --- linux-2.6.18/net/sctp/input.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sctp/input.c 2007-06-13 06:55:08.000000000 -0400 @@ -135,6 +135,9 @@ int sctp_rcv(struct sk_buff *skb) SCTP_INC_STATS_BH(SCTP_MIB_INSCTPPACKS); + if (skb_linearize(skb)) + goto discard_it; + sh = (struct sctphdr *) skb->h.raw; /* Pull up the IP and SCTP headers. */ diff -uprN linux-2.6.18/net/socket.c linux-2.6.18.ovz/net/socket.c --- linux-2.6.18/net/socket.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/socket.c 2007-06-13 06:55:08.000000000 -0400 @@ -85,6 +85,8 @@ #include #include #include +#include +#include #include #include @@ -205,15 +207,6 @@ static DEFINE_PER_CPU(int, sockets_in_us * divide and look after the messy bits. */ -#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - - 16 for IP, 16 for IPX, - 24 for IPv6, - about 80 for AX.25 - must be at least one bigger than - the AF_UNIX size (see net/unix/af_unix.c - :unix_mkname()). - */ - /** * move_addr_to_kernel - copy a socket address into kernel space * @uaddr: Address in user space @@ -1121,6 +1114,49 @@ int sock_wake_async(struct socket *sock, return 0; } +int vz_security_proto_check(int family, int type, int protocol) +{ +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + return 0; + + switch (family) { + case PF_UNSPEC: + case PF_PACKET: + case PF_NETLINK: + case PF_UNIX: + break; + case PF_INET: + switch (protocol) { + case IPPROTO_IP: + case IPPROTO_ICMP: + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_RAW: + break; + default: + return -EAFNOSUPPORT; + } + break; + case PF_INET6: + switch (protocol) { + case IPPROTO_IP: + case IPPROTO_ICMPV6: + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_RAW: + break; + default: + return -EAFNOSUPPORT; + } + break; + default: + return -EAFNOSUPPORT; + } +#endif + return 0; +} + static int __sock_create(int family, int type, int protocol, struct socket **res, int kern) { int err; @@ -1148,6 +1184,11 @@ static int __sock_create(int family, int family = PF_PACKET; } + /* VZ compatibility layer */ + err = vz_security_proto_check(family, type, protocol); + if (err < 0) + return err; + err = security_socket_create(family, type, protocol, kern); if (err) return err; @@ -1424,7 +1465,7 @@ asmlinkage long sys_accept(int fd, struc err = sock_attach_fd(newsock, newfile); if (err < 0) - goto out_fd; + goto out_fd_simple; err = security_socket_accept(sock, newsock); if (err) @@ -1455,6 +1496,11 @@ out_put: fput_light(sock->file, fput_needed); out: return err; +out_fd_simple: + sock_release(newsock); + put_filp(newfile); + put_unused_fd(newfd); + goto out_put; out_fd: fput(newfile); put_unused_fd(newfd); diff -uprN linux-2.6.18/net/sunrpc/clnt.c linux-2.6.18.ovz/net/sunrpc/clnt.c --- linux-2.6.18/net/sunrpc/clnt.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sunrpc/clnt.c 2007-06-13 06:55:08.000000000 -0400 @@ -64,6 +64,35 @@ static u32 * call_header(struct rpc_task static u32 * call_verify(struct rpc_task *task); +/* + * Grand abort timeout (stop the client if occures) + */ +int xprt_abort_timeout = RPC_MAX_ABORT_TIMEOUT; + +static int rpc_abort_hard(struct rpc_task *task) +{ + struct rpc_clnt *clnt; + clnt = task->tk_client; + + if (clnt->cl_pr_time == 0) { + clnt->cl_pr_time = jiffies; + return 0; + } + if (xprt_abort_timeout == RPC_MAX_ABORT_TIMEOUT) + return 0; + if (time_before(jiffies, clnt->cl_pr_time + xprt_abort_timeout * HZ)) + return 0; + + clnt->cl_broken = 1; + rpc_killall_tasks(clnt); + return -ETIMEDOUT; +} + +static void rpc_abort_clear(struct rpc_task *task) +{ + task->tk_client->cl_pr_time = 0; +} + static int rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) { @@ -175,10 +204,10 @@ rpc_new_client(struct rpc_xprt *xprt, ch } /* save the nodename */ - clnt->cl_nodelen = strlen(system_utsname.nodename); + clnt->cl_nodelen = strlen(utsname()->nodename); if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); + memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); return clnt; out_no_auth: @@ -250,6 +279,7 @@ rpc_clone_client(struct rpc_clnt *clnt) new->cl_autobind = 0; new->cl_oneshot = 0; new->cl_dead = 0; + new->cl_broken = 0; if (!IS_ERR(new->cl_dentry)) dget(new->cl_dentry); rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); @@ -450,7 +480,7 @@ int rpc_call_sync(struct rpc_clnt *clnt, int status; /* If this client is slain all further I/O fails */ - if (clnt->cl_dead) + if (clnt->cl_dead || clnt->cl_broken) return -EIO; BUG_ON(flags & RPC_TASK_ASYNC); @@ -492,7 +522,7 @@ rpc_call_async(struct rpc_clnt *clnt, st /* If this client is slain all further I/O fails */ status = -EIO; - if (clnt->cl_dead) + if (clnt->cl_dead || clnt->cl_broken) goto out_release; flags |= RPC_TASK_ASYNC; @@ -805,6 +835,7 @@ call_bind_status(struct rpc_task *task) if (task->tk_status >= 0) { dprintk("RPC: %4d call_bind_status (status %d)\n", task->tk_pid, task->tk_status); + rpc_abort_clear(task); task->tk_status = 0; task->tk_action = call_connect; return; @@ -819,7 +850,7 @@ call_bind_status(struct rpc_task *task) case -ETIMEDOUT: dprintk("RPC: %4d rpcbind request timed out\n", task->tk_pid); - if (RPC_IS_SOFT(task)) { + if (RPC_IS_SOFT(task) || rpc_abort_hard(task)) { status = -EIO; break; } @@ -895,8 +926,10 @@ call_connect_status(struct rpc_task *tas case -ENOTCONN: case -ETIMEDOUT: case -EAGAIN: - task->tk_action = call_bind; - break; + if (!rpc_abort_hard(task)) { + task->tk_action = call_bind; + break; + } default: rpc_exit(task, -EIO); break; @@ -1023,7 +1056,7 @@ call_timeout(struct rpc_task *task) dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid); task->tk_timeouts++; - if (RPC_IS_SOFT(task)) { + if (RPC_IS_SOFT(task) || rpc_abort_hard(task)) { printk(KERN_NOTICE "%s: server %s not responding, timed out\n", clnt->cl_protname, clnt->cl_server); rpc_exit(task, -EIO); @@ -1064,7 +1097,7 @@ call_decode(struct rpc_task *task) } if (task->tk_status < 12) { - if (!RPC_IS_SOFT(task)) { + if (!RPC_IS_SOFT(task) && !rpc_abort_hard(task)) { task->tk_action = call_bind; clnt->cl_stats->rpcretrans++; goto out_retry; @@ -1075,6 +1108,7 @@ call_decode(struct rpc_task *task) return; } + rpc_abort_clear(task); /* * Ensure that we see all writes made by xprt_complete_rqst() * before it changed req->rq_received. diff -uprN linux-2.6.18/net/sunrpc/rpc_pipe.c linux-2.6.18.ovz/net/sunrpc/rpc_pipe.c --- linux-2.6.18/net/sunrpc/rpc_pipe.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sunrpc/rpc_pipe.c 2007-06-13 06:55:08.000000000 -0400 @@ -815,6 +815,7 @@ static struct file_system_type rpc_pipe_ .name = "rpc_pipefs", .get_sb = rpc_get_sb, .kill_sb = kill_litter_super, + .fs_flags = FS_VIRTUALIZED, }; static void diff -uprN linux-2.6.18/net/sunrpc/sched.c linux-2.6.18.ovz/net/sunrpc/sched.c --- linux-2.6.18/net/sunrpc/sched.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sunrpc/sched.c 2007-06-13 06:55:08.000000000 -0400 @@ -324,16 +324,6 @@ static void rpc_make_runnable(struct rpc } /* - * Place a newly initialized task on the workqueue. - */ -static inline void -rpc_schedule_run(struct rpc_task *task) -{ - rpc_set_active(task); - rpc_make_runnable(task); -} - -/* * Prepare for sleeping on a wait queue. * By always appending tasks to the list we ensure FIFO behavior. * NB: An RPC task will only receive interrupt-driven events as long @@ -608,7 +598,9 @@ EXPORT_SYMBOL(rpc_exit_task); static int __rpc_execute(struct rpc_task *task) { int status = 0; + struct ve_struct *env; + env = set_exec_env(task->tk_client->cl_xprt->owner_env); dprintk("RPC: %4d rpc_execute flgs %x\n", task->tk_pid, task->tk_flags); @@ -662,10 +654,14 @@ static int __rpc_execute(struct rpc_task rpc_clear_running(task); if (RPC_IS_ASYNC(task)) { /* Careful! we may have raced... */ - if (RPC_IS_QUEUED(task)) + if (RPC_IS_QUEUED(task)) { + (void)set_exec_env(env); return 0; - if (rpc_test_and_set_running(task)) + } + if (rpc_test_and_set_running(task)) { + (void)set_exec_env(env); return 0; + } continue; } @@ -696,6 +692,7 @@ static int __rpc_execute(struct rpc_task rpc_mark_complete_task(task); /* Release all resources associated with the task */ rpc_release_task(task); + (void)set_exec_env(env); return status; } @@ -814,6 +811,13 @@ void rpc_init_task(struct rpc_task *task /* Add to global list of all tasks */ spin_lock(&rpc_sched_lock); list_add_tail(&task->tk_task, &all_tasks); + + /* Prevent the task to run if client is marked as dead */ + if (task->tk_client != NULL && task->tk_client->cl_dead) { + task->tk_flags |= RPC_TASK_KILLED; + rpc_exit(task, -EIO); + rpc_wake_up_task(task); + } spin_unlock(&rpc_sched_lock); BUG_ON(task->tk_ops == NULL); @@ -992,10 +996,12 @@ fail: void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) { + rpc_set_active(child); + spin_lock_bh(&childq.lock); /* N.B. Is it possible for the child to have already finished? */ __rpc_sleep_on(&childq, task, func, NULL); - rpc_schedule_run(child); + rpc_make_runnable(child); spin_unlock_bh(&childq.lock); } diff -uprN linux-2.6.18/net/sunrpc/svcsock.c linux-2.6.18.ovz/net/sunrpc/svcsock.c --- linux-2.6.18/net/sunrpc/svcsock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sunrpc/svcsock.c 2007-06-13 06:55:08.000000000 -0400 @@ -361,6 +361,9 @@ svc_sendto(struct svc_rqst *rqstp, struc size_t base = xdr->page_base; unsigned int pglen = xdr->page_len; unsigned int flags = MSG_MORE; + struct ve_struct *old_env; + + old_env = set_exec_env(sock->sk->owner_env); slen = xdr->len; @@ -425,6 +428,8 @@ out: rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, rqstp->rq_addr.sin_addr.s_addr); + (void)set_exec_env(old_env); + return len; } @@ -437,9 +442,12 @@ svc_recv_available(struct svc_sock *svsk mm_segment_t oldfs; struct socket *sock = svsk->sk_sock; int avail, err; + struct ve_struct *old_env; oldfs = get_fs(); set_fs(KERNEL_DS); + old_env = set_exec_env(sock->sk->owner_env); err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail); + (void)set_exec_env(old_env); set_fs(oldfs); return (err >= 0)? avail : err; @@ -454,6 +462,7 @@ svc_recvfrom(struct svc_rqst *rqstp, str struct msghdr msg; struct socket *sock; int len, alen; + struct ve_struct *old_env; rqstp->rq_addrlen = sizeof(rqstp->rq_addr); sock = rqstp->rq_sock->sk_sock; @@ -465,7 +474,9 @@ svc_recvfrom(struct svc_rqst *rqstp, str msg.msg_flags = MSG_DONTWAIT; + old_env = set_exec_env(sock->sk->owner_env); len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); + (void)set_exec_env(old_env); /* sock_recvmsg doesn't fill in the name/namelen, so we must.. * possibly we should cache this in the svc_sock structure @@ -902,7 +913,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) return 0; } - if (test_bit(SK_CONN, &svsk->sk_flags)) { + if (svsk->sk_sk->sk_state == TCP_LISTEN) { svc_tcp_accept(svsk); svc_sock_received(svsk); return 0; @@ -1440,6 +1451,8 @@ svc_delete_socket(struct svc_sock *svsk) serv = svsk->sk_server; sk = svsk->sk_sk; + /* XXX: serialization? */ + sk->sk_user_data = NULL; sk->sk_state_change = svsk->sk_ostate; sk->sk_data_ready = svsk->sk_odata; sk->sk_write_space = svsk->sk_owspace; diff -uprN linux-2.6.18/net/sunrpc/sysctl.c linux-2.6.18.ovz/net/sunrpc/sysctl.c --- linux-2.6.18/net/sunrpc/sysctl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sunrpc/sysctl.c 2007-06-13 06:55:08.000000000 -0400 @@ -123,6 +123,8 @@ static unsigned int min_slot_table_size static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; +static int xprt_min_abort_timeout = RPC_MIN_ABORT_TIMEOUT; +static int xprt_max_abort_timeout = RPC_MAX_ABORT_TIMEOUT; static ctl_table debug_table[] = { { @@ -201,6 +203,17 @@ static ctl_table debug_table[] = { .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, + { + .ctl_name = CTL_ABORT_TIMEOUT, + .procname = "abort_timeout", + .data = &xprt_abort_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_abort_timeout, + .extra2 = &xprt_max_abort_timeout + }, { .ctl_name = 0 } }; diff -uprN linux-2.6.18/net/sunrpc/xprt.c linux-2.6.18.ovz/net/sunrpc/xprt.c --- linux-2.6.18/net/sunrpc/xprt.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sunrpc/xprt.c 2007-06-13 06:55:08.000000000 -0400 @@ -482,10 +482,13 @@ int xprt_adjust_timeout(struct rpc_rqst static void xprt_autoclose(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); xprt_disconnect(xprt); xprt->ops->close(xprt); xprt_release_write(xprt, NULL); + (void)set_exec_env(ve); } /** @@ -897,6 +900,7 @@ static struct rpc_xprt *xprt_setup(int p return ERR_PTR(-ENOMEM); xprt->addr = *ap; + xprt->owner_env = get_ve(get_exec_env()); switch (proto) { case IPPROTO_UDP: @@ -912,6 +916,7 @@ static struct rpc_xprt *xprt_setup(int p break; } if (result) { + put_ve(xprt->owner_env); kfree(xprt); return ERR_PTR(result); } @@ -975,6 +980,7 @@ int xprt_destroy(struct rpc_xprt *xprt) xprt->shutdown = 1; del_timer_sync(&xprt->timer); xprt->ops->destroy(xprt); + put_ve(xprt->owner_env); kfree(xprt); return 0; diff -uprN linux-2.6.18/net/sunrpc/xprtsock.c linux-2.6.18.ovz/net/sunrpc/xprtsock.c --- linux-2.6.18/net/sunrpc/xprtsock.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/sunrpc/xprtsock.c 2007-06-13 06:55:08.000000000 -0400 @@ -1015,7 +1015,9 @@ static void xs_udp_connect_worker(void * struct rpc_xprt *xprt = (struct rpc_xprt *) args; struct socket *sock = xprt->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); if (xprt->shutdown || xprt->addr.sin_port == 0) goto out; @@ -1061,6 +1063,7 @@ static void xs_udp_connect_worker(void * out: xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + (void)set_exec_env(ve); } /* @@ -1098,7 +1101,9 @@ static void xs_tcp_connect_worker(void * struct rpc_xprt *xprt = (struct rpc_xprt *)args; struct socket *sock = xprt->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); if (xprt->shutdown || xprt->addr.sin_port == 0) goto out; @@ -1174,6 +1179,7 @@ out: xprt_wake_pending_tasks(xprt, status); out_clear: xprt_clear_connecting(xprt); + (void)set_exec_env(ve); } /** diff -uprN linux-2.6.18/net/unix/af_unix.c linux-2.6.18.ovz/net/unix/af_unix.c --- linux-2.6.18/net/unix/af_unix.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/unix/af_unix.c 2007-06-13 06:55:08.000000000 -0400 @@ -117,6 +117,9 @@ #include #include +#include +#include + int sysctl_unix_max_dgram_qlen = 10; struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; @@ -252,6 +255,8 @@ static struct sock *__unix_find_socket_b sk_for_each(s, node, &unix_socket_table[hash ^ type]) { struct unix_sock *u = unix_sk(s); + if (!ve_accessible(s->owner_env, get_exec_env())) + continue; if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) goto found; @@ -456,7 +461,7 @@ static int unix_listen(struct socket *so sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; /* set credentials so connect can copy them */ - sk->sk_peercred.pid = current->tgid; + sk->sk_peercred.pid = virt_tgid(current); sk->sk_peercred.uid = current->euid; sk->sk_peercred.gid = current->egid; err = 0; @@ -578,6 +583,8 @@ static struct sock * unix_create1(struct sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); if (!sk) goto out; + if (ub_other_sock_charge(sk)) + goto out_sk_free; atomic_inc(&unix_nr_socks); @@ -598,6 +605,9 @@ static struct sock * unix_create1(struct unix_insert_socket(unix_sockets_unbound, sk); out: return sk; +out_sk_free: + sk_free(sk); + return NULL; } static int unix_create(struct socket *sock, int protocol) @@ -958,6 +968,7 @@ static int unix_stream_connect(struct so int st; int err; long timeo; + unsigned long chargesize; err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) @@ -986,6 +997,10 @@ static int unix_stream_connect(struct so skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (skb == NULL) goto out; + chargesize = skb_charge_fullsize(skb); + if (ub_sock_getwres_other(newsk, chargesize) < 0) + goto out; + ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF); restart: /* Find listening sock. */ @@ -1069,7 +1084,7 @@ restart: unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; - newsk->sk_peercred.pid = current->tgid; + newsk->sk_peercred.pid = virt_tgid(current); newsk->sk_peercred.uid = current->euid; newsk->sk_peercred.gid = current->egid; newu = unix_sk(newsk); @@ -1133,7 +1148,7 @@ static int unix_socketpair(struct socket sock_hold(skb); unix_peer(ska)=skb; unix_peer(skb)=ska; - ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid; + ska->sk_peercred.pid = skb->sk_peercred.pid = virt_tgid(current); ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid; ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid; @@ -1237,7 +1252,7 @@ static void unix_detach_fds(struct scm_c unix_notinflight(scm->fp->fp[i]); } -static void unix_destruct_fds(struct sk_buff *skb) +void unix_destruct_fds(struct sk_buff *skb) { struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); @@ -1248,6 +1263,7 @@ static void unix_destruct_fds(struct sk_ scm_destroy(&scm); sock_wfree(skb); } +EXPORT_SYMBOL_GPL(unix_destruct_fds); static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { @@ -1460,6 +1476,16 @@ static int unix_stream_sendmsg(struct ki size = len-sent; + if (msg->msg_flags & MSG_DONTWAIT) + ub_sock_makewres_other(sk, skb_charge_size(size)); + if (sock_bc(sk) != NULL && + sock_bc(sk)->poll_reserv >= + SOCK_MIN_UBCSPACE && + skb_charge_size(size) > + sock_bc(sk)->poll_reserv) + size = skb_charge_datalen(sock_bc(sk)->poll_reserv); + + /* Keep two messages in the pipe so it schedules better */ if (size > ((sk->sk_sndbuf >> 1) - 64)) size = (sk->sk_sndbuf >> 1) - 64; @@ -1471,7 +1497,8 @@ static int unix_stream_sendmsg(struct ki * Grab a buffer */ - skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); + skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE, + msg->msg_flags&MSG_DONTWAIT, &err); if (skb==NULL) goto out_err; @@ -1897,6 +1924,7 @@ static unsigned int unix_poll(struct fil { struct sock *sk = sock->sk; unsigned int mask; + int no_ub_res; poll_wait(file, sk->sk_sleep, wait); mask = 0; @@ -1909,6 +1937,10 @@ static unsigned int unix_poll(struct fil if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP; + no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); + if (no_ub_res) + ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); + /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) @@ -1922,7 +1954,7 @@ static unsigned int unix_poll(struct fil * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ - if (unix_writable(sk)) + if (!no_ub_res && unix_writable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; return mask; @@ -2074,7 +2106,7 @@ static int __init af_unix_init(void) sock_register(&unix_family_ops); #ifdef CONFIG_PROC_FS - proc_net_fops_create("unix", 0, &unix_seq_fops); + proc_glob_fops_create("net/unix", 0, &unix_seq_fops); #endif unix_sysctl_register(); out: @@ -2085,7 +2117,7 @@ static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); unix_sysctl_unregister(); - proc_net_remove("unix"); + remove_proc_glob_entry("net/unix", NULL); proto_unregister(&unix_proto); } diff -uprN linux-2.6.18/net/unix/garbage.c linux-2.6.18.ovz/net/unix/garbage.c --- linux-2.6.18/net/unix/garbage.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/unix/garbage.c 2007-06-13 06:55:08.000000000 -0400 @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -136,7 +137,7 @@ void unix_notinflight(struct file *fp) atomic_dec(&unix_tot_inflight); } } - +EXPORT_SYMBOL_GPL(unix_notinflight); /* * Garbage Collector Support Functions diff -uprN linux-2.6.18/net/xfrm/xfrm_user.c linux-2.6.18.ovz/net/xfrm/xfrm_user.c --- linux-2.6.18/net/xfrm/xfrm_user.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/net/xfrm/xfrm_user.c 2007-06-13 06:55:08.000000000 -0400 @@ -1435,7 +1435,7 @@ static int xfrm_user_rcv_msg(struct sk_b link = &xfrm_dispatch[type]; /* All operations require privileges, even GET */ - if (security_netlink_recv(skb, CAP_NET_ADMIN)) { + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) { *errp = -EPERM; return -1; } diff -uprN linux-2.6.18/scripts/kconfig/Makefile linux-2.6.18.ovz/scripts/kconfig/Makefile --- linux-2.6.18/scripts/kconfig/Makefile 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/scripts/kconfig/Makefile 2007-06-13 06:55:04.000000000 -0400 @@ -42,6 +42,10 @@ update-po-config: $(obj)/kxgettext $(Q)rm -f arch/um/Kconfig_arch $(Q)rm -f scripts/kconfig/linux_*.pot scripts/kconfig/config.pot +nonint_oldconfig: scripts/kconfig/conf + ./scripts/kconfig/conf -b arch/$(ARCH)/Kconfig + + PHONY += randconfig allyesconfig allnoconfig allmodconfig defconfig randconfig: $(obj)/conf diff -uprN linux-2.6.18/scripts/kconfig/conf.c linux-2.6.18.ovz/scripts/kconfig/conf.c --- linux-2.6.18/scripts/kconfig/conf.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/scripts/kconfig/conf.c 2007-06-13 06:55:04.000000000 -0400 @@ -21,6 +21,7 @@ enum { ask_all, ask_new, ask_silent, + dont_ask, set_default, set_yes, set_mod, @@ -37,6 +38,8 @@ static struct menu *rootEntry; static char nohelp_text[] = N_("Sorry, no help available for this option yet.\n"); +static int return_value = 0; + static void strip(char *str) { char *p = str; @@ -103,6 +106,12 @@ static void conf_askvalue(struct symbol fflush(stdout); fgets(line, 128, stdin); return; + case dont_ask: + if (!sym_has_value(sym)) { + fprintf(stderr,"CONFIG_%s\n",sym->name); + return_value++; + } + return; case set_default: printf("%s\n", def); return; @@ -346,6 +355,10 @@ static int conf_choice(struct menu *menu printf("?"); printf("]: "); switch (input_mode) { + case dont_ask: + cnt = def; + printf("%d\n", cnt); + break; case ask_new: case ask_silent: if (!is_new) { @@ -482,7 +495,10 @@ static void check_conf(struct menu *menu if (!conf_cnt++) printf(_("*\n* Restart config...\n*\n")); rootEntry = menu_get_parent_menu(menu); - conf(rootEntry); + if (input_mode == dont_ask) + fprintf(stderr,"CONFIG_%s\n",sym->name); + else + conf(rootEntry); } } @@ -501,6 +517,9 @@ int main(int ac, char **av) case 'o': input_mode = ask_new; break; + case 'b': + input_mode = dont_ask; + break; case 's': input_mode = ask_silent; valid_stdin = isatty(0) && isatty(1) && isatty(2); @@ -566,6 +585,7 @@ int main(int ac, char **av) } case ask_all: case ask_new: + case dont_ask: conf_read(NULL); break; case set_no: @@ -612,7 +632,7 @@ int main(int ac, char **av) do { conf_cnt = 0; check_conf(&rootmenu); - } while (conf_cnt); + } while ((conf_cnt) && (input_mode != dont_ask)); if (conf_write(NULL)) { fprintf(stderr, _("\n*** Error during writing of the kernel configuration.\n\n")); return 1; @@ -623,5 +643,5 @@ skip_check: return 1; } - return 0; + return return_value; } diff -uprN linux-2.6.18/scripts/mod/modpost.c linux-2.6.18.ovz/scripts/mod/modpost.c --- linux-2.6.18/scripts/mod/modpost.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/scripts/mod/modpost.c 2007-06-13 06:55:08.000000000 -0400 @@ -1187,16 +1187,20 @@ static void add_header(struct buffer *b, /** * Record CRCs for unresolved symbols **/ -static void add_versions(struct buffer *b, struct module *mod) +static int add_versions(struct buffer *b, struct module *mod) { struct symbol *s, *exp; - + int err; + + err = 0; for (s = mod->unres; s; s = s->next) { exp = find_symbol(s->name); if (!exp || exp->module == mod) { - if (have_vmlinux && !s->weak) + if (have_vmlinux && !s->weak) { warn("\"%s\" [%s.ko] undefined!\n", s->name, mod->name); + err = 1; + } continue; } s->module = exp->module; @@ -1205,7 +1209,7 @@ static void add_versions(struct buffer * } if (!modversions) - return; + return err; buf_printf(b, "\n"); buf_printf(b, "static const struct modversion_info ____versions[]\n"); @@ -1225,6 +1229,7 @@ static void add_versions(struct buffer * } buf_printf(b, "};\n"); + return err; } static void add_depends(struct buffer *b, struct module *mod, @@ -1401,7 +1406,7 @@ int main(int argc, char **argv) char fname[SZ]; char *kernel_read = NULL, *module_read = NULL; char *dump_write = NULL; - int opt; + int opt, err; while ((opt = getopt(argc, argv, "i:I:mo:a")) != -1) { switch(opt) { @@ -1441,6 +1446,7 @@ int main(int argc, char **argv) check_exports(mod); } + err = 0; for (mod = modules; mod; mod = mod->next) { if (mod->skip) continue; @@ -1448,7 +1454,7 @@ int main(int argc, char **argv) buf.pos = 0; add_header(&buf, mod); - add_versions(&buf, mod); + err |= add_versions(&buf, mod); add_depends(&buf, mod, modules); add_moddevtable(&buf, mod); add_srcversion(&buf, mod); @@ -1460,5 +1466,5 @@ int main(int argc, char **argv) if (dump_write) write_dump(dump_write); - return 0; + return err; } diff -uprN linux-2.6.18/security/Kconfig linux-2.6.18.ovz/security/Kconfig --- linux-2.6.18/security/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/security/Kconfig 2007-06-13 06:55:08.000000000 -0400 @@ -41,7 +41,7 @@ config KEYS_DEBUG_PROC_KEYS config SECURITY bool "Enable different security models" - depends on SYSFS + depends on SYSFS && !VE help This allows you to choose different security modules to be configured into your kernel. diff -uprN linux-2.6.18/security/commoncap.c linux-2.6.18.ovz/security/commoncap.c --- linux-2.6.18/security/commoncap.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/security/commoncap.c 2007-06-13 06:55:08.000000000 -0400 @@ -34,6 +34,10 @@ EXPORT_SYMBOL(cap_netlink_send); int cap_netlink_recv(struct sk_buff *skb, int cap) { + if (likely(cap == CAP_VE_NET_ADMIN) && + cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) + return 0; + if (!cap_raised(NETLINK_CB(skb).eff_cap, cap)) return -EPERM; return 0; @@ -196,7 +200,7 @@ int cap_inode_setxattr(struct dentry *de { if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) return -EPERM; return 0; } @@ -205,7 +209,7 @@ int cap_inode_removexattr(struct dentry { if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) return -EPERM; return 0; } @@ -311,7 +315,7 @@ void cap_task_reparent_to_init (struct t int cap_syslog (int type) { - if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) + if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN)) return -EPERM; return 0; } diff -uprN linux-2.6.18/security/seclvl.c linux-2.6.18.ovz/security/seclvl.c --- linux-2.6.18/security/seclvl.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/security/seclvl.c 2007-06-13 06:55:08.000000000 -0400 @@ -370,6 +370,8 @@ static int seclvl_settime(struct timespe current->group_leader->pid); return -EPERM; } /* if attempt to decrement time */ + if (tv->tv_sec > 1924988400) /* disallow dates after 2030) */ + return -EPERM; /* CVE-2005-4352 */ } /* if seclvl > 1 */ return 0; } diff -uprN linux-2.6.18/security/selinux/Kconfig linux-2.6.18.ovz/security/selinux/Kconfig --- linux-2.6.18/security/selinux/Kconfig 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/security/selinux/Kconfig 2007-06-13 06:55:08.000000000 -0400 @@ -1,6 +1,6 @@ config SECURITY_SELINUX bool "NSA SELinux Support" - depends on SECURITY_NETWORK && AUDIT && NET && INET + depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE select NETWORK_SECMARK default n help diff -uprN linux-2.6.18/security/selinux/hooks.c linux-2.6.18.ovz/security/selinux/hooks.c --- linux-2.6.18/security/selinux/hooks.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/security/selinux/hooks.c 2007-06-13 06:55:08.000000000 -0400 @@ -4363,12 +4363,12 @@ static int selinux_setprocattr(struct ta struct task_struct *g, *t; struct mm_struct *mm = p->mm; read_lock(&tasklist_lock); - do_each_thread(g, t) + do_each_thread_ve(g, t) if (t->mm == mm && t != p) { read_unlock(&tasklist_lock); return -EPERM; } - while_each_thread(g, t); + while_each_thread_ve(g, t); read_unlock(&tasklist_lock); } diff -uprN linux-2.6.18/sound/core/control.c linux-2.6.18.ovz/sound/core/control.c --- linux-2.6.18/sound/core/control.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/core/control.c 2007-06-13 06:55:08.000000000 -0400 @@ -997,6 +997,7 @@ static int snd_ctl_elem_add(struct snd_c if (ue == NULL) return -ENOMEM; ue->info = *info; + ue->info.access = 0; ue->elem_data = (char *)ue + sizeof(*ue); ue->elem_data_size = private_size; kctl.private_free = snd_ctl_elem_user_free; diff -uprN linux-2.6.18/sound/core/hwdep.c linux-2.6.18.ovz/sound/core/hwdep.c --- linux-2.6.18/sound/core/hwdep.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/core/hwdep.c 2007-06-13 06:55:08.000000000 -0400 @@ -158,6 +158,7 @@ static int snd_hwdep_release(struct inod { int err = -ENXIO; struct snd_hwdep *hw = file->private_data; + struct module *mod = hw->card->module; mutex_lock(&hw->open_mutex); if (hw->ops.release) { err = hw->ops.release(hw, file); @@ -167,7 +168,7 @@ static int snd_hwdep_release(struct inod hw->used--; snd_card_file_remove(hw->card, file); mutex_unlock(&hw->open_mutex); - module_put(hw->card->module); + module_put(mod); return err; } diff -uprN linux-2.6.18/sound/core/info.c linux-2.6.18.ovz/sound/core/info.c --- linux-2.6.18/sound/core/info.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/core/info.c 2007-06-13 06:55:08.000000000 -0400 @@ -119,7 +119,10 @@ int snd_iprintf(struct snd_info_buffer * len = buffer->len - buffer->size; va_start(args, fmt); for (;;) { - res = vsnprintf(buffer->buffer + buffer->curr, len, fmt, args); + va_list ap; + va_copy(ap, args); + res = vsnprintf(buffer->buffer + buffer->curr, len, fmt, ap); + va_end(ap); if (res < len) break; err = resize_info_buffer(buffer, buffer->len + PAGE_SIZE); diff -uprN linux-2.6.18/sound/core/info_oss.c linux-2.6.18.ovz/sound/core/info_oss.c --- linux-2.6.18/sound/core/info_oss.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/core/info_oss.c 2007-06-13 06:55:08.000000000 -0400 @@ -96,11 +96,11 @@ static void snd_sndstat_proc_read(struct { snd_iprintf(buffer, "Sound Driver:3.8.1a-980706 (ALSA v" CONFIG_SND_VERSION " emulation code)\n"); snd_iprintf(buffer, "Kernel: %s %s %s %s %s\n", - system_utsname.sysname, - system_utsname.nodename, - system_utsname.release, - system_utsname.version, - system_utsname.machine); + init_utsname()->sysname, + init_utsname()->nodename, + init_utsname()->release, + init_utsname()->version, + init_utsname()->machine); snd_iprintf(buffer, "Config options: 0\n"); snd_iprintf(buffer, "\nInstalled drivers: \n"); snd_iprintf(buffer, "Type 10: ALSA emulation\n"); diff -uprN linux-2.6.18/sound/core/rtctimer.c linux-2.6.18.ovz/sound/core/rtctimer.c --- linux-2.6.18/sound/core/rtctimer.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/core/rtctimer.c 2007-06-13 06:55:08.000000000 -0400 @@ -50,7 +50,9 @@ static int rtctimer_stop(struct snd_time * The hardware dependent description for this timer. */ static struct snd_timer_hardware rtc_hw = { - .flags = SNDRV_TIMER_HW_FIRST|SNDRV_TIMER_HW_AUTO, + .flags = SNDRV_TIMER_HW_AUTO | + SNDRV_TIMER_HW_FIRST | + SNDRV_TIMER_HW_TASKLET, .ticks = 100000000L, /* FIXME: XXX */ .open = rtctimer_open, .close = rtctimer_close, @@ -60,6 +62,7 @@ static struct snd_timer_hardware rtc_hw static int rtctimer_freq = RTC_FREQ; /* frequency */ static struct snd_timer *rtctimer; +static struct tasklet_struct rtc_tasklet; static rtc_task_t rtc_task; @@ -81,6 +84,7 @@ rtctimer_close(struct snd_timer *t) rtc_task_t *rtc = t->private_data; if (rtc) { rtc_unregister(rtc); + tasklet_kill(&rtc_tasklet); t->private_data = NULL; } return 0; @@ -105,12 +109,17 @@ rtctimer_stop(struct snd_timer *timer) return 0; } +static void rtctimer_tasklet(unsigned long data) +{ + snd_timer_interrupt((struct snd_timer *)data, 1); +} + /* * interrupt */ static void rtctimer_interrupt(void *private_data) { - snd_timer_interrupt(private_data, 1); + tasklet_hi_schedule(private_data); } @@ -139,9 +148,11 @@ static int __init rtctimer_init(void) timer->hw = rtc_hw; timer->hw.resolution = NANO_SEC / rtctimer_freq; + tasklet_init(&rtc_tasklet, rtctimer_tasklet, (unsigned long)timer); + /* set up RTC callback */ rtc_task.func = rtctimer_interrupt; - rtc_task.private_data = timer; + rtc_task.private_data = &rtc_tasklet; err = snd_timer_global_register(timer); if (err < 0) { diff -uprN linux-2.6.18/sound/pci/au88x0/au88x0.c linux-2.6.18.ovz/sound/pci/au88x0/au88x0.c --- linux-2.6.18/sound/pci/au88x0/au88x0.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/pci/au88x0/au88x0.c 2007-06-13 06:55:08.000000000 -0400 @@ -128,6 +128,7 @@ static int snd_vortex_dev_free(struct sn // Take down PCI interface. synchronize_irq(vortex->irq); free_irq(vortex->irq, vortex); + iounmap(vortex->mmio); pci_release_regions(vortex->pci_dev); pci_disable_device(vortex->pci_dev); kfree(vortex); diff -uprN linux-2.6.18/sound/pci/emu10k1/emu10k1_main.c linux-2.6.18.ovz/sound/pci/emu10k1/emu10k1_main.c --- linux-2.6.18/sound/pci/emu10k1/emu10k1_main.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/pci/emu10k1/emu10k1_main.c 2007-06-13 06:55:08.000000000 -0400 @@ -1460,8 +1460,8 @@ void snd_emu10k1_resume_regs(struct snd_ /* resore for spdif */ if (emu->audigy) - outl(emu->port + A_IOCFG, emu->saved_a_iocfg); - outl(emu->port + HCFG, emu->saved_hcfg); + outl(emu->saved_a_iocfg, emu->port + A_IOCFG); + outl(emu->saved_hcfg, emu->port + HCFG); val = emu->saved_ptr; for (reg = saved_regs; *reg != 0xff; reg++) diff -uprN linux-2.6.18/sound/ppc/keywest.c linux-2.6.18.ovz/sound/ppc/keywest.c --- linux-2.6.18/sound/ppc/keywest.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/ppc/keywest.c 2007-06-13 06:55:08.000000000 -0400 @@ -117,6 +117,9 @@ int __init snd_pmac_tumbler_post_init(vo { int err; + if (!keywest_ctx || !keywest_ctx->client) + return -ENXIO; + if ((err = keywest_ctx->init_client(keywest_ctx)) < 0) { snd_printk(KERN_ERR "tumbler: %i :cannot initialize the MCS\n", err); return err; diff -uprN linux-2.6.18/sound/usb/usx2y/usbusx2yaudio.c linux-2.6.18.ovz/sound/usb/usx2y/usbusx2yaudio.c --- linux-2.6.18/sound/usb/usx2y/usbusx2yaudio.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/usb/usx2y/usbusx2yaudio.c 2007-06-13 06:55:08.000000000 -0400 @@ -322,7 +322,7 @@ static void i_usX2Y_urb_complete(struct usX2Y_error_urb_status(usX2Y, subs, urb); return; } - if (likely((0xFFFF & urb->start_frame) == usX2Y->wait_iso_frame)) + if (likely(urb->start_frame == usX2Y->wait_iso_frame)) subs->completed_urb = urb; else { usX2Y_error_sequence(usX2Y, subs, urb); @@ -335,13 +335,9 @@ static void i_usX2Y_urb_complete(struct atomic_read(&capsubs->state) >= state_PREPARED && (playbacksubs->completed_urb || atomic_read(&playbacksubs->state) < state_PREPARED)) { - if (!usX2Y_usbframe_complete(capsubs, playbacksubs, urb->start_frame)) { - if (nr_of_packs() <= urb->start_frame && - urb->start_frame <= (2 * nr_of_packs() - 1)) // uhci and ohci - usX2Y->wait_iso_frame = urb->start_frame - nr_of_packs(); - else - usX2Y->wait_iso_frame += nr_of_packs(); - } else { + if (!usX2Y_usbframe_complete(capsubs, playbacksubs, urb->start_frame)) + usX2Y->wait_iso_frame += nr_of_packs(); + else { snd_printdd("\n"); usX2Y_clients_stop(usX2Y); } @@ -495,7 +491,6 @@ static int usX2Y_urbs_start(struct snd_u if (subs != NULL && atomic_read(&subs->state) >= state_PREPARED) goto start; } - usX2Y->wait_iso_frame = -1; start: usX2Y_subs_startup(subs); @@ -516,10 +511,9 @@ static int usX2Y_urbs_start(struct snd_u snd_printk (KERN_ERR "cannot submit datapipe for urb %d, err = %d\n", i, err); err = -EPIPE; goto cleanup; - } else { - if (0 > usX2Y->wait_iso_frame) + } else + if (i == 0) usX2Y->wait_iso_frame = urb->start_frame; - } urb->transfer_flags = 0; } else { atomic_set(&subs->state, state_STARTING1); diff -uprN linux-2.6.18/sound/usb/usx2y/usx2yhwdeppcm.c linux-2.6.18.ovz/sound/usb/usx2y/usx2yhwdeppcm.c --- linux-2.6.18/sound/usb/usx2y/usx2yhwdeppcm.c 2006-09-19 23:42:06.000000000 -0400 +++ linux-2.6.18.ovz/sound/usb/usx2y/usx2yhwdeppcm.c 2007-06-13 06:55:08.000000000 -0400 @@ -243,7 +243,7 @@ static void i_usX2Y_usbpcm_urb_complete( usX2Y_error_urb_status(usX2Y, subs, urb); return; } - if (likely((0xFFFF & urb->start_frame) == usX2Y->wait_iso_frame)) + if (likely(urb->start_frame == usX2Y->wait_iso_frame)) subs->completed_urb = urb; else { usX2Y_error_sequence(usX2Y, subs, urb); @@ -256,13 +256,9 @@ static void i_usX2Y_usbpcm_urb_complete( if (capsubs->completed_urb && atomic_read(&capsubs->state) >= state_PREPARED && (NULL == capsubs2 || capsubs2->completed_urb) && (playbacksubs->completed_urb || atomic_read(&playbacksubs->state) < state_PREPARED)) { - if (!usX2Y_usbpcm_usbframe_complete(capsubs, capsubs2, playbacksubs, urb->start_frame)) { - if (nr_of_packs() <= urb->start_frame && - urb->start_frame <= (2 * nr_of_packs() - 1)) // uhci and ohci - usX2Y->wait_iso_frame = urb->start_frame - nr_of_packs(); - else - usX2Y->wait_iso_frame += nr_of_packs(); - } else { + if (!usX2Y_usbpcm_usbframe_complete(capsubs, capsubs2, playbacksubs, urb->start_frame)) + usX2Y->wait_iso_frame += nr_of_packs(); + else { snd_printdd("\n"); usX2Y_clients_stop(usX2Y); } @@ -433,7 +429,6 @@ static int usX2Y_usbpcm_urbs_start(struc if (subs != NULL && atomic_read(&subs->state) >= state_PREPARED) goto start; } - usX2Y->wait_iso_frame = -1; start: usX2Y_usbpcm_subs_startup(subs); @@ -459,7 +454,7 @@ static int usX2Y_usbpcm_urbs_start(struc goto cleanup; } else { snd_printdd("%i\n", urb->start_frame); - if (0 > usX2Y->wait_iso_frame) + if (u == 0) usX2Y->wait_iso_frame = urb->start_frame; } urb->transfer_flags = 0; @@ -632,7 +627,7 @@ static int usX2Y_pcms_lock_check(struct for (s = 0; s < 2; ++s) { struct snd_pcm_substream *substream; substream = pcm->streams[s].substream; - if (SUBSTREAM_BUSY(substream)) + if (substream && SUBSTREAM_BUSY(substream)) err = -EBUSY; } }