Magellan Linux

Annotation of /trunk/kernel-alx/patches-3.14/0120-3.14.21-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2506 - (hide annotations) (download)
Fri Oct 17 07:55:45 2014 UTC (9 years, 7 months ago) by niro
File size: 82171 byte(s)
-patches for 3.14
1 niro 2506 diff --git a/Makefile b/Makefile
2     index beb7e6f0803b..41e6e19fe2e9 100644
3     --- a/Makefile
4     +++ b/Makefile
5     @@ -1,6 +1,6 @@
6     VERSION = 3
7     PATCHLEVEL = 14
8     -SUBLEVEL = 20
9     +SUBLEVEL = 21
10     EXTRAVERSION =
11     NAME = Remembering Coco
12    
13     diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
14     index fb5e4c658f7a..ef470a7a3d0f 100644
15     --- a/arch/unicore32/include/asm/mmu_context.h
16     +++ b/arch/unicore32/include/asm/mmu_context.h
17     @@ -14,6 +14,8 @@
18    
19     #include <linux/compiler.h>
20     #include <linux/sched.h>
21     +#include <linux/mm.h>
22     +#include <linux/vmacache.h>
23     #include <linux/io.h>
24    
25     #include <asm/cacheflush.h>
26     @@ -73,7 +75,7 @@ do { \
27     else \
28     mm->mmap = NULL; \
29     rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
30     - mm->mmap_cache = NULL; \
31     + vmacache_invalidate(mm); \
32     mm->map_count--; \
33     remove_vma(high_vma); \
34     } \
35     diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
36     index c706d50a8b06..8c16c2f97026 100644
37     --- a/drivers/block/drbd/drbd_nl.c
38     +++ b/drivers/block/drbd/drbd_nl.c
39     @@ -525,6 +525,12 @@ void conn_try_outdate_peer_async(struct drbd_tconn *tconn)
40     struct task_struct *opa;
41    
42     kref_get(&tconn->kref);
43     + /* We may just have force_sig()'ed this thread
44     + * to get it out of some blocking network function.
45     + * Clear signals; otherwise kthread_run(), which internally uses
46     + * wait_on_completion_killable(), will mistake our pending signal
47     + * for a new fatal signal and fail. */
48     + flush_signals(current);
49     opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h");
50     if (IS_ERR(opa)) {
51     conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n");
52     diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c
53     index 0e27844e8c2d..8089dd2cd9d8 100644
54     --- a/drivers/cpufreq/integrator-cpufreq.c
55     +++ b/drivers/cpufreq/integrator-cpufreq.c
56     @@ -213,9 +213,9 @@ static int __init integrator_cpufreq_probe(struct platform_device *pdev)
57     return cpufreq_register_driver(&integrator_driver);
58     }
59    
60     -static void __exit integrator_cpufreq_remove(struct platform_device *pdev)
61     +static int __exit integrator_cpufreq_remove(struct platform_device *pdev)
62     {
63     - cpufreq_unregister_driver(&integrator_driver);
64     + return cpufreq_unregister_driver(&integrator_driver);
65     }
66    
67     static const struct of_device_id integrator_cpufreq_match[] = {
68     diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
69     index d278be110805..1855cdca39cd 100644
70     --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
71     +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
72     @@ -827,6 +827,16 @@ void i915_check_and_clear_faults(struct drm_device *dev)
73     POSTING_READ(RING_FAULT_REG(&dev_priv->ring[RCS]));
74     }
75    
76     +static void i915_ggtt_flush(struct drm_i915_private *dev_priv)
77     +{
78     + if (INTEL_INFO(dev_priv->dev)->gen < 6) {
79     + intel_gtt_chipset_flush();
80     + } else {
81     + I915_WRITE(GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN);
82     + POSTING_READ(GFX_FLSH_CNTL_GEN6);
83     + }
84     +}
85     +
86     void i915_gem_suspend_gtt_mappings(struct drm_device *dev)
87     {
88     struct drm_i915_private *dev_priv = dev->dev_private;
89     @@ -843,6 +853,8 @@ void i915_gem_suspend_gtt_mappings(struct drm_device *dev)
90     dev_priv->gtt.base.start / PAGE_SIZE,
91     dev_priv->gtt.base.total / PAGE_SIZE,
92     true);
93     +
94     + i915_ggtt_flush(dev_priv);
95     }
96    
97     void i915_gem_restore_gtt_mappings(struct drm_device *dev)
98     @@ -863,7 +875,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
99     i915_gem_gtt_bind_object(obj, obj->cache_level);
100     }
101    
102     - i915_gem_chipset_flush(dev);
103     + i915_ggtt_flush(dev_priv);
104     }
105    
106     int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
107     diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
108     index 18cda77b4f79..4913c0690872 100644
109     --- a/drivers/md/raid5.c
110     +++ b/drivers/md/raid5.c
111     @@ -64,6 +64,10 @@
112     #define cpu_to_group(cpu) cpu_to_node(cpu)
113     #define ANY_GROUP NUMA_NO_NODE
114    
115     +static bool devices_handle_discard_safely = false;
116     +module_param(devices_handle_discard_safely, bool, 0644);
117     +MODULE_PARM_DESC(devices_handle_discard_safely,
118     + "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
119     static struct workqueue_struct *raid5_wq;
120     /*
121     * Stripe cache
122     @@ -6117,7 +6121,7 @@ static int run(struct mddev *mddev)
123     mddev->queue->limits.discard_granularity = stripe;
124     /*
125     * unaligned part of discard request will be ignored, so can't
126     - * guarantee discard_zerors_data
127     + * guarantee discard_zeroes_data
128     */
129     mddev->queue->limits.discard_zeroes_data = 0;
130    
131     @@ -6142,6 +6146,18 @@ static int run(struct mddev *mddev)
132     !bdev_get_queue(rdev->bdev)->
133     limits.discard_zeroes_data)
134     discard_supported = false;
135     + /* Unfortunately, discard_zeroes_data is not currently
136     + * a guarantee - just a hint. So we only allow DISCARD
137     + * if the sysadmin has confirmed that only safe devices
138     + * are in use by setting a module parameter.
139     + */
140     + if (!devices_handle_discard_safely) {
141     + if (discard_supported) {
142     + pr_info("md/raid456: discard support disabled due to uncertainty.\n");
143     + pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
144     + }
145     + discard_supported = false;
146     + }
147     }
148    
149     if (discard_supported &&
150     diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c
151     index a127925c9d61..06faea4d60ee 100644
152     --- a/drivers/media/v4l2-core/videobuf2-core.c
153     +++ b/drivers/media/v4l2-core/videobuf2-core.c
154     @@ -745,6 +745,7 @@ static int __reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req)
155     * to the userspace.
156     */
157     req->count = allocated_buffers;
158     + q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type);
159    
160     return 0;
161     }
162     @@ -793,6 +794,7 @@ static int __create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create
163     memset(q->plane_sizes, 0, sizeof(q->plane_sizes));
164     memset(q->alloc_ctx, 0, sizeof(q->alloc_ctx));
165     q->memory = create->memory;
166     + q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type);
167     }
168    
169     num_buffers = min(create->count, VIDEO_MAX_FRAME - q->num_buffers);
170     @@ -1447,6 +1449,7 @@ static int vb2_internal_qbuf(struct vb2_queue *q, struct v4l2_buffer *b)
171     * dequeued in dqbuf.
172     */
173     list_add_tail(&vb->queued_entry, &q->queued_list);
174     + q->waiting_for_buffers = false;
175     vb->state = VB2_BUF_STATE_QUEUED;
176    
177     /*
178     @@ -1841,6 +1844,7 @@ static int vb2_internal_streamoff(struct vb2_queue *q, enum v4l2_buf_type type)
179     * and videobuf, effectively returning control over them to userspace.
180     */
181     __vb2_queue_cancel(q);
182     + q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type);
183    
184     dprintk(3, "Streamoff successful\n");
185     return 0;
186     @@ -2150,9 +2154,16 @@ unsigned int vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait)
187     }
188    
189     /*
190     - * There is nothing to wait for if no buffers have already been queued.
191     + * There is nothing to wait for if the queue isn't streaming.
192     */
193     - if (list_empty(&q->queued_list))
194     + if (!vb2_is_streaming(q))
195     + return res | POLLERR;
196     + /*
197     + * For compatibility with vb1: if QBUF hasn't been called yet, then
198     + * return POLLERR as well. This only affects capture queues, output
199     + * queues will always initialize waiting_for_buffers to false.
200     + */
201     + if (q->waiting_for_buffers)
202     return res | POLLERR;
203    
204     if (list_empty(&q->done_list))
205     diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
206     index f15d4353f30f..5d12d69e2045 100644
207     --- a/fs/cifs/cifsglob.h
208     +++ b/fs/cifs/cifsglob.h
209     @@ -399,6 +399,8 @@ struct smb_version_operations {
210     const struct cifs_fid *, u32 *);
211     int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
212     int);
213     + /* check if we need to issue closedir */
214     + bool (*dir_needs_close)(struct cifsFileInfo *);
215     };
216    
217     struct smb_version_values {
218     diff --git a/fs/cifs/file.c b/fs/cifs/file.c
219     index 8175b18df819..d375322b6cec 100644
220     --- a/fs/cifs/file.c
221     +++ b/fs/cifs/file.c
222     @@ -762,7 +762,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
223    
224     cifs_dbg(FYI, "Freeing private data in close dir\n");
225     spin_lock(&cifs_file_list_lock);
226     - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
227     + if (server->ops->dir_needs_close(cfile)) {
228     cfile->invalidHandle = true;
229     spin_unlock(&cifs_file_list_lock);
230     if (server->ops->close_dir)
231     diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
232     index 2bbf11b09214..b334a89d6a66 100644
233     --- a/fs/cifs/readdir.c
234     +++ b/fs/cifs/readdir.c
235     @@ -593,7 +593,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
236     /* close and restart search */
237     cifs_dbg(FYI, "search backing up - close and restart search\n");
238     spin_lock(&cifs_file_list_lock);
239     - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
240     + if (server->ops->dir_needs_close(cfile)) {
241     cfile->invalidHandle = true;
242     spin_unlock(&cifs_file_list_lock);
243     if (server->ops->close_dir)
244     diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
245     index d1fdfa848703..e9ad8d37bb00 100644
246     --- a/fs/cifs/smb1ops.c
247     +++ b/fs/cifs/smb1ops.c
248     @@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
249     tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
250     if (tmprc == -EOPNOTSUPP)
251     *symlink = true;
252     - else
253     + else if (tmprc == 0)
254     CIFSSMBClose(xid, tcon, fid.netfid);
255     }
256    
257     @@ -1009,6 +1009,12 @@ cifs_is_read_op(__u32 oplock)
258     return oplock == OPLOCK_READ;
259     }
260    
261     +static bool
262     +cifs_dir_needs_close(struct cifsFileInfo *cfile)
263     +{
264     + return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
265     +}
266     +
267     struct smb_version_operations smb1_operations = {
268     .send_cancel = send_nt_cancel,
269     .compare_fids = cifs_compare_fids,
270     @@ -1078,6 +1084,7 @@ struct smb_version_operations smb1_operations = {
271     .query_mf_symlink = cifs_query_mf_symlink,
272     .create_mf_symlink = cifs_create_mf_symlink,
273     .is_read_op = cifs_is_read_op,
274     + .dir_needs_close = cifs_dir_needs_close,
275     #ifdef CONFIG_CIFS_XATTR
276     .query_all_EAs = CIFSSMBQAllEAs,
277     .set_EA = CIFSSMBSetEA,
278     diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
279     index e31a9dfdcd39..a491814cb2c0 100644
280     --- a/fs/cifs/smb2maperror.c
281     +++ b/fs/cifs/smb2maperror.c
282     @@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
283     {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
284     {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
285     {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
286     - {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"},
287     + {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
288     {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
289     {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
290     {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
291     @@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
292     {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
293     "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
294     {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
295     + {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
296     + "STATUS_REPARSE_NOT_HANDLED"},
297     {STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
298     "STATUS_DEVICE_REQUIRES_CLEANING"},
299     {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
300     diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
301     index f8977b2d9187..34a17d425be6 100644
302     --- a/fs/cifs/smb2ops.c
303     +++ b/fs/cifs/smb2ops.c
304     @@ -1102,6 +1102,12 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch)
305     return le32_to_cpu(lc->lcontext.LeaseState);
306     }
307    
308     +static bool
309     +smb2_dir_needs_close(struct cifsFileInfo *cfile)
310     +{
311     + return !cfile->invalidHandle;
312     +}
313     +
314     struct smb_version_operations smb20_operations = {
315     .compare_fids = smb2_compare_fids,
316     .setup_request = smb2_setup_request,
317     @@ -1175,6 +1181,7 @@ struct smb_version_operations smb20_operations = {
318     .create_lease_buf = smb2_create_lease_buf,
319     .parse_lease_buf = smb2_parse_lease_buf,
320     .clone_range = smb2_clone_range,
321     + .dir_needs_close = smb2_dir_needs_close,
322     };
323    
324     struct smb_version_operations smb21_operations = {
325     @@ -1250,6 +1257,7 @@ struct smb_version_operations smb21_operations = {
326     .create_lease_buf = smb2_create_lease_buf,
327     .parse_lease_buf = smb2_parse_lease_buf,
328     .clone_range = smb2_clone_range,
329     + .dir_needs_close = smb2_dir_needs_close,
330     };
331    
332     struct smb_version_operations smb30_operations = {
333     @@ -1328,6 +1336,7 @@ struct smb_version_operations smb30_operations = {
334     .parse_lease_buf = smb3_parse_lease_buf,
335     .clone_range = smb2_clone_range,
336     .validate_negotiate = smb3_validate_negotiate,
337     + .dir_needs_close = smb2_dir_needs_close,
338     };
339    
340     struct smb_version_values smb20_values = {
341     diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
342     index 9aab8fe0e508..348792911e1f 100644
343     --- a/fs/cifs/smb2pdu.c
344     +++ b/fs/cifs/smb2pdu.c
345     @@ -2136,6 +2136,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
346     rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
347    
348     if (rc) {
349     + if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
350     + srch_inf->endOfSearch = true;
351     + rc = 0;
352     + }
353     cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
354     goto qdir_exit;
355     }
356     @@ -2173,11 +2177,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
357     else
358     cifs_dbg(VFS, "illegal search buffer type\n");
359    
360     - if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
361     - srch_inf->endOfSearch = 1;
362     - else
363     - srch_inf->endOfSearch = 0;
364     -
365     return rc;
366    
367     qdir_exit:
368     diff --git a/fs/exec.c b/fs/exec.c
369     index 31e46b1b358b..ea4449d0536a 100644
370     --- a/fs/exec.c
371     +++ b/fs/exec.c
372     @@ -26,6 +26,7 @@
373     #include <linux/file.h>
374     #include <linux/fdtable.h>
375     #include <linux/mm.h>
376     +#include <linux/vmacache.h>
377     #include <linux/stat.h>
378     #include <linux/fcntl.h>
379     #include <linux/swap.h>
380     @@ -820,7 +821,7 @@ EXPORT_SYMBOL(read_code);
381     static int exec_mmap(struct mm_struct *mm)
382     {
383     struct task_struct *tsk;
384     - struct mm_struct * old_mm, *active_mm;
385     + struct mm_struct *old_mm, *active_mm;
386    
387     /* Notify parent that we're no longer interested in the old VM */
388     tsk = current;
389     @@ -846,6 +847,8 @@ static int exec_mmap(struct mm_struct *mm)
390     tsk->mm = mm;
391     tsk->active_mm = mm;
392     activate_mm(active_mm, mm);
393     + tsk->mm->vmacache_seqnum = 0;
394     + vmacache_flush(tsk);
395     task_unlock(tsk);
396     if (old_mm) {
397     up_read(&old_mm->mmap_sem);
398     diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
399     index d19b30ababf1..a4a8ed56e438 100644
400     --- a/fs/hugetlbfs/inode.c
401     +++ b/fs/hugetlbfs/inode.c
402     @@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void)
403     int error;
404     int i;
405    
406     + if (!hugepages_supported()) {
407     + pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
408     + return -ENOTSUPP;
409     + }
410     +
411     error = bdi_init(&hugetlbfs_backing_dev_info);
412     if (error)
413     return error;
414     diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
415     index 8f788193e3d4..c4b2646b6d7c 100644
416     --- a/fs/proc/task_mmu.c
417     +++ b/fs/proc/task_mmu.c
418     @@ -1,4 +1,5 @@
419     #include <linux/mm.h>
420     +#include <linux/vmacache.h>
421     #include <linux/hugetlb.h>
422     #include <linux/huge_mm.h>
423     #include <linux/mount.h>
424     @@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
425    
426     /*
427     * We remember last_addr rather than next_addr to hit with
428     - * mmap_cache most of the time. We have zero last_addr at
429     + * vmacache most of the time. We have zero last_addr at
430     * the beginning and also after lseek. We will have -1 last_addr
431     * after the end of the vmas.
432     */
433     diff --git a/fs/udf/inode.c b/fs/udf/inode.c
434     index 982ce05c87ed..287cd5f23421 100644
435     --- a/fs/udf/inode.c
436     +++ b/fs/udf/inode.c
437     @@ -1271,13 +1271,22 @@ update_time:
438     return 0;
439     }
440    
441     +/*
442     + * Maximum length of linked list formed by ICB hierarchy. The chosen number is
443     + * arbitrary - just that we hopefully don't limit any real use of rewritten
444     + * inode on write-once media but avoid looping for too long on corrupted media.
445     + */
446     +#define UDF_MAX_ICB_NESTING 1024
447     +
448     static void __udf_read_inode(struct inode *inode)
449     {
450     struct buffer_head *bh = NULL;
451     struct fileEntry *fe;
452     uint16_t ident;
453     struct udf_inode_info *iinfo = UDF_I(inode);
454     + unsigned int indirections = 0;
455    
456     +reread:
457     /*
458     * Set defaults, but the inode is still incomplete!
459     * Note: get_new_inode() sets the following on a new inode:
460     @@ -1314,28 +1323,26 @@ static void __udf_read_inode(struct inode *inode)
461     ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
462     &ident);
463     if (ident == TAG_IDENT_IE && ibh) {
464     - struct buffer_head *nbh = NULL;
465     struct kernel_lb_addr loc;
466     struct indirectEntry *ie;
467    
468     ie = (struct indirectEntry *)ibh->b_data;
469     loc = lelb_to_cpu(ie->indirectICB.extLocation);
470    
471     - if (ie->indirectICB.extLength &&
472     - (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
473     - &ident))) {
474     - if (ident == TAG_IDENT_FE ||
475     - ident == TAG_IDENT_EFE) {
476     - memcpy(&iinfo->i_location,
477     - &loc,
478     - sizeof(struct kernel_lb_addr));
479     - brelse(bh);
480     - brelse(ibh);
481     - brelse(nbh);
482     - __udf_read_inode(inode);
483     + if (ie->indirectICB.extLength) {
484     + brelse(bh);
485     + brelse(ibh);
486     + memcpy(&iinfo->i_location, &loc,
487     + sizeof(struct kernel_lb_addr));
488     + if (++indirections > UDF_MAX_ICB_NESTING) {
489     + udf_err(inode->i_sb,
490     + "too many ICBs in ICB hierarchy"
491     + " (max %d supported)\n",
492     + UDF_MAX_ICB_NESTING);
493     + make_bad_inode(inode);
494     return;
495     }
496     - brelse(nbh);
497     + goto reread;
498     }
499     }
500     brelse(ibh);
501     diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
502     index 3fe661fe96d1..b19d3dc2e651 100644
503     --- a/include/linux/cpuset.h
504     +++ b/include/linux/cpuset.h
505     @@ -87,25 +87,26 @@ extern void rebuild_sched_domains(void);
506     extern void cpuset_print_task_mems_allowed(struct task_struct *p);
507    
508     /*
509     - * get_mems_allowed is required when making decisions involving mems_allowed
510     - * such as during page allocation. mems_allowed can be updated in parallel
511     - * and depending on the new value an operation can fail potentially causing
512     - * process failure. A retry loop with get_mems_allowed and put_mems_allowed
513     - * prevents these artificial failures.
514     + * read_mems_allowed_begin is required when making decisions involving
515     + * mems_allowed such as during page allocation. mems_allowed can be updated in
516     + * parallel and depending on the new value an operation can fail potentially
517     + * causing process failure. A retry loop with read_mems_allowed_begin and
518     + * read_mems_allowed_retry prevents these artificial failures.
519     */
520     -static inline unsigned int get_mems_allowed(void)
521     +static inline unsigned int read_mems_allowed_begin(void)
522     {
523     return read_seqcount_begin(&current->mems_allowed_seq);
524     }
525    
526     /*
527     - * If this returns false, the operation that took place after get_mems_allowed
528     - * may have failed. It is up to the caller to retry the operation if
529     + * If this returns true, the operation that took place after
530     + * read_mems_allowed_begin may have failed artificially due to a concurrent
531     + * update of mems_allowed. It is up to the caller to retry the operation if
532     * appropriate.
533     */
534     -static inline bool put_mems_allowed(unsigned int seq)
535     +static inline bool read_mems_allowed_retry(unsigned int seq)
536     {
537     - return !read_seqcount_retry(&current->mems_allowed_seq, seq);
538     + return read_seqcount_retry(&current->mems_allowed_seq, seq);
539     }
540    
541     static inline void set_mems_allowed(nodemask_t nodemask)
542     @@ -225,14 +226,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
543     {
544     }
545    
546     -static inline unsigned int get_mems_allowed(void)
547     +static inline unsigned int read_mems_allowed_begin(void)
548     {
549     return 0;
550     }
551    
552     -static inline bool put_mems_allowed(unsigned int seq)
553     +static inline bool read_mems_allowed_retry(unsigned int seq)
554     {
555     - return true;
556     + return false;
557     }
558    
559     #endif /* !CONFIG_CPUSETS */
560     diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
561     index bd1e9bcec547..42b05c4c53e5 100644
562     --- a/include/linux/hugetlb.h
563     +++ b/include/linux/hugetlb.h
564     @@ -400,6 +400,16 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
565     return &mm->page_table_lock;
566     }
567    
568     +static inline bool hugepages_supported(void)
569     +{
570     + /*
571     + * Some platform decide whether they support huge pages at boot
572     + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
573     + * there is no such support
574     + */
575     + return HPAGE_SHIFT != 0;
576     +}
577     +
578     #else /* CONFIG_HUGETLB_PAGE */
579     struct hstate {};
580     #define alloc_huge_page_node(h, nid) NULL
581     diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
582     index 1f44466c1e9d..c367cbdf73ab 100644
583     --- a/include/linux/jiffies.h
584     +++ b/include/linux/jiffies.h
585     @@ -258,23 +258,11 @@ extern unsigned long preset_lpj;
586     #define SEC_JIFFIE_SC (32 - SHIFT_HZ)
587     #endif
588     #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29)
589     -#define USEC_JIFFIE_SC (SEC_JIFFIE_SC + 19)
590     #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\
591     TICK_NSEC -1) / (u64)TICK_NSEC))
592    
593     #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\
594     TICK_NSEC -1) / (u64)TICK_NSEC))
595     -#define USEC_CONVERSION \
596     - ((unsigned long)((((u64)NSEC_PER_USEC << USEC_JIFFIE_SC) +\
597     - TICK_NSEC -1) / (u64)TICK_NSEC))
598     -/*
599     - * USEC_ROUND is used in the timeval to jiffie conversion. See there
600     - * for more details. It is the scaled resolution rounding value. Note
601     - * that it is a 64-bit value. Since, when it is applied, we are already
602     - * in jiffies (albit scaled), it is nothing but the bits we will shift
603     - * off.
604     - */
605     -#define USEC_ROUND (u64)(((u64)1 << USEC_JIFFIE_SC) - 1)
606     /*
607     * The maximum jiffie value is (MAX_INT >> 1). Here we translate that
608     * into seconds. The 64-bit case will overflow if we are not careful,
609     diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
610     index 290901a8c1de..2b58d192ea24 100644
611     --- a/include/linux/mm_types.h
612     +++ b/include/linux/mm_types.h
613     @@ -342,9 +342,9 @@ struct mm_rss_stat {
614    
615     struct kioctx_table;
616     struct mm_struct {
617     - struct vm_area_struct * mmap; /* list of VMAs */
618     + struct vm_area_struct *mmap; /* list of VMAs */
619     struct rb_root mm_rb;
620     - struct vm_area_struct * mmap_cache; /* last find_vma result */
621     + u32 vmacache_seqnum; /* per-thread vmacache */
622     #ifdef CONFIG_MMU
623     unsigned long (*get_unmapped_area) (struct file *filp,
624     unsigned long addr, unsigned long len,
625     diff --git a/include/linux/plist.h b/include/linux/plist.h
626     index aa0fb390bd29..8b6c970cff6c 100644
627     --- a/include/linux/plist.h
628     +++ b/include/linux/plist.h
629     @@ -98,6 +98,13 @@ struct plist_node {
630     }
631    
632     /**
633     + * PLIST_HEAD - declare and init plist_head
634     + * @head: name for struct plist_head variable
635     + */
636     +#define PLIST_HEAD(head) \
637     + struct plist_head head = PLIST_HEAD_INIT(head)
638     +
639     +/**
640     * PLIST_NODE_INIT - static struct plist_node initializer
641     * @node: struct plist_node variable name
642     * @__prio: initial node priority
643     @@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
644     extern void plist_add(struct plist_node *node, struct plist_head *head);
645     extern void plist_del(struct plist_node *node, struct plist_head *head);
646    
647     +extern void plist_requeue(struct plist_node *node, struct plist_head *head);
648     +
649     /**
650     * plist_for_each - iterate over the plist
651     * @pos: the type * to use as a loop counter
652     @@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
653     list_for_each_entry(pos, &(head)->node_list, node_list)
654    
655     /**
656     + * plist_for_each_continue - continue iteration over the plist
657     + * @pos: the type * to use as a loop cursor
658     + * @head: the head for your list
659     + *
660     + * Continue to iterate over plist, continuing after the current position.
661     + */
662     +#define plist_for_each_continue(pos, head) \
663     + list_for_each_entry_continue(pos, &(head)->node_list, node_list)
664     +
665     +/**
666     * plist_for_each_safe - iterate safely over a plist of given type
667     * @pos: the type * to use as a loop counter
668     * @n: another type * to use as temporary storage
669     @@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
670     list_for_each_entry(pos, &(head)->node_list, mem.node_list)
671    
672     /**
673     + * plist_for_each_entry_continue - continue iteration over list of given type
674     + * @pos: the type * to use as a loop cursor
675     + * @head: the head for your list
676     + * @m: the name of the list_struct within the struct
677     + *
678     + * Continue to iterate over list of given type, continuing after
679     + * the current position.
680     + */
681     +#define plist_for_each_entry_continue(pos, head, m) \
682     + list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
683     +
684     +/**
685     * plist_for_each_entry_safe - iterate safely over list of given type
686     * @pos: the type * to use as a loop counter
687     * @n: another type * to use as temporary storage
688     @@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node)
689     #endif
690    
691     /**
692     + * plist_next - get the next entry in list
693     + * @pos: the type * to cursor
694     + */
695     +#define plist_next(pos) \
696     + list_next_entry(pos, node_list)
697     +
698     +/**
699     + * plist_prev - get the prev entry in list
700     + * @pos: the type * to cursor
701     + */
702     +#define plist_prev(pos) \
703     + list_prev_entry(pos, node_list)
704     +
705     +/**
706     * plist_first - return the first node (and thus, highest priority)
707     * @head: the &struct plist_head pointer
708     *
709     diff --git a/include/linux/sched.h b/include/linux/sched.h
710     index ccd0c6f24f2c..d7ca410ace93 100644
711     --- a/include/linux/sched.h
712     +++ b/include/linux/sched.h
713     @@ -59,6 +59,10 @@ struct sched_param {
714    
715     #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
716    
717     +#define VMACACHE_BITS 2
718     +#define VMACACHE_SIZE (1U << VMACACHE_BITS)
719     +#define VMACACHE_MASK (VMACACHE_SIZE - 1)
720     +
721     /*
722     * Extended scheduling parameters data structure.
723     *
724     @@ -1228,6 +1232,9 @@ struct task_struct {
725     #ifdef CONFIG_COMPAT_BRK
726     unsigned brk_randomized:1;
727     #endif
728     + /* per-thread vma caching */
729     + u32 vmacache_seqnum;
730     + struct vm_area_struct *vmacache[VMACACHE_SIZE];
731     #if defined(SPLIT_RSS_COUNTING)
732     struct task_rss_stat rss_stat;
733     #endif
734     diff --git a/include/linux/swap.h b/include/linux/swap.h
735     index 46ba0c6c219f..789324976801 100644
736     --- a/include/linux/swap.h
737     +++ b/include/linux/swap.h
738     @@ -214,8 +214,9 @@ struct percpu_cluster {
739     struct swap_info_struct {
740     unsigned long flags; /* SWP_USED etc: see above */
741     signed short prio; /* swap priority of this type */
742     + struct plist_node list; /* entry in swap_active_head */
743     + struct plist_node avail_list; /* entry in swap_avail_head */
744     signed char type; /* strange name for an index */
745     - signed char next; /* next type on the swap list */
746     unsigned int max; /* extent of the swap_map */
747     unsigned char *swap_map; /* vmalloc'ed array of usage counts */
748     struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
749     @@ -255,11 +256,6 @@ struct swap_info_struct {
750     struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
751     };
752    
753     -struct swap_list_t {
754     - int head; /* head of priority-ordered swapfile list */
755     - int next; /* swapfile to be used next */
756     -};
757     -
758     /* linux/mm/page_alloc.c */
759     extern unsigned long totalram_pages;
760     extern unsigned long totalreserve_pages;
761     diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
762     index e282624e8c10..388293a91e8c 100644
763     --- a/include/linux/swapfile.h
764     +++ b/include/linux/swapfile.h
765     @@ -6,7 +6,7 @@
766     * want to expose them to the dozens of source files that include swap.h
767     */
768     extern spinlock_t swap_lock;
769     -extern struct swap_list_t swap_list;
770     +extern struct plist_head swap_active_head;
771     extern struct swap_info_struct *swap_info[];
772     extern int try_to_unuse(unsigned int, bool, unsigned long);
773    
774     diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
775     new file mode 100644
776     index 000000000000..c3fa0fd43949
777     --- /dev/null
778     +++ b/include/linux/vmacache.h
779     @@ -0,0 +1,38 @@
780     +#ifndef __LINUX_VMACACHE_H
781     +#define __LINUX_VMACACHE_H
782     +
783     +#include <linux/sched.h>
784     +#include <linux/mm.h>
785     +
786     +/*
787     + * Hash based on the page number. Provides a good hit rate for
788     + * workloads with good locality and those with random accesses as well.
789     + */
790     +#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
791     +
792     +static inline void vmacache_flush(struct task_struct *tsk)
793     +{
794     + memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
795     +}
796     +
797     +extern void vmacache_flush_all(struct mm_struct *mm);
798     +extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
799     +extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
800     + unsigned long addr);
801     +
802     +#ifndef CONFIG_MMU
803     +extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
804     + unsigned long start,
805     + unsigned long end);
806     +#endif
807     +
808     +static inline void vmacache_invalidate(struct mm_struct *mm)
809     +{
810     + mm->vmacache_seqnum++;
811     +
812     + /* deal with overflows */
813     + if (unlikely(mm->vmacache_seqnum == 0))
814     + vmacache_flush_all(mm);
815     +}
816     +
817     +#endif /* __LINUX_VMACACHE_H */
818     diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
819     index bef53ce555d2..b10682cb138c 100644
820     --- a/include/media/videobuf2-core.h
821     +++ b/include/media/videobuf2-core.h
822     @@ -329,6 +329,9 @@ struct v4l2_fh;
823     * @retry_start_streaming: start_streaming() was called, but there were not enough
824     * buffers queued. If set, then retry calling start_streaming when
825     * queuing a new buffer.
826     + * @waiting_for_buffers: used in poll() to check if vb2 is still waiting for
827     + * buffers. Only set for capture queues if qbuf has not yet been
828     + * called since poll() needs to return POLLERR in that situation.
829     * @fileio: file io emulator internal data, used only if emulator is active
830     */
831     struct vb2_queue {
832     @@ -362,6 +365,7 @@ struct vb2_queue {
833    
834     unsigned int streaming:1;
835     unsigned int retry_start_streaming:1;
836     + unsigned int waiting_for_buffers:1;
837    
838     struct vb2_fileio_data *fileio;
839     };
840     diff --git a/init/Kconfig b/init/Kconfig
841     index 93c5ef0c5210..8b9521a2d2c1 100644
842     --- a/init/Kconfig
843     +++ b/init/Kconfig
844     @@ -1389,6 +1389,7 @@ config FUTEX
845    
846     config HAVE_FUTEX_CMPXCHG
847     bool
848     + depends on FUTEX
849     help
850     Architectures should select this if futex_atomic_cmpxchg_inatomic()
851     is implemented and always working. This removes a couple of runtime
852     diff --git a/kernel/cpuset.c b/kernel/cpuset.c
853     index 6b27e5c0cd86..15b3ea693225 100644
854     --- a/kernel/cpuset.c
855     +++ b/kernel/cpuset.c
856     @@ -1022,7 +1022,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
857     task_lock(tsk);
858     /*
859     * Determine if a loop is necessary if another thread is doing
860     - * get_mems_allowed(). If at least one node remains unchanged and
861     + * read_mems_allowed_begin(). If at least one node remains unchanged and
862     * tsk does not have a mempolicy, then an empty nodemask will not be
863     * possible when mems_allowed is larger than a word.
864     */
865     diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
866     index 334b3980ffc1..8865caec45fb 100644
867     --- a/kernel/debug/debug_core.c
868     +++ b/kernel/debug/debug_core.c
869     @@ -49,6 +49,7 @@
870     #include <linux/pid.h>
871     #include <linux/smp.h>
872     #include <linux/mm.h>
873     +#include <linux/vmacache.h>
874     #include <linux/rcupdate.h>
875    
876     #include <asm/cacheflush.h>
877     @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
878     if (!CACHE_FLUSH_IS_SAFE)
879     return;
880    
881     - if (current->mm && current->mm->mmap_cache) {
882     - flush_cache_range(current->mm->mmap_cache,
883     - addr, addr + BREAK_INSTR_SIZE);
884     + if (current->mm) {
885     + int i;
886     +
887     + for (i = 0; i < VMACACHE_SIZE; i++) {
888     + if (!current->vmacache[i])
889     + continue;
890     + flush_cache_range(current->vmacache[i],
891     + addr, addr + BREAK_INSTR_SIZE);
892     + }
893     }
894     +
895     /* Force flush instruction cache if it was outside the mm */
896     flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
897     }
898     diff --git a/kernel/events/core.c b/kernel/events/core.c
899     index 3a140ca37777..4ced342f1ba9 100644
900     --- a/kernel/events/core.c
901     +++ b/kernel/events/core.c
902     @@ -7836,8 +7836,10 @@ int perf_event_init_task(struct task_struct *child)
903    
904     for_each_task_context_nr(ctxn) {
905     ret = perf_event_init_context(child, ctxn);
906     - if (ret)
907     + if (ret) {
908     + perf_event_free_task(child);
909     return ret;
910     + }
911     }
912    
913     return 0;
914     diff --git a/kernel/fork.c b/kernel/fork.c
915     index c44bff8097f5..e2c685396295 100644
916     --- a/kernel/fork.c
917     +++ b/kernel/fork.c
918     @@ -28,6 +28,8 @@
919     #include <linux/mman.h>
920     #include <linux/mmu_notifier.h>
921     #include <linux/fs.h>
922     +#include <linux/mm.h>
923     +#include <linux/vmacache.h>
924     #include <linux/nsproxy.h>
925     #include <linux/capability.h>
926     #include <linux/cpu.h>
927     @@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
928    
929     mm->locked_vm = 0;
930     mm->mmap = NULL;
931     - mm->mmap_cache = NULL;
932     + mm->vmacache_seqnum = 0;
933     mm->map_count = 0;
934     cpumask_clear(mm_cpumask(mm));
935     mm->mm_rb = RB_ROOT;
936     @@ -876,6 +878,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
937     if (!oldmm)
938     return 0;
939    
940     + /* initialize the new vmacache entries */
941     + vmacache_flush(tsk);
942     +
943     if (clone_flags & CLONE_VM) {
944     atomic_inc(&oldmm->mm_users);
945     mm = oldmm;
946     @@ -1323,7 +1328,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
947     goto bad_fork_cleanup_policy;
948     retval = audit_alloc(p);
949     if (retval)
950     - goto bad_fork_cleanup_policy;
951     + goto bad_fork_cleanup_perf;
952     /* copy all the process information */
953     retval = copy_semundo(clone_flags, p);
954     if (retval)
955     @@ -1522,8 +1527,9 @@ bad_fork_cleanup_semundo:
956     exit_sem(p);
957     bad_fork_cleanup_audit:
958     audit_free(p);
959     -bad_fork_cleanup_policy:
960     +bad_fork_cleanup_perf:
961     perf_event_free_task(p);
962     +bad_fork_cleanup_policy:
963     #ifdef CONFIG_NUMA
964     mpol_put(p->mempolicy);
965     bad_fork_cleanup_cgroup:
966     diff --git a/kernel/time.c b/kernel/time.c
967     index 7c7964c33ae7..3c49ab45f822 100644
968     --- a/kernel/time.c
969     +++ b/kernel/time.c
970     @@ -496,17 +496,20 @@ EXPORT_SYMBOL(usecs_to_jiffies);
971     * that a remainder subtract here would not do the right thing as the
972     * resolution values don't fall on second boundries. I.e. the line:
973     * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
974     + * Note that due to the small error in the multiplier here, this
975     + * rounding is incorrect for sufficiently large values of tv_nsec, but
976     + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
977     + * OK.
978     *
979     * Rather, we just shift the bits off the right.
980     *
981     * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
982     * value to a scaled second value.
983     */
984     -unsigned long
985     -timespec_to_jiffies(const struct timespec *value)
986     +static unsigned long
987     +__timespec_to_jiffies(unsigned long sec, long nsec)
988     {
989     - unsigned long sec = value->tv_sec;
990     - long nsec = value->tv_nsec + TICK_NSEC - 1;
991     + nsec = nsec + TICK_NSEC - 1;
992    
993     if (sec >= MAX_SEC_IN_JIFFIES){
994     sec = MAX_SEC_IN_JIFFIES;
995     @@ -517,6 +520,13 @@ timespec_to_jiffies(const struct timespec *value)
996     (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
997    
998     }
999     +
1000     +unsigned long
1001     +timespec_to_jiffies(const struct timespec *value)
1002     +{
1003     + return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
1004     +}
1005     +
1006     EXPORT_SYMBOL(timespec_to_jiffies);
1007    
1008     void
1009     @@ -533,31 +543,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
1010     }
1011     EXPORT_SYMBOL(jiffies_to_timespec);
1012    
1013     -/* Same for "timeval"
1014     +/*
1015     + * We could use a similar algorithm to timespec_to_jiffies (with a
1016     + * different multiplier for usec instead of nsec). But this has a
1017     + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
1018     + * usec value, since it's not necessarily integral.
1019     *
1020     - * Well, almost. The problem here is that the real system resolution is
1021     - * in nanoseconds and the value being converted is in micro seconds.
1022     - * Also for some machines (those that use HZ = 1024, in-particular),
1023     - * there is a LARGE error in the tick size in microseconds.
1024     -
1025     - * The solution we use is to do the rounding AFTER we convert the
1026     - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
1027     - * Instruction wise, this should cost only an additional add with carry
1028     - * instruction above the way it was done above.
1029     + * We could instead round in the intermediate scaled representation
1030     + * (i.e. in units of 1/2^(large scale) jiffies) but that's also
1031     + * perilous: the scaling introduces a small positive error, which
1032     + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
1033     + * units to the intermediate before shifting) leads to accidental
1034     + * overflow and overestimates.
1035     + *
1036     + * At the cost of one additional multiplication by a constant, just
1037     + * use the timespec implementation.
1038     */
1039     unsigned long
1040     timeval_to_jiffies(const struct timeval *value)
1041     {
1042     - unsigned long sec = value->tv_sec;
1043     - long usec = value->tv_usec;
1044     -
1045     - if (sec >= MAX_SEC_IN_JIFFIES){
1046     - sec = MAX_SEC_IN_JIFFIES;
1047     - usec = 0;
1048     - }
1049     - return (((u64)sec * SEC_CONVERSION) +
1050     - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
1051     - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
1052     + return __timespec_to_jiffies(value->tv_sec,
1053     + value->tv_usec * NSEC_PER_USEC);
1054     }
1055     EXPORT_SYMBOL(timeval_to_jiffies);
1056    
1057     diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
1058     index 773aba836e81..774a0807fe81 100644
1059     --- a/kernel/trace/ring_buffer.c
1060     +++ b/kernel/trace/ring_buffer.c
1061     @@ -3372,7 +3372,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
1062     iter->head = cpu_buffer->reader_page->read;
1063    
1064     iter->cache_reader_page = iter->head_page;
1065     - iter->cache_read = iter->head;
1066     + iter->cache_read = cpu_buffer->read;
1067    
1068     if (iter->head)
1069     iter->read_stamp = cpu_buffer->read_stamp;
1070     diff --git a/lib/plist.c b/lib/plist.c
1071     index 1ebc95f7a46f..0f2084d30798 100644
1072     --- a/lib/plist.c
1073     +++ b/lib/plist.c
1074     @@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head)
1075     plist_check_head(head);
1076     }
1077    
1078     +/**
1079     + * plist_requeue - Requeue @node at end of same-prio entries.
1080     + *
1081     + * This is essentially an optimized plist_del() followed by
1082     + * plist_add(). It moves an entry already in the plist to
1083     + * after any other same-priority entries.
1084     + *
1085     + * @node: &struct plist_node pointer - entry to be moved
1086     + * @head: &struct plist_head pointer - list head
1087     + */
1088     +void plist_requeue(struct plist_node *node, struct plist_head *head)
1089     +{
1090     + struct plist_node *iter;
1091     + struct list_head *node_next = &head->node_list;
1092     +
1093     + plist_check_head(head);
1094     + BUG_ON(plist_head_empty(head));
1095     + BUG_ON(plist_node_empty(node));
1096     +
1097     + if (node == plist_last(head))
1098     + return;
1099     +
1100     + iter = plist_next(node);
1101     +
1102     + if (node->prio != iter->prio)
1103     + return;
1104     +
1105     + plist_del(node, head);
1106     +
1107     + plist_for_each_continue(iter, head) {
1108     + if (node->prio != iter->prio) {
1109     + node_next = &iter->node_list;
1110     + break;
1111     + }
1112     + }
1113     + list_add_tail(&node->node_list, node_next);
1114     +
1115     + plist_check_head(head);
1116     +}
1117     +
1118     #ifdef CONFIG_DEBUG_PI_LIST
1119     #include <linux/sched.h>
1120     #include <linux/module.h>
1121     @@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect)
1122     BUG_ON(prio_pos->prio_list.next != &first->prio_list);
1123     }
1124    
1125     +static void __init plist_test_requeue(struct plist_node *node)
1126     +{
1127     + plist_requeue(node, &test_head);
1128     +
1129     + if (node != plist_last(&test_head))
1130     + BUG_ON(node->prio == plist_next(node)->prio);
1131     +}
1132     +
1133     static int __init plist_test(void)
1134     {
1135     int nr_expect = 0, i, loop;
1136     @@ -193,6 +241,10 @@ static int __init plist_test(void)
1137     nr_expect--;
1138     }
1139     plist_test_check(nr_expect);
1140     + if (!plist_node_empty(test_node + i)) {
1141     + plist_test_requeue(test_node + i);
1142     + plist_test_check(nr_expect);
1143     + }
1144     }
1145    
1146     for (i = 0; i < ARRAY_SIZE(test_node); i++) {
1147     diff --git a/mm/Makefile b/mm/Makefile
1148     index 310c90a09264..c561f1f6bca0 100644
1149     --- a/mm/Makefile
1150     +++ b/mm/Makefile
1151     @@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
1152     readahead.o swap.o truncate.o vmscan.o shmem.o \
1153     util.o mmzone.o vmstat.o backing-dev.o \
1154     mm_init.o mmu_context.o percpu.o slab_common.o \
1155     - compaction.o balloon_compaction.o \
1156     + compaction.o balloon_compaction.o vmacache.o \
1157     interval_tree.o list_lru.o $(mmu-y)
1158    
1159     obj-y += init-mm.o
1160     diff --git a/mm/compaction.c b/mm/compaction.c
1161     index 5f702ef0a65f..5e38e5706f62 100644
1162     --- a/mm/compaction.c
1163     +++ b/mm/compaction.c
1164     @@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
1165     /* Returns true if the page is within a block suitable for migration to */
1166     static bool suitable_migration_target(struct page *page)
1167     {
1168     - int migratetype = get_pageblock_migratetype(page);
1169     -
1170     - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
1171     - if (migratetype == MIGRATE_RESERVE)
1172     - return false;
1173     -
1174     - if (is_migrate_isolate(migratetype))
1175     - return false;
1176     -
1177     - /* If the page is a large free page, then allow migration */
1178     + /* If the page is a large free page, then disallow migration */
1179     if (PageBuddy(page) && page_order(page) >= pageblock_order)
1180     - return true;
1181     + return false;
1182    
1183     /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
1184     - if (migrate_async_suitable(migratetype))
1185     + if (migrate_async_suitable(get_pageblock_migratetype(page)))
1186     return true;
1187    
1188     /* Otherwise skip the block */
1189     @@ -253,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
1190     struct page *cursor, *valid_page = NULL;
1191     unsigned long flags;
1192     bool locked = false;
1193     + bool checked_pageblock = false;
1194    
1195     cursor = pfn_to_page(blockpfn);
1196    
1197     @@ -284,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
1198     break;
1199    
1200     /* Recheck this is a suitable migration target under lock */
1201     - if (!strict && !suitable_migration_target(page))
1202     - break;
1203     + if (!strict && !checked_pageblock) {
1204     + /*
1205     + * We need to check suitability of pageblock only once
1206     + * and this isolate_freepages_block() is called with
1207     + * pageblock range, so just check once is sufficient.
1208     + */
1209     + checked_pageblock = true;
1210     + if (!suitable_migration_target(page))
1211     + break;
1212     + }
1213    
1214     /* Recheck this is a buddy page under lock */
1215     if (!PageBuddy(page))
1216     @@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
1217     unsigned long last_pageblock_nr = 0, pageblock_nr;
1218     unsigned long nr_scanned = 0, nr_isolated = 0;
1219     struct list_head *migratelist = &cc->migratepages;
1220     - isolate_mode_t mode = 0;
1221     struct lruvec *lruvec;
1222     unsigned long flags;
1223     bool locked = false;
1224     struct page *page = NULL, *valid_page = NULL;
1225     bool skipped_async_unsuitable = false;
1226     + const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
1227     + (unevictable ? ISOLATE_UNEVICTABLE : 0);
1228    
1229     /*
1230     * Ensure that there are not too many pages isolated from the LRU
1231     @@ -487,7 +488,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
1232     cond_resched();
1233     for (; low_pfn < end_pfn; low_pfn++) {
1234     /* give a chance to irqs before checking need_resched() */
1235     - if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
1236     + if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
1237     if (should_release_lock(&zone->lru_lock)) {
1238     spin_unlock_irqrestore(&zone->lru_lock, flags);
1239     locked = false;
1240     @@ -526,8 +527,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
1241    
1242     /* If isolation recently failed, do not retry */
1243     pageblock_nr = low_pfn >> pageblock_order;
1244     - if (!isolation_suitable(cc, page))
1245     - goto next_pageblock;
1246     + if (last_pageblock_nr != pageblock_nr) {
1247     + int mt;
1248     +
1249     + last_pageblock_nr = pageblock_nr;
1250     + if (!isolation_suitable(cc, page))
1251     + goto next_pageblock;
1252     +
1253     + /*
1254     + * For async migration, also only scan in MOVABLE
1255     + * blocks. Async migration is optimistic to see if
1256     + * the minimum amount of work satisfies the allocation
1257     + */
1258     + mt = get_pageblock_migratetype(page);
1259     + if (!cc->sync && !migrate_async_suitable(mt)) {
1260     + cc->finished_update_migrate = true;
1261     + skipped_async_unsuitable = true;
1262     + goto next_pageblock;
1263     + }
1264     + }
1265    
1266     /*
1267     * Skip if free. page_order cannot be used without zone->lock
1268     @@ -537,18 +555,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
1269     continue;
1270    
1271     /*
1272     - * For async migration, also only scan in MOVABLE blocks. Async
1273     - * migration is optimistic to see if the minimum amount of work
1274     - * satisfies the allocation
1275     - */
1276     - if (!cc->sync && last_pageblock_nr != pageblock_nr &&
1277     - !migrate_async_suitable(get_pageblock_migratetype(page))) {
1278     - cc->finished_update_migrate = true;
1279     - skipped_async_unsuitable = true;
1280     - goto next_pageblock;
1281     - }
1282     -
1283     - /*
1284     * Check may be lockless but that's ok as we recheck later.
1285     * It's possible to migrate LRU pages and balloon pages
1286     * Skip any other type of page
1287     @@ -557,11 +563,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
1288     if (unlikely(balloon_page_movable(page))) {
1289     if (locked && balloon_page_isolate(page)) {
1290     /* Successfully isolated */
1291     - cc->finished_update_migrate = true;
1292     - list_add(&page->lru, migratelist);
1293     - cc->nr_migratepages++;
1294     - nr_isolated++;
1295     - goto check_compact_cluster;
1296     + goto isolate_success;
1297     }
1298     }
1299     continue;
1300     @@ -584,6 +586,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
1301     continue;
1302     }
1303    
1304     + /*
1305     + * Migration will fail if an anonymous page is pinned in memory,
1306     + * so avoid taking lru_lock and isolating it unnecessarily in an
1307     + * admittedly racy check.
1308     + */
1309     + if (!page_mapping(page) &&
1310     + page_count(page) > page_mapcount(page))
1311     + continue;
1312     +
1313     /* Check if it is ok to still hold the lock */
1314     locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
1315     locked, cc);
1316     @@ -598,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
1317     continue;
1318     }
1319    
1320     - if (!cc->sync)
1321     - mode |= ISOLATE_ASYNC_MIGRATE;
1322     -
1323     - if (unevictable)
1324     - mode |= ISOLATE_UNEVICTABLE;
1325     -
1326     lruvec = mem_cgroup_page_lruvec(page, zone);
1327    
1328     /* Try isolate the page */
1329     @@ -613,13 +618,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
1330     VM_BUG_ON_PAGE(PageTransCompound(page), page);
1331    
1332     /* Successfully isolated */
1333     - cc->finished_update_migrate = true;
1334     del_page_from_lru_list(page, lruvec, page_lru(page));
1335     +
1336     +isolate_success:
1337     + cc->finished_update_migrate = true;
1338     list_add(&page->lru, migratelist);
1339     cc->nr_migratepages++;
1340     nr_isolated++;
1341    
1342     -check_compact_cluster:
1343     /* Avoid isolating too much */
1344     if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
1345     ++low_pfn;
1346     @@ -630,7 +636,6 @@ check_compact_cluster:
1347    
1348     next_pageblock:
1349     low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
1350     - last_pageblock_nr = pageblock_nr;
1351     }
1352    
1353     acct_isolated(zone, locked, cc);
1354     @@ -1188,6 +1193,7 @@ static void compact_node(int nid)
1355     struct compact_control cc = {
1356     .order = -1,
1357     .sync = true,
1358     + .ignore_skip_hint = true,
1359     };
1360    
1361     __compact_pgdat(NODE_DATA(nid), &cc);
1362     diff --git a/mm/filemap.c b/mm/filemap.c
1363     index 7a13f6ac5421..c2cc7c95eff1 100644
1364     --- a/mm/filemap.c
1365     +++ b/mm/filemap.c
1366     @@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping)
1367     {
1368     int ret = 0;
1369     /* Check for outstanding write errors */
1370     - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
1371     + if (test_bit(AS_ENOSPC, &mapping->flags) &&
1372     + test_and_clear_bit(AS_ENOSPC, &mapping->flags))
1373     ret = -ENOSPC;
1374     - if (test_and_clear_bit(AS_EIO, &mapping->flags))
1375     + if (test_bit(AS_EIO, &mapping->flags) &&
1376     + test_and_clear_bit(AS_EIO, &mapping->flags))
1377     ret = -EIO;
1378     return ret;
1379     }
1380     @@ -520,10 +522,10 @@ struct page *__page_cache_alloc(gfp_t gfp)
1381     if (cpuset_do_page_mem_spread()) {
1382     unsigned int cpuset_mems_cookie;
1383     do {
1384     - cpuset_mems_cookie = get_mems_allowed();
1385     + cpuset_mems_cookie = read_mems_allowed_begin();
1386     n = cpuset_mem_spread_node();
1387     page = alloc_pages_exact_node(n, gfp, 0);
1388     - } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
1389     + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
1390    
1391     return page;
1392     }
1393     diff --git a/mm/frontswap.c b/mm/frontswap.c
1394     index 1b24bdcb3197..c30eec536f03 100644
1395     --- a/mm/frontswap.c
1396     +++ b/mm/frontswap.c
1397     @@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
1398    
1399     static unsigned long __frontswap_curr_pages(void)
1400     {
1401     - int type;
1402     unsigned long totalpages = 0;
1403     struct swap_info_struct *si = NULL;
1404    
1405     assert_spin_locked(&swap_lock);
1406     - for (type = swap_list.head; type >= 0; type = si->next) {
1407     - si = swap_info[type];
1408     + plist_for_each_entry(si, &swap_active_head, list)
1409     totalpages += atomic_read(&si->frontswap_pages);
1410     - }
1411     return totalpages;
1412     }
1413    
1414     @@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
1415     int si_frontswap_pages;
1416     unsigned long total_pages_to_unuse = total;
1417     unsigned long pages = 0, pages_to_unuse = 0;
1418     - int type;
1419    
1420     assert_spin_locked(&swap_lock);
1421     - for (type = swap_list.head; type >= 0; type = si->next) {
1422     - si = swap_info[type];
1423     + plist_for_each_entry(si, &swap_active_head, list) {
1424     si_frontswap_pages = atomic_read(&si->frontswap_pages);
1425     if (total_pages_to_unuse < si_frontswap_pages) {
1426     pages = pages_to_unuse = total_pages_to_unuse;
1427     @@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
1428     }
1429     vm_unacct_memory(pages);
1430     *unused = pages_to_unuse;
1431     - *swapid = type;
1432     + *swapid = si->type;
1433     ret = 0;
1434     break;
1435     }
1436     @@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
1437     /*
1438     * we don't want to hold swap_lock while doing a very
1439     * lengthy try_to_unuse, but swap_list may change
1440     - * so restart scan from swap_list.head each time
1441     + * so restart scan from swap_active_head each time
1442     */
1443     spin_lock(&swap_lock);
1444     ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
1445     diff --git a/mm/huge_memory.c b/mm/huge_memory.c
1446     index 1c42d0c36d0b..718bfa16a36f 100644
1447     --- a/mm/huge_memory.c
1448     +++ b/mm/huge_memory.c
1449     @@ -1819,21 +1819,24 @@ static int __split_huge_page_map(struct page *page,
1450     if (pmd) {
1451     pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1452     pmd_populate(mm, &_pmd, pgtable);
1453     + if (pmd_write(*pmd))
1454     + BUG_ON(page_mapcount(page) != 1);
1455    
1456     haddr = address;
1457     for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1458     pte_t *pte, entry;
1459     BUG_ON(PageCompound(page+i));
1460     + /*
1461     + * Note that pmd_numa is not transferred deliberately
1462     + * to avoid any possibility that pte_numa leaks to
1463     + * a PROT_NONE VMA by accident.
1464     + */
1465     entry = mk_pte(page + i, vma->vm_page_prot);
1466     entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1467     if (!pmd_write(*pmd))
1468     entry = pte_wrprotect(entry);
1469     - else
1470     - BUG_ON(page_mapcount(page) != 1);
1471     if (!pmd_young(*pmd))
1472     entry = pte_mkold(entry);
1473     - if (pmd_numa(*pmd))
1474     - entry = pte_mknuma(entry);
1475     pte = pte_offset_map(&_pmd, haddr);
1476     BUG_ON(!pte_none(*pte));
1477     set_pte_at(mm, haddr, pte, entry);
1478     diff --git a/mm/hugetlb.c b/mm/hugetlb.c
1479     index 923f38e62bcf..67d0c175efcf 100644
1480     --- a/mm/hugetlb.c
1481     +++ b/mm/hugetlb.c
1482     @@ -540,7 +540,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
1483     goto err;
1484    
1485     retry_cpuset:
1486     - cpuset_mems_cookie = get_mems_allowed();
1487     + cpuset_mems_cookie = read_mems_allowed_begin();
1488     zonelist = huge_zonelist(vma, address,
1489     htlb_alloc_mask(h), &mpol, &nodemask);
1490    
1491     @@ -562,7 +562,7 @@ retry_cpuset:
1492     }
1493    
1494     mpol_cond_put(mpol);
1495     - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1496     + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
1497     goto retry_cpuset;
1498     return page;
1499    
1500     @@ -2071,6 +2071,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1501     unsigned long tmp;
1502     int ret;
1503    
1504     + if (!hugepages_supported())
1505     + return -ENOTSUPP;
1506     +
1507     tmp = h->max_huge_pages;
1508    
1509     if (write && h->order >= MAX_ORDER)
1510     @@ -2124,6 +2127,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1511     unsigned long tmp;
1512     int ret;
1513    
1514     + if (!hugepages_supported())
1515     + return -ENOTSUPP;
1516     +
1517     tmp = h->nr_overcommit_huge_pages;
1518    
1519     if (write && h->order >= MAX_ORDER)
1520     @@ -2149,6 +2155,8 @@ out:
1521     void hugetlb_report_meminfo(struct seq_file *m)
1522     {
1523     struct hstate *h = &default_hstate;
1524     + if (!hugepages_supported())
1525     + return;
1526     seq_printf(m,
1527     "HugePages_Total: %5lu\n"
1528     "HugePages_Free: %5lu\n"
1529     @@ -2165,6 +2173,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
1530     int hugetlb_report_node_meminfo(int nid, char *buf)
1531     {
1532     struct hstate *h = &default_hstate;
1533     + if (!hugepages_supported())
1534     + return 0;
1535     return sprintf(buf,
1536     "Node %d HugePages_Total: %5u\n"
1537     "Node %d HugePages_Free: %5u\n"
1538     @@ -2179,6 +2189,9 @@ void hugetlb_show_meminfo(void)
1539     struct hstate *h;
1540     int nid;
1541    
1542     + if (!hugepages_supported())
1543     + return;
1544     +
1545     for_each_node_state(nid, N_MEMORY)
1546     for_each_hstate(h)
1547     pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
1548     diff --git a/mm/mempolicy.c b/mm/mempolicy.c
1549     index 15a8ea031526..796c7e6cf93b 100644
1550     --- a/mm/mempolicy.c
1551     +++ b/mm/mempolicy.c
1552     @@ -1897,7 +1897,7 @@ int node_random(const nodemask_t *maskp)
1553     * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1554     * @nodemask for filtering the zonelist.
1555     *
1556     - * Must be protected by get_mems_allowed()
1557     + * Must be protected by read_mems_allowed_begin()
1558     */
1559     struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1560     gfp_t gfp_flags, struct mempolicy **mpol,
1561     @@ -2061,7 +2061,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1562    
1563     retry_cpuset:
1564     pol = get_vma_policy(current, vma, addr);
1565     - cpuset_mems_cookie = get_mems_allowed();
1566     + cpuset_mems_cookie = read_mems_allowed_begin();
1567    
1568     if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1569     unsigned nid;
1570     @@ -2069,7 +2069,7 @@ retry_cpuset:
1571     nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1572     mpol_cond_put(pol);
1573     page = alloc_page_interleave(gfp, order, nid);
1574     - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1575     + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
1576     goto retry_cpuset;
1577    
1578     return page;
1579     @@ -2079,7 +2079,7 @@ retry_cpuset:
1580     policy_nodemask(gfp, pol));
1581     if (unlikely(mpol_needs_cond_ref(pol)))
1582     __mpol_put(pol);
1583     - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1584     + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
1585     goto retry_cpuset;
1586     return page;
1587     }
1588     @@ -2113,7 +2113,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1589     pol = &default_policy;
1590    
1591     retry_cpuset:
1592     - cpuset_mems_cookie = get_mems_allowed();
1593     + cpuset_mems_cookie = read_mems_allowed_begin();
1594    
1595     /*
1596     * No reference counting needed for current->mempolicy
1597     @@ -2126,7 +2126,7 @@ retry_cpuset:
1598     policy_zonelist(gfp, pol, numa_node_id()),
1599     policy_nodemask(gfp, pol));
1600    
1601     - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1602     + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
1603     goto retry_cpuset;
1604    
1605     return page;
1606     diff --git a/mm/migrate.c b/mm/migrate.c
1607     index bed48809e5d0..13f47fbe3550 100644
1608     --- a/mm/migrate.c
1609     +++ b/mm/migrate.c
1610     @@ -148,8 +148,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
1611     pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
1612     if (pte_swp_soft_dirty(*ptep))
1613     pte = pte_mksoft_dirty(pte);
1614     +
1615     + /* Recheck VMA as permissions can change since migration started */
1616     if (is_write_migration_entry(entry))
1617     - pte = pte_mkwrite(pte);
1618     + pte = maybe_mkwrite(pte, vma);
1619     +
1620     #ifdef CONFIG_HUGETLB_PAGE
1621     if (PageHuge(new)) {
1622     pte = pte_mkhuge(pte);
1623     diff --git a/mm/mmap.c b/mm/mmap.c
1624     index 20ff0c33274c..dfe90657a6db 100644
1625     --- a/mm/mmap.c
1626     +++ b/mm/mmap.c
1627     @@ -10,6 +10,7 @@
1628     #include <linux/slab.h>
1629     #include <linux/backing-dev.h>
1630     #include <linux/mm.h>
1631     +#include <linux/vmacache.h>
1632     #include <linux/shm.h>
1633     #include <linux/mman.h>
1634     #include <linux/pagemap.h>
1635     @@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
1636     prev->vm_next = next = vma->vm_next;
1637     if (next)
1638     next->vm_prev = prev;
1639     - if (mm->mmap_cache == vma)
1640     - mm->mmap_cache = prev;
1641     +
1642     + /* Kill the cache */
1643     + vmacache_invalidate(mm);
1644     }
1645    
1646     /*
1647     @@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
1648     /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1649     struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1650     {
1651     - struct vm_area_struct *vma = NULL;
1652     + struct rb_node *rb_node;
1653     + struct vm_area_struct *vma;
1654    
1655     /* Check the cache first. */
1656     - /* (Cache hit rate is typically around 35%.) */
1657     - vma = ACCESS_ONCE(mm->mmap_cache);
1658     - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1659     - struct rb_node *rb_node;
1660     + vma = vmacache_find(mm, addr);
1661     + if (likely(vma))
1662     + return vma;
1663    
1664     - rb_node = mm->mm_rb.rb_node;
1665     - vma = NULL;
1666     + rb_node = mm->mm_rb.rb_node;
1667     + vma = NULL;
1668    
1669     - while (rb_node) {
1670     - struct vm_area_struct *vma_tmp;
1671     -
1672     - vma_tmp = rb_entry(rb_node,
1673     - struct vm_area_struct, vm_rb);
1674     -
1675     - if (vma_tmp->vm_end > addr) {
1676     - vma = vma_tmp;
1677     - if (vma_tmp->vm_start <= addr)
1678     - break;
1679     - rb_node = rb_node->rb_left;
1680     - } else
1681     - rb_node = rb_node->rb_right;
1682     - }
1683     - if (vma)
1684     - mm->mmap_cache = vma;
1685     + while (rb_node) {
1686     + struct vm_area_struct *tmp;
1687     +
1688     + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
1689     +
1690     + if (tmp->vm_end > addr) {
1691     + vma = tmp;
1692     + if (tmp->vm_start <= addr)
1693     + break;
1694     + rb_node = rb_node->rb_left;
1695     + } else
1696     + rb_node = rb_node->rb_right;
1697     }
1698     +
1699     + if (vma)
1700     + vmacache_update(addr, vma);
1701     return vma;
1702     }
1703    
1704     @@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1705     } else
1706     mm->highest_vm_end = prev ? prev->vm_end : 0;
1707     tail_vma->vm_next = NULL;
1708     - mm->mmap_cache = NULL; /* Kill the cache. */
1709     +
1710     + /* Kill the cache */
1711     + vmacache_invalidate(mm);
1712     }
1713    
1714     /*
1715     diff --git a/mm/nommu.c b/mm/nommu.c
1716     index 8740213b1647..3ee4f74fbfbe 100644
1717     --- a/mm/nommu.c
1718     +++ b/mm/nommu.c
1719     @@ -15,6 +15,7 @@
1720    
1721     #include <linux/export.h>
1722     #include <linux/mm.h>
1723     +#include <linux/vmacache.h>
1724     #include <linux/mman.h>
1725     #include <linux/swap.h>
1726     #include <linux/file.h>
1727     @@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
1728     */
1729     static void delete_vma_from_mm(struct vm_area_struct *vma)
1730     {
1731     + int i;
1732     struct address_space *mapping;
1733     struct mm_struct *mm = vma->vm_mm;
1734     + struct task_struct *curr = current;
1735    
1736     kenter("%p", vma);
1737    
1738     protect_vma(vma, 0);
1739    
1740     mm->map_count--;
1741     - if (mm->mmap_cache == vma)
1742     - mm->mmap_cache = NULL;
1743     + for (i = 0; i < VMACACHE_SIZE; i++) {
1744     + /* if the vma is cached, invalidate the entire cache */
1745     + if (curr->vmacache[i] == vma) {
1746     + vmacache_invalidate(curr->mm);
1747     + break;
1748     + }
1749     + }
1750    
1751     /* remove the VMA from the mapping */
1752     if (vma->vm_file) {
1753     @@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1754     struct vm_area_struct *vma;
1755    
1756     /* check the cache first */
1757     - vma = ACCESS_ONCE(mm->mmap_cache);
1758     - if (vma && vma->vm_start <= addr && vma->vm_end > addr)
1759     + vma = vmacache_find(mm, addr);
1760     + if (likely(vma))
1761     return vma;
1762    
1763     /* trawl the list (there may be multiple mappings in which addr
1764     @@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1765     if (vma->vm_start > addr)
1766     return NULL;
1767     if (vma->vm_end > addr) {
1768     - mm->mmap_cache = vma;
1769     + vmacache_update(addr, vma);
1770     return vma;
1771     }
1772     }
1773     @@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
1774     unsigned long end = addr + len;
1775    
1776     /* check the cache first */
1777     - vma = mm->mmap_cache;
1778     - if (vma && vma->vm_start == addr && vma->vm_end == end)
1779     + vma = vmacache_find_exact(mm, addr, end);
1780     + if (vma)
1781     return vma;
1782    
1783     /* trawl the list (there may be multiple mappings in which addr
1784     @@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
1785     if (vma->vm_start > addr)
1786     return NULL;
1787     if (vma->vm_end == end) {
1788     - mm->mmap_cache = vma;
1789     + vmacache_update(addr, vma);
1790     return vma;
1791     }
1792     }
1793     diff --git a/mm/page_alloc.c b/mm/page_alloc.c
1794     index 62e400d00e3f..ff0f6b13f32f 100644
1795     --- a/mm/page_alloc.c
1796     +++ b/mm/page_alloc.c
1797     @@ -1869,7 +1869,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
1798     {
1799     int i;
1800    
1801     - for_each_online_node(i)
1802     + for_each_node_state(i, N_MEMORY)
1803     if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1804     node_set(i, NODE_DATA(nid)->reclaim_nodes);
1805     else
1806     @@ -2736,7 +2736,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1807     return NULL;
1808    
1809     retry_cpuset:
1810     - cpuset_mems_cookie = get_mems_allowed();
1811     + cpuset_mems_cookie = read_mems_allowed_begin();
1812    
1813     /* The preferred zone is used for statistics later */
1814     first_zones_zonelist(zonelist, high_zoneidx,
1815     @@ -2791,7 +2791,7 @@ out:
1816     * the mask is being updated. If a page allocation is about to fail,
1817     * check if the cpuset changed during allocation and if so, retry.
1818     */
1819     - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1820     + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
1821     goto retry_cpuset;
1822    
1823     memcg_kmem_commit_charge(page, memcg, order);
1824     @@ -3059,9 +3059,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
1825     goto out;
1826    
1827     do {
1828     - cpuset_mems_cookie = get_mems_allowed();
1829     + cpuset_mems_cookie = read_mems_allowed_begin();
1830     ret = !node_isset(nid, cpuset_current_mems_allowed);
1831     - } while (!put_mems_allowed(cpuset_mems_cookie));
1832     + } while (read_mems_allowed_retry(cpuset_mems_cookie));
1833     out:
1834     return ret;
1835     }
1836     @@ -4933,7 +4933,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
1837    
1838     pgdat->node_id = nid;
1839     pgdat->node_start_pfn = node_start_pfn;
1840     - init_zone_allows_reclaim(nid);
1841     + if (node_state(nid, N_MEMORY))
1842     + init_zone_allows_reclaim(nid);
1843     #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1844     get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1845     #endif
1846     diff --git a/mm/readahead.c b/mm/readahead.c
1847     index 0de2360d65f3..1fa0d6fca556 100644
1848     --- a/mm/readahead.c
1849     +++ b/mm/readahead.c
1850     @@ -233,14 +233,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
1851     return 0;
1852     }
1853    
1854     +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
1855     /*
1856     * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
1857     * sensible upper limit.
1858     */
1859     unsigned long max_sane_readahead(unsigned long nr)
1860     {
1861     - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
1862     - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
1863     + return min(nr, MAX_READAHEAD);
1864     }
1865    
1866     /*
1867     diff --git a/mm/slab.c b/mm/slab.c
1868     index ea854eb2388c..0b1c2a58559d 100644
1869     --- a/mm/slab.c
1870     +++ b/mm/slab.c
1871     @@ -3122,7 +3122,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
1872     local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
1873    
1874     retry_cpuset:
1875     - cpuset_mems_cookie = get_mems_allowed();
1876     + cpuset_mems_cookie = read_mems_allowed_begin();
1877     zonelist = node_zonelist(slab_node(), flags);
1878    
1879     retry:
1880     @@ -3180,7 +3180,7 @@ retry:
1881     }
1882     }
1883    
1884     - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
1885     + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
1886     goto retry_cpuset;
1887     return obj;
1888     }
1889     diff --git a/mm/slub.c b/mm/slub.c
1890     index 25f14ad8f817..7611f148ee81 100644
1891     --- a/mm/slub.c
1892     +++ b/mm/slub.c
1893     @@ -1684,7 +1684,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1894     return NULL;
1895    
1896     do {
1897     - cpuset_mems_cookie = get_mems_allowed();
1898     + cpuset_mems_cookie = read_mems_allowed_begin();
1899     zonelist = node_zonelist(slab_node(), flags);
1900     for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1901     struct kmem_cache_node *n;
1902     @@ -1696,19 +1696,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1903     object = get_partial_node(s, n, c, flags);
1904     if (object) {
1905     /*
1906     - * Return the object even if
1907     - * put_mems_allowed indicated that
1908     - * the cpuset mems_allowed was
1909     - * updated in parallel. It's a
1910     - * harmless race between the alloc
1911     - * and the cpuset update.
1912     + * Don't check read_mems_allowed_retry()
1913     + * here - if mems_allowed was updated in
1914     + * parallel, that was a harmless race
1915     + * between allocation and the cpuset
1916     + * update
1917     */
1918     - put_mems_allowed(cpuset_mems_cookie);
1919     return object;
1920     }
1921     }
1922     }
1923     - } while (!put_mems_allowed(cpuset_mems_cookie));
1924     + } while (read_mems_allowed_retry(cpuset_mems_cookie));
1925     #endif
1926     return NULL;
1927     }
1928     diff --git a/mm/swapfile.c b/mm/swapfile.c
1929     index 4a7f7e6992b6..beeeef8a1b2d 100644
1930     --- a/mm/swapfile.c
1931     +++ b/mm/swapfile.c
1932     @@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
1933     /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
1934     long total_swap_pages;
1935     static int least_priority;
1936     -static atomic_t highest_priority_index = ATOMIC_INIT(-1);
1937    
1938     static const char Bad_file[] = "Bad swap file entry ";
1939     static const char Unused_file[] = "Unused swap file entry ";
1940     static const char Bad_offset[] = "Bad swap offset entry ";
1941     static const char Unused_offset[] = "Unused swap offset entry ";
1942    
1943     -struct swap_list_t swap_list = {-1, -1};
1944     +/*
1945     + * all active swap_info_structs
1946     + * protected with swap_lock, and ordered by priority.
1947     + */
1948     +PLIST_HEAD(swap_active_head);
1949     +
1950     +/*
1951     + * all available (active, not full) swap_info_structs
1952     + * protected with swap_avail_lock, ordered by priority.
1953     + * This is used by get_swap_page() instead of swap_active_head
1954     + * because swap_active_head includes all swap_info_structs,
1955     + * but get_swap_page() doesn't need to look at full ones.
1956     + * This uses its own lock instead of swap_lock because when a
1957     + * swap_info_struct changes between not-full/full, it needs to
1958     + * add/remove itself to/from this list, but the swap_info_struct->lock
1959     + * is held and the locking order requires swap_lock to be taken
1960     + * before any swap_info_struct->lock.
1961     + */
1962     +static PLIST_HEAD(swap_avail_head);
1963     +static DEFINE_SPINLOCK(swap_avail_lock);
1964    
1965     struct swap_info_struct *swap_info[MAX_SWAPFILES];
1966    
1967     @@ -591,6 +609,9 @@ checks:
1968     if (si->inuse_pages == si->pages) {
1969     si->lowest_bit = si->max;
1970     si->highest_bit = 0;
1971     + spin_lock(&swap_avail_lock);
1972     + plist_del(&si->avail_list, &swap_avail_head);
1973     + spin_unlock(&swap_avail_lock);
1974     }
1975     si->swap_map[offset] = usage;
1976     inc_cluster_info_page(si, si->cluster_info, offset);
1977     @@ -640,71 +661,65 @@ no_page:
1978    
1979     swp_entry_t get_swap_page(void)
1980     {
1981     - struct swap_info_struct *si;
1982     + struct swap_info_struct *si, *next;
1983     pgoff_t offset;
1984     - int type, next;
1985     - int wrapped = 0;
1986     - int hp_index;
1987    
1988     - spin_lock(&swap_lock);
1989     if (atomic_long_read(&nr_swap_pages) <= 0)
1990     goto noswap;
1991     atomic_long_dec(&nr_swap_pages);
1992    
1993     - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
1994     - hp_index = atomic_xchg(&highest_priority_index, -1);
1995     - /*
1996     - * highest_priority_index records current highest priority swap
1997     - * type which just frees swap entries. If its priority is
1998     - * higher than that of swap_list.next swap type, we use it. It
1999     - * isn't protected by swap_lock, so it can be an invalid value
2000     - * if the corresponding swap type is swapoff. We double check
2001     - * the flags here. It's even possible the swap type is swapoff
2002     - * and swapon again and its priority is changed. In such rare
2003     - * case, low prority swap type might be used, but eventually
2004     - * high priority swap will be used after several rounds of
2005     - * swap.
2006     - */
2007     - if (hp_index != -1 && hp_index != type &&
2008     - swap_info[type]->prio < swap_info[hp_index]->prio &&
2009     - (swap_info[hp_index]->flags & SWP_WRITEOK)) {
2010     - type = hp_index;
2011     - swap_list.next = type;
2012     - }
2013     -
2014     - si = swap_info[type];
2015     - next = si->next;
2016     - if (next < 0 ||
2017     - (!wrapped && si->prio != swap_info[next]->prio)) {
2018     - next = swap_list.head;
2019     - wrapped++;
2020     - }
2021     + spin_lock(&swap_avail_lock);
2022    
2023     +start_over:
2024     + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
2025     + /* requeue si to after same-priority siblings */
2026     + plist_requeue(&si->avail_list, &swap_avail_head);
2027     + spin_unlock(&swap_avail_lock);
2028     spin_lock(&si->lock);
2029     - if (!si->highest_bit) {
2030     + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
2031     + spin_lock(&swap_avail_lock);
2032     + if (plist_node_empty(&si->avail_list)) {
2033     + spin_unlock(&si->lock);
2034     + goto nextsi;
2035     + }
2036     + WARN(!si->highest_bit,
2037     + "swap_info %d in list but !highest_bit\n",
2038     + si->type);
2039     + WARN(!(si->flags & SWP_WRITEOK),
2040     + "swap_info %d in list but !SWP_WRITEOK\n",
2041     + si->type);
2042     + plist_del(&si->avail_list, &swap_avail_head);
2043     spin_unlock(&si->lock);
2044     - continue;
2045     + goto nextsi;
2046     }
2047     - if (!(si->flags & SWP_WRITEOK)) {
2048     - spin_unlock(&si->lock);
2049     - continue;
2050     - }
2051     -
2052     - swap_list.next = next;
2053    
2054     - spin_unlock(&swap_lock);
2055     /* This is called for allocating swap entry for cache */
2056     offset = scan_swap_map(si, SWAP_HAS_CACHE);
2057     spin_unlock(&si->lock);
2058     if (offset)
2059     - return swp_entry(type, offset);
2060     - spin_lock(&swap_lock);
2061     - next = swap_list.next;
2062     + return swp_entry(si->type, offset);
2063     + pr_debug("scan_swap_map of si %d failed to find offset\n",
2064     + si->type);
2065     + spin_lock(&swap_avail_lock);
2066     +nextsi:
2067     + /*
2068     + * if we got here, it's likely that si was almost full before,
2069     + * and since scan_swap_map() can drop the si->lock, multiple
2070     + * callers probably all tried to get a page from the same si
2071     + * and it filled up before we could get one; or, the si filled
2072     + * up between us dropping swap_avail_lock and taking si->lock.
2073     + * Since we dropped the swap_avail_lock, the swap_avail_head
2074     + * list may have been modified; so if next is still in the
2075     + * swap_avail_head list then try it, otherwise start over.
2076     + */
2077     + if (plist_node_empty(&next->avail_list))
2078     + goto start_over;
2079     }
2080    
2081     + spin_unlock(&swap_avail_lock);
2082     +
2083     atomic_long_inc(&nr_swap_pages);
2084     noswap:
2085     - spin_unlock(&swap_lock);
2086     return (swp_entry_t) {0};
2087     }
2088    
2089     @@ -766,27 +781,6 @@ out:
2090     return NULL;
2091     }
2092    
2093     -/*
2094     - * This swap type frees swap entry, check if it is the highest priority swap
2095     - * type which just frees swap entry. get_swap_page() uses
2096     - * highest_priority_index to search highest priority swap type. The
2097     - * swap_info_struct.lock can't protect us if there are multiple swap types
2098     - * active, so we use atomic_cmpxchg.
2099     - */
2100     -static void set_highest_priority_index(int type)
2101     -{
2102     - int old_hp_index, new_hp_index;
2103     -
2104     - do {
2105     - old_hp_index = atomic_read(&highest_priority_index);
2106     - if (old_hp_index != -1 &&
2107     - swap_info[old_hp_index]->prio >= swap_info[type]->prio)
2108     - break;
2109     - new_hp_index = type;
2110     - } while (atomic_cmpxchg(&highest_priority_index,
2111     - old_hp_index, new_hp_index) != old_hp_index);
2112     -}
2113     -
2114     static unsigned char swap_entry_free(struct swap_info_struct *p,
2115     swp_entry_t entry, unsigned char usage)
2116     {
2117     @@ -828,9 +822,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
2118     dec_cluster_info_page(p, p->cluster_info, offset);
2119     if (offset < p->lowest_bit)
2120     p->lowest_bit = offset;
2121     - if (offset > p->highest_bit)
2122     + if (offset > p->highest_bit) {
2123     + bool was_full = !p->highest_bit;
2124     p->highest_bit = offset;
2125     - set_highest_priority_index(p->type);
2126     + if (was_full && (p->flags & SWP_WRITEOK)) {
2127     + spin_lock(&swap_avail_lock);
2128     + WARN_ON(!plist_node_empty(&p->avail_list));
2129     + if (plist_node_empty(&p->avail_list))
2130     + plist_add(&p->avail_list,
2131     + &swap_avail_head);
2132     + spin_unlock(&swap_avail_lock);
2133     + }
2134     + }
2135     atomic_long_inc(&nr_swap_pages);
2136     p->inuse_pages--;
2137     frontswap_invalidate_page(p->type, offset);
2138     @@ -1765,30 +1768,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
2139     unsigned char *swap_map,
2140     struct swap_cluster_info *cluster_info)
2141     {
2142     - int i, prev;
2143     -
2144     if (prio >= 0)
2145     p->prio = prio;
2146     else
2147     p->prio = --least_priority;
2148     + /*
2149     + * the plist prio is negated because plist ordering is
2150     + * low-to-high, while swap ordering is high-to-low
2151     + */
2152     + p->list.prio = -p->prio;
2153     + p->avail_list.prio = -p->prio;
2154     p->swap_map = swap_map;
2155     p->cluster_info = cluster_info;
2156     p->flags |= SWP_WRITEOK;
2157     atomic_long_add(p->pages, &nr_swap_pages);
2158     total_swap_pages += p->pages;
2159    
2160     - /* insert swap space into swap_list: */
2161     - prev = -1;
2162     - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2163     - if (p->prio >= swap_info[i]->prio)
2164     - break;
2165     - prev = i;
2166     - }
2167     - p->next = i;
2168     - if (prev < 0)
2169     - swap_list.head = swap_list.next = p->type;
2170     - else
2171     - swap_info[prev]->next = p->type;
2172     + assert_spin_locked(&swap_lock);
2173     + /*
2174     + * both lists are plists, and thus priority ordered.
2175     + * swap_active_head needs to be priority ordered for swapoff(),
2176     + * which on removal of any swap_info_struct with an auto-assigned
2177     + * (i.e. negative) priority increments the auto-assigned priority
2178     + * of any lower-priority swap_info_structs.
2179     + * swap_avail_head needs to be priority ordered for get_swap_page(),
2180     + * which allocates swap pages from the highest available priority
2181     + * swap_info_struct.
2182     + */
2183     + plist_add(&p->list, &swap_active_head);
2184     + spin_lock(&swap_avail_lock);
2185     + plist_add(&p->avail_list, &swap_avail_head);
2186     + spin_unlock(&swap_avail_lock);
2187     }
2188    
2189     static void enable_swap_info(struct swap_info_struct *p, int prio,
2190     @@ -1823,8 +1833,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2191     struct address_space *mapping;
2192     struct inode *inode;
2193     struct filename *pathname;
2194     - int i, type, prev;
2195     - int err;
2196     + int err, found = 0;
2197     unsigned int old_block_size;
2198    
2199     if (!capable(CAP_SYS_ADMIN))
2200     @@ -1842,17 +1851,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2201     goto out;
2202    
2203     mapping = victim->f_mapping;
2204     - prev = -1;
2205     spin_lock(&swap_lock);
2206     - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
2207     - p = swap_info[type];
2208     + plist_for_each_entry(p, &swap_active_head, list) {
2209     if (p->flags & SWP_WRITEOK) {
2210     - if (p->swap_file->f_mapping == mapping)
2211     + if (p->swap_file->f_mapping == mapping) {
2212     + found = 1;
2213     break;
2214     + }
2215     }
2216     - prev = type;
2217     }
2218     - if (type < 0) {
2219     + if (!found) {
2220     err = -EINVAL;
2221     spin_unlock(&swap_lock);
2222     goto out_dput;
2223     @@ -1864,20 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2224     spin_unlock(&swap_lock);
2225     goto out_dput;
2226     }
2227     - if (prev < 0)
2228     - swap_list.head = p->next;
2229     - else
2230     - swap_info[prev]->next = p->next;
2231     - if (type == swap_list.next) {
2232     - /* just pick something that's safe... */
2233     - swap_list.next = swap_list.head;
2234     - }
2235     + spin_lock(&swap_avail_lock);
2236     + plist_del(&p->avail_list, &swap_avail_head);
2237     + spin_unlock(&swap_avail_lock);
2238     spin_lock(&p->lock);
2239     if (p->prio < 0) {
2240     - for (i = p->next; i >= 0; i = swap_info[i]->next)
2241     - swap_info[i]->prio = p->prio--;
2242     + struct swap_info_struct *si = p;
2243     +
2244     + plist_for_each_entry_continue(si, &swap_active_head, list) {
2245     + si->prio++;
2246     + si->list.prio--;
2247     + si->avail_list.prio--;
2248     + }
2249     least_priority++;
2250     }
2251     + plist_del(&p->list, &swap_active_head);
2252     atomic_long_sub(p->pages, &nr_swap_pages);
2253     total_swap_pages -= p->pages;
2254     p->flags &= ~SWP_WRITEOK;
2255     @@ -1885,7 +1894,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2256     spin_unlock(&swap_lock);
2257    
2258     set_current_oom_origin();
2259     - err = try_to_unuse(type, false, 0); /* force all pages to be unused */
2260     + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
2261     clear_current_oom_origin();
2262    
2263     if (err) {
2264     @@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2265     frontswap_map = frontswap_map_get(p);
2266     spin_unlock(&p->lock);
2267     spin_unlock(&swap_lock);
2268     - frontswap_invalidate_area(type);
2269     + frontswap_invalidate_area(p->type);
2270     frontswap_map_set(p, NULL);
2271     mutex_unlock(&swapon_mutex);
2272     free_percpu(p->percpu_cluster);
2273     @@ -1935,7 +1944,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2274     vfree(cluster_info);
2275     vfree(frontswap_map);
2276     /* Destroy swap account information */
2277     - swap_cgroup_swapoff(type);
2278     + swap_cgroup_swapoff(p->type);
2279    
2280     inode = mapping->host;
2281     if (S_ISBLK(inode->i_mode)) {
2282     @@ -2142,8 +2151,9 @@ static struct swap_info_struct *alloc_swap_info(void)
2283     */
2284     }
2285     INIT_LIST_HEAD(&p->first_swap_extent.list);
2286     + plist_node_init(&p->list, 0);
2287     + plist_node_init(&p->avail_list, 0);
2288     p->flags = SWP_USED;
2289     - p->next = -1;
2290     spin_unlock(&swap_lock);
2291     spin_lock_init(&p->lock);
2292    
2293     diff --git a/mm/vmacache.c b/mm/vmacache.c
2294     new file mode 100644
2295     index 000000000000..1037a3bab505
2296     --- /dev/null
2297     +++ b/mm/vmacache.c
2298     @@ -0,0 +1,114 @@
2299     +/*
2300     + * Copyright (C) 2014 Davidlohr Bueso.
2301     + */
2302     +#include <linux/sched.h>
2303     +#include <linux/mm.h>
2304     +#include <linux/vmacache.h>
2305     +
2306     +/*
2307     + * Flush vma caches for threads that share a given mm.
2308     + *
2309     + * The operation is safe because the caller holds the mmap_sem
2310     + * exclusively and other threads accessing the vma cache will
2311     + * have mmap_sem held at least for read, so no extra locking
2312     + * is required to maintain the vma cache.
2313     + */
2314     +void vmacache_flush_all(struct mm_struct *mm)
2315     +{
2316     + struct task_struct *g, *p;
2317     +
2318     + rcu_read_lock();
2319     + for_each_process_thread(g, p) {
2320     + /*
2321     + * Only flush the vmacache pointers as the
2322     + * mm seqnum is already set and curr's will
2323     + * be set upon invalidation when the next
2324     + * lookup is done.
2325     + */
2326     + if (mm == p->mm)
2327     + vmacache_flush(p);
2328     + }
2329     + rcu_read_unlock();
2330     +}
2331     +
2332     +/*
2333     + * This task may be accessing a foreign mm via (for example)
2334     + * get_user_pages()->find_vma(). The vmacache is task-local and this
2335     + * task's vmacache pertains to a different mm (ie, its own). There is
2336     + * nothing we can do here.
2337     + *
2338     + * Also handle the case where a kernel thread has adopted this mm via use_mm().
2339     + * That kernel thread's vmacache is not applicable to this mm.
2340     + */
2341     +static bool vmacache_valid_mm(struct mm_struct *mm)
2342     +{
2343     + return current->mm == mm && !(current->flags & PF_KTHREAD);
2344     +}
2345     +
2346     +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
2347     +{
2348     + if (vmacache_valid_mm(newvma->vm_mm))
2349     + current->vmacache[VMACACHE_HASH(addr)] = newvma;
2350     +}
2351     +
2352     +static bool vmacache_valid(struct mm_struct *mm)
2353     +{
2354     + struct task_struct *curr;
2355     +
2356     + if (!vmacache_valid_mm(mm))
2357     + return false;
2358     +
2359     + curr = current;
2360     + if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
2361     + /*
2362     + * First attempt will always be invalid, initialize
2363     + * the new cache for this task here.
2364     + */
2365     + curr->vmacache_seqnum = mm->vmacache_seqnum;
2366     + vmacache_flush(curr);
2367     + return false;
2368     + }
2369     + return true;
2370     +}
2371     +
2372     +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
2373     +{
2374     + int i;
2375     +
2376     + if (!vmacache_valid(mm))
2377     + return NULL;
2378     +
2379     + for (i = 0; i < VMACACHE_SIZE; i++) {
2380     + struct vm_area_struct *vma = current->vmacache[i];
2381     +
2382     + if (!vma)
2383     + continue;
2384     + if (WARN_ON_ONCE(vma->vm_mm != mm))
2385     + break;
2386     + if (vma->vm_start <= addr && vma->vm_end > addr)
2387     + return vma;
2388     + }
2389     +
2390     + return NULL;
2391     +}
2392     +
2393     +#ifndef CONFIG_MMU
2394     +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
2395     + unsigned long start,
2396     + unsigned long end)
2397     +{
2398     + int i;
2399     +
2400     + if (!vmacache_valid(mm))
2401     + return NULL;
2402     +
2403     + for (i = 0; i < VMACACHE_SIZE; i++) {
2404     + struct vm_area_struct *vma = current->vmacache[i];
2405     +
2406     + if (vma && vma->vm_start == start && vma->vm_end == end)
2407     + return vma;
2408     + }
2409     +
2410     + return NULL;
2411     +}
2412     +#endif
2413     diff --git a/mm/vmscan.c b/mm/vmscan.c
2414     index 6ef484f0777f..0c0b36e5b4f8 100644
2415     --- a/mm/vmscan.c
2416     +++ b/mm/vmscan.c
2417     @@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
2418     unsigned long freed = 0;
2419     unsigned long long delta;
2420     long total_scan;
2421     - long max_pass;
2422     + long freeable;
2423     long nr;
2424     long new_nr;
2425     int nid = shrinkctl->nid;
2426     long batch_size = shrinker->batch ? shrinker->batch
2427     : SHRINK_BATCH;
2428    
2429     - max_pass = shrinker->count_objects(shrinker, shrinkctl);
2430     - if (max_pass == 0)
2431     + freeable = shrinker->count_objects(shrinker, shrinkctl);
2432     + if (freeable == 0)
2433     return 0;
2434    
2435     /*
2436     @@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
2437    
2438     total_scan = nr;
2439     delta = (4 * nr_pages_scanned) / shrinker->seeks;
2440     - delta *= max_pass;
2441     + delta *= freeable;
2442     do_div(delta, lru_pages + 1);
2443     total_scan += delta;
2444     if (total_scan < 0) {
2445     printk(KERN_ERR
2446     "shrink_slab: %pF negative objects to delete nr=%ld\n",
2447     shrinker->scan_objects, total_scan);
2448     - total_scan = max_pass;
2449     + total_scan = freeable;
2450     }
2451    
2452     /*
2453     @@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
2454     * shrinkers to return -1 all the time. This results in a large
2455     * nr being built up so when a shrink that can do some work
2456     * comes along it empties the entire cache due to nr >>>
2457     - * max_pass. This is bad for sustaining a working set in
2458     + * freeable. This is bad for sustaining a working set in
2459     * memory.
2460     *
2461     * Hence only allow the shrinker to scan the entire cache when
2462     * a large delta change is calculated directly.
2463     */
2464     - if (delta < max_pass / 4)
2465     - total_scan = min(total_scan, max_pass / 2);
2466     + if (delta < freeable / 4)
2467     + total_scan = min(total_scan, freeable / 2);
2468    
2469     /*
2470     * Avoid risking looping forever due to too large nr value:
2471     * never try to free more than twice the estimate number of
2472     * freeable entries.
2473     */
2474     - if (total_scan > max_pass * 2)
2475     - total_scan = max_pass * 2;
2476     + if (total_scan > freeable * 2)
2477     + total_scan = freeable * 2;
2478    
2479     trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
2480     nr_pages_scanned, lru_pages,
2481     - max_pass, delta, total_scan);
2482     + freeable, delta, total_scan);
2483    
2484     /*
2485     * Normally, we should not scan less than batch_size objects in one
2486     @@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
2487     *
2488     * We detect the "tight on memory" situations by looking at the total
2489     * number of objects we want to scan (total_scan). If it is greater
2490     - * than the total number of objects on slab (max_pass), we must be
2491     + * than the total number of objects on slab (freeable), we must be
2492     * scanning at high prio and therefore should try to reclaim as much as
2493     * possible.
2494     */
2495     while (total_scan >= batch_size ||
2496     - total_scan >= max_pass) {
2497     + total_scan >= freeable) {
2498     unsigned long ret;
2499     unsigned long nr_to_scan = min(batch_size, total_scan);
2500    
2501     @@ -1144,7 +1144,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
2502     TTU_UNMAP|TTU_IGNORE_ACCESS,
2503     &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
2504     list_splice(&clean_pages, page_list);
2505     - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
2506     + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
2507     return ret;
2508     }
2509    
2510     @@ -2424,8 +2424,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2511     unsigned long lru_pages = 0;
2512    
2513     nodes_clear(shrink->nodes_to_scan);
2514     - for_each_zone_zonelist(zone, z, zonelist,
2515     - gfp_zone(sc->gfp_mask)) {
2516     + for_each_zone_zonelist_nodemask(zone, z, zonelist,
2517     + gfp_zone(sc->gfp_mask), sc->nodemask) {
2518     if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2519     continue;
2520