Annotation of /trunk/kernel-alx/patches-3.14/0120-3.14.21-all-fixes.patch
Parent Directory | Revision Log
Revision 2506 -
(hide annotations)
(download)
Fri Oct 17 07:55:45 2014 UTC (9 years, 11 months ago) by niro
File size: 82171 byte(s)
Fri Oct 17 07:55:45 2014 UTC (9 years, 11 months ago) by niro
File size: 82171 byte(s)
-patches for 3.14
1 | niro | 2506 | diff --git a/Makefile b/Makefile |
2 | index beb7e6f0803b..41e6e19fe2e9 100644 | ||
3 | --- a/Makefile | ||
4 | +++ b/Makefile | ||
5 | @@ -1,6 +1,6 @@ | ||
6 | VERSION = 3 | ||
7 | PATCHLEVEL = 14 | ||
8 | -SUBLEVEL = 20 | ||
9 | +SUBLEVEL = 21 | ||
10 | EXTRAVERSION = | ||
11 | NAME = Remembering Coco | ||
12 | |||
13 | diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h | ||
14 | index fb5e4c658f7a..ef470a7a3d0f 100644 | ||
15 | --- a/arch/unicore32/include/asm/mmu_context.h | ||
16 | +++ b/arch/unicore32/include/asm/mmu_context.h | ||
17 | @@ -14,6 +14,8 @@ | ||
18 | |||
19 | #include <linux/compiler.h> | ||
20 | #include <linux/sched.h> | ||
21 | +#include <linux/mm.h> | ||
22 | +#include <linux/vmacache.h> | ||
23 | #include <linux/io.h> | ||
24 | |||
25 | #include <asm/cacheflush.h> | ||
26 | @@ -73,7 +75,7 @@ do { \ | ||
27 | else \ | ||
28 | mm->mmap = NULL; \ | ||
29 | rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ | ||
30 | - mm->mmap_cache = NULL; \ | ||
31 | + vmacache_invalidate(mm); \ | ||
32 | mm->map_count--; \ | ||
33 | remove_vma(high_vma); \ | ||
34 | } \ | ||
35 | diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c | ||
36 | index c706d50a8b06..8c16c2f97026 100644 | ||
37 | --- a/drivers/block/drbd/drbd_nl.c | ||
38 | +++ b/drivers/block/drbd/drbd_nl.c | ||
39 | @@ -525,6 +525,12 @@ void conn_try_outdate_peer_async(struct drbd_tconn *tconn) | ||
40 | struct task_struct *opa; | ||
41 | |||
42 | kref_get(&tconn->kref); | ||
43 | + /* We may just have force_sig()'ed this thread | ||
44 | + * to get it out of some blocking network function. | ||
45 | + * Clear signals; otherwise kthread_run(), which internally uses | ||
46 | + * wait_on_completion_killable(), will mistake our pending signal | ||
47 | + * for a new fatal signal and fail. */ | ||
48 | + flush_signals(current); | ||
49 | opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h"); | ||
50 | if (IS_ERR(opa)) { | ||
51 | conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n"); | ||
52 | diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c | ||
53 | index 0e27844e8c2d..8089dd2cd9d8 100644 | ||
54 | --- a/drivers/cpufreq/integrator-cpufreq.c | ||
55 | +++ b/drivers/cpufreq/integrator-cpufreq.c | ||
56 | @@ -213,9 +213,9 @@ static int __init integrator_cpufreq_probe(struct platform_device *pdev) | ||
57 | return cpufreq_register_driver(&integrator_driver); | ||
58 | } | ||
59 | |||
60 | -static void __exit integrator_cpufreq_remove(struct platform_device *pdev) | ||
61 | +static int __exit integrator_cpufreq_remove(struct platform_device *pdev) | ||
62 | { | ||
63 | - cpufreq_unregister_driver(&integrator_driver); | ||
64 | + return cpufreq_unregister_driver(&integrator_driver); | ||
65 | } | ||
66 | |||
67 | static const struct of_device_id integrator_cpufreq_match[] = { | ||
68 | diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c | ||
69 | index d278be110805..1855cdca39cd 100644 | ||
70 | --- a/drivers/gpu/drm/i915/i915_gem_gtt.c | ||
71 | +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c | ||
72 | @@ -827,6 +827,16 @@ void i915_check_and_clear_faults(struct drm_device *dev) | ||
73 | POSTING_READ(RING_FAULT_REG(&dev_priv->ring[RCS])); | ||
74 | } | ||
75 | |||
76 | +static void i915_ggtt_flush(struct drm_i915_private *dev_priv) | ||
77 | +{ | ||
78 | + if (INTEL_INFO(dev_priv->dev)->gen < 6) { | ||
79 | + intel_gtt_chipset_flush(); | ||
80 | + } else { | ||
81 | + I915_WRITE(GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN); | ||
82 | + POSTING_READ(GFX_FLSH_CNTL_GEN6); | ||
83 | + } | ||
84 | +} | ||
85 | + | ||
86 | void i915_gem_suspend_gtt_mappings(struct drm_device *dev) | ||
87 | { | ||
88 | struct drm_i915_private *dev_priv = dev->dev_private; | ||
89 | @@ -843,6 +853,8 @@ void i915_gem_suspend_gtt_mappings(struct drm_device *dev) | ||
90 | dev_priv->gtt.base.start / PAGE_SIZE, | ||
91 | dev_priv->gtt.base.total / PAGE_SIZE, | ||
92 | true); | ||
93 | + | ||
94 | + i915_ggtt_flush(dev_priv); | ||
95 | } | ||
96 | |||
97 | void i915_gem_restore_gtt_mappings(struct drm_device *dev) | ||
98 | @@ -863,7 +875,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev) | ||
99 | i915_gem_gtt_bind_object(obj, obj->cache_level); | ||
100 | } | ||
101 | |||
102 | - i915_gem_chipset_flush(dev); | ||
103 | + i915_ggtt_flush(dev_priv); | ||
104 | } | ||
105 | |||
106 | int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj) | ||
107 | diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c | ||
108 | index 18cda77b4f79..4913c0690872 100644 | ||
109 | --- a/drivers/md/raid5.c | ||
110 | +++ b/drivers/md/raid5.c | ||
111 | @@ -64,6 +64,10 @@ | ||
112 | #define cpu_to_group(cpu) cpu_to_node(cpu) | ||
113 | #define ANY_GROUP NUMA_NO_NODE | ||
114 | |||
115 | +static bool devices_handle_discard_safely = false; | ||
116 | +module_param(devices_handle_discard_safely, bool, 0644); | ||
117 | +MODULE_PARM_DESC(devices_handle_discard_safely, | ||
118 | + "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); | ||
119 | static struct workqueue_struct *raid5_wq; | ||
120 | /* | ||
121 | * Stripe cache | ||
122 | @@ -6117,7 +6121,7 @@ static int run(struct mddev *mddev) | ||
123 | mddev->queue->limits.discard_granularity = stripe; | ||
124 | /* | ||
125 | * unaligned part of discard request will be ignored, so can't | ||
126 | - * guarantee discard_zerors_data | ||
127 | + * guarantee discard_zeroes_data | ||
128 | */ | ||
129 | mddev->queue->limits.discard_zeroes_data = 0; | ||
130 | |||
131 | @@ -6142,6 +6146,18 @@ static int run(struct mddev *mddev) | ||
132 | !bdev_get_queue(rdev->bdev)-> | ||
133 | limits.discard_zeroes_data) | ||
134 | discard_supported = false; | ||
135 | + /* Unfortunately, discard_zeroes_data is not currently | ||
136 | + * a guarantee - just a hint. So we only allow DISCARD | ||
137 | + * if the sysadmin has confirmed that only safe devices | ||
138 | + * are in use by setting a module parameter. | ||
139 | + */ | ||
140 | + if (!devices_handle_discard_safely) { | ||
141 | + if (discard_supported) { | ||
142 | + pr_info("md/raid456: discard support disabled due to uncertainty.\n"); | ||
143 | + pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); | ||
144 | + } | ||
145 | + discard_supported = false; | ||
146 | + } | ||
147 | } | ||
148 | |||
149 | if (discard_supported && | ||
150 | diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c | ||
151 | index a127925c9d61..06faea4d60ee 100644 | ||
152 | --- a/drivers/media/v4l2-core/videobuf2-core.c | ||
153 | +++ b/drivers/media/v4l2-core/videobuf2-core.c | ||
154 | @@ -745,6 +745,7 @@ static int __reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req) | ||
155 | * to the userspace. | ||
156 | */ | ||
157 | req->count = allocated_buffers; | ||
158 | + q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type); | ||
159 | |||
160 | return 0; | ||
161 | } | ||
162 | @@ -793,6 +794,7 @@ static int __create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create | ||
163 | memset(q->plane_sizes, 0, sizeof(q->plane_sizes)); | ||
164 | memset(q->alloc_ctx, 0, sizeof(q->alloc_ctx)); | ||
165 | q->memory = create->memory; | ||
166 | + q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type); | ||
167 | } | ||
168 | |||
169 | num_buffers = min(create->count, VIDEO_MAX_FRAME - q->num_buffers); | ||
170 | @@ -1447,6 +1449,7 @@ static int vb2_internal_qbuf(struct vb2_queue *q, struct v4l2_buffer *b) | ||
171 | * dequeued in dqbuf. | ||
172 | */ | ||
173 | list_add_tail(&vb->queued_entry, &q->queued_list); | ||
174 | + q->waiting_for_buffers = false; | ||
175 | vb->state = VB2_BUF_STATE_QUEUED; | ||
176 | |||
177 | /* | ||
178 | @@ -1841,6 +1844,7 @@ static int vb2_internal_streamoff(struct vb2_queue *q, enum v4l2_buf_type type) | ||
179 | * and videobuf, effectively returning control over them to userspace. | ||
180 | */ | ||
181 | __vb2_queue_cancel(q); | ||
182 | + q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type); | ||
183 | |||
184 | dprintk(3, "Streamoff successful\n"); | ||
185 | return 0; | ||
186 | @@ -2150,9 +2154,16 @@ unsigned int vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait) | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | - * There is nothing to wait for if no buffers have already been queued. | ||
191 | + * There is nothing to wait for if the queue isn't streaming. | ||
192 | */ | ||
193 | - if (list_empty(&q->queued_list)) | ||
194 | + if (!vb2_is_streaming(q)) | ||
195 | + return res | POLLERR; | ||
196 | + /* | ||
197 | + * For compatibility with vb1: if QBUF hasn't been called yet, then | ||
198 | + * return POLLERR as well. This only affects capture queues, output | ||
199 | + * queues will always initialize waiting_for_buffers to false. | ||
200 | + */ | ||
201 | + if (q->waiting_for_buffers) | ||
202 | return res | POLLERR; | ||
203 | |||
204 | if (list_empty(&q->done_list)) | ||
205 | diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h | ||
206 | index f15d4353f30f..5d12d69e2045 100644 | ||
207 | --- a/fs/cifs/cifsglob.h | ||
208 | +++ b/fs/cifs/cifsglob.h | ||
209 | @@ -399,6 +399,8 @@ struct smb_version_operations { | ||
210 | const struct cifs_fid *, u32 *); | ||
211 | int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, | ||
212 | int); | ||
213 | + /* check if we need to issue closedir */ | ||
214 | + bool (*dir_needs_close)(struct cifsFileInfo *); | ||
215 | }; | ||
216 | |||
217 | struct smb_version_values { | ||
218 | diff --git a/fs/cifs/file.c b/fs/cifs/file.c | ||
219 | index 8175b18df819..d375322b6cec 100644 | ||
220 | --- a/fs/cifs/file.c | ||
221 | +++ b/fs/cifs/file.c | ||
222 | @@ -762,7 +762,7 @@ int cifs_closedir(struct inode *inode, struct file *file) | ||
223 | |||
224 | cifs_dbg(FYI, "Freeing private data in close dir\n"); | ||
225 | spin_lock(&cifs_file_list_lock); | ||
226 | - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { | ||
227 | + if (server->ops->dir_needs_close(cfile)) { | ||
228 | cfile->invalidHandle = true; | ||
229 | spin_unlock(&cifs_file_list_lock); | ||
230 | if (server->ops->close_dir) | ||
231 | diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c | ||
232 | index 2bbf11b09214..b334a89d6a66 100644 | ||
233 | --- a/fs/cifs/readdir.c | ||
234 | +++ b/fs/cifs/readdir.c | ||
235 | @@ -593,7 +593,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, | ||
236 | /* close and restart search */ | ||
237 | cifs_dbg(FYI, "search backing up - close and restart search\n"); | ||
238 | spin_lock(&cifs_file_list_lock); | ||
239 | - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { | ||
240 | + if (server->ops->dir_needs_close(cfile)) { | ||
241 | cfile->invalidHandle = true; | ||
242 | spin_unlock(&cifs_file_list_lock); | ||
243 | if (server->ops->close_dir) | ||
244 | diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c | ||
245 | index d1fdfa848703..e9ad8d37bb00 100644 | ||
246 | --- a/fs/cifs/smb1ops.c | ||
247 | +++ b/fs/cifs/smb1ops.c | ||
248 | @@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, | ||
249 | tmprc = CIFS_open(xid, &oparms, &oplock, NULL); | ||
250 | if (tmprc == -EOPNOTSUPP) | ||
251 | *symlink = true; | ||
252 | - else | ||
253 | + else if (tmprc == 0) | ||
254 | CIFSSMBClose(xid, tcon, fid.netfid); | ||
255 | } | ||
256 | |||
257 | @@ -1009,6 +1009,12 @@ cifs_is_read_op(__u32 oplock) | ||
258 | return oplock == OPLOCK_READ; | ||
259 | } | ||
260 | |||
261 | +static bool | ||
262 | +cifs_dir_needs_close(struct cifsFileInfo *cfile) | ||
263 | +{ | ||
264 | + return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; | ||
265 | +} | ||
266 | + | ||
267 | struct smb_version_operations smb1_operations = { | ||
268 | .send_cancel = send_nt_cancel, | ||
269 | .compare_fids = cifs_compare_fids, | ||
270 | @@ -1078,6 +1084,7 @@ struct smb_version_operations smb1_operations = { | ||
271 | .query_mf_symlink = cifs_query_mf_symlink, | ||
272 | .create_mf_symlink = cifs_create_mf_symlink, | ||
273 | .is_read_op = cifs_is_read_op, | ||
274 | + .dir_needs_close = cifs_dir_needs_close, | ||
275 | #ifdef CONFIG_CIFS_XATTR | ||
276 | .query_all_EAs = CIFSSMBQAllEAs, | ||
277 | .set_EA = CIFSSMBSetEA, | ||
278 | diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c | ||
279 | index e31a9dfdcd39..a491814cb2c0 100644 | ||
280 | --- a/fs/cifs/smb2maperror.c | ||
281 | +++ b/fs/cifs/smb2maperror.c | ||
282 | @@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { | ||
283 | {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"}, | ||
284 | {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"}, | ||
285 | {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"}, | ||
286 | - {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"}, | ||
287 | + {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"}, | ||
288 | {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"}, | ||
289 | {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"}, | ||
290 | {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"}, | ||
291 | @@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { | ||
292 | {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO, | ||
293 | "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"}, | ||
294 | {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"}, | ||
295 | + {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP, | ||
296 | + "STATUS_REPARSE_NOT_HANDLED"}, | ||
297 | {STATUS_DEVICE_REQUIRES_CLEANING, -EIO, | ||
298 | "STATUS_DEVICE_REQUIRES_CLEANING"}, | ||
299 | {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"}, | ||
300 | diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c | ||
301 | index f8977b2d9187..34a17d425be6 100644 | ||
302 | --- a/fs/cifs/smb2ops.c | ||
303 | +++ b/fs/cifs/smb2ops.c | ||
304 | @@ -1102,6 +1102,12 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch) | ||
305 | return le32_to_cpu(lc->lcontext.LeaseState); | ||
306 | } | ||
307 | |||
308 | +static bool | ||
309 | +smb2_dir_needs_close(struct cifsFileInfo *cfile) | ||
310 | +{ | ||
311 | + return !cfile->invalidHandle; | ||
312 | +} | ||
313 | + | ||
314 | struct smb_version_operations smb20_operations = { | ||
315 | .compare_fids = smb2_compare_fids, | ||
316 | .setup_request = smb2_setup_request, | ||
317 | @@ -1175,6 +1181,7 @@ struct smb_version_operations smb20_operations = { | ||
318 | .create_lease_buf = smb2_create_lease_buf, | ||
319 | .parse_lease_buf = smb2_parse_lease_buf, | ||
320 | .clone_range = smb2_clone_range, | ||
321 | + .dir_needs_close = smb2_dir_needs_close, | ||
322 | }; | ||
323 | |||
324 | struct smb_version_operations smb21_operations = { | ||
325 | @@ -1250,6 +1257,7 @@ struct smb_version_operations smb21_operations = { | ||
326 | .create_lease_buf = smb2_create_lease_buf, | ||
327 | .parse_lease_buf = smb2_parse_lease_buf, | ||
328 | .clone_range = smb2_clone_range, | ||
329 | + .dir_needs_close = smb2_dir_needs_close, | ||
330 | }; | ||
331 | |||
332 | struct smb_version_operations smb30_operations = { | ||
333 | @@ -1328,6 +1336,7 @@ struct smb_version_operations smb30_operations = { | ||
334 | .parse_lease_buf = smb3_parse_lease_buf, | ||
335 | .clone_range = smb2_clone_range, | ||
336 | .validate_negotiate = smb3_validate_negotiate, | ||
337 | + .dir_needs_close = smb2_dir_needs_close, | ||
338 | }; | ||
339 | |||
340 | struct smb_version_values smb20_values = { | ||
341 | diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c | ||
342 | index 9aab8fe0e508..348792911e1f 100644 | ||
343 | --- a/fs/cifs/smb2pdu.c | ||
344 | +++ b/fs/cifs/smb2pdu.c | ||
345 | @@ -2136,6 +2136,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, | ||
346 | rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base; | ||
347 | |||
348 | if (rc) { | ||
349 | + if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) { | ||
350 | + srch_inf->endOfSearch = true; | ||
351 | + rc = 0; | ||
352 | + } | ||
353 | cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); | ||
354 | goto qdir_exit; | ||
355 | } | ||
356 | @@ -2173,11 +2177,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, | ||
357 | else | ||
358 | cifs_dbg(VFS, "illegal search buffer type\n"); | ||
359 | |||
360 | - if (rsp->hdr.Status == STATUS_NO_MORE_FILES) | ||
361 | - srch_inf->endOfSearch = 1; | ||
362 | - else | ||
363 | - srch_inf->endOfSearch = 0; | ||
364 | - | ||
365 | return rc; | ||
366 | |||
367 | qdir_exit: | ||
368 | diff --git a/fs/exec.c b/fs/exec.c | ||
369 | index 31e46b1b358b..ea4449d0536a 100644 | ||
370 | --- a/fs/exec.c | ||
371 | +++ b/fs/exec.c | ||
372 | @@ -26,6 +26,7 @@ | ||
373 | #include <linux/file.h> | ||
374 | #include <linux/fdtable.h> | ||
375 | #include <linux/mm.h> | ||
376 | +#include <linux/vmacache.h> | ||
377 | #include <linux/stat.h> | ||
378 | #include <linux/fcntl.h> | ||
379 | #include <linux/swap.h> | ||
380 | @@ -820,7 +821,7 @@ EXPORT_SYMBOL(read_code); | ||
381 | static int exec_mmap(struct mm_struct *mm) | ||
382 | { | ||
383 | struct task_struct *tsk; | ||
384 | - struct mm_struct * old_mm, *active_mm; | ||
385 | + struct mm_struct *old_mm, *active_mm; | ||
386 | |||
387 | /* Notify parent that we're no longer interested in the old VM */ | ||
388 | tsk = current; | ||
389 | @@ -846,6 +847,8 @@ static int exec_mmap(struct mm_struct *mm) | ||
390 | tsk->mm = mm; | ||
391 | tsk->active_mm = mm; | ||
392 | activate_mm(active_mm, mm); | ||
393 | + tsk->mm->vmacache_seqnum = 0; | ||
394 | + vmacache_flush(tsk); | ||
395 | task_unlock(tsk); | ||
396 | if (old_mm) { | ||
397 | up_read(&old_mm->mmap_sem); | ||
398 | diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c | ||
399 | index d19b30ababf1..a4a8ed56e438 100644 | ||
400 | --- a/fs/hugetlbfs/inode.c | ||
401 | +++ b/fs/hugetlbfs/inode.c | ||
402 | @@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void) | ||
403 | int error; | ||
404 | int i; | ||
405 | |||
406 | + if (!hugepages_supported()) { | ||
407 | + pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); | ||
408 | + return -ENOTSUPP; | ||
409 | + } | ||
410 | + | ||
411 | error = bdi_init(&hugetlbfs_backing_dev_info); | ||
412 | if (error) | ||
413 | return error; | ||
414 | diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c | ||
415 | index 8f788193e3d4..c4b2646b6d7c 100644 | ||
416 | --- a/fs/proc/task_mmu.c | ||
417 | +++ b/fs/proc/task_mmu.c | ||
418 | @@ -1,4 +1,5 @@ | ||
419 | #include <linux/mm.h> | ||
420 | +#include <linux/vmacache.h> | ||
421 | #include <linux/hugetlb.h> | ||
422 | #include <linux/huge_mm.h> | ||
423 | #include <linux/mount.h> | ||
424 | @@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) | ||
425 | |||
426 | /* | ||
427 | * We remember last_addr rather than next_addr to hit with | ||
428 | - * mmap_cache most of the time. We have zero last_addr at | ||
429 | + * vmacache most of the time. We have zero last_addr at | ||
430 | * the beginning and also after lseek. We will have -1 last_addr | ||
431 | * after the end of the vmas. | ||
432 | */ | ||
433 | diff --git a/fs/udf/inode.c b/fs/udf/inode.c | ||
434 | index 982ce05c87ed..287cd5f23421 100644 | ||
435 | --- a/fs/udf/inode.c | ||
436 | +++ b/fs/udf/inode.c | ||
437 | @@ -1271,13 +1271,22 @@ update_time: | ||
438 | return 0; | ||
439 | } | ||
440 | |||
441 | +/* | ||
442 | + * Maximum length of linked list formed by ICB hierarchy. The chosen number is | ||
443 | + * arbitrary - just that we hopefully don't limit any real use of rewritten | ||
444 | + * inode on write-once media but avoid looping for too long on corrupted media. | ||
445 | + */ | ||
446 | +#define UDF_MAX_ICB_NESTING 1024 | ||
447 | + | ||
448 | static void __udf_read_inode(struct inode *inode) | ||
449 | { | ||
450 | struct buffer_head *bh = NULL; | ||
451 | struct fileEntry *fe; | ||
452 | uint16_t ident; | ||
453 | struct udf_inode_info *iinfo = UDF_I(inode); | ||
454 | + unsigned int indirections = 0; | ||
455 | |||
456 | +reread: | ||
457 | /* | ||
458 | * Set defaults, but the inode is still incomplete! | ||
459 | * Note: get_new_inode() sets the following on a new inode: | ||
460 | @@ -1314,28 +1323,26 @@ static void __udf_read_inode(struct inode *inode) | ||
461 | ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, | ||
462 | &ident); | ||
463 | if (ident == TAG_IDENT_IE && ibh) { | ||
464 | - struct buffer_head *nbh = NULL; | ||
465 | struct kernel_lb_addr loc; | ||
466 | struct indirectEntry *ie; | ||
467 | |||
468 | ie = (struct indirectEntry *)ibh->b_data; | ||
469 | loc = lelb_to_cpu(ie->indirectICB.extLocation); | ||
470 | |||
471 | - if (ie->indirectICB.extLength && | ||
472 | - (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, | ||
473 | - &ident))) { | ||
474 | - if (ident == TAG_IDENT_FE || | ||
475 | - ident == TAG_IDENT_EFE) { | ||
476 | - memcpy(&iinfo->i_location, | ||
477 | - &loc, | ||
478 | - sizeof(struct kernel_lb_addr)); | ||
479 | - brelse(bh); | ||
480 | - brelse(ibh); | ||
481 | - brelse(nbh); | ||
482 | - __udf_read_inode(inode); | ||
483 | + if (ie->indirectICB.extLength) { | ||
484 | + brelse(bh); | ||
485 | + brelse(ibh); | ||
486 | + memcpy(&iinfo->i_location, &loc, | ||
487 | + sizeof(struct kernel_lb_addr)); | ||
488 | + if (++indirections > UDF_MAX_ICB_NESTING) { | ||
489 | + udf_err(inode->i_sb, | ||
490 | + "too many ICBs in ICB hierarchy" | ||
491 | + " (max %d supported)\n", | ||
492 | + UDF_MAX_ICB_NESTING); | ||
493 | + make_bad_inode(inode); | ||
494 | return; | ||
495 | } | ||
496 | - brelse(nbh); | ||
497 | + goto reread; | ||
498 | } | ||
499 | } | ||
500 | brelse(ibh); | ||
501 | diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h | ||
502 | index 3fe661fe96d1..b19d3dc2e651 100644 | ||
503 | --- a/include/linux/cpuset.h | ||
504 | +++ b/include/linux/cpuset.h | ||
505 | @@ -87,25 +87,26 @@ extern void rebuild_sched_domains(void); | ||
506 | extern void cpuset_print_task_mems_allowed(struct task_struct *p); | ||
507 | |||
508 | /* | ||
509 | - * get_mems_allowed is required when making decisions involving mems_allowed | ||
510 | - * such as during page allocation. mems_allowed can be updated in parallel | ||
511 | - * and depending on the new value an operation can fail potentially causing | ||
512 | - * process failure. A retry loop with get_mems_allowed and put_mems_allowed | ||
513 | - * prevents these artificial failures. | ||
514 | + * read_mems_allowed_begin is required when making decisions involving | ||
515 | + * mems_allowed such as during page allocation. mems_allowed can be updated in | ||
516 | + * parallel and depending on the new value an operation can fail potentially | ||
517 | + * causing process failure. A retry loop with read_mems_allowed_begin and | ||
518 | + * read_mems_allowed_retry prevents these artificial failures. | ||
519 | */ | ||
520 | -static inline unsigned int get_mems_allowed(void) | ||
521 | +static inline unsigned int read_mems_allowed_begin(void) | ||
522 | { | ||
523 | return read_seqcount_begin(¤t->mems_allowed_seq); | ||
524 | } | ||
525 | |||
526 | /* | ||
527 | - * If this returns false, the operation that took place after get_mems_allowed | ||
528 | - * may have failed. It is up to the caller to retry the operation if | ||
529 | + * If this returns true, the operation that took place after | ||
530 | + * read_mems_allowed_begin may have failed artificially due to a concurrent | ||
531 | + * update of mems_allowed. It is up to the caller to retry the operation if | ||
532 | * appropriate. | ||
533 | */ | ||
534 | -static inline bool put_mems_allowed(unsigned int seq) | ||
535 | +static inline bool read_mems_allowed_retry(unsigned int seq) | ||
536 | { | ||
537 | - return !read_seqcount_retry(¤t->mems_allowed_seq, seq); | ||
538 | + return read_seqcount_retry(¤t->mems_allowed_seq, seq); | ||
539 | } | ||
540 | |||
541 | static inline void set_mems_allowed(nodemask_t nodemask) | ||
542 | @@ -225,14 +226,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) | ||
543 | { | ||
544 | } | ||
545 | |||
546 | -static inline unsigned int get_mems_allowed(void) | ||
547 | +static inline unsigned int read_mems_allowed_begin(void) | ||
548 | { | ||
549 | return 0; | ||
550 | } | ||
551 | |||
552 | -static inline bool put_mems_allowed(unsigned int seq) | ||
553 | +static inline bool read_mems_allowed_retry(unsigned int seq) | ||
554 | { | ||
555 | - return true; | ||
556 | + return false; | ||
557 | } | ||
558 | |||
559 | #endif /* !CONFIG_CPUSETS */ | ||
560 | diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h | ||
561 | index bd1e9bcec547..42b05c4c53e5 100644 | ||
562 | --- a/include/linux/hugetlb.h | ||
563 | +++ b/include/linux/hugetlb.h | ||
564 | @@ -400,6 +400,16 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, | ||
565 | return &mm->page_table_lock; | ||
566 | } | ||
567 | |||
568 | +static inline bool hugepages_supported(void) | ||
569 | +{ | ||
570 | + /* | ||
571 | + * Some platform decide whether they support huge pages at boot | ||
572 | + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when | ||
573 | + * there is no such support | ||
574 | + */ | ||
575 | + return HPAGE_SHIFT != 0; | ||
576 | +} | ||
577 | + | ||
578 | #else /* CONFIG_HUGETLB_PAGE */ | ||
579 | struct hstate {}; | ||
580 | #define alloc_huge_page_node(h, nid) NULL | ||
581 | diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h | ||
582 | index 1f44466c1e9d..c367cbdf73ab 100644 | ||
583 | --- a/include/linux/jiffies.h | ||
584 | +++ b/include/linux/jiffies.h | ||
585 | @@ -258,23 +258,11 @@ extern unsigned long preset_lpj; | ||
586 | #define SEC_JIFFIE_SC (32 - SHIFT_HZ) | ||
587 | #endif | ||
588 | #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) | ||
589 | -#define USEC_JIFFIE_SC (SEC_JIFFIE_SC + 19) | ||
590 | #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ | ||
591 | TICK_NSEC -1) / (u64)TICK_NSEC)) | ||
592 | |||
593 | #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ | ||
594 | TICK_NSEC -1) / (u64)TICK_NSEC)) | ||
595 | -#define USEC_CONVERSION \ | ||
596 | - ((unsigned long)((((u64)NSEC_PER_USEC << USEC_JIFFIE_SC) +\ | ||
597 | - TICK_NSEC -1) / (u64)TICK_NSEC)) | ||
598 | -/* | ||
599 | - * USEC_ROUND is used in the timeval to jiffie conversion. See there | ||
600 | - * for more details. It is the scaled resolution rounding value. Note | ||
601 | - * that it is a 64-bit value. Since, when it is applied, we are already | ||
602 | - * in jiffies (albit scaled), it is nothing but the bits we will shift | ||
603 | - * off. | ||
604 | - */ | ||
605 | -#define USEC_ROUND (u64)(((u64)1 << USEC_JIFFIE_SC) - 1) | ||
606 | /* | ||
607 | * The maximum jiffie value is (MAX_INT >> 1). Here we translate that | ||
608 | * into seconds. The 64-bit case will overflow if we are not careful, | ||
609 | diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h | ||
610 | index 290901a8c1de..2b58d192ea24 100644 | ||
611 | --- a/include/linux/mm_types.h | ||
612 | +++ b/include/linux/mm_types.h | ||
613 | @@ -342,9 +342,9 @@ struct mm_rss_stat { | ||
614 | |||
615 | struct kioctx_table; | ||
616 | struct mm_struct { | ||
617 | - struct vm_area_struct * mmap; /* list of VMAs */ | ||
618 | + struct vm_area_struct *mmap; /* list of VMAs */ | ||
619 | struct rb_root mm_rb; | ||
620 | - struct vm_area_struct * mmap_cache; /* last find_vma result */ | ||
621 | + u32 vmacache_seqnum; /* per-thread vmacache */ | ||
622 | #ifdef CONFIG_MMU | ||
623 | unsigned long (*get_unmapped_area) (struct file *filp, | ||
624 | unsigned long addr, unsigned long len, | ||
625 | diff --git a/include/linux/plist.h b/include/linux/plist.h | ||
626 | index aa0fb390bd29..8b6c970cff6c 100644 | ||
627 | --- a/include/linux/plist.h | ||
628 | +++ b/include/linux/plist.h | ||
629 | @@ -98,6 +98,13 @@ struct plist_node { | ||
630 | } | ||
631 | |||
632 | /** | ||
633 | + * PLIST_HEAD - declare and init plist_head | ||
634 | + * @head: name for struct plist_head variable | ||
635 | + */ | ||
636 | +#define PLIST_HEAD(head) \ | ||
637 | + struct plist_head head = PLIST_HEAD_INIT(head) | ||
638 | + | ||
639 | +/** | ||
640 | * PLIST_NODE_INIT - static struct plist_node initializer | ||
641 | * @node: struct plist_node variable name | ||
642 | * @__prio: initial node priority | ||
643 | @@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio) | ||
644 | extern void plist_add(struct plist_node *node, struct plist_head *head); | ||
645 | extern void plist_del(struct plist_node *node, struct plist_head *head); | ||
646 | |||
647 | +extern void plist_requeue(struct plist_node *node, struct plist_head *head); | ||
648 | + | ||
649 | /** | ||
650 | * plist_for_each - iterate over the plist | ||
651 | * @pos: the type * to use as a loop counter | ||
652 | @@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); | ||
653 | list_for_each_entry(pos, &(head)->node_list, node_list) | ||
654 | |||
655 | /** | ||
656 | + * plist_for_each_continue - continue iteration over the plist | ||
657 | + * @pos: the type * to use as a loop cursor | ||
658 | + * @head: the head for your list | ||
659 | + * | ||
660 | + * Continue to iterate over plist, continuing after the current position. | ||
661 | + */ | ||
662 | +#define plist_for_each_continue(pos, head) \ | ||
663 | + list_for_each_entry_continue(pos, &(head)->node_list, node_list) | ||
664 | + | ||
665 | +/** | ||
666 | * plist_for_each_safe - iterate safely over a plist of given type | ||
667 | * @pos: the type * to use as a loop counter | ||
668 | * @n: another type * to use as temporary storage | ||
669 | @@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); | ||
670 | list_for_each_entry(pos, &(head)->node_list, mem.node_list) | ||
671 | |||
672 | /** | ||
673 | + * plist_for_each_entry_continue - continue iteration over list of given type | ||
674 | + * @pos: the type * to use as a loop cursor | ||
675 | + * @head: the head for your list | ||
676 | + * @m: the name of the list_struct within the struct | ||
677 | + * | ||
678 | + * Continue to iterate over list of given type, continuing after | ||
679 | + * the current position. | ||
680 | + */ | ||
681 | +#define plist_for_each_entry_continue(pos, head, m) \ | ||
682 | + list_for_each_entry_continue(pos, &(head)->node_list, m.node_list) | ||
683 | + | ||
684 | +/** | ||
685 | * plist_for_each_entry_safe - iterate safely over list of given type | ||
686 | * @pos: the type * to use as a loop counter | ||
687 | * @n: another type * to use as temporary storage | ||
688 | @@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node) | ||
689 | #endif | ||
690 | |||
691 | /** | ||
692 | + * plist_next - get the next entry in list | ||
693 | + * @pos: the type * to cursor | ||
694 | + */ | ||
695 | +#define plist_next(pos) \ | ||
696 | + list_next_entry(pos, node_list) | ||
697 | + | ||
698 | +/** | ||
699 | + * plist_prev - get the prev entry in list | ||
700 | + * @pos: the type * to cursor | ||
701 | + */ | ||
702 | +#define plist_prev(pos) \ | ||
703 | + list_prev_entry(pos, node_list) | ||
704 | + | ||
705 | +/** | ||
706 | * plist_first - return the first node (and thus, highest priority) | ||
707 | * @head: the &struct plist_head pointer | ||
708 | * | ||
709 | diff --git a/include/linux/sched.h b/include/linux/sched.h | ||
710 | index ccd0c6f24f2c..d7ca410ace93 100644 | ||
711 | --- a/include/linux/sched.h | ||
712 | +++ b/include/linux/sched.h | ||
713 | @@ -59,6 +59,10 @@ struct sched_param { | ||
714 | |||
715 | #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ | ||
716 | |||
717 | +#define VMACACHE_BITS 2 | ||
718 | +#define VMACACHE_SIZE (1U << VMACACHE_BITS) | ||
719 | +#define VMACACHE_MASK (VMACACHE_SIZE - 1) | ||
720 | + | ||
721 | /* | ||
722 | * Extended scheduling parameters data structure. | ||
723 | * | ||
724 | @@ -1228,6 +1232,9 @@ struct task_struct { | ||
725 | #ifdef CONFIG_COMPAT_BRK | ||
726 | unsigned brk_randomized:1; | ||
727 | #endif | ||
728 | + /* per-thread vma caching */ | ||
729 | + u32 vmacache_seqnum; | ||
730 | + struct vm_area_struct *vmacache[VMACACHE_SIZE]; | ||
731 | #if defined(SPLIT_RSS_COUNTING) | ||
732 | struct task_rss_stat rss_stat; | ||
733 | #endif | ||
734 | diff --git a/include/linux/swap.h b/include/linux/swap.h | ||
735 | index 46ba0c6c219f..789324976801 100644 | ||
736 | --- a/include/linux/swap.h | ||
737 | +++ b/include/linux/swap.h | ||
738 | @@ -214,8 +214,9 @@ struct percpu_cluster { | ||
739 | struct swap_info_struct { | ||
740 | unsigned long flags; /* SWP_USED etc: see above */ | ||
741 | signed short prio; /* swap priority of this type */ | ||
742 | + struct plist_node list; /* entry in swap_active_head */ | ||
743 | + struct plist_node avail_list; /* entry in swap_avail_head */ | ||
744 | signed char type; /* strange name for an index */ | ||
745 | - signed char next; /* next type on the swap list */ | ||
746 | unsigned int max; /* extent of the swap_map */ | ||
747 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ | ||
748 | struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ | ||
749 | @@ -255,11 +256,6 @@ struct swap_info_struct { | ||
750 | struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ | ||
751 | }; | ||
752 | |||
753 | -struct swap_list_t { | ||
754 | - int head; /* head of priority-ordered swapfile list */ | ||
755 | - int next; /* swapfile to be used next */ | ||
756 | -}; | ||
757 | - | ||
758 | /* linux/mm/page_alloc.c */ | ||
759 | extern unsigned long totalram_pages; | ||
760 | extern unsigned long totalreserve_pages; | ||
761 | diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h | ||
762 | index e282624e8c10..388293a91e8c 100644 | ||
763 | --- a/include/linux/swapfile.h | ||
764 | +++ b/include/linux/swapfile.h | ||
765 | @@ -6,7 +6,7 @@ | ||
766 | * want to expose them to the dozens of source files that include swap.h | ||
767 | */ | ||
768 | extern spinlock_t swap_lock; | ||
769 | -extern struct swap_list_t swap_list; | ||
770 | +extern struct plist_head swap_active_head; | ||
771 | extern struct swap_info_struct *swap_info[]; | ||
772 | extern int try_to_unuse(unsigned int, bool, unsigned long); | ||
773 | |||
774 | diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h | ||
775 | new file mode 100644 | ||
776 | index 000000000000..c3fa0fd43949 | ||
777 | --- /dev/null | ||
778 | +++ b/include/linux/vmacache.h | ||
779 | @@ -0,0 +1,38 @@ | ||
780 | +#ifndef __LINUX_VMACACHE_H | ||
781 | +#define __LINUX_VMACACHE_H | ||
782 | + | ||
783 | +#include <linux/sched.h> | ||
784 | +#include <linux/mm.h> | ||
785 | + | ||
786 | +/* | ||
787 | + * Hash based on the page number. Provides a good hit rate for | ||
788 | + * workloads with good locality and those with random accesses as well. | ||
789 | + */ | ||
790 | +#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK) | ||
791 | + | ||
792 | +static inline void vmacache_flush(struct task_struct *tsk) | ||
793 | +{ | ||
794 | + memset(tsk->vmacache, 0, sizeof(tsk->vmacache)); | ||
795 | +} | ||
796 | + | ||
797 | +extern void vmacache_flush_all(struct mm_struct *mm); | ||
798 | +extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); | ||
799 | +extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, | ||
800 | + unsigned long addr); | ||
801 | + | ||
802 | +#ifndef CONFIG_MMU | ||
803 | +extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, | ||
804 | + unsigned long start, | ||
805 | + unsigned long end); | ||
806 | +#endif | ||
807 | + | ||
808 | +static inline void vmacache_invalidate(struct mm_struct *mm) | ||
809 | +{ | ||
810 | + mm->vmacache_seqnum++; | ||
811 | + | ||
812 | + /* deal with overflows */ | ||
813 | + if (unlikely(mm->vmacache_seqnum == 0)) | ||
814 | + vmacache_flush_all(mm); | ||
815 | +} | ||
816 | + | ||
817 | +#endif /* __LINUX_VMACACHE_H */ | ||
818 | diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h | ||
819 | index bef53ce555d2..b10682cb138c 100644 | ||
820 | --- a/include/media/videobuf2-core.h | ||
821 | +++ b/include/media/videobuf2-core.h | ||
822 | @@ -329,6 +329,9 @@ struct v4l2_fh; | ||
823 | * @retry_start_streaming: start_streaming() was called, but there were not enough | ||
824 | * buffers queued. If set, then retry calling start_streaming when | ||
825 | * queuing a new buffer. | ||
826 | + * @waiting_for_buffers: used in poll() to check if vb2 is still waiting for | ||
827 | + * buffers. Only set for capture queues if qbuf has not yet been | ||
828 | + * called since poll() needs to return POLLERR in that situation. | ||
829 | * @fileio: file io emulator internal data, used only if emulator is active | ||
830 | */ | ||
831 | struct vb2_queue { | ||
832 | @@ -362,6 +365,7 @@ struct vb2_queue { | ||
833 | |||
834 | unsigned int streaming:1; | ||
835 | unsigned int retry_start_streaming:1; | ||
836 | + unsigned int waiting_for_buffers:1; | ||
837 | |||
838 | struct vb2_fileio_data *fileio; | ||
839 | }; | ||
840 | diff --git a/init/Kconfig b/init/Kconfig | ||
841 | index 93c5ef0c5210..8b9521a2d2c1 100644 | ||
842 | --- a/init/Kconfig | ||
843 | +++ b/init/Kconfig | ||
844 | @@ -1389,6 +1389,7 @@ config FUTEX | ||
845 | |||
846 | config HAVE_FUTEX_CMPXCHG | ||
847 | bool | ||
848 | + depends on FUTEX | ||
849 | help | ||
850 | Architectures should select this if futex_atomic_cmpxchg_inatomic() | ||
851 | is implemented and always working. This removes a couple of runtime | ||
852 | diff --git a/kernel/cpuset.c b/kernel/cpuset.c | ||
853 | index 6b27e5c0cd86..15b3ea693225 100644 | ||
854 | --- a/kernel/cpuset.c | ||
855 | +++ b/kernel/cpuset.c | ||
856 | @@ -1022,7 +1022,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | ||
857 | task_lock(tsk); | ||
858 | /* | ||
859 | * Determine if a loop is necessary if another thread is doing | ||
860 | - * get_mems_allowed(). If at least one node remains unchanged and | ||
861 | + * read_mems_allowed_begin(). If at least one node remains unchanged and | ||
862 | * tsk does not have a mempolicy, then an empty nodemask will not be | ||
863 | * possible when mems_allowed is larger than a word. | ||
864 | */ | ||
865 | diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c | ||
866 | index 334b3980ffc1..8865caec45fb 100644 | ||
867 | --- a/kernel/debug/debug_core.c | ||
868 | +++ b/kernel/debug/debug_core.c | ||
869 | @@ -49,6 +49,7 @@ | ||
870 | #include <linux/pid.h> | ||
871 | #include <linux/smp.h> | ||
872 | #include <linux/mm.h> | ||
873 | +#include <linux/vmacache.h> | ||
874 | #include <linux/rcupdate.h> | ||
875 | |||
876 | #include <asm/cacheflush.h> | ||
877 | @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) | ||
878 | if (!CACHE_FLUSH_IS_SAFE) | ||
879 | return; | ||
880 | |||
881 | - if (current->mm && current->mm->mmap_cache) { | ||
882 | - flush_cache_range(current->mm->mmap_cache, | ||
883 | - addr, addr + BREAK_INSTR_SIZE); | ||
884 | + if (current->mm) { | ||
885 | + int i; | ||
886 | + | ||
887 | + for (i = 0; i < VMACACHE_SIZE; i++) { | ||
888 | + if (!current->vmacache[i]) | ||
889 | + continue; | ||
890 | + flush_cache_range(current->vmacache[i], | ||
891 | + addr, addr + BREAK_INSTR_SIZE); | ||
892 | + } | ||
893 | } | ||
894 | + | ||
895 | /* Force flush instruction cache if it was outside the mm */ | ||
896 | flush_icache_range(addr, addr + BREAK_INSTR_SIZE); | ||
897 | } | ||
898 | diff --git a/kernel/events/core.c b/kernel/events/core.c | ||
899 | index 3a140ca37777..4ced342f1ba9 100644 | ||
900 | --- a/kernel/events/core.c | ||
901 | +++ b/kernel/events/core.c | ||
902 | @@ -7836,8 +7836,10 @@ int perf_event_init_task(struct task_struct *child) | ||
903 | |||
904 | for_each_task_context_nr(ctxn) { | ||
905 | ret = perf_event_init_context(child, ctxn); | ||
906 | - if (ret) | ||
907 | + if (ret) { | ||
908 | + perf_event_free_task(child); | ||
909 | return ret; | ||
910 | + } | ||
911 | } | ||
912 | |||
913 | return 0; | ||
914 | diff --git a/kernel/fork.c b/kernel/fork.c | ||
915 | index c44bff8097f5..e2c685396295 100644 | ||
916 | --- a/kernel/fork.c | ||
917 | +++ b/kernel/fork.c | ||
918 | @@ -28,6 +28,8 @@ | ||
919 | #include <linux/mman.h> | ||
920 | #include <linux/mmu_notifier.h> | ||
921 | #include <linux/fs.h> | ||
922 | +#include <linux/mm.h> | ||
923 | +#include <linux/vmacache.h> | ||
924 | #include <linux/nsproxy.h> | ||
925 | #include <linux/capability.h> | ||
926 | #include <linux/cpu.h> | ||
927 | @@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | ||
928 | |||
929 | mm->locked_vm = 0; | ||
930 | mm->mmap = NULL; | ||
931 | - mm->mmap_cache = NULL; | ||
932 | + mm->vmacache_seqnum = 0; | ||
933 | mm->map_count = 0; | ||
934 | cpumask_clear(mm_cpumask(mm)); | ||
935 | mm->mm_rb = RB_ROOT; | ||
936 | @@ -876,6 +878,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) | ||
937 | if (!oldmm) | ||
938 | return 0; | ||
939 | |||
940 | + /* initialize the new vmacache entries */ | ||
941 | + vmacache_flush(tsk); | ||
942 | + | ||
943 | if (clone_flags & CLONE_VM) { | ||
944 | atomic_inc(&oldmm->mm_users); | ||
945 | mm = oldmm; | ||
946 | @@ -1323,7 +1328,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | ||
947 | goto bad_fork_cleanup_policy; | ||
948 | retval = audit_alloc(p); | ||
949 | if (retval) | ||
950 | - goto bad_fork_cleanup_policy; | ||
951 | + goto bad_fork_cleanup_perf; | ||
952 | /* copy all the process information */ | ||
953 | retval = copy_semundo(clone_flags, p); | ||
954 | if (retval) | ||
955 | @@ -1522,8 +1527,9 @@ bad_fork_cleanup_semundo: | ||
956 | exit_sem(p); | ||
957 | bad_fork_cleanup_audit: | ||
958 | audit_free(p); | ||
959 | -bad_fork_cleanup_policy: | ||
960 | +bad_fork_cleanup_perf: | ||
961 | perf_event_free_task(p); | ||
962 | +bad_fork_cleanup_policy: | ||
963 | #ifdef CONFIG_NUMA | ||
964 | mpol_put(p->mempolicy); | ||
965 | bad_fork_cleanup_cgroup: | ||
966 | diff --git a/kernel/time.c b/kernel/time.c | ||
967 | index 7c7964c33ae7..3c49ab45f822 100644 | ||
968 | --- a/kernel/time.c | ||
969 | +++ b/kernel/time.c | ||
970 | @@ -496,17 +496,20 @@ EXPORT_SYMBOL(usecs_to_jiffies); | ||
971 | * that a remainder subtract here would not do the right thing as the | ||
972 | * resolution values don't fall on second boundries. I.e. the line: | ||
973 | * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. | ||
974 | + * Note that due to the small error in the multiplier here, this | ||
975 | + * rounding is incorrect for sufficiently large values of tv_nsec, but | ||
976 | + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're | ||
977 | + * OK. | ||
978 | * | ||
979 | * Rather, we just shift the bits off the right. | ||
980 | * | ||
981 | * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec | ||
982 | * value to a scaled second value. | ||
983 | */ | ||
984 | -unsigned long | ||
985 | -timespec_to_jiffies(const struct timespec *value) | ||
986 | +static unsigned long | ||
987 | +__timespec_to_jiffies(unsigned long sec, long nsec) | ||
988 | { | ||
989 | - unsigned long sec = value->tv_sec; | ||
990 | - long nsec = value->tv_nsec + TICK_NSEC - 1; | ||
991 | + nsec = nsec + TICK_NSEC - 1; | ||
992 | |||
993 | if (sec >= MAX_SEC_IN_JIFFIES){ | ||
994 | sec = MAX_SEC_IN_JIFFIES; | ||
995 | @@ -517,6 +520,13 @@ timespec_to_jiffies(const struct timespec *value) | ||
996 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
997 | |||
998 | } | ||
999 | + | ||
1000 | +unsigned long | ||
1001 | +timespec_to_jiffies(const struct timespec *value) | ||
1002 | +{ | ||
1003 | + return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); | ||
1004 | +} | ||
1005 | + | ||
1006 | EXPORT_SYMBOL(timespec_to_jiffies); | ||
1007 | |||
1008 | void | ||
1009 | @@ -533,31 +543,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) | ||
1010 | } | ||
1011 | EXPORT_SYMBOL(jiffies_to_timespec); | ||
1012 | |||
1013 | -/* Same for "timeval" | ||
1014 | +/* | ||
1015 | + * We could use a similar algorithm to timespec_to_jiffies (with a | ||
1016 | + * different multiplier for usec instead of nsec). But this has a | ||
1017 | + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the | ||
1018 | + * usec value, since it's not necessarily integral. | ||
1019 | * | ||
1020 | - * Well, almost. The problem here is that the real system resolution is | ||
1021 | - * in nanoseconds and the value being converted is in micro seconds. | ||
1022 | - * Also for some machines (those that use HZ = 1024, in-particular), | ||
1023 | - * there is a LARGE error in the tick size in microseconds. | ||
1024 | - | ||
1025 | - * The solution we use is to do the rounding AFTER we convert the | ||
1026 | - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. | ||
1027 | - * Instruction wise, this should cost only an additional add with carry | ||
1028 | - * instruction above the way it was done above. | ||
1029 | + * We could instead round in the intermediate scaled representation | ||
1030 | + * (i.e. in units of 1/2^(large scale) jiffies) but that's also | ||
1031 | + * perilous: the scaling introduces a small positive error, which | ||
1032 | + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 | ||
1033 | + * units to the intermediate before shifting) leads to accidental | ||
1034 | + * overflow and overestimates. | ||
1035 | + * | ||
1036 | + * At the cost of one additional multiplication by a constant, just | ||
1037 | + * use the timespec implementation. | ||
1038 | */ | ||
1039 | unsigned long | ||
1040 | timeval_to_jiffies(const struct timeval *value) | ||
1041 | { | ||
1042 | - unsigned long sec = value->tv_sec; | ||
1043 | - long usec = value->tv_usec; | ||
1044 | - | ||
1045 | - if (sec >= MAX_SEC_IN_JIFFIES){ | ||
1046 | - sec = MAX_SEC_IN_JIFFIES; | ||
1047 | - usec = 0; | ||
1048 | - } | ||
1049 | - return (((u64)sec * SEC_CONVERSION) + | ||
1050 | - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> | ||
1051 | - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
1052 | + return __timespec_to_jiffies(value->tv_sec, | ||
1053 | + value->tv_usec * NSEC_PER_USEC); | ||
1054 | } | ||
1055 | EXPORT_SYMBOL(timeval_to_jiffies); | ||
1056 | |||
1057 | diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c | ||
1058 | index 773aba836e81..774a0807fe81 100644 | ||
1059 | --- a/kernel/trace/ring_buffer.c | ||
1060 | +++ b/kernel/trace/ring_buffer.c | ||
1061 | @@ -3372,7 +3372,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) | ||
1062 | iter->head = cpu_buffer->reader_page->read; | ||
1063 | |||
1064 | iter->cache_reader_page = iter->head_page; | ||
1065 | - iter->cache_read = iter->head; | ||
1066 | + iter->cache_read = cpu_buffer->read; | ||
1067 | |||
1068 | if (iter->head) | ||
1069 | iter->read_stamp = cpu_buffer->read_stamp; | ||
1070 | diff --git a/lib/plist.c b/lib/plist.c | ||
1071 | index 1ebc95f7a46f..0f2084d30798 100644 | ||
1072 | --- a/lib/plist.c | ||
1073 | +++ b/lib/plist.c | ||
1074 | @@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head) | ||
1075 | plist_check_head(head); | ||
1076 | } | ||
1077 | |||
1078 | +/** | ||
1079 | + * plist_requeue - Requeue @node at end of same-prio entries. | ||
1080 | + * | ||
1081 | + * This is essentially an optimized plist_del() followed by | ||
1082 | + * plist_add(). It moves an entry already in the plist to | ||
1083 | + * after any other same-priority entries. | ||
1084 | + * | ||
1085 | + * @node: &struct plist_node pointer - entry to be moved | ||
1086 | + * @head: &struct plist_head pointer - list head | ||
1087 | + */ | ||
1088 | +void plist_requeue(struct plist_node *node, struct plist_head *head) | ||
1089 | +{ | ||
1090 | + struct plist_node *iter; | ||
1091 | + struct list_head *node_next = &head->node_list; | ||
1092 | + | ||
1093 | + plist_check_head(head); | ||
1094 | + BUG_ON(plist_head_empty(head)); | ||
1095 | + BUG_ON(plist_node_empty(node)); | ||
1096 | + | ||
1097 | + if (node == plist_last(head)) | ||
1098 | + return; | ||
1099 | + | ||
1100 | + iter = plist_next(node); | ||
1101 | + | ||
1102 | + if (node->prio != iter->prio) | ||
1103 | + return; | ||
1104 | + | ||
1105 | + plist_del(node, head); | ||
1106 | + | ||
1107 | + plist_for_each_continue(iter, head) { | ||
1108 | + if (node->prio != iter->prio) { | ||
1109 | + node_next = &iter->node_list; | ||
1110 | + break; | ||
1111 | + } | ||
1112 | + } | ||
1113 | + list_add_tail(&node->node_list, node_next); | ||
1114 | + | ||
1115 | + plist_check_head(head); | ||
1116 | +} | ||
1117 | + | ||
1118 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1119 | #include <linux/sched.h> | ||
1120 | #include <linux/module.h> | ||
1121 | @@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect) | ||
1122 | BUG_ON(prio_pos->prio_list.next != &first->prio_list); | ||
1123 | } | ||
1124 | |||
1125 | +static void __init plist_test_requeue(struct plist_node *node) | ||
1126 | +{ | ||
1127 | + plist_requeue(node, &test_head); | ||
1128 | + | ||
1129 | + if (node != plist_last(&test_head)) | ||
1130 | + BUG_ON(node->prio == plist_next(node)->prio); | ||
1131 | +} | ||
1132 | + | ||
1133 | static int __init plist_test(void) | ||
1134 | { | ||
1135 | int nr_expect = 0, i, loop; | ||
1136 | @@ -193,6 +241,10 @@ static int __init plist_test(void) | ||
1137 | nr_expect--; | ||
1138 | } | ||
1139 | plist_test_check(nr_expect); | ||
1140 | + if (!plist_node_empty(test_node + i)) { | ||
1141 | + plist_test_requeue(test_node + i); | ||
1142 | + plist_test_check(nr_expect); | ||
1143 | + } | ||
1144 | } | ||
1145 | |||
1146 | for (i = 0; i < ARRAY_SIZE(test_node); i++) { | ||
1147 | diff --git a/mm/Makefile b/mm/Makefile | ||
1148 | index 310c90a09264..c561f1f6bca0 100644 | ||
1149 | --- a/mm/Makefile | ||
1150 | +++ b/mm/Makefile | ||
1151 | @@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | ||
1152 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | ||
1153 | util.o mmzone.o vmstat.o backing-dev.o \ | ||
1154 | mm_init.o mmu_context.o percpu.o slab_common.o \ | ||
1155 | - compaction.o balloon_compaction.o \ | ||
1156 | + compaction.o balloon_compaction.o vmacache.o \ | ||
1157 | interval_tree.o list_lru.o $(mmu-y) | ||
1158 | |||
1159 | obj-y += init-mm.o | ||
1160 | diff --git a/mm/compaction.c b/mm/compaction.c | ||
1161 | index 5f702ef0a65f..5e38e5706f62 100644 | ||
1162 | --- a/mm/compaction.c | ||
1163 | +++ b/mm/compaction.c | ||
1164 | @@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, | ||
1165 | /* Returns true if the page is within a block suitable for migration to */ | ||
1166 | static bool suitable_migration_target(struct page *page) | ||
1167 | { | ||
1168 | - int migratetype = get_pageblock_migratetype(page); | ||
1169 | - | ||
1170 | - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
1171 | - if (migratetype == MIGRATE_RESERVE) | ||
1172 | - return false; | ||
1173 | - | ||
1174 | - if (is_migrate_isolate(migratetype)) | ||
1175 | - return false; | ||
1176 | - | ||
1177 | - /* If the page is a large free page, then allow migration */ | ||
1178 | + /* If the page is a large free page, then disallow migration */ | ||
1179 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
1180 | - return true; | ||
1181 | + return false; | ||
1182 | |||
1183 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
1184 | - if (migrate_async_suitable(migratetype)) | ||
1185 | + if (migrate_async_suitable(get_pageblock_migratetype(page))) | ||
1186 | return true; | ||
1187 | |||
1188 | /* Otherwise skip the block */ | ||
1189 | @@ -253,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | ||
1190 | struct page *cursor, *valid_page = NULL; | ||
1191 | unsigned long flags; | ||
1192 | bool locked = false; | ||
1193 | + bool checked_pageblock = false; | ||
1194 | |||
1195 | cursor = pfn_to_page(blockpfn); | ||
1196 | |||
1197 | @@ -284,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | ||
1198 | break; | ||
1199 | |||
1200 | /* Recheck this is a suitable migration target under lock */ | ||
1201 | - if (!strict && !suitable_migration_target(page)) | ||
1202 | - break; | ||
1203 | + if (!strict && !checked_pageblock) { | ||
1204 | + /* | ||
1205 | + * We need to check suitability of pageblock only once | ||
1206 | + * and this isolate_freepages_block() is called with | ||
1207 | + * pageblock range, so just check once is sufficient. | ||
1208 | + */ | ||
1209 | + checked_pageblock = true; | ||
1210 | + if (!suitable_migration_target(page)) | ||
1211 | + break; | ||
1212 | + } | ||
1213 | |||
1214 | /* Recheck this is a buddy page under lock */ | ||
1215 | if (!PageBuddy(page)) | ||
1216 | @@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
1217 | unsigned long last_pageblock_nr = 0, pageblock_nr; | ||
1218 | unsigned long nr_scanned = 0, nr_isolated = 0; | ||
1219 | struct list_head *migratelist = &cc->migratepages; | ||
1220 | - isolate_mode_t mode = 0; | ||
1221 | struct lruvec *lruvec; | ||
1222 | unsigned long flags; | ||
1223 | bool locked = false; | ||
1224 | struct page *page = NULL, *valid_page = NULL; | ||
1225 | bool skipped_async_unsuitable = false; | ||
1226 | + const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | | ||
1227 | + (unevictable ? ISOLATE_UNEVICTABLE : 0); | ||
1228 | |||
1229 | /* | ||
1230 | * Ensure that there are not too many pages isolated from the LRU | ||
1231 | @@ -487,7 +488,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
1232 | cond_resched(); | ||
1233 | for (; low_pfn < end_pfn; low_pfn++) { | ||
1234 | /* give a chance to irqs before checking need_resched() */ | ||
1235 | - if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { | ||
1236 | + if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { | ||
1237 | if (should_release_lock(&zone->lru_lock)) { | ||
1238 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
1239 | locked = false; | ||
1240 | @@ -526,8 +527,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
1241 | |||
1242 | /* If isolation recently failed, do not retry */ | ||
1243 | pageblock_nr = low_pfn >> pageblock_order; | ||
1244 | - if (!isolation_suitable(cc, page)) | ||
1245 | - goto next_pageblock; | ||
1246 | + if (last_pageblock_nr != pageblock_nr) { | ||
1247 | + int mt; | ||
1248 | + | ||
1249 | + last_pageblock_nr = pageblock_nr; | ||
1250 | + if (!isolation_suitable(cc, page)) | ||
1251 | + goto next_pageblock; | ||
1252 | + | ||
1253 | + /* | ||
1254 | + * For async migration, also only scan in MOVABLE | ||
1255 | + * blocks. Async migration is optimistic to see if | ||
1256 | + * the minimum amount of work satisfies the allocation | ||
1257 | + */ | ||
1258 | + mt = get_pageblock_migratetype(page); | ||
1259 | + if (!cc->sync && !migrate_async_suitable(mt)) { | ||
1260 | + cc->finished_update_migrate = true; | ||
1261 | + skipped_async_unsuitable = true; | ||
1262 | + goto next_pageblock; | ||
1263 | + } | ||
1264 | + } | ||
1265 | |||
1266 | /* | ||
1267 | * Skip if free. page_order cannot be used without zone->lock | ||
1268 | @@ -537,18 +555,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
1269 | continue; | ||
1270 | |||
1271 | /* | ||
1272 | - * For async migration, also only scan in MOVABLE blocks. Async | ||
1273 | - * migration is optimistic to see if the minimum amount of work | ||
1274 | - * satisfies the allocation | ||
1275 | - */ | ||
1276 | - if (!cc->sync && last_pageblock_nr != pageblock_nr && | ||
1277 | - !migrate_async_suitable(get_pageblock_migratetype(page))) { | ||
1278 | - cc->finished_update_migrate = true; | ||
1279 | - skipped_async_unsuitable = true; | ||
1280 | - goto next_pageblock; | ||
1281 | - } | ||
1282 | - | ||
1283 | - /* | ||
1284 | * Check may be lockless but that's ok as we recheck later. | ||
1285 | * It's possible to migrate LRU pages and balloon pages | ||
1286 | * Skip any other type of page | ||
1287 | @@ -557,11 +563,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
1288 | if (unlikely(balloon_page_movable(page))) { | ||
1289 | if (locked && balloon_page_isolate(page)) { | ||
1290 | /* Successfully isolated */ | ||
1291 | - cc->finished_update_migrate = true; | ||
1292 | - list_add(&page->lru, migratelist); | ||
1293 | - cc->nr_migratepages++; | ||
1294 | - nr_isolated++; | ||
1295 | - goto check_compact_cluster; | ||
1296 | + goto isolate_success; | ||
1297 | } | ||
1298 | } | ||
1299 | continue; | ||
1300 | @@ -584,6 +586,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
1301 | continue; | ||
1302 | } | ||
1303 | |||
1304 | + /* | ||
1305 | + * Migration will fail if an anonymous page is pinned in memory, | ||
1306 | + * so avoid taking lru_lock and isolating it unnecessarily in an | ||
1307 | + * admittedly racy check. | ||
1308 | + */ | ||
1309 | + if (!page_mapping(page) && | ||
1310 | + page_count(page) > page_mapcount(page)) | ||
1311 | + continue; | ||
1312 | + | ||
1313 | /* Check if it is ok to still hold the lock */ | ||
1314 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | ||
1315 | locked, cc); | ||
1316 | @@ -598,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
1317 | continue; | ||
1318 | } | ||
1319 | |||
1320 | - if (!cc->sync) | ||
1321 | - mode |= ISOLATE_ASYNC_MIGRATE; | ||
1322 | - | ||
1323 | - if (unevictable) | ||
1324 | - mode |= ISOLATE_UNEVICTABLE; | ||
1325 | - | ||
1326 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1327 | |||
1328 | /* Try isolate the page */ | ||
1329 | @@ -613,13 +618,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | ||
1330 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | ||
1331 | |||
1332 | /* Successfully isolated */ | ||
1333 | - cc->finished_update_migrate = true; | ||
1334 | del_page_from_lru_list(page, lruvec, page_lru(page)); | ||
1335 | + | ||
1336 | +isolate_success: | ||
1337 | + cc->finished_update_migrate = true; | ||
1338 | list_add(&page->lru, migratelist); | ||
1339 | cc->nr_migratepages++; | ||
1340 | nr_isolated++; | ||
1341 | |||
1342 | -check_compact_cluster: | ||
1343 | /* Avoid isolating too much */ | ||
1344 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { | ||
1345 | ++low_pfn; | ||
1346 | @@ -630,7 +636,6 @@ check_compact_cluster: | ||
1347 | |||
1348 | next_pageblock: | ||
1349 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; | ||
1350 | - last_pageblock_nr = pageblock_nr; | ||
1351 | } | ||
1352 | |||
1353 | acct_isolated(zone, locked, cc); | ||
1354 | @@ -1188,6 +1193,7 @@ static void compact_node(int nid) | ||
1355 | struct compact_control cc = { | ||
1356 | .order = -1, | ||
1357 | .sync = true, | ||
1358 | + .ignore_skip_hint = true, | ||
1359 | }; | ||
1360 | |||
1361 | __compact_pgdat(NODE_DATA(nid), &cc); | ||
1362 | diff --git a/mm/filemap.c b/mm/filemap.c | ||
1363 | index 7a13f6ac5421..c2cc7c95eff1 100644 | ||
1364 | --- a/mm/filemap.c | ||
1365 | +++ b/mm/filemap.c | ||
1366 | @@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping) | ||
1367 | { | ||
1368 | int ret = 0; | ||
1369 | /* Check for outstanding write errors */ | ||
1370 | - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | ||
1371 | + if (test_bit(AS_ENOSPC, &mapping->flags) && | ||
1372 | + test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | ||
1373 | ret = -ENOSPC; | ||
1374 | - if (test_and_clear_bit(AS_EIO, &mapping->flags)) | ||
1375 | + if (test_bit(AS_EIO, &mapping->flags) && | ||
1376 | + test_and_clear_bit(AS_EIO, &mapping->flags)) | ||
1377 | ret = -EIO; | ||
1378 | return ret; | ||
1379 | } | ||
1380 | @@ -520,10 +522,10 @@ struct page *__page_cache_alloc(gfp_t gfp) | ||
1381 | if (cpuset_do_page_mem_spread()) { | ||
1382 | unsigned int cpuset_mems_cookie; | ||
1383 | do { | ||
1384 | - cpuset_mems_cookie = get_mems_allowed(); | ||
1385 | + cpuset_mems_cookie = read_mems_allowed_begin(); | ||
1386 | n = cpuset_mem_spread_node(); | ||
1387 | page = alloc_pages_exact_node(n, gfp, 0); | ||
1388 | - } while (!put_mems_allowed(cpuset_mems_cookie) && !page); | ||
1389 | + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); | ||
1390 | |||
1391 | return page; | ||
1392 | } | ||
1393 | diff --git a/mm/frontswap.c b/mm/frontswap.c | ||
1394 | index 1b24bdcb3197..c30eec536f03 100644 | ||
1395 | --- a/mm/frontswap.c | ||
1396 | +++ b/mm/frontswap.c | ||
1397 | @@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); | ||
1398 | |||
1399 | static unsigned long __frontswap_curr_pages(void) | ||
1400 | { | ||
1401 | - int type; | ||
1402 | unsigned long totalpages = 0; | ||
1403 | struct swap_info_struct *si = NULL; | ||
1404 | |||
1405 | assert_spin_locked(&swap_lock); | ||
1406 | - for (type = swap_list.head; type >= 0; type = si->next) { | ||
1407 | - si = swap_info[type]; | ||
1408 | + plist_for_each_entry(si, &swap_active_head, list) | ||
1409 | totalpages += atomic_read(&si->frontswap_pages); | ||
1410 | - } | ||
1411 | return totalpages; | ||
1412 | } | ||
1413 | |||
1414 | @@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | ||
1415 | int si_frontswap_pages; | ||
1416 | unsigned long total_pages_to_unuse = total; | ||
1417 | unsigned long pages = 0, pages_to_unuse = 0; | ||
1418 | - int type; | ||
1419 | |||
1420 | assert_spin_locked(&swap_lock); | ||
1421 | - for (type = swap_list.head; type >= 0; type = si->next) { | ||
1422 | - si = swap_info[type]; | ||
1423 | + plist_for_each_entry(si, &swap_active_head, list) { | ||
1424 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | ||
1425 | if (total_pages_to_unuse < si_frontswap_pages) { | ||
1426 | pages = pages_to_unuse = total_pages_to_unuse; | ||
1427 | @@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | ||
1428 | } | ||
1429 | vm_unacct_memory(pages); | ||
1430 | *unused = pages_to_unuse; | ||
1431 | - *swapid = type; | ||
1432 | + *swapid = si->type; | ||
1433 | ret = 0; | ||
1434 | break; | ||
1435 | } | ||
1436 | @@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) | ||
1437 | /* | ||
1438 | * we don't want to hold swap_lock while doing a very | ||
1439 | * lengthy try_to_unuse, but swap_list may change | ||
1440 | - * so restart scan from swap_list.head each time | ||
1441 | + * so restart scan from swap_active_head each time | ||
1442 | */ | ||
1443 | spin_lock(&swap_lock); | ||
1444 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); | ||
1445 | diff --git a/mm/huge_memory.c b/mm/huge_memory.c | ||
1446 | index 1c42d0c36d0b..718bfa16a36f 100644 | ||
1447 | --- a/mm/huge_memory.c | ||
1448 | +++ b/mm/huge_memory.c | ||
1449 | @@ -1819,21 +1819,24 @@ static int __split_huge_page_map(struct page *page, | ||
1450 | if (pmd) { | ||
1451 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | ||
1452 | pmd_populate(mm, &_pmd, pgtable); | ||
1453 | + if (pmd_write(*pmd)) | ||
1454 | + BUG_ON(page_mapcount(page) != 1); | ||
1455 | |||
1456 | haddr = address; | ||
1457 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
1458 | pte_t *pte, entry; | ||
1459 | BUG_ON(PageCompound(page+i)); | ||
1460 | + /* | ||
1461 | + * Note that pmd_numa is not transferred deliberately | ||
1462 | + * to avoid any possibility that pte_numa leaks to | ||
1463 | + * a PROT_NONE VMA by accident. | ||
1464 | + */ | ||
1465 | entry = mk_pte(page + i, vma->vm_page_prot); | ||
1466 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1467 | if (!pmd_write(*pmd)) | ||
1468 | entry = pte_wrprotect(entry); | ||
1469 | - else | ||
1470 | - BUG_ON(page_mapcount(page) != 1); | ||
1471 | if (!pmd_young(*pmd)) | ||
1472 | entry = pte_mkold(entry); | ||
1473 | - if (pmd_numa(*pmd)) | ||
1474 | - entry = pte_mknuma(entry); | ||
1475 | pte = pte_offset_map(&_pmd, haddr); | ||
1476 | BUG_ON(!pte_none(*pte)); | ||
1477 | set_pte_at(mm, haddr, pte, entry); | ||
1478 | diff --git a/mm/hugetlb.c b/mm/hugetlb.c | ||
1479 | index 923f38e62bcf..67d0c175efcf 100644 | ||
1480 | --- a/mm/hugetlb.c | ||
1481 | +++ b/mm/hugetlb.c | ||
1482 | @@ -540,7 +540,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | ||
1483 | goto err; | ||
1484 | |||
1485 | retry_cpuset: | ||
1486 | - cpuset_mems_cookie = get_mems_allowed(); | ||
1487 | + cpuset_mems_cookie = read_mems_allowed_begin(); | ||
1488 | zonelist = huge_zonelist(vma, address, | ||
1489 | htlb_alloc_mask(h), &mpol, &nodemask); | ||
1490 | |||
1491 | @@ -562,7 +562,7 @@ retry_cpuset: | ||
1492 | } | ||
1493 | |||
1494 | mpol_cond_put(mpol); | ||
1495 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1496 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | ||
1497 | goto retry_cpuset; | ||
1498 | return page; | ||
1499 | |||
1500 | @@ -2071,6 +2071,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | ||
1501 | unsigned long tmp; | ||
1502 | int ret; | ||
1503 | |||
1504 | + if (!hugepages_supported()) | ||
1505 | + return -ENOTSUPP; | ||
1506 | + | ||
1507 | tmp = h->max_huge_pages; | ||
1508 | |||
1509 | if (write && h->order >= MAX_ORDER) | ||
1510 | @@ -2124,6 +2127,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | ||
1511 | unsigned long tmp; | ||
1512 | int ret; | ||
1513 | |||
1514 | + if (!hugepages_supported()) | ||
1515 | + return -ENOTSUPP; | ||
1516 | + | ||
1517 | tmp = h->nr_overcommit_huge_pages; | ||
1518 | |||
1519 | if (write && h->order >= MAX_ORDER) | ||
1520 | @@ -2149,6 +2155,8 @@ out: | ||
1521 | void hugetlb_report_meminfo(struct seq_file *m) | ||
1522 | { | ||
1523 | struct hstate *h = &default_hstate; | ||
1524 | + if (!hugepages_supported()) | ||
1525 | + return; | ||
1526 | seq_printf(m, | ||
1527 | "HugePages_Total: %5lu\n" | ||
1528 | "HugePages_Free: %5lu\n" | ||
1529 | @@ -2165,6 +2173,8 @@ void hugetlb_report_meminfo(struct seq_file *m) | ||
1530 | int hugetlb_report_node_meminfo(int nid, char *buf) | ||
1531 | { | ||
1532 | struct hstate *h = &default_hstate; | ||
1533 | + if (!hugepages_supported()) | ||
1534 | + return 0; | ||
1535 | return sprintf(buf, | ||
1536 | "Node %d HugePages_Total: %5u\n" | ||
1537 | "Node %d HugePages_Free: %5u\n" | ||
1538 | @@ -2179,6 +2189,9 @@ void hugetlb_show_meminfo(void) | ||
1539 | struct hstate *h; | ||
1540 | int nid; | ||
1541 | |||
1542 | + if (!hugepages_supported()) | ||
1543 | + return; | ||
1544 | + | ||
1545 | for_each_node_state(nid, N_MEMORY) | ||
1546 | for_each_hstate(h) | ||
1547 | pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", | ||
1548 | diff --git a/mm/mempolicy.c b/mm/mempolicy.c | ||
1549 | index 15a8ea031526..796c7e6cf93b 100644 | ||
1550 | --- a/mm/mempolicy.c | ||
1551 | +++ b/mm/mempolicy.c | ||
1552 | @@ -1897,7 +1897,7 @@ int node_random(const nodemask_t *maskp) | ||
1553 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's | ||
1554 | * @nodemask for filtering the zonelist. | ||
1555 | * | ||
1556 | - * Must be protected by get_mems_allowed() | ||
1557 | + * Must be protected by read_mems_allowed_begin() | ||
1558 | */ | ||
1559 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | ||
1560 | gfp_t gfp_flags, struct mempolicy **mpol, | ||
1561 | @@ -2061,7 +2061,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | ||
1562 | |||
1563 | retry_cpuset: | ||
1564 | pol = get_vma_policy(current, vma, addr); | ||
1565 | - cpuset_mems_cookie = get_mems_allowed(); | ||
1566 | + cpuset_mems_cookie = read_mems_allowed_begin(); | ||
1567 | |||
1568 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | ||
1569 | unsigned nid; | ||
1570 | @@ -2069,7 +2069,7 @@ retry_cpuset: | ||
1571 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | ||
1572 | mpol_cond_put(pol); | ||
1573 | page = alloc_page_interleave(gfp, order, nid); | ||
1574 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1575 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | ||
1576 | goto retry_cpuset; | ||
1577 | |||
1578 | return page; | ||
1579 | @@ -2079,7 +2079,7 @@ retry_cpuset: | ||
1580 | policy_nodemask(gfp, pol)); | ||
1581 | if (unlikely(mpol_needs_cond_ref(pol))) | ||
1582 | __mpol_put(pol); | ||
1583 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1584 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | ||
1585 | goto retry_cpuset; | ||
1586 | return page; | ||
1587 | } | ||
1588 | @@ -2113,7 +2113,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | ||
1589 | pol = &default_policy; | ||
1590 | |||
1591 | retry_cpuset: | ||
1592 | - cpuset_mems_cookie = get_mems_allowed(); | ||
1593 | + cpuset_mems_cookie = read_mems_allowed_begin(); | ||
1594 | |||
1595 | /* | ||
1596 | * No reference counting needed for current->mempolicy | ||
1597 | @@ -2126,7 +2126,7 @@ retry_cpuset: | ||
1598 | policy_zonelist(gfp, pol, numa_node_id()), | ||
1599 | policy_nodemask(gfp, pol)); | ||
1600 | |||
1601 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1602 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | ||
1603 | goto retry_cpuset; | ||
1604 | |||
1605 | return page; | ||
1606 | diff --git a/mm/migrate.c b/mm/migrate.c | ||
1607 | index bed48809e5d0..13f47fbe3550 100644 | ||
1608 | --- a/mm/migrate.c | ||
1609 | +++ b/mm/migrate.c | ||
1610 | @@ -148,8 +148,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | ||
1611 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | ||
1612 | if (pte_swp_soft_dirty(*ptep)) | ||
1613 | pte = pte_mksoft_dirty(pte); | ||
1614 | + | ||
1615 | + /* Recheck VMA as permissions can change since migration started */ | ||
1616 | if (is_write_migration_entry(entry)) | ||
1617 | - pte = pte_mkwrite(pte); | ||
1618 | + pte = maybe_mkwrite(pte, vma); | ||
1619 | + | ||
1620 | #ifdef CONFIG_HUGETLB_PAGE | ||
1621 | if (PageHuge(new)) { | ||
1622 | pte = pte_mkhuge(pte); | ||
1623 | diff --git a/mm/mmap.c b/mm/mmap.c | ||
1624 | index 20ff0c33274c..dfe90657a6db 100644 | ||
1625 | --- a/mm/mmap.c | ||
1626 | +++ b/mm/mmap.c | ||
1627 | @@ -10,6 +10,7 @@ | ||
1628 | #include <linux/slab.h> | ||
1629 | #include <linux/backing-dev.h> | ||
1630 | #include <linux/mm.h> | ||
1631 | +#include <linux/vmacache.h> | ||
1632 | #include <linux/shm.h> | ||
1633 | #include <linux/mman.h> | ||
1634 | #include <linux/pagemap.h> | ||
1635 | @@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | ||
1636 | prev->vm_next = next = vma->vm_next; | ||
1637 | if (next) | ||
1638 | next->vm_prev = prev; | ||
1639 | - if (mm->mmap_cache == vma) | ||
1640 | - mm->mmap_cache = prev; | ||
1641 | + | ||
1642 | + /* Kill the cache */ | ||
1643 | + vmacache_invalidate(mm); | ||
1644 | } | ||
1645 | |||
1646 | /* | ||
1647 | @@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area); | ||
1648 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | ||
1649 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
1650 | { | ||
1651 | - struct vm_area_struct *vma = NULL; | ||
1652 | + struct rb_node *rb_node; | ||
1653 | + struct vm_area_struct *vma; | ||
1654 | |||
1655 | /* Check the cache first. */ | ||
1656 | - /* (Cache hit rate is typically around 35%.) */ | ||
1657 | - vma = ACCESS_ONCE(mm->mmap_cache); | ||
1658 | - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | ||
1659 | - struct rb_node *rb_node; | ||
1660 | + vma = vmacache_find(mm, addr); | ||
1661 | + if (likely(vma)) | ||
1662 | + return vma; | ||
1663 | |||
1664 | - rb_node = mm->mm_rb.rb_node; | ||
1665 | - vma = NULL; | ||
1666 | + rb_node = mm->mm_rb.rb_node; | ||
1667 | + vma = NULL; | ||
1668 | |||
1669 | - while (rb_node) { | ||
1670 | - struct vm_area_struct *vma_tmp; | ||
1671 | - | ||
1672 | - vma_tmp = rb_entry(rb_node, | ||
1673 | - struct vm_area_struct, vm_rb); | ||
1674 | - | ||
1675 | - if (vma_tmp->vm_end > addr) { | ||
1676 | - vma = vma_tmp; | ||
1677 | - if (vma_tmp->vm_start <= addr) | ||
1678 | - break; | ||
1679 | - rb_node = rb_node->rb_left; | ||
1680 | - } else | ||
1681 | - rb_node = rb_node->rb_right; | ||
1682 | - } | ||
1683 | - if (vma) | ||
1684 | - mm->mmap_cache = vma; | ||
1685 | + while (rb_node) { | ||
1686 | + struct vm_area_struct *tmp; | ||
1687 | + | ||
1688 | + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); | ||
1689 | + | ||
1690 | + if (tmp->vm_end > addr) { | ||
1691 | + vma = tmp; | ||
1692 | + if (tmp->vm_start <= addr) | ||
1693 | + break; | ||
1694 | + rb_node = rb_node->rb_left; | ||
1695 | + } else | ||
1696 | + rb_node = rb_node->rb_right; | ||
1697 | } | ||
1698 | + | ||
1699 | + if (vma) | ||
1700 | + vmacache_update(addr, vma); | ||
1701 | return vma; | ||
1702 | } | ||
1703 | |||
1704 | @@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | ||
1705 | } else | ||
1706 | mm->highest_vm_end = prev ? prev->vm_end : 0; | ||
1707 | tail_vma->vm_next = NULL; | ||
1708 | - mm->mmap_cache = NULL; /* Kill the cache. */ | ||
1709 | + | ||
1710 | + /* Kill the cache */ | ||
1711 | + vmacache_invalidate(mm); | ||
1712 | } | ||
1713 | |||
1714 | /* | ||
1715 | diff --git a/mm/nommu.c b/mm/nommu.c | ||
1716 | index 8740213b1647..3ee4f74fbfbe 100644 | ||
1717 | --- a/mm/nommu.c | ||
1718 | +++ b/mm/nommu.c | ||
1719 | @@ -15,6 +15,7 @@ | ||
1720 | |||
1721 | #include <linux/export.h> | ||
1722 | #include <linux/mm.h> | ||
1723 | +#include <linux/vmacache.h> | ||
1724 | #include <linux/mman.h> | ||
1725 | #include <linux/swap.h> | ||
1726 | #include <linux/file.h> | ||
1727 | @@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | ||
1728 | */ | ||
1729 | static void delete_vma_from_mm(struct vm_area_struct *vma) | ||
1730 | { | ||
1731 | + int i; | ||
1732 | struct address_space *mapping; | ||
1733 | struct mm_struct *mm = vma->vm_mm; | ||
1734 | + struct task_struct *curr = current; | ||
1735 | |||
1736 | kenter("%p", vma); | ||
1737 | |||
1738 | protect_vma(vma, 0); | ||
1739 | |||
1740 | mm->map_count--; | ||
1741 | - if (mm->mmap_cache == vma) | ||
1742 | - mm->mmap_cache = NULL; | ||
1743 | + for (i = 0; i < VMACACHE_SIZE; i++) { | ||
1744 | + /* if the vma is cached, invalidate the entire cache */ | ||
1745 | + if (curr->vmacache[i] == vma) { | ||
1746 | + vmacache_invalidate(curr->mm); | ||
1747 | + break; | ||
1748 | + } | ||
1749 | + } | ||
1750 | |||
1751 | /* remove the VMA from the mapping */ | ||
1752 | if (vma->vm_file) { | ||
1753 | @@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
1754 | struct vm_area_struct *vma; | ||
1755 | |||
1756 | /* check the cache first */ | ||
1757 | - vma = ACCESS_ONCE(mm->mmap_cache); | ||
1758 | - if (vma && vma->vm_start <= addr && vma->vm_end > addr) | ||
1759 | + vma = vmacache_find(mm, addr); | ||
1760 | + if (likely(vma)) | ||
1761 | return vma; | ||
1762 | |||
1763 | /* trawl the list (there may be multiple mappings in which addr | ||
1764 | @@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
1765 | if (vma->vm_start > addr) | ||
1766 | return NULL; | ||
1767 | if (vma->vm_end > addr) { | ||
1768 | - mm->mmap_cache = vma; | ||
1769 | + vmacache_update(addr, vma); | ||
1770 | return vma; | ||
1771 | } | ||
1772 | } | ||
1773 | @@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | ||
1774 | unsigned long end = addr + len; | ||
1775 | |||
1776 | /* check the cache first */ | ||
1777 | - vma = mm->mmap_cache; | ||
1778 | - if (vma && vma->vm_start == addr && vma->vm_end == end) | ||
1779 | + vma = vmacache_find_exact(mm, addr, end); | ||
1780 | + if (vma) | ||
1781 | return vma; | ||
1782 | |||
1783 | /* trawl the list (there may be multiple mappings in which addr | ||
1784 | @@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | ||
1785 | if (vma->vm_start > addr) | ||
1786 | return NULL; | ||
1787 | if (vma->vm_end == end) { | ||
1788 | - mm->mmap_cache = vma; | ||
1789 | + vmacache_update(addr, vma); | ||
1790 | return vma; | ||
1791 | } | ||
1792 | } | ||
1793 | diff --git a/mm/page_alloc.c b/mm/page_alloc.c | ||
1794 | index 62e400d00e3f..ff0f6b13f32f 100644 | ||
1795 | --- a/mm/page_alloc.c | ||
1796 | +++ b/mm/page_alloc.c | ||
1797 | @@ -1869,7 +1869,7 @@ static void __paginginit init_zone_allows_reclaim(int nid) | ||
1798 | { | ||
1799 | int i; | ||
1800 | |||
1801 | - for_each_online_node(i) | ||
1802 | + for_each_node_state(i, N_MEMORY) | ||
1803 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) | ||
1804 | node_set(i, NODE_DATA(nid)->reclaim_nodes); | ||
1805 | else | ||
1806 | @@ -2736,7 +2736,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1807 | return NULL; | ||
1808 | |||
1809 | retry_cpuset: | ||
1810 | - cpuset_mems_cookie = get_mems_allowed(); | ||
1811 | + cpuset_mems_cookie = read_mems_allowed_begin(); | ||
1812 | |||
1813 | /* The preferred zone is used for statistics later */ | ||
1814 | first_zones_zonelist(zonelist, high_zoneidx, | ||
1815 | @@ -2791,7 +2791,7 @@ out: | ||
1816 | * the mask is being updated. If a page allocation is about to fail, | ||
1817 | * check if the cpuset changed during allocation and if so, retry. | ||
1818 | */ | ||
1819 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1820 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | ||
1821 | goto retry_cpuset; | ||
1822 | |||
1823 | memcg_kmem_commit_charge(page, memcg, order); | ||
1824 | @@ -3059,9 +3059,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) | ||
1825 | goto out; | ||
1826 | |||
1827 | do { | ||
1828 | - cpuset_mems_cookie = get_mems_allowed(); | ||
1829 | + cpuset_mems_cookie = read_mems_allowed_begin(); | ||
1830 | ret = !node_isset(nid, cpuset_current_mems_allowed); | ||
1831 | - } while (!put_mems_allowed(cpuset_mems_cookie)); | ||
1832 | + } while (read_mems_allowed_retry(cpuset_mems_cookie)); | ||
1833 | out: | ||
1834 | return ret; | ||
1835 | } | ||
1836 | @@ -4933,7 +4933,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | ||
1837 | |||
1838 | pgdat->node_id = nid; | ||
1839 | pgdat->node_start_pfn = node_start_pfn; | ||
1840 | - init_zone_allows_reclaim(nid); | ||
1841 | + if (node_state(nid, N_MEMORY)) | ||
1842 | + init_zone_allows_reclaim(nid); | ||
1843 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
1844 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | ||
1845 | #endif | ||
1846 | diff --git a/mm/readahead.c b/mm/readahead.c | ||
1847 | index 0de2360d65f3..1fa0d6fca556 100644 | ||
1848 | --- a/mm/readahead.c | ||
1849 | +++ b/mm/readahead.c | ||
1850 | @@ -233,14 +233,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
1851 | return 0; | ||
1852 | } | ||
1853 | |||
1854 | +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) | ||
1855 | /* | ||
1856 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | ||
1857 | * sensible upper limit. | ||
1858 | */ | ||
1859 | unsigned long max_sane_readahead(unsigned long nr) | ||
1860 | { | ||
1861 | - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) | ||
1862 | - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); | ||
1863 | + return min(nr, MAX_READAHEAD); | ||
1864 | } | ||
1865 | |||
1866 | /* | ||
1867 | diff --git a/mm/slab.c b/mm/slab.c | ||
1868 | index ea854eb2388c..0b1c2a58559d 100644 | ||
1869 | --- a/mm/slab.c | ||
1870 | +++ b/mm/slab.c | ||
1871 | @@ -3122,7 +3122,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | ||
1872 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | ||
1873 | |||
1874 | retry_cpuset: | ||
1875 | - cpuset_mems_cookie = get_mems_allowed(); | ||
1876 | + cpuset_mems_cookie = read_mems_allowed_begin(); | ||
1877 | zonelist = node_zonelist(slab_node(), flags); | ||
1878 | |||
1879 | retry: | ||
1880 | @@ -3180,7 +3180,7 @@ retry: | ||
1881 | } | ||
1882 | } | ||
1883 | |||
1884 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) | ||
1885 | + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) | ||
1886 | goto retry_cpuset; | ||
1887 | return obj; | ||
1888 | } | ||
1889 | diff --git a/mm/slub.c b/mm/slub.c | ||
1890 | index 25f14ad8f817..7611f148ee81 100644 | ||
1891 | --- a/mm/slub.c | ||
1892 | +++ b/mm/slub.c | ||
1893 | @@ -1684,7 +1684,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | ||
1894 | return NULL; | ||
1895 | |||
1896 | do { | ||
1897 | - cpuset_mems_cookie = get_mems_allowed(); | ||
1898 | + cpuset_mems_cookie = read_mems_allowed_begin(); | ||
1899 | zonelist = node_zonelist(slab_node(), flags); | ||
1900 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | ||
1901 | struct kmem_cache_node *n; | ||
1902 | @@ -1696,19 +1696,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | ||
1903 | object = get_partial_node(s, n, c, flags); | ||
1904 | if (object) { | ||
1905 | /* | ||
1906 | - * Return the object even if | ||
1907 | - * put_mems_allowed indicated that | ||
1908 | - * the cpuset mems_allowed was | ||
1909 | - * updated in parallel. It's a | ||
1910 | - * harmless race between the alloc | ||
1911 | - * and the cpuset update. | ||
1912 | + * Don't check read_mems_allowed_retry() | ||
1913 | + * here - if mems_allowed was updated in | ||
1914 | + * parallel, that was a harmless race | ||
1915 | + * between allocation and the cpuset | ||
1916 | + * update | ||
1917 | */ | ||
1918 | - put_mems_allowed(cpuset_mems_cookie); | ||
1919 | return object; | ||
1920 | } | ||
1921 | } | ||
1922 | } | ||
1923 | - } while (!put_mems_allowed(cpuset_mems_cookie)); | ||
1924 | + } while (read_mems_allowed_retry(cpuset_mems_cookie)); | ||
1925 | #endif | ||
1926 | return NULL; | ||
1927 | } | ||
1928 | diff --git a/mm/swapfile.c b/mm/swapfile.c | ||
1929 | index 4a7f7e6992b6..beeeef8a1b2d 100644 | ||
1930 | --- a/mm/swapfile.c | ||
1931 | +++ b/mm/swapfile.c | ||
1932 | @@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages; | ||
1933 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | ||
1934 | long total_swap_pages; | ||
1935 | static int least_priority; | ||
1936 | -static atomic_t highest_priority_index = ATOMIC_INIT(-1); | ||
1937 | |||
1938 | static const char Bad_file[] = "Bad swap file entry "; | ||
1939 | static const char Unused_file[] = "Unused swap file entry "; | ||
1940 | static const char Bad_offset[] = "Bad swap offset entry "; | ||
1941 | static const char Unused_offset[] = "Unused swap offset entry "; | ||
1942 | |||
1943 | -struct swap_list_t swap_list = {-1, -1}; | ||
1944 | +/* | ||
1945 | + * all active swap_info_structs | ||
1946 | + * protected with swap_lock, and ordered by priority. | ||
1947 | + */ | ||
1948 | +PLIST_HEAD(swap_active_head); | ||
1949 | + | ||
1950 | +/* | ||
1951 | + * all available (active, not full) swap_info_structs | ||
1952 | + * protected with swap_avail_lock, ordered by priority. | ||
1953 | + * This is used by get_swap_page() instead of swap_active_head | ||
1954 | + * because swap_active_head includes all swap_info_structs, | ||
1955 | + * but get_swap_page() doesn't need to look at full ones. | ||
1956 | + * This uses its own lock instead of swap_lock because when a | ||
1957 | + * swap_info_struct changes between not-full/full, it needs to | ||
1958 | + * add/remove itself to/from this list, but the swap_info_struct->lock | ||
1959 | + * is held and the locking order requires swap_lock to be taken | ||
1960 | + * before any swap_info_struct->lock. | ||
1961 | + */ | ||
1962 | +static PLIST_HEAD(swap_avail_head); | ||
1963 | +static DEFINE_SPINLOCK(swap_avail_lock); | ||
1964 | |||
1965 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; | ||
1966 | |||
1967 | @@ -591,6 +609,9 @@ checks: | ||
1968 | if (si->inuse_pages == si->pages) { | ||
1969 | si->lowest_bit = si->max; | ||
1970 | si->highest_bit = 0; | ||
1971 | + spin_lock(&swap_avail_lock); | ||
1972 | + plist_del(&si->avail_list, &swap_avail_head); | ||
1973 | + spin_unlock(&swap_avail_lock); | ||
1974 | } | ||
1975 | si->swap_map[offset] = usage; | ||
1976 | inc_cluster_info_page(si, si->cluster_info, offset); | ||
1977 | @@ -640,71 +661,65 @@ no_page: | ||
1978 | |||
1979 | swp_entry_t get_swap_page(void) | ||
1980 | { | ||
1981 | - struct swap_info_struct *si; | ||
1982 | + struct swap_info_struct *si, *next; | ||
1983 | pgoff_t offset; | ||
1984 | - int type, next; | ||
1985 | - int wrapped = 0; | ||
1986 | - int hp_index; | ||
1987 | |||
1988 | - spin_lock(&swap_lock); | ||
1989 | if (atomic_long_read(&nr_swap_pages) <= 0) | ||
1990 | goto noswap; | ||
1991 | atomic_long_dec(&nr_swap_pages); | ||
1992 | |||
1993 | - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | ||
1994 | - hp_index = atomic_xchg(&highest_priority_index, -1); | ||
1995 | - /* | ||
1996 | - * highest_priority_index records current highest priority swap | ||
1997 | - * type which just frees swap entries. If its priority is | ||
1998 | - * higher than that of swap_list.next swap type, we use it. It | ||
1999 | - * isn't protected by swap_lock, so it can be an invalid value | ||
2000 | - * if the corresponding swap type is swapoff. We double check | ||
2001 | - * the flags here. It's even possible the swap type is swapoff | ||
2002 | - * and swapon again and its priority is changed. In such rare | ||
2003 | - * case, low prority swap type might be used, but eventually | ||
2004 | - * high priority swap will be used after several rounds of | ||
2005 | - * swap. | ||
2006 | - */ | ||
2007 | - if (hp_index != -1 && hp_index != type && | ||
2008 | - swap_info[type]->prio < swap_info[hp_index]->prio && | ||
2009 | - (swap_info[hp_index]->flags & SWP_WRITEOK)) { | ||
2010 | - type = hp_index; | ||
2011 | - swap_list.next = type; | ||
2012 | - } | ||
2013 | - | ||
2014 | - si = swap_info[type]; | ||
2015 | - next = si->next; | ||
2016 | - if (next < 0 || | ||
2017 | - (!wrapped && si->prio != swap_info[next]->prio)) { | ||
2018 | - next = swap_list.head; | ||
2019 | - wrapped++; | ||
2020 | - } | ||
2021 | + spin_lock(&swap_avail_lock); | ||
2022 | |||
2023 | +start_over: | ||
2024 | + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { | ||
2025 | + /* requeue si to after same-priority siblings */ | ||
2026 | + plist_requeue(&si->avail_list, &swap_avail_head); | ||
2027 | + spin_unlock(&swap_avail_lock); | ||
2028 | spin_lock(&si->lock); | ||
2029 | - if (!si->highest_bit) { | ||
2030 | + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { | ||
2031 | + spin_lock(&swap_avail_lock); | ||
2032 | + if (plist_node_empty(&si->avail_list)) { | ||
2033 | + spin_unlock(&si->lock); | ||
2034 | + goto nextsi; | ||
2035 | + } | ||
2036 | + WARN(!si->highest_bit, | ||
2037 | + "swap_info %d in list but !highest_bit\n", | ||
2038 | + si->type); | ||
2039 | + WARN(!(si->flags & SWP_WRITEOK), | ||
2040 | + "swap_info %d in list but !SWP_WRITEOK\n", | ||
2041 | + si->type); | ||
2042 | + plist_del(&si->avail_list, &swap_avail_head); | ||
2043 | spin_unlock(&si->lock); | ||
2044 | - continue; | ||
2045 | + goto nextsi; | ||
2046 | } | ||
2047 | - if (!(si->flags & SWP_WRITEOK)) { | ||
2048 | - spin_unlock(&si->lock); | ||
2049 | - continue; | ||
2050 | - } | ||
2051 | - | ||
2052 | - swap_list.next = next; | ||
2053 | |||
2054 | - spin_unlock(&swap_lock); | ||
2055 | /* This is called for allocating swap entry for cache */ | ||
2056 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | ||
2057 | spin_unlock(&si->lock); | ||
2058 | if (offset) | ||
2059 | - return swp_entry(type, offset); | ||
2060 | - spin_lock(&swap_lock); | ||
2061 | - next = swap_list.next; | ||
2062 | + return swp_entry(si->type, offset); | ||
2063 | + pr_debug("scan_swap_map of si %d failed to find offset\n", | ||
2064 | + si->type); | ||
2065 | + spin_lock(&swap_avail_lock); | ||
2066 | +nextsi: | ||
2067 | + /* | ||
2068 | + * if we got here, it's likely that si was almost full before, | ||
2069 | + * and since scan_swap_map() can drop the si->lock, multiple | ||
2070 | + * callers probably all tried to get a page from the same si | ||
2071 | + * and it filled up before we could get one; or, the si filled | ||
2072 | + * up between us dropping swap_avail_lock and taking si->lock. | ||
2073 | + * Since we dropped the swap_avail_lock, the swap_avail_head | ||
2074 | + * list may have been modified; so if next is still in the | ||
2075 | + * swap_avail_head list then try it, otherwise start over. | ||
2076 | + */ | ||
2077 | + if (plist_node_empty(&next->avail_list)) | ||
2078 | + goto start_over; | ||
2079 | } | ||
2080 | |||
2081 | + spin_unlock(&swap_avail_lock); | ||
2082 | + | ||
2083 | atomic_long_inc(&nr_swap_pages); | ||
2084 | noswap: | ||
2085 | - spin_unlock(&swap_lock); | ||
2086 | return (swp_entry_t) {0}; | ||
2087 | } | ||
2088 | |||
2089 | @@ -766,27 +781,6 @@ out: | ||
2090 | return NULL; | ||
2091 | } | ||
2092 | |||
2093 | -/* | ||
2094 | - * This swap type frees swap entry, check if it is the highest priority swap | ||
2095 | - * type which just frees swap entry. get_swap_page() uses | ||
2096 | - * highest_priority_index to search highest priority swap type. The | ||
2097 | - * swap_info_struct.lock can't protect us if there are multiple swap types | ||
2098 | - * active, so we use atomic_cmpxchg. | ||
2099 | - */ | ||
2100 | -static void set_highest_priority_index(int type) | ||
2101 | -{ | ||
2102 | - int old_hp_index, new_hp_index; | ||
2103 | - | ||
2104 | - do { | ||
2105 | - old_hp_index = atomic_read(&highest_priority_index); | ||
2106 | - if (old_hp_index != -1 && | ||
2107 | - swap_info[old_hp_index]->prio >= swap_info[type]->prio) | ||
2108 | - break; | ||
2109 | - new_hp_index = type; | ||
2110 | - } while (atomic_cmpxchg(&highest_priority_index, | ||
2111 | - old_hp_index, new_hp_index) != old_hp_index); | ||
2112 | -} | ||
2113 | - | ||
2114 | static unsigned char swap_entry_free(struct swap_info_struct *p, | ||
2115 | swp_entry_t entry, unsigned char usage) | ||
2116 | { | ||
2117 | @@ -828,9 +822,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | ||
2118 | dec_cluster_info_page(p, p->cluster_info, offset); | ||
2119 | if (offset < p->lowest_bit) | ||
2120 | p->lowest_bit = offset; | ||
2121 | - if (offset > p->highest_bit) | ||
2122 | + if (offset > p->highest_bit) { | ||
2123 | + bool was_full = !p->highest_bit; | ||
2124 | p->highest_bit = offset; | ||
2125 | - set_highest_priority_index(p->type); | ||
2126 | + if (was_full && (p->flags & SWP_WRITEOK)) { | ||
2127 | + spin_lock(&swap_avail_lock); | ||
2128 | + WARN_ON(!plist_node_empty(&p->avail_list)); | ||
2129 | + if (plist_node_empty(&p->avail_list)) | ||
2130 | + plist_add(&p->avail_list, | ||
2131 | + &swap_avail_head); | ||
2132 | + spin_unlock(&swap_avail_lock); | ||
2133 | + } | ||
2134 | + } | ||
2135 | atomic_long_inc(&nr_swap_pages); | ||
2136 | p->inuse_pages--; | ||
2137 | frontswap_invalidate_page(p->type, offset); | ||
2138 | @@ -1765,30 +1768,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | ||
2139 | unsigned char *swap_map, | ||
2140 | struct swap_cluster_info *cluster_info) | ||
2141 | { | ||
2142 | - int i, prev; | ||
2143 | - | ||
2144 | if (prio >= 0) | ||
2145 | p->prio = prio; | ||
2146 | else | ||
2147 | p->prio = --least_priority; | ||
2148 | + /* | ||
2149 | + * the plist prio is negated because plist ordering is | ||
2150 | + * low-to-high, while swap ordering is high-to-low | ||
2151 | + */ | ||
2152 | + p->list.prio = -p->prio; | ||
2153 | + p->avail_list.prio = -p->prio; | ||
2154 | p->swap_map = swap_map; | ||
2155 | p->cluster_info = cluster_info; | ||
2156 | p->flags |= SWP_WRITEOK; | ||
2157 | atomic_long_add(p->pages, &nr_swap_pages); | ||
2158 | total_swap_pages += p->pages; | ||
2159 | |||
2160 | - /* insert swap space into swap_list: */ | ||
2161 | - prev = -1; | ||
2162 | - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { | ||
2163 | - if (p->prio >= swap_info[i]->prio) | ||
2164 | - break; | ||
2165 | - prev = i; | ||
2166 | - } | ||
2167 | - p->next = i; | ||
2168 | - if (prev < 0) | ||
2169 | - swap_list.head = swap_list.next = p->type; | ||
2170 | - else | ||
2171 | - swap_info[prev]->next = p->type; | ||
2172 | + assert_spin_locked(&swap_lock); | ||
2173 | + /* | ||
2174 | + * both lists are plists, and thus priority ordered. | ||
2175 | + * swap_active_head needs to be priority ordered for swapoff(), | ||
2176 | + * which on removal of any swap_info_struct with an auto-assigned | ||
2177 | + * (i.e. negative) priority increments the auto-assigned priority | ||
2178 | + * of any lower-priority swap_info_structs. | ||
2179 | + * swap_avail_head needs to be priority ordered for get_swap_page(), | ||
2180 | + * which allocates swap pages from the highest available priority | ||
2181 | + * swap_info_struct. | ||
2182 | + */ | ||
2183 | + plist_add(&p->list, &swap_active_head); | ||
2184 | + spin_lock(&swap_avail_lock); | ||
2185 | + plist_add(&p->avail_list, &swap_avail_head); | ||
2186 | + spin_unlock(&swap_avail_lock); | ||
2187 | } | ||
2188 | |||
2189 | static void enable_swap_info(struct swap_info_struct *p, int prio, | ||
2190 | @@ -1823,8 +1833,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | ||
2191 | struct address_space *mapping; | ||
2192 | struct inode *inode; | ||
2193 | struct filename *pathname; | ||
2194 | - int i, type, prev; | ||
2195 | - int err; | ||
2196 | + int err, found = 0; | ||
2197 | unsigned int old_block_size; | ||
2198 | |||
2199 | if (!capable(CAP_SYS_ADMIN)) | ||
2200 | @@ -1842,17 +1851,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | ||
2201 | goto out; | ||
2202 | |||
2203 | mapping = victim->f_mapping; | ||
2204 | - prev = -1; | ||
2205 | spin_lock(&swap_lock); | ||
2206 | - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { | ||
2207 | - p = swap_info[type]; | ||
2208 | + plist_for_each_entry(p, &swap_active_head, list) { | ||
2209 | if (p->flags & SWP_WRITEOK) { | ||
2210 | - if (p->swap_file->f_mapping == mapping) | ||
2211 | + if (p->swap_file->f_mapping == mapping) { | ||
2212 | + found = 1; | ||
2213 | break; | ||
2214 | + } | ||
2215 | } | ||
2216 | - prev = type; | ||
2217 | } | ||
2218 | - if (type < 0) { | ||
2219 | + if (!found) { | ||
2220 | err = -EINVAL; | ||
2221 | spin_unlock(&swap_lock); | ||
2222 | goto out_dput; | ||
2223 | @@ -1864,20 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | ||
2224 | spin_unlock(&swap_lock); | ||
2225 | goto out_dput; | ||
2226 | } | ||
2227 | - if (prev < 0) | ||
2228 | - swap_list.head = p->next; | ||
2229 | - else | ||
2230 | - swap_info[prev]->next = p->next; | ||
2231 | - if (type == swap_list.next) { | ||
2232 | - /* just pick something that's safe... */ | ||
2233 | - swap_list.next = swap_list.head; | ||
2234 | - } | ||
2235 | + spin_lock(&swap_avail_lock); | ||
2236 | + plist_del(&p->avail_list, &swap_avail_head); | ||
2237 | + spin_unlock(&swap_avail_lock); | ||
2238 | spin_lock(&p->lock); | ||
2239 | if (p->prio < 0) { | ||
2240 | - for (i = p->next; i >= 0; i = swap_info[i]->next) | ||
2241 | - swap_info[i]->prio = p->prio--; | ||
2242 | + struct swap_info_struct *si = p; | ||
2243 | + | ||
2244 | + plist_for_each_entry_continue(si, &swap_active_head, list) { | ||
2245 | + si->prio++; | ||
2246 | + si->list.prio--; | ||
2247 | + si->avail_list.prio--; | ||
2248 | + } | ||
2249 | least_priority++; | ||
2250 | } | ||
2251 | + plist_del(&p->list, &swap_active_head); | ||
2252 | atomic_long_sub(p->pages, &nr_swap_pages); | ||
2253 | total_swap_pages -= p->pages; | ||
2254 | p->flags &= ~SWP_WRITEOK; | ||
2255 | @@ -1885,7 +1894,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | ||
2256 | spin_unlock(&swap_lock); | ||
2257 | |||
2258 | set_current_oom_origin(); | ||
2259 | - err = try_to_unuse(type, false, 0); /* force all pages to be unused */ | ||
2260 | + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ | ||
2261 | clear_current_oom_origin(); | ||
2262 | |||
2263 | if (err) { | ||
2264 | @@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | ||
2265 | frontswap_map = frontswap_map_get(p); | ||
2266 | spin_unlock(&p->lock); | ||
2267 | spin_unlock(&swap_lock); | ||
2268 | - frontswap_invalidate_area(type); | ||
2269 | + frontswap_invalidate_area(p->type); | ||
2270 | frontswap_map_set(p, NULL); | ||
2271 | mutex_unlock(&swapon_mutex); | ||
2272 | free_percpu(p->percpu_cluster); | ||
2273 | @@ -1935,7 +1944,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | ||
2274 | vfree(cluster_info); | ||
2275 | vfree(frontswap_map); | ||
2276 | /* Destroy swap account information */ | ||
2277 | - swap_cgroup_swapoff(type); | ||
2278 | + swap_cgroup_swapoff(p->type); | ||
2279 | |||
2280 | inode = mapping->host; | ||
2281 | if (S_ISBLK(inode->i_mode)) { | ||
2282 | @@ -2142,8 +2151,9 @@ static struct swap_info_struct *alloc_swap_info(void) | ||
2283 | */ | ||
2284 | } | ||
2285 | INIT_LIST_HEAD(&p->first_swap_extent.list); | ||
2286 | + plist_node_init(&p->list, 0); | ||
2287 | + plist_node_init(&p->avail_list, 0); | ||
2288 | p->flags = SWP_USED; | ||
2289 | - p->next = -1; | ||
2290 | spin_unlock(&swap_lock); | ||
2291 | spin_lock_init(&p->lock); | ||
2292 | |||
2293 | diff --git a/mm/vmacache.c b/mm/vmacache.c | ||
2294 | new file mode 100644 | ||
2295 | index 000000000000..1037a3bab505 | ||
2296 | --- /dev/null | ||
2297 | +++ b/mm/vmacache.c | ||
2298 | @@ -0,0 +1,114 @@ | ||
2299 | +/* | ||
2300 | + * Copyright (C) 2014 Davidlohr Bueso. | ||
2301 | + */ | ||
2302 | +#include <linux/sched.h> | ||
2303 | +#include <linux/mm.h> | ||
2304 | +#include <linux/vmacache.h> | ||
2305 | + | ||
2306 | +/* | ||
2307 | + * Flush vma caches for threads that share a given mm. | ||
2308 | + * | ||
2309 | + * The operation is safe because the caller holds the mmap_sem | ||
2310 | + * exclusively and other threads accessing the vma cache will | ||
2311 | + * have mmap_sem held at least for read, so no extra locking | ||
2312 | + * is required to maintain the vma cache. | ||
2313 | + */ | ||
2314 | +void vmacache_flush_all(struct mm_struct *mm) | ||
2315 | +{ | ||
2316 | + struct task_struct *g, *p; | ||
2317 | + | ||
2318 | + rcu_read_lock(); | ||
2319 | + for_each_process_thread(g, p) { | ||
2320 | + /* | ||
2321 | + * Only flush the vmacache pointers as the | ||
2322 | + * mm seqnum is already set and curr's will | ||
2323 | + * be set upon invalidation when the next | ||
2324 | + * lookup is done. | ||
2325 | + */ | ||
2326 | + if (mm == p->mm) | ||
2327 | + vmacache_flush(p); | ||
2328 | + } | ||
2329 | + rcu_read_unlock(); | ||
2330 | +} | ||
2331 | + | ||
2332 | +/* | ||
2333 | + * This task may be accessing a foreign mm via (for example) | ||
2334 | + * get_user_pages()->find_vma(). The vmacache is task-local and this | ||
2335 | + * task's vmacache pertains to a different mm (ie, its own). There is | ||
2336 | + * nothing we can do here. | ||
2337 | + * | ||
2338 | + * Also handle the case where a kernel thread has adopted this mm via use_mm(). | ||
2339 | + * That kernel thread's vmacache is not applicable to this mm. | ||
2340 | + */ | ||
2341 | +static bool vmacache_valid_mm(struct mm_struct *mm) | ||
2342 | +{ | ||
2343 | + return current->mm == mm && !(current->flags & PF_KTHREAD); | ||
2344 | +} | ||
2345 | + | ||
2346 | +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) | ||
2347 | +{ | ||
2348 | + if (vmacache_valid_mm(newvma->vm_mm)) | ||
2349 | + current->vmacache[VMACACHE_HASH(addr)] = newvma; | ||
2350 | +} | ||
2351 | + | ||
2352 | +static bool vmacache_valid(struct mm_struct *mm) | ||
2353 | +{ | ||
2354 | + struct task_struct *curr; | ||
2355 | + | ||
2356 | + if (!vmacache_valid_mm(mm)) | ||
2357 | + return false; | ||
2358 | + | ||
2359 | + curr = current; | ||
2360 | + if (mm->vmacache_seqnum != curr->vmacache_seqnum) { | ||
2361 | + /* | ||
2362 | + * First attempt will always be invalid, initialize | ||
2363 | + * the new cache for this task here. | ||
2364 | + */ | ||
2365 | + curr->vmacache_seqnum = mm->vmacache_seqnum; | ||
2366 | + vmacache_flush(curr); | ||
2367 | + return false; | ||
2368 | + } | ||
2369 | + return true; | ||
2370 | +} | ||
2371 | + | ||
2372 | +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) | ||
2373 | +{ | ||
2374 | + int i; | ||
2375 | + | ||
2376 | + if (!vmacache_valid(mm)) | ||
2377 | + return NULL; | ||
2378 | + | ||
2379 | + for (i = 0; i < VMACACHE_SIZE; i++) { | ||
2380 | + struct vm_area_struct *vma = current->vmacache[i]; | ||
2381 | + | ||
2382 | + if (!vma) | ||
2383 | + continue; | ||
2384 | + if (WARN_ON_ONCE(vma->vm_mm != mm)) | ||
2385 | + break; | ||
2386 | + if (vma->vm_start <= addr && vma->vm_end > addr) | ||
2387 | + return vma; | ||
2388 | + } | ||
2389 | + | ||
2390 | + return NULL; | ||
2391 | +} | ||
2392 | + | ||
2393 | +#ifndef CONFIG_MMU | ||
2394 | +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, | ||
2395 | + unsigned long start, | ||
2396 | + unsigned long end) | ||
2397 | +{ | ||
2398 | + int i; | ||
2399 | + | ||
2400 | + if (!vmacache_valid(mm)) | ||
2401 | + return NULL; | ||
2402 | + | ||
2403 | + for (i = 0; i < VMACACHE_SIZE; i++) { | ||
2404 | + struct vm_area_struct *vma = current->vmacache[i]; | ||
2405 | + | ||
2406 | + if (vma && vma->vm_start == start && vma->vm_end == end) | ||
2407 | + return vma; | ||
2408 | + } | ||
2409 | + | ||
2410 | + return NULL; | ||
2411 | +} | ||
2412 | +#endif | ||
2413 | diff --git a/mm/vmscan.c b/mm/vmscan.c | ||
2414 | index 6ef484f0777f..0c0b36e5b4f8 100644 | ||
2415 | --- a/mm/vmscan.c | ||
2416 | +++ b/mm/vmscan.c | ||
2417 | @@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | ||
2418 | unsigned long freed = 0; | ||
2419 | unsigned long long delta; | ||
2420 | long total_scan; | ||
2421 | - long max_pass; | ||
2422 | + long freeable; | ||
2423 | long nr; | ||
2424 | long new_nr; | ||
2425 | int nid = shrinkctl->nid; | ||
2426 | long batch_size = shrinker->batch ? shrinker->batch | ||
2427 | : SHRINK_BATCH; | ||
2428 | |||
2429 | - max_pass = shrinker->count_objects(shrinker, shrinkctl); | ||
2430 | - if (max_pass == 0) | ||
2431 | + freeable = shrinker->count_objects(shrinker, shrinkctl); | ||
2432 | + if (freeable == 0) | ||
2433 | return 0; | ||
2434 | |||
2435 | /* | ||
2436 | @@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | ||
2437 | |||
2438 | total_scan = nr; | ||
2439 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | ||
2440 | - delta *= max_pass; | ||
2441 | + delta *= freeable; | ||
2442 | do_div(delta, lru_pages + 1); | ||
2443 | total_scan += delta; | ||
2444 | if (total_scan < 0) { | ||
2445 | printk(KERN_ERR | ||
2446 | "shrink_slab: %pF negative objects to delete nr=%ld\n", | ||
2447 | shrinker->scan_objects, total_scan); | ||
2448 | - total_scan = max_pass; | ||
2449 | + total_scan = freeable; | ||
2450 | } | ||
2451 | |||
2452 | /* | ||
2453 | @@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | ||
2454 | * shrinkers to return -1 all the time. This results in a large | ||
2455 | * nr being built up so when a shrink that can do some work | ||
2456 | * comes along it empties the entire cache due to nr >>> | ||
2457 | - * max_pass. This is bad for sustaining a working set in | ||
2458 | + * freeable. This is bad for sustaining a working set in | ||
2459 | * memory. | ||
2460 | * | ||
2461 | * Hence only allow the shrinker to scan the entire cache when | ||
2462 | * a large delta change is calculated directly. | ||
2463 | */ | ||
2464 | - if (delta < max_pass / 4) | ||
2465 | - total_scan = min(total_scan, max_pass / 2); | ||
2466 | + if (delta < freeable / 4) | ||
2467 | + total_scan = min(total_scan, freeable / 2); | ||
2468 | |||
2469 | /* | ||
2470 | * Avoid risking looping forever due to too large nr value: | ||
2471 | * never try to free more than twice the estimate number of | ||
2472 | * freeable entries. | ||
2473 | */ | ||
2474 | - if (total_scan > max_pass * 2) | ||
2475 | - total_scan = max_pass * 2; | ||
2476 | + if (total_scan > freeable * 2) | ||
2477 | + total_scan = freeable * 2; | ||
2478 | |||
2479 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, | ||
2480 | nr_pages_scanned, lru_pages, | ||
2481 | - max_pass, delta, total_scan); | ||
2482 | + freeable, delta, total_scan); | ||
2483 | |||
2484 | /* | ||
2485 | * Normally, we should not scan less than batch_size objects in one | ||
2486 | @@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | ||
2487 | * | ||
2488 | * We detect the "tight on memory" situations by looking at the total | ||
2489 | * number of objects we want to scan (total_scan). If it is greater | ||
2490 | - * than the total number of objects on slab (max_pass), we must be | ||
2491 | + * than the total number of objects on slab (freeable), we must be | ||
2492 | * scanning at high prio and therefore should try to reclaim as much as | ||
2493 | * possible. | ||
2494 | */ | ||
2495 | while (total_scan >= batch_size || | ||
2496 | - total_scan >= max_pass) { | ||
2497 | + total_scan >= freeable) { | ||
2498 | unsigned long ret; | ||
2499 | unsigned long nr_to_scan = min(batch_size, total_scan); | ||
2500 | |||
2501 | @@ -1144,7 +1144,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | ||
2502 | TTU_UNMAP|TTU_IGNORE_ACCESS, | ||
2503 | &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); | ||
2504 | list_splice(&clean_pages, page_list); | ||
2505 | - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); | ||
2506 | + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); | ||
2507 | return ret; | ||
2508 | } | ||
2509 | |||
2510 | @@ -2424,8 +2424,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | ||
2511 | unsigned long lru_pages = 0; | ||
2512 | |||
2513 | nodes_clear(shrink->nodes_to_scan); | ||
2514 | - for_each_zone_zonelist(zone, z, zonelist, | ||
2515 | - gfp_zone(sc->gfp_mask)) { | ||
2516 | + for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
2517 | + gfp_zone(sc->gfp_mask), sc->nodemask) { | ||
2518 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | ||
2519 | continue; | ||
2520 |