Contents of /trunk/kernel-alx/patches-3.14/0120-3.14.21-all-fixes.patch
Parent Directory | Revision Log
Revision 2506 -
(show annotations)
(download)
Fri Oct 17 07:55:45 2014 UTC (9 years, 11 months ago) by niro
File size: 82171 byte(s)
Fri Oct 17 07:55:45 2014 UTC (9 years, 11 months ago) by niro
File size: 82171 byte(s)
-patches for 3.14
1 | diff --git a/Makefile b/Makefile |
2 | index beb7e6f0803b..41e6e19fe2e9 100644 |
3 | --- a/Makefile |
4 | +++ b/Makefile |
5 | @@ -1,6 +1,6 @@ |
6 | VERSION = 3 |
7 | PATCHLEVEL = 14 |
8 | -SUBLEVEL = 20 |
9 | +SUBLEVEL = 21 |
10 | EXTRAVERSION = |
11 | NAME = Remembering Coco |
12 | |
13 | diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h |
14 | index fb5e4c658f7a..ef470a7a3d0f 100644 |
15 | --- a/arch/unicore32/include/asm/mmu_context.h |
16 | +++ b/arch/unicore32/include/asm/mmu_context.h |
17 | @@ -14,6 +14,8 @@ |
18 | |
19 | #include <linux/compiler.h> |
20 | #include <linux/sched.h> |
21 | +#include <linux/mm.h> |
22 | +#include <linux/vmacache.h> |
23 | #include <linux/io.h> |
24 | |
25 | #include <asm/cacheflush.h> |
26 | @@ -73,7 +75,7 @@ do { \ |
27 | else \ |
28 | mm->mmap = NULL; \ |
29 | rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ |
30 | - mm->mmap_cache = NULL; \ |
31 | + vmacache_invalidate(mm); \ |
32 | mm->map_count--; \ |
33 | remove_vma(high_vma); \ |
34 | } \ |
35 | diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c |
36 | index c706d50a8b06..8c16c2f97026 100644 |
37 | --- a/drivers/block/drbd/drbd_nl.c |
38 | +++ b/drivers/block/drbd/drbd_nl.c |
39 | @@ -525,6 +525,12 @@ void conn_try_outdate_peer_async(struct drbd_tconn *tconn) |
40 | struct task_struct *opa; |
41 | |
42 | kref_get(&tconn->kref); |
43 | + /* We may just have force_sig()'ed this thread |
44 | + * to get it out of some blocking network function. |
45 | + * Clear signals; otherwise kthread_run(), which internally uses |
46 | + * wait_on_completion_killable(), will mistake our pending signal |
47 | + * for a new fatal signal and fail. */ |
48 | + flush_signals(current); |
49 | opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h"); |
50 | if (IS_ERR(opa)) { |
51 | conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n"); |
52 | diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c |
53 | index 0e27844e8c2d..8089dd2cd9d8 100644 |
54 | --- a/drivers/cpufreq/integrator-cpufreq.c |
55 | +++ b/drivers/cpufreq/integrator-cpufreq.c |
56 | @@ -213,9 +213,9 @@ static int __init integrator_cpufreq_probe(struct platform_device *pdev) |
57 | return cpufreq_register_driver(&integrator_driver); |
58 | } |
59 | |
60 | -static void __exit integrator_cpufreq_remove(struct platform_device *pdev) |
61 | +static int __exit integrator_cpufreq_remove(struct platform_device *pdev) |
62 | { |
63 | - cpufreq_unregister_driver(&integrator_driver); |
64 | + return cpufreq_unregister_driver(&integrator_driver); |
65 | } |
66 | |
67 | static const struct of_device_id integrator_cpufreq_match[] = { |
68 | diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c |
69 | index d278be110805..1855cdca39cd 100644 |
70 | --- a/drivers/gpu/drm/i915/i915_gem_gtt.c |
71 | +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c |
72 | @@ -827,6 +827,16 @@ void i915_check_and_clear_faults(struct drm_device *dev) |
73 | POSTING_READ(RING_FAULT_REG(&dev_priv->ring[RCS])); |
74 | } |
75 | |
76 | +static void i915_ggtt_flush(struct drm_i915_private *dev_priv) |
77 | +{ |
78 | + if (INTEL_INFO(dev_priv->dev)->gen < 6) { |
79 | + intel_gtt_chipset_flush(); |
80 | + } else { |
81 | + I915_WRITE(GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN); |
82 | + POSTING_READ(GFX_FLSH_CNTL_GEN6); |
83 | + } |
84 | +} |
85 | + |
86 | void i915_gem_suspend_gtt_mappings(struct drm_device *dev) |
87 | { |
88 | struct drm_i915_private *dev_priv = dev->dev_private; |
89 | @@ -843,6 +853,8 @@ void i915_gem_suspend_gtt_mappings(struct drm_device *dev) |
90 | dev_priv->gtt.base.start / PAGE_SIZE, |
91 | dev_priv->gtt.base.total / PAGE_SIZE, |
92 | true); |
93 | + |
94 | + i915_ggtt_flush(dev_priv); |
95 | } |
96 | |
97 | void i915_gem_restore_gtt_mappings(struct drm_device *dev) |
98 | @@ -863,7 +875,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev) |
99 | i915_gem_gtt_bind_object(obj, obj->cache_level); |
100 | } |
101 | |
102 | - i915_gem_chipset_flush(dev); |
103 | + i915_ggtt_flush(dev_priv); |
104 | } |
105 | |
106 | int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj) |
107 | diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c |
108 | index 18cda77b4f79..4913c0690872 100644 |
109 | --- a/drivers/md/raid5.c |
110 | +++ b/drivers/md/raid5.c |
111 | @@ -64,6 +64,10 @@ |
112 | #define cpu_to_group(cpu) cpu_to_node(cpu) |
113 | #define ANY_GROUP NUMA_NO_NODE |
114 | |
115 | +static bool devices_handle_discard_safely = false; |
116 | +module_param(devices_handle_discard_safely, bool, 0644); |
117 | +MODULE_PARM_DESC(devices_handle_discard_safely, |
118 | + "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); |
119 | static struct workqueue_struct *raid5_wq; |
120 | /* |
121 | * Stripe cache |
122 | @@ -6117,7 +6121,7 @@ static int run(struct mddev *mddev) |
123 | mddev->queue->limits.discard_granularity = stripe; |
124 | /* |
125 | * unaligned part of discard request will be ignored, so can't |
126 | - * guarantee discard_zerors_data |
127 | + * guarantee discard_zeroes_data |
128 | */ |
129 | mddev->queue->limits.discard_zeroes_data = 0; |
130 | |
131 | @@ -6142,6 +6146,18 @@ static int run(struct mddev *mddev) |
132 | !bdev_get_queue(rdev->bdev)-> |
133 | limits.discard_zeroes_data) |
134 | discard_supported = false; |
135 | + /* Unfortunately, discard_zeroes_data is not currently |
136 | + * a guarantee - just a hint. So we only allow DISCARD |
137 | + * if the sysadmin has confirmed that only safe devices |
138 | + * are in use by setting a module parameter. |
139 | + */ |
140 | + if (!devices_handle_discard_safely) { |
141 | + if (discard_supported) { |
142 | + pr_info("md/raid456: discard support disabled due to uncertainty.\n"); |
143 | + pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); |
144 | + } |
145 | + discard_supported = false; |
146 | + } |
147 | } |
148 | |
149 | if (discard_supported && |
150 | diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c |
151 | index a127925c9d61..06faea4d60ee 100644 |
152 | --- a/drivers/media/v4l2-core/videobuf2-core.c |
153 | +++ b/drivers/media/v4l2-core/videobuf2-core.c |
154 | @@ -745,6 +745,7 @@ static int __reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req) |
155 | * to the userspace. |
156 | */ |
157 | req->count = allocated_buffers; |
158 | + q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type); |
159 | |
160 | return 0; |
161 | } |
162 | @@ -793,6 +794,7 @@ static int __create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create |
163 | memset(q->plane_sizes, 0, sizeof(q->plane_sizes)); |
164 | memset(q->alloc_ctx, 0, sizeof(q->alloc_ctx)); |
165 | q->memory = create->memory; |
166 | + q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type); |
167 | } |
168 | |
169 | num_buffers = min(create->count, VIDEO_MAX_FRAME - q->num_buffers); |
170 | @@ -1447,6 +1449,7 @@ static int vb2_internal_qbuf(struct vb2_queue *q, struct v4l2_buffer *b) |
171 | * dequeued in dqbuf. |
172 | */ |
173 | list_add_tail(&vb->queued_entry, &q->queued_list); |
174 | + q->waiting_for_buffers = false; |
175 | vb->state = VB2_BUF_STATE_QUEUED; |
176 | |
177 | /* |
178 | @@ -1841,6 +1844,7 @@ static int vb2_internal_streamoff(struct vb2_queue *q, enum v4l2_buf_type type) |
179 | * and videobuf, effectively returning control over them to userspace. |
180 | */ |
181 | __vb2_queue_cancel(q); |
182 | + q->waiting_for_buffers = !V4L2_TYPE_IS_OUTPUT(q->type); |
183 | |
184 | dprintk(3, "Streamoff successful\n"); |
185 | return 0; |
186 | @@ -2150,9 +2154,16 @@ unsigned int vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait) |
187 | } |
188 | |
189 | /* |
190 | - * There is nothing to wait for if no buffers have already been queued. |
191 | + * There is nothing to wait for if the queue isn't streaming. |
192 | */ |
193 | - if (list_empty(&q->queued_list)) |
194 | + if (!vb2_is_streaming(q)) |
195 | + return res | POLLERR; |
196 | + /* |
197 | + * For compatibility with vb1: if QBUF hasn't been called yet, then |
198 | + * return POLLERR as well. This only affects capture queues, output |
199 | + * queues will always initialize waiting_for_buffers to false. |
200 | + */ |
201 | + if (q->waiting_for_buffers) |
202 | return res | POLLERR; |
203 | |
204 | if (list_empty(&q->done_list)) |
205 | diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h |
206 | index f15d4353f30f..5d12d69e2045 100644 |
207 | --- a/fs/cifs/cifsglob.h |
208 | +++ b/fs/cifs/cifsglob.h |
209 | @@ -399,6 +399,8 @@ struct smb_version_operations { |
210 | const struct cifs_fid *, u32 *); |
211 | int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, |
212 | int); |
213 | + /* check if we need to issue closedir */ |
214 | + bool (*dir_needs_close)(struct cifsFileInfo *); |
215 | }; |
216 | |
217 | struct smb_version_values { |
218 | diff --git a/fs/cifs/file.c b/fs/cifs/file.c |
219 | index 8175b18df819..d375322b6cec 100644 |
220 | --- a/fs/cifs/file.c |
221 | +++ b/fs/cifs/file.c |
222 | @@ -762,7 +762,7 @@ int cifs_closedir(struct inode *inode, struct file *file) |
223 | |
224 | cifs_dbg(FYI, "Freeing private data in close dir\n"); |
225 | spin_lock(&cifs_file_list_lock); |
226 | - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { |
227 | + if (server->ops->dir_needs_close(cfile)) { |
228 | cfile->invalidHandle = true; |
229 | spin_unlock(&cifs_file_list_lock); |
230 | if (server->ops->close_dir) |
231 | diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c |
232 | index 2bbf11b09214..b334a89d6a66 100644 |
233 | --- a/fs/cifs/readdir.c |
234 | +++ b/fs/cifs/readdir.c |
235 | @@ -593,7 +593,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, |
236 | /* close and restart search */ |
237 | cifs_dbg(FYI, "search backing up - close and restart search\n"); |
238 | spin_lock(&cifs_file_list_lock); |
239 | - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { |
240 | + if (server->ops->dir_needs_close(cfile)) { |
241 | cfile->invalidHandle = true; |
242 | spin_unlock(&cifs_file_list_lock); |
243 | if (server->ops->close_dir) |
244 | diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c |
245 | index d1fdfa848703..e9ad8d37bb00 100644 |
246 | --- a/fs/cifs/smb1ops.c |
247 | +++ b/fs/cifs/smb1ops.c |
248 | @@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, |
249 | tmprc = CIFS_open(xid, &oparms, &oplock, NULL); |
250 | if (tmprc == -EOPNOTSUPP) |
251 | *symlink = true; |
252 | - else |
253 | + else if (tmprc == 0) |
254 | CIFSSMBClose(xid, tcon, fid.netfid); |
255 | } |
256 | |
257 | @@ -1009,6 +1009,12 @@ cifs_is_read_op(__u32 oplock) |
258 | return oplock == OPLOCK_READ; |
259 | } |
260 | |
261 | +static bool |
262 | +cifs_dir_needs_close(struct cifsFileInfo *cfile) |
263 | +{ |
264 | + return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; |
265 | +} |
266 | + |
267 | struct smb_version_operations smb1_operations = { |
268 | .send_cancel = send_nt_cancel, |
269 | .compare_fids = cifs_compare_fids, |
270 | @@ -1078,6 +1084,7 @@ struct smb_version_operations smb1_operations = { |
271 | .query_mf_symlink = cifs_query_mf_symlink, |
272 | .create_mf_symlink = cifs_create_mf_symlink, |
273 | .is_read_op = cifs_is_read_op, |
274 | + .dir_needs_close = cifs_dir_needs_close, |
275 | #ifdef CONFIG_CIFS_XATTR |
276 | .query_all_EAs = CIFSSMBQAllEAs, |
277 | .set_EA = CIFSSMBSetEA, |
278 | diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c |
279 | index e31a9dfdcd39..a491814cb2c0 100644 |
280 | --- a/fs/cifs/smb2maperror.c |
281 | +++ b/fs/cifs/smb2maperror.c |
282 | @@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { |
283 | {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"}, |
284 | {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"}, |
285 | {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"}, |
286 | - {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"}, |
287 | + {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"}, |
288 | {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"}, |
289 | {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"}, |
290 | {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"}, |
291 | @@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { |
292 | {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO, |
293 | "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"}, |
294 | {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"}, |
295 | + {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP, |
296 | + "STATUS_REPARSE_NOT_HANDLED"}, |
297 | {STATUS_DEVICE_REQUIRES_CLEANING, -EIO, |
298 | "STATUS_DEVICE_REQUIRES_CLEANING"}, |
299 | {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"}, |
300 | diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c |
301 | index f8977b2d9187..34a17d425be6 100644 |
302 | --- a/fs/cifs/smb2ops.c |
303 | +++ b/fs/cifs/smb2ops.c |
304 | @@ -1102,6 +1102,12 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch) |
305 | return le32_to_cpu(lc->lcontext.LeaseState); |
306 | } |
307 | |
308 | +static bool |
309 | +smb2_dir_needs_close(struct cifsFileInfo *cfile) |
310 | +{ |
311 | + return !cfile->invalidHandle; |
312 | +} |
313 | + |
314 | struct smb_version_operations smb20_operations = { |
315 | .compare_fids = smb2_compare_fids, |
316 | .setup_request = smb2_setup_request, |
317 | @@ -1175,6 +1181,7 @@ struct smb_version_operations smb20_operations = { |
318 | .create_lease_buf = smb2_create_lease_buf, |
319 | .parse_lease_buf = smb2_parse_lease_buf, |
320 | .clone_range = smb2_clone_range, |
321 | + .dir_needs_close = smb2_dir_needs_close, |
322 | }; |
323 | |
324 | struct smb_version_operations smb21_operations = { |
325 | @@ -1250,6 +1257,7 @@ struct smb_version_operations smb21_operations = { |
326 | .create_lease_buf = smb2_create_lease_buf, |
327 | .parse_lease_buf = smb2_parse_lease_buf, |
328 | .clone_range = smb2_clone_range, |
329 | + .dir_needs_close = smb2_dir_needs_close, |
330 | }; |
331 | |
332 | struct smb_version_operations smb30_operations = { |
333 | @@ -1328,6 +1336,7 @@ struct smb_version_operations smb30_operations = { |
334 | .parse_lease_buf = smb3_parse_lease_buf, |
335 | .clone_range = smb2_clone_range, |
336 | .validate_negotiate = smb3_validate_negotiate, |
337 | + .dir_needs_close = smb2_dir_needs_close, |
338 | }; |
339 | |
340 | struct smb_version_values smb20_values = { |
341 | diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c |
342 | index 9aab8fe0e508..348792911e1f 100644 |
343 | --- a/fs/cifs/smb2pdu.c |
344 | +++ b/fs/cifs/smb2pdu.c |
345 | @@ -2136,6 +2136,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, |
346 | rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base; |
347 | |
348 | if (rc) { |
349 | + if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) { |
350 | + srch_inf->endOfSearch = true; |
351 | + rc = 0; |
352 | + } |
353 | cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); |
354 | goto qdir_exit; |
355 | } |
356 | @@ -2173,11 +2177,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, |
357 | else |
358 | cifs_dbg(VFS, "illegal search buffer type\n"); |
359 | |
360 | - if (rsp->hdr.Status == STATUS_NO_MORE_FILES) |
361 | - srch_inf->endOfSearch = 1; |
362 | - else |
363 | - srch_inf->endOfSearch = 0; |
364 | - |
365 | return rc; |
366 | |
367 | qdir_exit: |
368 | diff --git a/fs/exec.c b/fs/exec.c |
369 | index 31e46b1b358b..ea4449d0536a 100644 |
370 | --- a/fs/exec.c |
371 | +++ b/fs/exec.c |
372 | @@ -26,6 +26,7 @@ |
373 | #include <linux/file.h> |
374 | #include <linux/fdtable.h> |
375 | #include <linux/mm.h> |
376 | +#include <linux/vmacache.h> |
377 | #include <linux/stat.h> |
378 | #include <linux/fcntl.h> |
379 | #include <linux/swap.h> |
380 | @@ -820,7 +821,7 @@ EXPORT_SYMBOL(read_code); |
381 | static int exec_mmap(struct mm_struct *mm) |
382 | { |
383 | struct task_struct *tsk; |
384 | - struct mm_struct * old_mm, *active_mm; |
385 | + struct mm_struct *old_mm, *active_mm; |
386 | |
387 | /* Notify parent that we're no longer interested in the old VM */ |
388 | tsk = current; |
389 | @@ -846,6 +847,8 @@ static int exec_mmap(struct mm_struct *mm) |
390 | tsk->mm = mm; |
391 | tsk->active_mm = mm; |
392 | activate_mm(active_mm, mm); |
393 | + tsk->mm->vmacache_seqnum = 0; |
394 | + vmacache_flush(tsk); |
395 | task_unlock(tsk); |
396 | if (old_mm) { |
397 | up_read(&old_mm->mmap_sem); |
398 | diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c |
399 | index d19b30ababf1..a4a8ed56e438 100644 |
400 | --- a/fs/hugetlbfs/inode.c |
401 | +++ b/fs/hugetlbfs/inode.c |
402 | @@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void) |
403 | int error; |
404 | int i; |
405 | |
406 | + if (!hugepages_supported()) { |
407 | + pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); |
408 | + return -ENOTSUPP; |
409 | + } |
410 | + |
411 | error = bdi_init(&hugetlbfs_backing_dev_info); |
412 | if (error) |
413 | return error; |
414 | diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c |
415 | index 8f788193e3d4..c4b2646b6d7c 100644 |
416 | --- a/fs/proc/task_mmu.c |
417 | +++ b/fs/proc/task_mmu.c |
418 | @@ -1,4 +1,5 @@ |
419 | #include <linux/mm.h> |
420 | +#include <linux/vmacache.h> |
421 | #include <linux/hugetlb.h> |
422 | #include <linux/huge_mm.h> |
423 | #include <linux/mount.h> |
424 | @@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) |
425 | |
426 | /* |
427 | * We remember last_addr rather than next_addr to hit with |
428 | - * mmap_cache most of the time. We have zero last_addr at |
429 | + * vmacache most of the time. We have zero last_addr at |
430 | * the beginning and also after lseek. We will have -1 last_addr |
431 | * after the end of the vmas. |
432 | */ |
433 | diff --git a/fs/udf/inode.c b/fs/udf/inode.c |
434 | index 982ce05c87ed..287cd5f23421 100644 |
435 | --- a/fs/udf/inode.c |
436 | +++ b/fs/udf/inode.c |
437 | @@ -1271,13 +1271,22 @@ update_time: |
438 | return 0; |
439 | } |
440 | |
441 | +/* |
442 | + * Maximum length of linked list formed by ICB hierarchy. The chosen number is |
443 | + * arbitrary - just that we hopefully don't limit any real use of rewritten |
444 | + * inode on write-once media but avoid looping for too long on corrupted media. |
445 | + */ |
446 | +#define UDF_MAX_ICB_NESTING 1024 |
447 | + |
448 | static void __udf_read_inode(struct inode *inode) |
449 | { |
450 | struct buffer_head *bh = NULL; |
451 | struct fileEntry *fe; |
452 | uint16_t ident; |
453 | struct udf_inode_info *iinfo = UDF_I(inode); |
454 | + unsigned int indirections = 0; |
455 | |
456 | +reread: |
457 | /* |
458 | * Set defaults, but the inode is still incomplete! |
459 | * Note: get_new_inode() sets the following on a new inode: |
460 | @@ -1314,28 +1323,26 @@ static void __udf_read_inode(struct inode *inode) |
461 | ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, |
462 | &ident); |
463 | if (ident == TAG_IDENT_IE && ibh) { |
464 | - struct buffer_head *nbh = NULL; |
465 | struct kernel_lb_addr loc; |
466 | struct indirectEntry *ie; |
467 | |
468 | ie = (struct indirectEntry *)ibh->b_data; |
469 | loc = lelb_to_cpu(ie->indirectICB.extLocation); |
470 | |
471 | - if (ie->indirectICB.extLength && |
472 | - (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, |
473 | - &ident))) { |
474 | - if (ident == TAG_IDENT_FE || |
475 | - ident == TAG_IDENT_EFE) { |
476 | - memcpy(&iinfo->i_location, |
477 | - &loc, |
478 | - sizeof(struct kernel_lb_addr)); |
479 | - brelse(bh); |
480 | - brelse(ibh); |
481 | - brelse(nbh); |
482 | - __udf_read_inode(inode); |
483 | + if (ie->indirectICB.extLength) { |
484 | + brelse(bh); |
485 | + brelse(ibh); |
486 | + memcpy(&iinfo->i_location, &loc, |
487 | + sizeof(struct kernel_lb_addr)); |
488 | + if (++indirections > UDF_MAX_ICB_NESTING) { |
489 | + udf_err(inode->i_sb, |
490 | + "too many ICBs in ICB hierarchy" |
491 | + " (max %d supported)\n", |
492 | + UDF_MAX_ICB_NESTING); |
493 | + make_bad_inode(inode); |
494 | return; |
495 | } |
496 | - brelse(nbh); |
497 | + goto reread; |
498 | } |
499 | } |
500 | brelse(ibh); |
501 | diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h |
502 | index 3fe661fe96d1..b19d3dc2e651 100644 |
503 | --- a/include/linux/cpuset.h |
504 | +++ b/include/linux/cpuset.h |
505 | @@ -87,25 +87,26 @@ extern void rebuild_sched_domains(void); |
506 | extern void cpuset_print_task_mems_allowed(struct task_struct *p); |
507 | |
508 | /* |
509 | - * get_mems_allowed is required when making decisions involving mems_allowed |
510 | - * such as during page allocation. mems_allowed can be updated in parallel |
511 | - * and depending on the new value an operation can fail potentially causing |
512 | - * process failure. A retry loop with get_mems_allowed and put_mems_allowed |
513 | - * prevents these artificial failures. |
514 | + * read_mems_allowed_begin is required when making decisions involving |
515 | + * mems_allowed such as during page allocation. mems_allowed can be updated in |
516 | + * parallel and depending on the new value an operation can fail potentially |
517 | + * causing process failure. A retry loop with read_mems_allowed_begin and |
518 | + * read_mems_allowed_retry prevents these artificial failures. |
519 | */ |
520 | -static inline unsigned int get_mems_allowed(void) |
521 | +static inline unsigned int read_mems_allowed_begin(void) |
522 | { |
523 | return read_seqcount_begin(¤t->mems_allowed_seq); |
524 | } |
525 | |
526 | /* |
527 | - * If this returns false, the operation that took place after get_mems_allowed |
528 | - * may have failed. It is up to the caller to retry the operation if |
529 | + * If this returns true, the operation that took place after |
530 | + * read_mems_allowed_begin may have failed artificially due to a concurrent |
531 | + * update of mems_allowed. It is up to the caller to retry the operation if |
532 | * appropriate. |
533 | */ |
534 | -static inline bool put_mems_allowed(unsigned int seq) |
535 | +static inline bool read_mems_allowed_retry(unsigned int seq) |
536 | { |
537 | - return !read_seqcount_retry(¤t->mems_allowed_seq, seq); |
538 | + return read_seqcount_retry(¤t->mems_allowed_seq, seq); |
539 | } |
540 | |
541 | static inline void set_mems_allowed(nodemask_t nodemask) |
542 | @@ -225,14 +226,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) |
543 | { |
544 | } |
545 | |
546 | -static inline unsigned int get_mems_allowed(void) |
547 | +static inline unsigned int read_mems_allowed_begin(void) |
548 | { |
549 | return 0; |
550 | } |
551 | |
552 | -static inline bool put_mems_allowed(unsigned int seq) |
553 | +static inline bool read_mems_allowed_retry(unsigned int seq) |
554 | { |
555 | - return true; |
556 | + return false; |
557 | } |
558 | |
559 | #endif /* !CONFIG_CPUSETS */ |
560 | diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h |
561 | index bd1e9bcec547..42b05c4c53e5 100644 |
562 | --- a/include/linux/hugetlb.h |
563 | +++ b/include/linux/hugetlb.h |
564 | @@ -400,6 +400,16 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, |
565 | return &mm->page_table_lock; |
566 | } |
567 | |
568 | +static inline bool hugepages_supported(void) |
569 | +{ |
570 | + /* |
571 | + * Some platform decide whether they support huge pages at boot |
572 | + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when |
573 | + * there is no such support |
574 | + */ |
575 | + return HPAGE_SHIFT != 0; |
576 | +} |
577 | + |
578 | #else /* CONFIG_HUGETLB_PAGE */ |
579 | struct hstate {}; |
580 | #define alloc_huge_page_node(h, nid) NULL |
581 | diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h |
582 | index 1f44466c1e9d..c367cbdf73ab 100644 |
583 | --- a/include/linux/jiffies.h |
584 | +++ b/include/linux/jiffies.h |
585 | @@ -258,23 +258,11 @@ extern unsigned long preset_lpj; |
586 | #define SEC_JIFFIE_SC (32 - SHIFT_HZ) |
587 | #endif |
588 | #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) |
589 | -#define USEC_JIFFIE_SC (SEC_JIFFIE_SC + 19) |
590 | #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ |
591 | TICK_NSEC -1) / (u64)TICK_NSEC)) |
592 | |
593 | #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ |
594 | TICK_NSEC -1) / (u64)TICK_NSEC)) |
595 | -#define USEC_CONVERSION \ |
596 | - ((unsigned long)((((u64)NSEC_PER_USEC << USEC_JIFFIE_SC) +\ |
597 | - TICK_NSEC -1) / (u64)TICK_NSEC)) |
598 | -/* |
599 | - * USEC_ROUND is used in the timeval to jiffie conversion. See there |
600 | - * for more details. It is the scaled resolution rounding value. Note |
601 | - * that it is a 64-bit value. Since, when it is applied, we are already |
602 | - * in jiffies (albit scaled), it is nothing but the bits we will shift |
603 | - * off. |
604 | - */ |
605 | -#define USEC_ROUND (u64)(((u64)1 << USEC_JIFFIE_SC) - 1) |
606 | /* |
607 | * The maximum jiffie value is (MAX_INT >> 1). Here we translate that |
608 | * into seconds. The 64-bit case will overflow if we are not careful, |
609 | diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h |
610 | index 290901a8c1de..2b58d192ea24 100644 |
611 | --- a/include/linux/mm_types.h |
612 | +++ b/include/linux/mm_types.h |
613 | @@ -342,9 +342,9 @@ struct mm_rss_stat { |
614 | |
615 | struct kioctx_table; |
616 | struct mm_struct { |
617 | - struct vm_area_struct * mmap; /* list of VMAs */ |
618 | + struct vm_area_struct *mmap; /* list of VMAs */ |
619 | struct rb_root mm_rb; |
620 | - struct vm_area_struct * mmap_cache; /* last find_vma result */ |
621 | + u32 vmacache_seqnum; /* per-thread vmacache */ |
622 | #ifdef CONFIG_MMU |
623 | unsigned long (*get_unmapped_area) (struct file *filp, |
624 | unsigned long addr, unsigned long len, |
625 | diff --git a/include/linux/plist.h b/include/linux/plist.h |
626 | index aa0fb390bd29..8b6c970cff6c 100644 |
627 | --- a/include/linux/plist.h |
628 | +++ b/include/linux/plist.h |
629 | @@ -98,6 +98,13 @@ struct plist_node { |
630 | } |
631 | |
632 | /** |
633 | + * PLIST_HEAD - declare and init plist_head |
634 | + * @head: name for struct plist_head variable |
635 | + */ |
636 | +#define PLIST_HEAD(head) \ |
637 | + struct plist_head head = PLIST_HEAD_INIT(head) |
638 | + |
639 | +/** |
640 | * PLIST_NODE_INIT - static struct plist_node initializer |
641 | * @node: struct plist_node variable name |
642 | * @__prio: initial node priority |
643 | @@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio) |
644 | extern void plist_add(struct plist_node *node, struct plist_head *head); |
645 | extern void plist_del(struct plist_node *node, struct plist_head *head); |
646 | |
647 | +extern void plist_requeue(struct plist_node *node, struct plist_head *head); |
648 | + |
649 | /** |
650 | * plist_for_each - iterate over the plist |
651 | * @pos: the type * to use as a loop counter |
652 | @@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); |
653 | list_for_each_entry(pos, &(head)->node_list, node_list) |
654 | |
655 | /** |
656 | + * plist_for_each_continue - continue iteration over the plist |
657 | + * @pos: the type * to use as a loop cursor |
658 | + * @head: the head for your list |
659 | + * |
660 | + * Continue to iterate over plist, continuing after the current position. |
661 | + */ |
662 | +#define plist_for_each_continue(pos, head) \ |
663 | + list_for_each_entry_continue(pos, &(head)->node_list, node_list) |
664 | + |
665 | +/** |
666 | * plist_for_each_safe - iterate safely over a plist of given type |
667 | * @pos: the type * to use as a loop counter |
668 | * @n: another type * to use as temporary storage |
669 | @@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); |
670 | list_for_each_entry(pos, &(head)->node_list, mem.node_list) |
671 | |
672 | /** |
673 | + * plist_for_each_entry_continue - continue iteration over list of given type |
674 | + * @pos: the type * to use as a loop cursor |
675 | + * @head: the head for your list |
676 | + * @m: the name of the list_struct within the struct |
677 | + * |
678 | + * Continue to iterate over list of given type, continuing after |
679 | + * the current position. |
680 | + */ |
681 | +#define plist_for_each_entry_continue(pos, head, m) \ |
682 | + list_for_each_entry_continue(pos, &(head)->node_list, m.node_list) |
683 | + |
684 | +/** |
685 | * plist_for_each_entry_safe - iterate safely over list of given type |
686 | * @pos: the type * to use as a loop counter |
687 | * @n: another type * to use as temporary storage |
688 | @@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node) |
689 | #endif |
690 | |
691 | /** |
692 | + * plist_next - get the next entry in list |
693 | + * @pos: the type * to cursor |
694 | + */ |
695 | +#define plist_next(pos) \ |
696 | + list_next_entry(pos, node_list) |
697 | + |
698 | +/** |
699 | + * plist_prev - get the prev entry in list |
700 | + * @pos: the type * to cursor |
701 | + */ |
702 | +#define plist_prev(pos) \ |
703 | + list_prev_entry(pos, node_list) |
704 | + |
705 | +/** |
706 | * plist_first - return the first node (and thus, highest priority) |
707 | * @head: the &struct plist_head pointer |
708 | * |
709 | diff --git a/include/linux/sched.h b/include/linux/sched.h |
710 | index ccd0c6f24f2c..d7ca410ace93 100644 |
711 | --- a/include/linux/sched.h |
712 | +++ b/include/linux/sched.h |
713 | @@ -59,6 +59,10 @@ struct sched_param { |
714 | |
715 | #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ |
716 | |
717 | +#define VMACACHE_BITS 2 |
718 | +#define VMACACHE_SIZE (1U << VMACACHE_BITS) |
719 | +#define VMACACHE_MASK (VMACACHE_SIZE - 1) |
720 | + |
721 | /* |
722 | * Extended scheduling parameters data structure. |
723 | * |
724 | @@ -1228,6 +1232,9 @@ struct task_struct { |
725 | #ifdef CONFIG_COMPAT_BRK |
726 | unsigned brk_randomized:1; |
727 | #endif |
728 | + /* per-thread vma caching */ |
729 | + u32 vmacache_seqnum; |
730 | + struct vm_area_struct *vmacache[VMACACHE_SIZE]; |
731 | #if defined(SPLIT_RSS_COUNTING) |
732 | struct task_rss_stat rss_stat; |
733 | #endif |
734 | diff --git a/include/linux/swap.h b/include/linux/swap.h |
735 | index 46ba0c6c219f..789324976801 100644 |
736 | --- a/include/linux/swap.h |
737 | +++ b/include/linux/swap.h |
738 | @@ -214,8 +214,9 @@ struct percpu_cluster { |
739 | struct swap_info_struct { |
740 | unsigned long flags; /* SWP_USED etc: see above */ |
741 | signed short prio; /* swap priority of this type */ |
742 | + struct plist_node list; /* entry in swap_active_head */ |
743 | + struct plist_node avail_list; /* entry in swap_avail_head */ |
744 | signed char type; /* strange name for an index */ |
745 | - signed char next; /* next type on the swap list */ |
746 | unsigned int max; /* extent of the swap_map */ |
747 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ |
748 | struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ |
749 | @@ -255,11 +256,6 @@ struct swap_info_struct { |
750 | struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ |
751 | }; |
752 | |
753 | -struct swap_list_t { |
754 | - int head; /* head of priority-ordered swapfile list */ |
755 | - int next; /* swapfile to be used next */ |
756 | -}; |
757 | - |
758 | /* linux/mm/page_alloc.c */ |
759 | extern unsigned long totalram_pages; |
760 | extern unsigned long totalreserve_pages; |
761 | diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h |
762 | index e282624e8c10..388293a91e8c 100644 |
763 | --- a/include/linux/swapfile.h |
764 | +++ b/include/linux/swapfile.h |
765 | @@ -6,7 +6,7 @@ |
766 | * want to expose them to the dozens of source files that include swap.h |
767 | */ |
768 | extern spinlock_t swap_lock; |
769 | -extern struct swap_list_t swap_list; |
770 | +extern struct plist_head swap_active_head; |
771 | extern struct swap_info_struct *swap_info[]; |
772 | extern int try_to_unuse(unsigned int, bool, unsigned long); |
773 | |
774 | diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h |
775 | new file mode 100644 |
776 | index 000000000000..c3fa0fd43949 |
777 | --- /dev/null |
778 | +++ b/include/linux/vmacache.h |
779 | @@ -0,0 +1,38 @@ |
780 | +#ifndef __LINUX_VMACACHE_H |
781 | +#define __LINUX_VMACACHE_H |
782 | + |
783 | +#include <linux/sched.h> |
784 | +#include <linux/mm.h> |
785 | + |
786 | +/* |
787 | + * Hash based on the page number. Provides a good hit rate for |
788 | + * workloads with good locality and those with random accesses as well. |
789 | + */ |
790 | +#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK) |
791 | + |
792 | +static inline void vmacache_flush(struct task_struct *tsk) |
793 | +{ |
794 | + memset(tsk->vmacache, 0, sizeof(tsk->vmacache)); |
795 | +} |
796 | + |
797 | +extern void vmacache_flush_all(struct mm_struct *mm); |
798 | +extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); |
799 | +extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, |
800 | + unsigned long addr); |
801 | + |
802 | +#ifndef CONFIG_MMU |
803 | +extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, |
804 | + unsigned long start, |
805 | + unsigned long end); |
806 | +#endif |
807 | + |
808 | +static inline void vmacache_invalidate(struct mm_struct *mm) |
809 | +{ |
810 | + mm->vmacache_seqnum++; |
811 | + |
812 | + /* deal with overflows */ |
813 | + if (unlikely(mm->vmacache_seqnum == 0)) |
814 | + vmacache_flush_all(mm); |
815 | +} |
816 | + |
817 | +#endif /* __LINUX_VMACACHE_H */ |
818 | diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h |
819 | index bef53ce555d2..b10682cb138c 100644 |
820 | --- a/include/media/videobuf2-core.h |
821 | +++ b/include/media/videobuf2-core.h |
822 | @@ -329,6 +329,9 @@ struct v4l2_fh; |
823 | * @retry_start_streaming: start_streaming() was called, but there were not enough |
824 | * buffers queued. If set, then retry calling start_streaming when |
825 | * queuing a new buffer. |
826 | + * @waiting_for_buffers: used in poll() to check if vb2 is still waiting for |
827 | + * buffers. Only set for capture queues if qbuf has not yet been |
828 | + * called since poll() needs to return POLLERR in that situation. |
829 | * @fileio: file io emulator internal data, used only if emulator is active |
830 | */ |
831 | struct vb2_queue { |
832 | @@ -362,6 +365,7 @@ struct vb2_queue { |
833 | |
834 | unsigned int streaming:1; |
835 | unsigned int retry_start_streaming:1; |
836 | + unsigned int waiting_for_buffers:1; |
837 | |
838 | struct vb2_fileio_data *fileio; |
839 | }; |
840 | diff --git a/init/Kconfig b/init/Kconfig |
841 | index 93c5ef0c5210..8b9521a2d2c1 100644 |
842 | --- a/init/Kconfig |
843 | +++ b/init/Kconfig |
844 | @@ -1389,6 +1389,7 @@ config FUTEX |
845 | |
846 | config HAVE_FUTEX_CMPXCHG |
847 | bool |
848 | + depends on FUTEX |
849 | help |
850 | Architectures should select this if futex_atomic_cmpxchg_inatomic() |
851 | is implemented and always working. This removes a couple of runtime |
852 | diff --git a/kernel/cpuset.c b/kernel/cpuset.c |
853 | index 6b27e5c0cd86..15b3ea693225 100644 |
854 | --- a/kernel/cpuset.c |
855 | +++ b/kernel/cpuset.c |
856 | @@ -1022,7 +1022,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, |
857 | task_lock(tsk); |
858 | /* |
859 | * Determine if a loop is necessary if another thread is doing |
860 | - * get_mems_allowed(). If at least one node remains unchanged and |
861 | + * read_mems_allowed_begin(). If at least one node remains unchanged and |
862 | * tsk does not have a mempolicy, then an empty nodemask will not be |
863 | * possible when mems_allowed is larger than a word. |
864 | */ |
865 | diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c |
866 | index 334b3980ffc1..8865caec45fb 100644 |
867 | --- a/kernel/debug/debug_core.c |
868 | +++ b/kernel/debug/debug_core.c |
869 | @@ -49,6 +49,7 @@ |
870 | #include <linux/pid.h> |
871 | #include <linux/smp.h> |
872 | #include <linux/mm.h> |
873 | +#include <linux/vmacache.h> |
874 | #include <linux/rcupdate.h> |
875 | |
876 | #include <asm/cacheflush.h> |
877 | @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) |
878 | if (!CACHE_FLUSH_IS_SAFE) |
879 | return; |
880 | |
881 | - if (current->mm && current->mm->mmap_cache) { |
882 | - flush_cache_range(current->mm->mmap_cache, |
883 | - addr, addr + BREAK_INSTR_SIZE); |
884 | + if (current->mm) { |
885 | + int i; |
886 | + |
887 | + for (i = 0; i < VMACACHE_SIZE; i++) { |
888 | + if (!current->vmacache[i]) |
889 | + continue; |
890 | + flush_cache_range(current->vmacache[i], |
891 | + addr, addr + BREAK_INSTR_SIZE); |
892 | + } |
893 | } |
894 | + |
895 | /* Force flush instruction cache if it was outside the mm */ |
896 | flush_icache_range(addr, addr + BREAK_INSTR_SIZE); |
897 | } |
898 | diff --git a/kernel/events/core.c b/kernel/events/core.c |
899 | index 3a140ca37777..4ced342f1ba9 100644 |
900 | --- a/kernel/events/core.c |
901 | +++ b/kernel/events/core.c |
902 | @@ -7836,8 +7836,10 @@ int perf_event_init_task(struct task_struct *child) |
903 | |
904 | for_each_task_context_nr(ctxn) { |
905 | ret = perf_event_init_context(child, ctxn); |
906 | - if (ret) |
907 | + if (ret) { |
908 | + perf_event_free_task(child); |
909 | return ret; |
910 | + } |
911 | } |
912 | |
913 | return 0; |
914 | diff --git a/kernel/fork.c b/kernel/fork.c |
915 | index c44bff8097f5..e2c685396295 100644 |
916 | --- a/kernel/fork.c |
917 | +++ b/kernel/fork.c |
918 | @@ -28,6 +28,8 @@ |
919 | #include <linux/mman.h> |
920 | #include <linux/mmu_notifier.h> |
921 | #include <linux/fs.h> |
922 | +#include <linux/mm.h> |
923 | +#include <linux/vmacache.h> |
924 | #include <linux/nsproxy.h> |
925 | #include <linux/capability.h> |
926 | #include <linux/cpu.h> |
927 | @@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
928 | |
929 | mm->locked_vm = 0; |
930 | mm->mmap = NULL; |
931 | - mm->mmap_cache = NULL; |
932 | + mm->vmacache_seqnum = 0; |
933 | mm->map_count = 0; |
934 | cpumask_clear(mm_cpumask(mm)); |
935 | mm->mm_rb = RB_ROOT; |
936 | @@ -876,6 +878,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) |
937 | if (!oldmm) |
938 | return 0; |
939 | |
940 | + /* initialize the new vmacache entries */ |
941 | + vmacache_flush(tsk); |
942 | + |
943 | if (clone_flags & CLONE_VM) { |
944 | atomic_inc(&oldmm->mm_users); |
945 | mm = oldmm; |
946 | @@ -1323,7 +1328,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, |
947 | goto bad_fork_cleanup_policy; |
948 | retval = audit_alloc(p); |
949 | if (retval) |
950 | - goto bad_fork_cleanup_policy; |
951 | + goto bad_fork_cleanup_perf; |
952 | /* copy all the process information */ |
953 | retval = copy_semundo(clone_flags, p); |
954 | if (retval) |
955 | @@ -1522,8 +1527,9 @@ bad_fork_cleanup_semundo: |
956 | exit_sem(p); |
957 | bad_fork_cleanup_audit: |
958 | audit_free(p); |
959 | -bad_fork_cleanup_policy: |
960 | +bad_fork_cleanup_perf: |
961 | perf_event_free_task(p); |
962 | +bad_fork_cleanup_policy: |
963 | #ifdef CONFIG_NUMA |
964 | mpol_put(p->mempolicy); |
965 | bad_fork_cleanup_cgroup: |
966 | diff --git a/kernel/time.c b/kernel/time.c |
967 | index 7c7964c33ae7..3c49ab45f822 100644 |
968 | --- a/kernel/time.c |
969 | +++ b/kernel/time.c |
970 | @@ -496,17 +496,20 @@ EXPORT_SYMBOL(usecs_to_jiffies); |
971 | * that a remainder subtract here would not do the right thing as the |
972 | * resolution values don't fall on second boundries. I.e. the line: |
973 | * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. |
974 | + * Note that due to the small error in the multiplier here, this |
975 | + * rounding is incorrect for sufficiently large values of tv_nsec, but |
976 | + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're |
977 | + * OK. |
978 | * |
979 | * Rather, we just shift the bits off the right. |
980 | * |
981 | * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec |
982 | * value to a scaled second value. |
983 | */ |
984 | -unsigned long |
985 | -timespec_to_jiffies(const struct timespec *value) |
986 | +static unsigned long |
987 | +__timespec_to_jiffies(unsigned long sec, long nsec) |
988 | { |
989 | - unsigned long sec = value->tv_sec; |
990 | - long nsec = value->tv_nsec + TICK_NSEC - 1; |
991 | + nsec = nsec + TICK_NSEC - 1; |
992 | |
993 | if (sec >= MAX_SEC_IN_JIFFIES){ |
994 | sec = MAX_SEC_IN_JIFFIES; |
995 | @@ -517,6 +520,13 @@ timespec_to_jiffies(const struct timespec *value) |
996 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; |
997 | |
998 | } |
999 | + |
1000 | +unsigned long |
1001 | +timespec_to_jiffies(const struct timespec *value) |
1002 | +{ |
1003 | + return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); |
1004 | +} |
1005 | + |
1006 | EXPORT_SYMBOL(timespec_to_jiffies); |
1007 | |
1008 | void |
1009 | @@ -533,31 +543,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) |
1010 | } |
1011 | EXPORT_SYMBOL(jiffies_to_timespec); |
1012 | |
1013 | -/* Same for "timeval" |
1014 | +/* |
1015 | + * We could use a similar algorithm to timespec_to_jiffies (with a |
1016 | + * different multiplier for usec instead of nsec). But this has a |
1017 | + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the |
1018 | + * usec value, since it's not necessarily integral. |
1019 | * |
1020 | - * Well, almost. The problem here is that the real system resolution is |
1021 | - * in nanoseconds and the value being converted is in micro seconds. |
1022 | - * Also for some machines (those that use HZ = 1024, in-particular), |
1023 | - * there is a LARGE error in the tick size in microseconds. |
1024 | - |
1025 | - * The solution we use is to do the rounding AFTER we convert the |
1026 | - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. |
1027 | - * Instruction wise, this should cost only an additional add with carry |
1028 | - * instruction above the way it was done above. |
1029 | + * We could instead round in the intermediate scaled representation |
1030 | + * (i.e. in units of 1/2^(large scale) jiffies) but that's also |
1031 | + * perilous: the scaling introduces a small positive error, which |
1032 | + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 |
1033 | + * units to the intermediate before shifting) leads to accidental |
1034 | + * overflow and overestimates. |
1035 | + * |
1036 | + * At the cost of one additional multiplication by a constant, just |
1037 | + * use the timespec implementation. |
1038 | */ |
1039 | unsigned long |
1040 | timeval_to_jiffies(const struct timeval *value) |
1041 | { |
1042 | - unsigned long sec = value->tv_sec; |
1043 | - long usec = value->tv_usec; |
1044 | - |
1045 | - if (sec >= MAX_SEC_IN_JIFFIES){ |
1046 | - sec = MAX_SEC_IN_JIFFIES; |
1047 | - usec = 0; |
1048 | - } |
1049 | - return (((u64)sec * SEC_CONVERSION) + |
1050 | - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> |
1051 | - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; |
1052 | + return __timespec_to_jiffies(value->tv_sec, |
1053 | + value->tv_usec * NSEC_PER_USEC); |
1054 | } |
1055 | EXPORT_SYMBOL(timeval_to_jiffies); |
1056 | |
1057 | diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c |
1058 | index 773aba836e81..774a0807fe81 100644 |
1059 | --- a/kernel/trace/ring_buffer.c |
1060 | +++ b/kernel/trace/ring_buffer.c |
1061 | @@ -3372,7 +3372,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) |
1062 | iter->head = cpu_buffer->reader_page->read; |
1063 | |
1064 | iter->cache_reader_page = iter->head_page; |
1065 | - iter->cache_read = iter->head; |
1066 | + iter->cache_read = cpu_buffer->read; |
1067 | |
1068 | if (iter->head) |
1069 | iter->read_stamp = cpu_buffer->read_stamp; |
1070 | diff --git a/lib/plist.c b/lib/plist.c |
1071 | index 1ebc95f7a46f..0f2084d30798 100644 |
1072 | --- a/lib/plist.c |
1073 | +++ b/lib/plist.c |
1074 | @@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head) |
1075 | plist_check_head(head); |
1076 | } |
1077 | |
1078 | +/** |
1079 | + * plist_requeue - Requeue @node at end of same-prio entries. |
1080 | + * |
1081 | + * This is essentially an optimized plist_del() followed by |
1082 | + * plist_add(). It moves an entry already in the plist to |
1083 | + * after any other same-priority entries. |
1084 | + * |
1085 | + * @node: &struct plist_node pointer - entry to be moved |
1086 | + * @head: &struct plist_head pointer - list head |
1087 | + */ |
1088 | +void plist_requeue(struct plist_node *node, struct plist_head *head) |
1089 | +{ |
1090 | + struct plist_node *iter; |
1091 | + struct list_head *node_next = &head->node_list; |
1092 | + |
1093 | + plist_check_head(head); |
1094 | + BUG_ON(plist_head_empty(head)); |
1095 | + BUG_ON(plist_node_empty(node)); |
1096 | + |
1097 | + if (node == plist_last(head)) |
1098 | + return; |
1099 | + |
1100 | + iter = plist_next(node); |
1101 | + |
1102 | + if (node->prio != iter->prio) |
1103 | + return; |
1104 | + |
1105 | + plist_del(node, head); |
1106 | + |
1107 | + plist_for_each_continue(iter, head) { |
1108 | + if (node->prio != iter->prio) { |
1109 | + node_next = &iter->node_list; |
1110 | + break; |
1111 | + } |
1112 | + } |
1113 | + list_add_tail(&node->node_list, node_next); |
1114 | + |
1115 | + plist_check_head(head); |
1116 | +} |
1117 | + |
1118 | #ifdef CONFIG_DEBUG_PI_LIST |
1119 | #include <linux/sched.h> |
1120 | #include <linux/module.h> |
1121 | @@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect) |
1122 | BUG_ON(prio_pos->prio_list.next != &first->prio_list); |
1123 | } |
1124 | |
1125 | +static void __init plist_test_requeue(struct plist_node *node) |
1126 | +{ |
1127 | + plist_requeue(node, &test_head); |
1128 | + |
1129 | + if (node != plist_last(&test_head)) |
1130 | + BUG_ON(node->prio == plist_next(node)->prio); |
1131 | +} |
1132 | + |
1133 | static int __init plist_test(void) |
1134 | { |
1135 | int nr_expect = 0, i, loop; |
1136 | @@ -193,6 +241,10 @@ static int __init plist_test(void) |
1137 | nr_expect--; |
1138 | } |
1139 | plist_test_check(nr_expect); |
1140 | + if (!plist_node_empty(test_node + i)) { |
1141 | + plist_test_requeue(test_node + i); |
1142 | + plist_test_check(nr_expect); |
1143 | + } |
1144 | } |
1145 | |
1146 | for (i = 0; i < ARRAY_SIZE(test_node); i++) { |
1147 | diff --git a/mm/Makefile b/mm/Makefile |
1148 | index 310c90a09264..c561f1f6bca0 100644 |
1149 | --- a/mm/Makefile |
1150 | +++ b/mm/Makefile |
1151 | @@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
1152 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
1153 | util.o mmzone.o vmstat.o backing-dev.o \ |
1154 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
1155 | - compaction.o balloon_compaction.o \ |
1156 | + compaction.o balloon_compaction.o vmacache.o \ |
1157 | interval_tree.o list_lru.o $(mmu-y) |
1158 | |
1159 | obj-y += init-mm.o |
1160 | diff --git a/mm/compaction.c b/mm/compaction.c |
1161 | index 5f702ef0a65f..5e38e5706f62 100644 |
1162 | --- a/mm/compaction.c |
1163 | +++ b/mm/compaction.c |
1164 | @@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, |
1165 | /* Returns true if the page is within a block suitable for migration to */ |
1166 | static bool suitable_migration_target(struct page *page) |
1167 | { |
1168 | - int migratetype = get_pageblock_migratetype(page); |
1169 | - |
1170 | - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ |
1171 | - if (migratetype == MIGRATE_RESERVE) |
1172 | - return false; |
1173 | - |
1174 | - if (is_migrate_isolate(migratetype)) |
1175 | - return false; |
1176 | - |
1177 | - /* If the page is a large free page, then allow migration */ |
1178 | + /* If the page is a large free page, then disallow migration */ |
1179 | if (PageBuddy(page) && page_order(page) >= pageblock_order) |
1180 | - return true; |
1181 | + return false; |
1182 | |
1183 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ |
1184 | - if (migrate_async_suitable(migratetype)) |
1185 | + if (migrate_async_suitable(get_pageblock_migratetype(page))) |
1186 | return true; |
1187 | |
1188 | /* Otherwise skip the block */ |
1189 | @@ -253,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, |
1190 | struct page *cursor, *valid_page = NULL; |
1191 | unsigned long flags; |
1192 | bool locked = false; |
1193 | + bool checked_pageblock = false; |
1194 | |
1195 | cursor = pfn_to_page(blockpfn); |
1196 | |
1197 | @@ -284,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, |
1198 | break; |
1199 | |
1200 | /* Recheck this is a suitable migration target under lock */ |
1201 | - if (!strict && !suitable_migration_target(page)) |
1202 | - break; |
1203 | + if (!strict && !checked_pageblock) { |
1204 | + /* |
1205 | + * We need to check suitability of pageblock only once |
1206 | + * and this isolate_freepages_block() is called with |
1207 | + * pageblock range, so just check once is sufficient. |
1208 | + */ |
1209 | + checked_pageblock = true; |
1210 | + if (!suitable_migration_target(page)) |
1211 | + break; |
1212 | + } |
1213 | |
1214 | /* Recheck this is a buddy page under lock */ |
1215 | if (!PageBuddy(page)) |
1216 | @@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
1217 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
1218 | unsigned long nr_scanned = 0, nr_isolated = 0; |
1219 | struct list_head *migratelist = &cc->migratepages; |
1220 | - isolate_mode_t mode = 0; |
1221 | struct lruvec *lruvec; |
1222 | unsigned long flags; |
1223 | bool locked = false; |
1224 | struct page *page = NULL, *valid_page = NULL; |
1225 | bool skipped_async_unsuitable = false; |
1226 | + const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | |
1227 | + (unevictable ? ISOLATE_UNEVICTABLE : 0); |
1228 | |
1229 | /* |
1230 | * Ensure that there are not too many pages isolated from the LRU |
1231 | @@ -487,7 +488,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
1232 | cond_resched(); |
1233 | for (; low_pfn < end_pfn; low_pfn++) { |
1234 | /* give a chance to irqs before checking need_resched() */ |
1235 | - if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { |
1236 | + if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { |
1237 | if (should_release_lock(&zone->lru_lock)) { |
1238 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
1239 | locked = false; |
1240 | @@ -526,8 +527,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
1241 | |
1242 | /* If isolation recently failed, do not retry */ |
1243 | pageblock_nr = low_pfn >> pageblock_order; |
1244 | - if (!isolation_suitable(cc, page)) |
1245 | - goto next_pageblock; |
1246 | + if (last_pageblock_nr != pageblock_nr) { |
1247 | + int mt; |
1248 | + |
1249 | + last_pageblock_nr = pageblock_nr; |
1250 | + if (!isolation_suitable(cc, page)) |
1251 | + goto next_pageblock; |
1252 | + |
1253 | + /* |
1254 | + * For async migration, also only scan in MOVABLE |
1255 | + * blocks. Async migration is optimistic to see if |
1256 | + * the minimum amount of work satisfies the allocation |
1257 | + */ |
1258 | + mt = get_pageblock_migratetype(page); |
1259 | + if (!cc->sync && !migrate_async_suitable(mt)) { |
1260 | + cc->finished_update_migrate = true; |
1261 | + skipped_async_unsuitable = true; |
1262 | + goto next_pageblock; |
1263 | + } |
1264 | + } |
1265 | |
1266 | /* |
1267 | * Skip if free. page_order cannot be used without zone->lock |
1268 | @@ -537,18 +555,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
1269 | continue; |
1270 | |
1271 | /* |
1272 | - * For async migration, also only scan in MOVABLE blocks. Async |
1273 | - * migration is optimistic to see if the minimum amount of work |
1274 | - * satisfies the allocation |
1275 | - */ |
1276 | - if (!cc->sync && last_pageblock_nr != pageblock_nr && |
1277 | - !migrate_async_suitable(get_pageblock_migratetype(page))) { |
1278 | - cc->finished_update_migrate = true; |
1279 | - skipped_async_unsuitable = true; |
1280 | - goto next_pageblock; |
1281 | - } |
1282 | - |
1283 | - /* |
1284 | * Check may be lockless but that's ok as we recheck later. |
1285 | * It's possible to migrate LRU pages and balloon pages |
1286 | * Skip any other type of page |
1287 | @@ -557,11 +563,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
1288 | if (unlikely(balloon_page_movable(page))) { |
1289 | if (locked && balloon_page_isolate(page)) { |
1290 | /* Successfully isolated */ |
1291 | - cc->finished_update_migrate = true; |
1292 | - list_add(&page->lru, migratelist); |
1293 | - cc->nr_migratepages++; |
1294 | - nr_isolated++; |
1295 | - goto check_compact_cluster; |
1296 | + goto isolate_success; |
1297 | } |
1298 | } |
1299 | continue; |
1300 | @@ -584,6 +586,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
1301 | continue; |
1302 | } |
1303 | |
1304 | + /* |
1305 | + * Migration will fail if an anonymous page is pinned in memory, |
1306 | + * so avoid taking lru_lock and isolating it unnecessarily in an |
1307 | + * admittedly racy check. |
1308 | + */ |
1309 | + if (!page_mapping(page) && |
1310 | + page_count(page) > page_mapcount(page)) |
1311 | + continue; |
1312 | + |
1313 | /* Check if it is ok to still hold the lock */ |
1314 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, |
1315 | locked, cc); |
1316 | @@ -598,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
1317 | continue; |
1318 | } |
1319 | |
1320 | - if (!cc->sync) |
1321 | - mode |= ISOLATE_ASYNC_MIGRATE; |
1322 | - |
1323 | - if (unevictable) |
1324 | - mode |= ISOLATE_UNEVICTABLE; |
1325 | - |
1326 | lruvec = mem_cgroup_page_lruvec(page, zone); |
1327 | |
1328 | /* Try isolate the page */ |
1329 | @@ -613,13 +618,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
1330 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
1331 | |
1332 | /* Successfully isolated */ |
1333 | - cc->finished_update_migrate = true; |
1334 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
1335 | + |
1336 | +isolate_success: |
1337 | + cc->finished_update_migrate = true; |
1338 | list_add(&page->lru, migratelist); |
1339 | cc->nr_migratepages++; |
1340 | nr_isolated++; |
1341 | |
1342 | -check_compact_cluster: |
1343 | /* Avoid isolating too much */ |
1344 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { |
1345 | ++low_pfn; |
1346 | @@ -630,7 +636,6 @@ check_compact_cluster: |
1347 | |
1348 | next_pageblock: |
1349 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; |
1350 | - last_pageblock_nr = pageblock_nr; |
1351 | } |
1352 | |
1353 | acct_isolated(zone, locked, cc); |
1354 | @@ -1188,6 +1193,7 @@ static void compact_node(int nid) |
1355 | struct compact_control cc = { |
1356 | .order = -1, |
1357 | .sync = true, |
1358 | + .ignore_skip_hint = true, |
1359 | }; |
1360 | |
1361 | __compact_pgdat(NODE_DATA(nid), &cc); |
1362 | diff --git a/mm/filemap.c b/mm/filemap.c |
1363 | index 7a13f6ac5421..c2cc7c95eff1 100644 |
1364 | --- a/mm/filemap.c |
1365 | +++ b/mm/filemap.c |
1366 | @@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping) |
1367 | { |
1368 | int ret = 0; |
1369 | /* Check for outstanding write errors */ |
1370 | - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) |
1371 | + if (test_bit(AS_ENOSPC, &mapping->flags) && |
1372 | + test_and_clear_bit(AS_ENOSPC, &mapping->flags)) |
1373 | ret = -ENOSPC; |
1374 | - if (test_and_clear_bit(AS_EIO, &mapping->flags)) |
1375 | + if (test_bit(AS_EIO, &mapping->flags) && |
1376 | + test_and_clear_bit(AS_EIO, &mapping->flags)) |
1377 | ret = -EIO; |
1378 | return ret; |
1379 | } |
1380 | @@ -520,10 +522,10 @@ struct page *__page_cache_alloc(gfp_t gfp) |
1381 | if (cpuset_do_page_mem_spread()) { |
1382 | unsigned int cpuset_mems_cookie; |
1383 | do { |
1384 | - cpuset_mems_cookie = get_mems_allowed(); |
1385 | + cpuset_mems_cookie = read_mems_allowed_begin(); |
1386 | n = cpuset_mem_spread_node(); |
1387 | page = alloc_pages_exact_node(n, gfp, 0); |
1388 | - } while (!put_mems_allowed(cpuset_mems_cookie) && !page); |
1389 | + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); |
1390 | |
1391 | return page; |
1392 | } |
1393 | diff --git a/mm/frontswap.c b/mm/frontswap.c |
1394 | index 1b24bdcb3197..c30eec536f03 100644 |
1395 | --- a/mm/frontswap.c |
1396 | +++ b/mm/frontswap.c |
1397 | @@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); |
1398 | |
1399 | static unsigned long __frontswap_curr_pages(void) |
1400 | { |
1401 | - int type; |
1402 | unsigned long totalpages = 0; |
1403 | struct swap_info_struct *si = NULL; |
1404 | |
1405 | assert_spin_locked(&swap_lock); |
1406 | - for (type = swap_list.head; type >= 0; type = si->next) { |
1407 | - si = swap_info[type]; |
1408 | + plist_for_each_entry(si, &swap_active_head, list) |
1409 | totalpages += atomic_read(&si->frontswap_pages); |
1410 | - } |
1411 | return totalpages; |
1412 | } |
1413 | |
1414 | @@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, |
1415 | int si_frontswap_pages; |
1416 | unsigned long total_pages_to_unuse = total; |
1417 | unsigned long pages = 0, pages_to_unuse = 0; |
1418 | - int type; |
1419 | |
1420 | assert_spin_locked(&swap_lock); |
1421 | - for (type = swap_list.head; type >= 0; type = si->next) { |
1422 | - si = swap_info[type]; |
1423 | + plist_for_each_entry(si, &swap_active_head, list) { |
1424 | si_frontswap_pages = atomic_read(&si->frontswap_pages); |
1425 | if (total_pages_to_unuse < si_frontswap_pages) { |
1426 | pages = pages_to_unuse = total_pages_to_unuse; |
1427 | @@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, |
1428 | } |
1429 | vm_unacct_memory(pages); |
1430 | *unused = pages_to_unuse; |
1431 | - *swapid = type; |
1432 | + *swapid = si->type; |
1433 | ret = 0; |
1434 | break; |
1435 | } |
1436 | @@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) |
1437 | /* |
1438 | * we don't want to hold swap_lock while doing a very |
1439 | * lengthy try_to_unuse, but swap_list may change |
1440 | - * so restart scan from swap_list.head each time |
1441 | + * so restart scan from swap_active_head each time |
1442 | */ |
1443 | spin_lock(&swap_lock); |
1444 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); |
1445 | diff --git a/mm/huge_memory.c b/mm/huge_memory.c |
1446 | index 1c42d0c36d0b..718bfa16a36f 100644 |
1447 | --- a/mm/huge_memory.c |
1448 | +++ b/mm/huge_memory.c |
1449 | @@ -1819,21 +1819,24 @@ static int __split_huge_page_map(struct page *page, |
1450 | if (pmd) { |
1451 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
1452 | pmd_populate(mm, &_pmd, pgtable); |
1453 | + if (pmd_write(*pmd)) |
1454 | + BUG_ON(page_mapcount(page) != 1); |
1455 | |
1456 | haddr = address; |
1457 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1458 | pte_t *pte, entry; |
1459 | BUG_ON(PageCompound(page+i)); |
1460 | + /* |
1461 | + * Note that pmd_numa is not transferred deliberately |
1462 | + * to avoid any possibility that pte_numa leaks to |
1463 | + * a PROT_NONE VMA by accident. |
1464 | + */ |
1465 | entry = mk_pte(page + i, vma->vm_page_prot); |
1466 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1467 | if (!pmd_write(*pmd)) |
1468 | entry = pte_wrprotect(entry); |
1469 | - else |
1470 | - BUG_ON(page_mapcount(page) != 1); |
1471 | if (!pmd_young(*pmd)) |
1472 | entry = pte_mkold(entry); |
1473 | - if (pmd_numa(*pmd)) |
1474 | - entry = pte_mknuma(entry); |
1475 | pte = pte_offset_map(&_pmd, haddr); |
1476 | BUG_ON(!pte_none(*pte)); |
1477 | set_pte_at(mm, haddr, pte, entry); |
1478 | diff --git a/mm/hugetlb.c b/mm/hugetlb.c |
1479 | index 923f38e62bcf..67d0c175efcf 100644 |
1480 | --- a/mm/hugetlb.c |
1481 | +++ b/mm/hugetlb.c |
1482 | @@ -540,7 +540,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, |
1483 | goto err; |
1484 | |
1485 | retry_cpuset: |
1486 | - cpuset_mems_cookie = get_mems_allowed(); |
1487 | + cpuset_mems_cookie = read_mems_allowed_begin(); |
1488 | zonelist = huge_zonelist(vma, address, |
1489 | htlb_alloc_mask(h), &mpol, &nodemask); |
1490 | |
1491 | @@ -562,7 +562,7 @@ retry_cpuset: |
1492 | } |
1493 | |
1494 | mpol_cond_put(mpol); |
1495 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1496 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
1497 | goto retry_cpuset; |
1498 | return page; |
1499 | |
1500 | @@ -2071,6 +2071,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, |
1501 | unsigned long tmp; |
1502 | int ret; |
1503 | |
1504 | + if (!hugepages_supported()) |
1505 | + return -ENOTSUPP; |
1506 | + |
1507 | tmp = h->max_huge_pages; |
1508 | |
1509 | if (write && h->order >= MAX_ORDER) |
1510 | @@ -2124,6 +2127,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, |
1511 | unsigned long tmp; |
1512 | int ret; |
1513 | |
1514 | + if (!hugepages_supported()) |
1515 | + return -ENOTSUPP; |
1516 | + |
1517 | tmp = h->nr_overcommit_huge_pages; |
1518 | |
1519 | if (write && h->order >= MAX_ORDER) |
1520 | @@ -2149,6 +2155,8 @@ out: |
1521 | void hugetlb_report_meminfo(struct seq_file *m) |
1522 | { |
1523 | struct hstate *h = &default_hstate; |
1524 | + if (!hugepages_supported()) |
1525 | + return; |
1526 | seq_printf(m, |
1527 | "HugePages_Total: %5lu\n" |
1528 | "HugePages_Free: %5lu\n" |
1529 | @@ -2165,6 +2173,8 @@ void hugetlb_report_meminfo(struct seq_file *m) |
1530 | int hugetlb_report_node_meminfo(int nid, char *buf) |
1531 | { |
1532 | struct hstate *h = &default_hstate; |
1533 | + if (!hugepages_supported()) |
1534 | + return 0; |
1535 | return sprintf(buf, |
1536 | "Node %d HugePages_Total: %5u\n" |
1537 | "Node %d HugePages_Free: %5u\n" |
1538 | @@ -2179,6 +2189,9 @@ void hugetlb_show_meminfo(void) |
1539 | struct hstate *h; |
1540 | int nid; |
1541 | |
1542 | + if (!hugepages_supported()) |
1543 | + return; |
1544 | + |
1545 | for_each_node_state(nid, N_MEMORY) |
1546 | for_each_hstate(h) |
1547 | pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", |
1548 | diff --git a/mm/mempolicy.c b/mm/mempolicy.c |
1549 | index 15a8ea031526..796c7e6cf93b 100644 |
1550 | --- a/mm/mempolicy.c |
1551 | +++ b/mm/mempolicy.c |
1552 | @@ -1897,7 +1897,7 @@ int node_random(const nodemask_t *maskp) |
1553 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's |
1554 | * @nodemask for filtering the zonelist. |
1555 | * |
1556 | - * Must be protected by get_mems_allowed() |
1557 | + * Must be protected by read_mems_allowed_begin() |
1558 | */ |
1559 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1560 | gfp_t gfp_flags, struct mempolicy **mpol, |
1561 | @@ -2061,7 +2061,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1562 | |
1563 | retry_cpuset: |
1564 | pol = get_vma_policy(current, vma, addr); |
1565 | - cpuset_mems_cookie = get_mems_allowed(); |
1566 | + cpuset_mems_cookie = read_mems_allowed_begin(); |
1567 | |
1568 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1569 | unsigned nid; |
1570 | @@ -2069,7 +2069,7 @@ retry_cpuset: |
1571 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
1572 | mpol_cond_put(pol); |
1573 | page = alloc_page_interleave(gfp, order, nid); |
1574 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1575 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
1576 | goto retry_cpuset; |
1577 | |
1578 | return page; |
1579 | @@ -2079,7 +2079,7 @@ retry_cpuset: |
1580 | policy_nodemask(gfp, pol)); |
1581 | if (unlikely(mpol_needs_cond_ref(pol))) |
1582 | __mpol_put(pol); |
1583 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1584 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
1585 | goto retry_cpuset; |
1586 | return page; |
1587 | } |
1588 | @@ -2113,7 +2113,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) |
1589 | pol = &default_policy; |
1590 | |
1591 | retry_cpuset: |
1592 | - cpuset_mems_cookie = get_mems_allowed(); |
1593 | + cpuset_mems_cookie = read_mems_allowed_begin(); |
1594 | |
1595 | /* |
1596 | * No reference counting needed for current->mempolicy |
1597 | @@ -2126,7 +2126,7 @@ retry_cpuset: |
1598 | policy_zonelist(gfp, pol, numa_node_id()), |
1599 | policy_nodemask(gfp, pol)); |
1600 | |
1601 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1602 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
1603 | goto retry_cpuset; |
1604 | |
1605 | return page; |
1606 | diff --git a/mm/migrate.c b/mm/migrate.c |
1607 | index bed48809e5d0..13f47fbe3550 100644 |
1608 | --- a/mm/migrate.c |
1609 | +++ b/mm/migrate.c |
1610 | @@ -148,8 +148,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, |
1611 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
1612 | if (pte_swp_soft_dirty(*ptep)) |
1613 | pte = pte_mksoft_dirty(pte); |
1614 | + |
1615 | + /* Recheck VMA as permissions can change since migration started */ |
1616 | if (is_write_migration_entry(entry)) |
1617 | - pte = pte_mkwrite(pte); |
1618 | + pte = maybe_mkwrite(pte, vma); |
1619 | + |
1620 | #ifdef CONFIG_HUGETLB_PAGE |
1621 | if (PageHuge(new)) { |
1622 | pte = pte_mkhuge(pte); |
1623 | diff --git a/mm/mmap.c b/mm/mmap.c |
1624 | index 20ff0c33274c..dfe90657a6db 100644 |
1625 | --- a/mm/mmap.c |
1626 | +++ b/mm/mmap.c |
1627 | @@ -10,6 +10,7 @@ |
1628 | #include <linux/slab.h> |
1629 | #include <linux/backing-dev.h> |
1630 | #include <linux/mm.h> |
1631 | +#include <linux/vmacache.h> |
1632 | #include <linux/shm.h> |
1633 | #include <linux/mman.h> |
1634 | #include <linux/pagemap.h> |
1635 | @@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, |
1636 | prev->vm_next = next = vma->vm_next; |
1637 | if (next) |
1638 | next->vm_prev = prev; |
1639 | - if (mm->mmap_cache == vma) |
1640 | - mm->mmap_cache = prev; |
1641 | + |
1642 | + /* Kill the cache */ |
1643 | + vmacache_invalidate(mm); |
1644 | } |
1645 | |
1646 | /* |
1647 | @@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area); |
1648 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
1649 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1650 | { |
1651 | - struct vm_area_struct *vma = NULL; |
1652 | + struct rb_node *rb_node; |
1653 | + struct vm_area_struct *vma; |
1654 | |
1655 | /* Check the cache first. */ |
1656 | - /* (Cache hit rate is typically around 35%.) */ |
1657 | - vma = ACCESS_ONCE(mm->mmap_cache); |
1658 | - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { |
1659 | - struct rb_node *rb_node; |
1660 | + vma = vmacache_find(mm, addr); |
1661 | + if (likely(vma)) |
1662 | + return vma; |
1663 | |
1664 | - rb_node = mm->mm_rb.rb_node; |
1665 | - vma = NULL; |
1666 | + rb_node = mm->mm_rb.rb_node; |
1667 | + vma = NULL; |
1668 | |
1669 | - while (rb_node) { |
1670 | - struct vm_area_struct *vma_tmp; |
1671 | - |
1672 | - vma_tmp = rb_entry(rb_node, |
1673 | - struct vm_area_struct, vm_rb); |
1674 | - |
1675 | - if (vma_tmp->vm_end > addr) { |
1676 | - vma = vma_tmp; |
1677 | - if (vma_tmp->vm_start <= addr) |
1678 | - break; |
1679 | - rb_node = rb_node->rb_left; |
1680 | - } else |
1681 | - rb_node = rb_node->rb_right; |
1682 | - } |
1683 | - if (vma) |
1684 | - mm->mmap_cache = vma; |
1685 | + while (rb_node) { |
1686 | + struct vm_area_struct *tmp; |
1687 | + |
1688 | + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); |
1689 | + |
1690 | + if (tmp->vm_end > addr) { |
1691 | + vma = tmp; |
1692 | + if (tmp->vm_start <= addr) |
1693 | + break; |
1694 | + rb_node = rb_node->rb_left; |
1695 | + } else |
1696 | + rb_node = rb_node->rb_right; |
1697 | } |
1698 | + |
1699 | + if (vma) |
1700 | + vmacache_update(addr, vma); |
1701 | return vma; |
1702 | } |
1703 | |
1704 | @@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, |
1705 | } else |
1706 | mm->highest_vm_end = prev ? prev->vm_end : 0; |
1707 | tail_vma->vm_next = NULL; |
1708 | - mm->mmap_cache = NULL; /* Kill the cache. */ |
1709 | + |
1710 | + /* Kill the cache */ |
1711 | + vmacache_invalidate(mm); |
1712 | } |
1713 | |
1714 | /* |
1715 | diff --git a/mm/nommu.c b/mm/nommu.c |
1716 | index 8740213b1647..3ee4f74fbfbe 100644 |
1717 | --- a/mm/nommu.c |
1718 | +++ b/mm/nommu.c |
1719 | @@ -15,6 +15,7 @@ |
1720 | |
1721 | #include <linux/export.h> |
1722 | #include <linux/mm.h> |
1723 | +#include <linux/vmacache.h> |
1724 | #include <linux/mman.h> |
1725 | #include <linux/swap.h> |
1726 | #include <linux/file.h> |
1727 | @@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) |
1728 | */ |
1729 | static void delete_vma_from_mm(struct vm_area_struct *vma) |
1730 | { |
1731 | + int i; |
1732 | struct address_space *mapping; |
1733 | struct mm_struct *mm = vma->vm_mm; |
1734 | + struct task_struct *curr = current; |
1735 | |
1736 | kenter("%p", vma); |
1737 | |
1738 | protect_vma(vma, 0); |
1739 | |
1740 | mm->map_count--; |
1741 | - if (mm->mmap_cache == vma) |
1742 | - mm->mmap_cache = NULL; |
1743 | + for (i = 0; i < VMACACHE_SIZE; i++) { |
1744 | + /* if the vma is cached, invalidate the entire cache */ |
1745 | + if (curr->vmacache[i] == vma) { |
1746 | + vmacache_invalidate(curr->mm); |
1747 | + break; |
1748 | + } |
1749 | + } |
1750 | |
1751 | /* remove the VMA from the mapping */ |
1752 | if (vma->vm_file) { |
1753 | @@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1754 | struct vm_area_struct *vma; |
1755 | |
1756 | /* check the cache first */ |
1757 | - vma = ACCESS_ONCE(mm->mmap_cache); |
1758 | - if (vma && vma->vm_start <= addr && vma->vm_end > addr) |
1759 | + vma = vmacache_find(mm, addr); |
1760 | + if (likely(vma)) |
1761 | return vma; |
1762 | |
1763 | /* trawl the list (there may be multiple mappings in which addr |
1764 | @@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1765 | if (vma->vm_start > addr) |
1766 | return NULL; |
1767 | if (vma->vm_end > addr) { |
1768 | - mm->mmap_cache = vma; |
1769 | + vmacache_update(addr, vma); |
1770 | return vma; |
1771 | } |
1772 | } |
1773 | @@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, |
1774 | unsigned long end = addr + len; |
1775 | |
1776 | /* check the cache first */ |
1777 | - vma = mm->mmap_cache; |
1778 | - if (vma && vma->vm_start == addr && vma->vm_end == end) |
1779 | + vma = vmacache_find_exact(mm, addr, end); |
1780 | + if (vma) |
1781 | return vma; |
1782 | |
1783 | /* trawl the list (there may be multiple mappings in which addr |
1784 | @@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, |
1785 | if (vma->vm_start > addr) |
1786 | return NULL; |
1787 | if (vma->vm_end == end) { |
1788 | - mm->mmap_cache = vma; |
1789 | + vmacache_update(addr, vma); |
1790 | return vma; |
1791 | } |
1792 | } |
1793 | diff --git a/mm/page_alloc.c b/mm/page_alloc.c |
1794 | index 62e400d00e3f..ff0f6b13f32f 100644 |
1795 | --- a/mm/page_alloc.c |
1796 | +++ b/mm/page_alloc.c |
1797 | @@ -1869,7 +1869,7 @@ static void __paginginit init_zone_allows_reclaim(int nid) |
1798 | { |
1799 | int i; |
1800 | |
1801 | - for_each_online_node(i) |
1802 | + for_each_node_state(i, N_MEMORY) |
1803 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) |
1804 | node_set(i, NODE_DATA(nid)->reclaim_nodes); |
1805 | else |
1806 | @@ -2736,7 +2736,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, |
1807 | return NULL; |
1808 | |
1809 | retry_cpuset: |
1810 | - cpuset_mems_cookie = get_mems_allowed(); |
1811 | + cpuset_mems_cookie = read_mems_allowed_begin(); |
1812 | |
1813 | /* The preferred zone is used for statistics later */ |
1814 | first_zones_zonelist(zonelist, high_zoneidx, |
1815 | @@ -2791,7 +2791,7 @@ out: |
1816 | * the mask is being updated. If a page allocation is about to fail, |
1817 | * check if the cpuset changed during allocation and if so, retry. |
1818 | */ |
1819 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1820 | + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
1821 | goto retry_cpuset; |
1822 | |
1823 | memcg_kmem_commit_charge(page, memcg, order); |
1824 | @@ -3059,9 +3059,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) |
1825 | goto out; |
1826 | |
1827 | do { |
1828 | - cpuset_mems_cookie = get_mems_allowed(); |
1829 | + cpuset_mems_cookie = read_mems_allowed_begin(); |
1830 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
1831 | - } while (!put_mems_allowed(cpuset_mems_cookie)); |
1832 | + } while (read_mems_allowed_retry(cpuset_mems_cookie)); |
1833 | out: |
1834 | return ret; |
1835 | } |
1836 | @@ -4933,7 +4933,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, |
1837 | |
1838 | pgdat->node_id = nid; |
1839 | pgdat->node_start_pfn = node_start_pfn; |
1840 | - init_zone_allows_reclaim(nid); |
1841 | + if (node_state(nid, N_MEMORY)) |
1842 | + init_zone_allows_reclaim(nid); |
1843 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
1844 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
1845 | #endif |
1846 | diff --git a/mm/readahead.c b/mm/readahead.c |
1847 | index 0de2360d65f3..1fa0d6fca556 100644 |
1848 | --- a/mm/readahead.c |
1849 | +++ b/mm/readahead.c |
1850 | @@ -233,14 +233,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, |
1851 | return 0; |
1852 | } |
1853 | |
1854 | +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) |
1855 | /* |
1856 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
1857 | * sensible upper limit. |
1858 | */ |
1859 | unsigned long max_sane_readahead(unsigned long nr) |
1860 | { |
1861 | - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) |
1862 | - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); |
1863 | + return min(nr, MAX_READAHEAD); |
1864 | } |
1865 | |
1866 | /* |
1867 | diff --git a/mm/slab.c b/mm/slab.c |
1868 | index ea854eb2388c..0b1c2a58559d 100644 |
1869 | --- a/mm/slab.c |
1870 | +++ b/mm/slab.c |
1871 | @@ -3122,7 +3122,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) |
1872 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
1873 | |
1874 | retry_cpuset: |
1875 | - cpuset_mems_cookie = get_mems_allowed(); |
1876 | + cpuset_mems_cookie = read_mems_allowed_begin(); |
1877 | zonelist = node_zonelist(slab_node(), flags); |
1878 | |
1879 | retry: |
1880 | @@ -3180,7 +3180,7 @@ retry: |
1881 | } |
1882 | } |
1883 | |
1884 | - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) |
1885 | + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) |
1886 | goto retry_cpuset; |
1887 | return obj; |
1888 | } |
1889 | diff --git a/mm/slub.c b/mm/slub.c |
1890 | index 25f14ad8f817..7611f148ee81 100644 |
1891 | --- a/mm/slub.c |
1892 | +++ b/mm/slub.c |
1893 | @@ -1684,7 +1684,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, |
1894 | return NULL; |
1895 | |
1896 | do { |
1897 | - cpuset_mems_cookie = get_mems_allowed(); |
1898 | + cpuset_mems_cookie = read_mems_allowed_begin(); |
1899 | zonelist = node_zonelist(slab_node(), flags); |
1900 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1901 | struct kmem_cache_node *n; |
1902 | @@ -1696,19 +1696,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, |
1903 | object = get_partial_node(s, n, c, flags); |
1904 | if (object) { |
1905 | /* |
1906 | - * Return the object even if |
1907 | - * put_mems_allowed indicated that |
1908 | - * the cpuset mems_allowed was |
1909 | - * updated in parallel. It's a |
1910 | - * harmless race between the alloc |
1911 | - * and the cpuset update. |
1912 | + * Don't check read_mems_allowed_retry() |
1913 | + * here - if mems_allowed was updated in |
1914 | + * parallel, that was a harmless race |
1915 | + * between allocation and the cpuset |
1916 | + * update |
1917 | */ |
1918 | - put_mems_allowed(cpuset_mems_cookie); |
1919 | return object; |
1920 | } |
1921 | } |
1922 | } |
1923 | - } while (!put_mems_allowed(cpuset_mems_cookie)); |
1924 | + } while (read_mems_allowed_retry(cpuset_mems_cookie)); |
1925 | #endif |
1926 | return NULL; |
1927 | } |
1928 | diff --git a/mm/swapfile.c b/mm/swapfile.c |
1929 | index 4a7f7e6992b6..beeeef8a1b2d 100644 |
1930 | --- a/mm/swapfile.c |
1931 | +++ b/mm/swapfile.c |
1932 | @@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages; |
1933 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ |
1934 | long total_swap_pages; |
1935 | static int least_priority; |
1936 | -static atomic_t highest_priority_index = ATOMIC_INIT(-1); |
1937 | |
1938 | static const char Bad_file[] = "Bad swap file entry "; |
1939 | static const char Unused_file[] = "Unused swap file entry "; |
1940 | static const char Bad_offset[] = "Bad swap offset entry "; |
1941 | static const char Unused_offset[] = "Unused swap offset entry "; |
1942 | |
1943 | -struct swap_list_t swap_list = {-1, -1}; |
1944 | +/* |
1945 | + * all active swap_info_structs |
1946 | + * protected with swap_lock, and ordered by priority. |
1947 | + */ |
1948 | +PLIST_HEAD(swap_active_head); |
1949 | + |
1950 | +/* |
1951 | + * all available (active, not full) swap_info_structs |
1952 | + * protected with swap_avail_lock, ordered by priority. |
1953 | + * This is used by get_swap_page() instead of swap_active_head |
1954 | + * because swap_active_head includes all swap_info_structs, |
1955 | + * but get_swap_page() doesn't need to look at full ones. |
1956 | + * This uses its own lock instead of swap_lock because when a |
1957 | + * swap_info_struct changes between not-full/full, it needs to |
1958 | + * add/remove itself to/from this list, but the swap_info_struct->lock |
1959 | + * is held and the locking order requires swap_lock to be taken |
1960 | + * before any swap_info_struct->lock. |
1961 | + */ |
1962 | +static PLIST_HEAD(swap_avail_head); |
1963 | +static DEFINE_SPINLOCK(swap_avail_lock); |
1964 | |
1965 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
1966 | |
1967 | @@ -591,6 +609,9 @@ checks: |
1968 | if (si->inuse_pages == si->pages) { |
1969 | si->lowest_bit = si->max; |
1970 | si->highest_bit = 0; |
1971 | + spin_lock(&swap_avail_lock); |
1972 | + plist_del(&si->avail_list, &swap_avail_head); |
1973 | + spin_unlock(&swap_avail_lock); |
1974 | } |
1975 | si->swap_map[offset] = usage; |
1976 | inc_cluster_info_page(si, si->cluster_info, offset); |
1977 | @@ -640,71 +661,65 @@ no_page: |
1978 | |
1979 | swp_entry_t get_swap_page(void) |
1980 | { |
1981 | - struct swap_info_struct *si; |
1982 | + struct swap_info_struct *si, *next; |
1983 | pgoff_t offset; |
1984 | - int type, next; |
1985 | - int wrapped = 0; |
1986 | - int hp_index; |
1987 | |
1988 | - spin_lock(&swap_lock); |
1989 | if (atomic_long_read(&nr_swap_pages) <= 0) |
1990 | goto noswap; |
1991 | atomic_long_dec(&nr_swap_pages); |
1992 | |
1993 | - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
1994 | - hp_index = atomic_xchg(&highest_priority_index, -1); |
1995 | - /* |
1996 | - * highest_priority_index records current highest priority swap |
1997 | - * type which just frees swap entries. If its priority is |
1998 | - * higher than that of swap_list.next swap type, we use it. It |
1999 | - * isn't protected by swap_lock, so it can be an invalid value |
2000 | - * if the corresponding swap type is swapoff. We double check |
2001 | - * the flags here. It's even possible the swap type is swapoff |
2002 | - * and swapon again and its priority is changed. In such rare |
2003 | - * case, low prority swap type might be used, but eventually |
2004 | - * high priority swap will be used after several rounds of |
2005 | - * swap. |
2006 | - */ |
2007 | - if (hp_index != -1 && hp_index != type && |
2008 | - swap_info[type]->prio < swap_info[hp_index]->prio && |
2009 | - (swap_info[hp_index]->flags & SWP_WRITEOK)) { |
2010 | - type = hp_index; |
2011 | - swap_list.next = type; |
2012 | - } |
2013 | - |
2014 | - si = swap_info[type]; |
2015 | - next = si->next; |
2016 | - if (next < 0 || |
2017 | - (!wrapped && si->prio != swap_info[next]->prio)) { |
2018 | - next = swap_list.head; |
2019 | - wrapped++; |
2020 | - } |
2021 | + spin_lock(&swap_avail_lock); |
2022 | |
2023 | +start_over: |
2024 | + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { |
2025 | + /* requeue si to after same-priority siblings */ |
2026 | + plist_requeue(&si->avail_list, &swap_avail_head); |
2027 | + spin_unlock(&swap_avail_lock); |
2028 | spin_lock(&si->lock); |
2029 | - if (!si->highest_bit) { |
2030 | + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { |
2031 | + spin_lock(&swap_avail_lock); |
2032 | + if (plist_node_empty(&si->avail_list)) { |
2033 | + spin_unlock(&si->lock); |
2034 | + goto nextsi; |
2035 | + } |
2036 | + WARN(!si->highest_bit, |
2037 | + "swap_info %d in list but !highest_bit\n", |
2038 | + si->type); |
2039 | + WARN(!(si->flags & SWP_WRITEOK), |
2040 | + "swap_info %d in list but !SWP_WRITEOK\n", |
2041 | + si->type); |
2042 | + plist_del(&si->avail_list, &swap_avail_head); |
2043 | spin_unlock(&si->lock); |
2044 | - continue; |
2045 | + goto nextsi; |
2046 | } |
2047 | - if (!(si->flags & SWP_WRITEOK)) { |
2048 | - spin_unlock(&si->lock); |
2049 | - continue; |
2050 | - } |
2051 | - |
2052 | - swap_list.next = next; |
2053 | |
2054 | - spin_unlock(&swap_lock); |
2055 | /* This is called for allocating swap entry for cache */ |
2056 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
2057 | spin_unlock(&si->lock); |
2058 | if (offset) |
2059 | - return swp_entry(type, offset); |
2060 | - spin_lock(&swap_lock); |
2061 | - next = swap_list.next; |
2062 | + return swp_entry(si->type, offset); |
2063 | + pr_debug("scan_swap_map of si %d failed to find offset\n", |
2064 | + si->type); |
2065 | + spin_lock(&swap_avail_lock); |
2066 | +nextsi: |
2067 | + /* |
2068 | + * if we got here, it's likely that si was almost full before, |
2069 | + * and since scan_swap_map() can drop the si->lock, multiple |
2070 | + * callers probably all tried to get a page from the same si |
2071 | + * and it filled up before we could get one; or, the si filled |
2072 | + * up between us dropping swap_avail_lock and taking si->lock. |
2073 | + * Since we dropped the swap_avail_lock, the swap_avail_head |
2074 | + * list may have been modified; so if next is still in the |
2075 | + * swap_avail_head list then try it, otherwise start over. |
2076 | + */ |
2077 | + if (plist_node_empty(&next->avail_list)) |
2078 | + goto start_over; |
2079 | } |
2080 | |
2081 | + spin_unlock(&swap_avail_lock); |
2082 | + |
2083 | atomic_long_inc(&nr_swap_pages); |
2084 | noswap: |
2085 | - spin_unlock(&swap_lock); |
2086 | return (swp_entry_t) {0}; |
2087 | } |
2088 | |
2089 | @@ -766,27 +781,6 @@ out: |
2090 | return NULL; |
2091 | } |
2092 | |
2093 | -/* |
2094 | - * This swap type frees swap entry, check if it is the highest priority swap |
2095 | - * type which just frees swap entry. get_swap_page() uses |
2096 | - * highest_priority_index to search highest priority swap type. The |
2097 | - * swap_info_struct.lock can't protect us if there are multiple swap types |
2098 | - * active, so we use atomic_cmpxchg. |
2099 | - */ |
2100 | -static void set_highest_priority_index(int type) |
2101 | -{ |
2102 | - int old_hp_index, new_hp_index; |
2103 | - |
2104 | - do { |
2105 | - old_hp_index = atomic_read(&highest_priority_index); |
2106 | - if (old_hp_index != -1 && |
2107 | - swap_info[old_hp_index]->prio >= swap_info[type]->prio) |
2108 | - break; |
2109 | - new_hp_index = type; |
2110 | - } while (atomic_cmpxchg(&highest_priority_index, |
2111 | - old_hp_index, new_hp_index) != old_hp_index); |
2112 | -} |
2113 | - |
2114 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
2115 | swp_entry_t entry, unsigned char usage) |
2116 | { |
2117 | @@ -828,9 +822,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, |
2118 | dec_cluster_info_page(p, p->cluster_info, offset); |
2119 | if (offset < p->lowest_bit) |
2120 | p->lowest_bit = offset; |
2121 | - if (offset > p->highest_bit) |
2122 | + if (offset > p->highest_bit) { |
2123 | + bool was_full = !p->highest_bit; |
2124 | p->highest_bit = offset; |
2125 | - set_highest_priority_index(p->type); |
2126 | + if (was_full && (p->flags & SWP_WRITEOK)) { |
2127 | + spin_lock(&swap_avail_lock); |
2128 | + WARN_ON(!plist_node_empty(&p->avail_list)); |
2129 | + if (plist_node_empty(&p->avail_list)) |
2130 | + plist_add(&p->avail_list, |
2131 | + &swap_avail_head); |
2132 | + spin_unlock(&swap_avail_lock); |
2133 | + } |
2134 | + } |
2135 | atomic_long_inc(&nr_swap_pages); |
2136 | p->inuse_pages--; |
2137 | frontswap_invalidate_page(p->type, offset); |
2138 | @@ -1765,30 +1768,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, |
2139 | unsigned char *swap_map, |
2140 | struct swap_cluster_info *cluster_info) |
2141 | { |
2142 | - int i, prev; |
2143 | - |
2144 | if (prio >= 0) |
2145 | p->prio = prio; |
2146 | else |
2147 | p->prio = --least_priority; |
2148 | + /* |
2149 | + * the plist prio is negated because plist ordering is |
2150 | + * low-to-high, while swap ordering is high-to-low |
2151 | + */ |
2152 | + p->list.prio = -p->prio; |
2153 | + p->avail_list.prio = -p->prio; |
2154 | p->swap_map = swap_map; |
2155 | p->cluster_info = cluster_info; |
2156 | p->flags |= SWP_WRITEOK; |
2157 | atomic_long_add(p->pages, &nr_swap_pages); |
2158 | total_swap_pages += p->pages; |
2159 | |
2160 | - /* insert swap space into swap_list: */ |
2161 | - prev = -1; |
2162 | - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
2163 | - if (p->prio >= swap_info[i]->prio) |
2164 | - break; |
2165 | - prev = i; |
2166 | - } |
2167 | - p->next = i; |
2168 | - if (prev < 0) |
2169 | - swap_list.head = swap_list.next = p->type; |
2170 | - else |
2171 | - swap_info[prev]->next = p->type; |
2172 | + assert_spin_locked(&swap_lock); |
2173 | + /* |
2174 | + * both lists are plists, and thus priority ordered. |
2175 | + * swap_active_head needs to be priority ordered for swapoff(), |
2176 | + * which on removal of any swap_info_struct with an auto-assigned |
2177 | + * (i.e. negative) priority increments the auto-assigned priority |
2178 | + * of any lower-priority swap_info_structs. |
2179 | + * swap_avail_head needs to be priority ordered for get_swap_page(), |
2180 | + * which allocates swap pages from the highest available priority |
2181 | + * swap_info_struct. |
2182 | + */ |
2183 | + plist_add(&p->list, &swap_active_head); |
2184 | + spin_lock(&swap_avail_lock); |
2185 | + plist_add(&p->avail_list, &swap_avail_head); |
2186 | + spin_unlock(&swap_avail_lock); |
2187 | } |
2188 | |
2189 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
2190 | @@ -1823,8 +1833,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
2191 | struct address_space *mapping; |
2192 | struct inode *inode; |
2193 | struct filename *pathname; |
2194 | - int i, type, prev; |
2195 | - int err; |
2196 | + int err, found = 0; |
2197 | unsigned int old_block_size; |
2198 | |
2199 | if (!capable(CAP_SYS_ADMIN)) |
2200 | @@ -1842,17 +1851,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
2201 | goto out; |
2202 | |
2203 | mapping = victim->f_mapping; |
2204 | - prev = -1; |
2205 | spin_lock(&swap_lock); |
2206 | - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { |
2207 | - p = swap_info[type]; |
2208 | + plist_for_each_entry(p, &swap_active_head, list) { |
2209 | if (p->flags & SWP_WRITEOK) { |
2210 | - if (p->swap_file->f_mapping == mapping) |
2211 | + if (p->swap_file->f_mapping == mapping) { |
2212 | + found = 1; |
2213 | break; |
2214 | + } |
2215 | } |
2216 | - prev = type; |
2217 | } |
2218 | - if (type < 0) { |
2219 | + if (!found) { |
2220 | err = -EINVAL; |
2221 | spin_unlock(&swap_lock); |
2222 | goto out_dput; |
2223 | @@ -1864,20 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
2224 | spin_unlock(&swap_lock); |
2225 | goto out_dput; |
2226 | } |
2227 | - if (prev < 0) |
2228 | - swap_list.head = p->next; |
2229 | - else |
2230 | - swap_info[prev]->next = p->next; |
2231 | - if (type == swap_list.next) { |
2232 | - /* just pick something that's safe... */ |
2233 | - swap_list.next = swap_list.head; |
2234 | - } |
2235 | + spin_lock(&swap_avail_lock); |
2236 | + plist_del(&p->avail_list, &swap_avail_head); |
2237 | + spin_unlock(&swap_avail_lock); |
2238 | spin_lock(&p->lock); |
2239 | if (p->prio < 0) { |
2240 | - for (i = p->next; i >= 0; i = swap_info[i]->next) |
2241 | - swap_info[i]->prio = p->prio--; |
2242 | + struct swap_info_struct *si = p; |
2243 | + |
2244 | + plist_for_each_entry_continue(si, &swap_active_head, list) { |
2245 | + si->prio++; |
2246 | + si->list.prio--; |
2247 | + si->avail_list.prio--; |
2248 | + } |
2249 | least_priority++; |
2250 | } |
2251 | + plist_del(&p->list, &swap_active_head); |
2252 | atomic_long_sub(p->pages, &nr_swap_pages); |
2253 | total_swap_pages -= p->pages; |
2254 | p->flags &= ~SWP_WRITEOK; |
2255 | @@ -1885,7 +1894,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
2256 | spin_unlock(&swap_lock); |
2257 | |
2258 | set_current_oom_origin(); |
2259 | - err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
2260 | + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ |
2261 | clear_current_oom_origin(); |
2262 | |
2263 | if (err) { |
2264 | @@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
2265 | frontswap_map = frontswap_map_get(p); |
2266 | spin_unlock(&p->lock); |
2267 | spin_unlock(&swap_lock); |
2268 | - frontswap_invalidate_area(type); |
2269 | + frontswap_invalidate_area(p->type); |
2270 | frontswap_map_set(p, NULL); |
2271 | mutex_unlock(&swapon_mutex); |
2272 | free_percpu(p->percpu_cluster); |
2273 | @@ -1935,7 +1944,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
2274 | vfree(cluster_info); |
2275 | vfree(frontswap_map); |
2276 | /* Destroy swap account information */ |
2277 | - swap_cgroup_swapoff(type); |
2278 | + swap_cgroup_swapoff(p->type); |
2279 | |
2280 | inode = mapping->host; |
2281 | if (S_ISBLK(inode->i_mode)) { |
2282 | @@ -2142,8 +2151,9 @@ static struct swap_info_struct *alloc_swap_info(void) |
2283 | */ |
2284 | } |
2285 | INIT_LIST_HEAD(&p->first_swap_extent.list); |
2286 | + plist_node_init(&p->list, 0); |
2287 | + plist_node_init(&p->avail_list, 0); |
2288 | p->flags = SWP_USED; |
2289 | - p->next = -1; |
2290 | spin_unlock(&swap_lock); |
2291 | spin_lock_init(&p->lock); |
2292 | |
2293 | diff --git a/mm/vmacache.c b/mm/vmacache.c |
2294 | new file mode 100644 |
2295 | index 000000000000..1037a3bab505 |
2296 | --- /dev/null |
2297 | +++ b/mm/vmacache.c |
2298 | @@ -0,0 +1,114 @@ |
2299 | +/* |
2300 | + * Copyright (C) 2014 Davidlohr Bueso. |
2301 | + */ |
2302 | +#include <linux/sched.h> |
2303 | +#include <linux/mm.h> |
2304 | +#include <linux/vmacache.h> |
2305 | + |
2306 | +/* |
2307 | + * Flush vma caches for threads that share a given mm. |
2308 | + * |
2309 | + * The operation is safe because the caller holds the mmap_sem |
2310 | + * exclusively and other threads accessing the vma cache will |
2311 | + * have mmap_sem held at least for read, so no extra locking |
2312 | + * is required to maintain the vma cache. |
2313 | + */ |
2314 | +void vmacache_flush_all(struct mm_struct *mm) |
2315 | +{ |
2316 | + struct task_struct *g, *p; |
2317 | + |
2318 | + rcu_read_lock(); |
2319 | + for_each_process_thread(g, p) { |
2320 | + /* |
2321 | + * Only flush the vmacache pointers as the |
2322 | + * mm seqnum is already set and curr's will |
2323 | + * be set upon invalidation when the next |
2324 | + * lookup is done. |
2325 | + */ |
2326 | + if (mm == p->mm) |
2327 | + vmacache_flush(p); |
2328 | + } |
2329 | + rcu_read_unlock(); |
2330 | +} |
2331 | + |
2332 | +/* |
2333 | + * This task may be accessing a foreign mm via (for example) |
2334 | + * get_user_pages()->find_vma(). The vmacache is task-local and this |
2335 | + * task's vmacache pertains to a different mm (ie, its own). There is |
2336 | + * nothing we can do here. |
2337 | + * |
2338 | + * Also handle the case where a kernel thread has adopted this mm via use_mm(). |
2339 | + * That kernel thread's vmacache is not applicable to this mm. |
2340 | + */ |
2341 | +static bool vmacache_valid_mm(struct mm_struct *mm) |
2342 | +{ |
2343 | + return current->mm == mm && !(current->flags & PF_KTHREAD); |
2344 | +} |
2345 | + |
2346 | +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) |
2347 | +{ |
2348 | + if (vmacache_valid_mm(newvma->vm_mm)) |
2349 | + current->vmacache[VMACACHE_HASH(addr)] = newvma; |
2350 | +} |
2351 | + |
2352 | +static bool vmacache_valid(struct mm_struct *mm) |
2353 | +{ |
2354 | + struct task_struct *curr; |
2355 | + |
2356 | + if (!vmacache_valid_mm(mm)) |
2357 | + return false; |
2358 | + |
2359 | + curr = current; |
2360 | + if (mm->vmacache_seqnum != curr->vmacache_seqnum) { |
2361 | + /* |
2362 | + * First attempt will always be invalid, initialize |
2363 | + * the new cache for this task here. |
2364 | + */ |
2365 | + curr->vmacache_seqnum = mm->vmacache_seqnum; |
2366 | + vmacache_flush(curr); |
2367 | + return false; |
2368 | + } |
2369 | + return true; |
2370 | +} |
2371 | + |
2372 | +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) |
2373 | +{ |
2374 | + int i; |
2375 | + |
2376 | + if (!vmacache_valid(mm)) |
2377 | + return NULL; |
2378 | + |
2379 | + for (i = 0; i < VMACACHE_SIZE; i++) { |
2380 | + struct vm_area_struct *vma = current->vmacache[i]; |
2381 | + |
2382 | + if (!vma) |
2383 | + continue; |
2384 | + if (WARN_ON_ONCE(vma->vm_mm != mm)) |
2385 | + break; |
2386 | + if (vma->vm_start <= addr && vma->vm_end > addr) |
2387 | + return vma; |
2388 | + } |
2389 | + |
2390 | + return NULL; |
2391 | +} |
2392 | + |
2393 | +#ifndef CONFIG_MMU |
2394 | +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, |
2395 | + unsigned long start, |
2396 | + unsigned long end) |
2397 | +{ |
2398 | + int i; |
2399 | + |
2400 | + if (!vmacache_valid(mm)) |
2401 | + return NULL; |
2402 | + |
2403 | + for (i = 0; i < VMACACHE_SIZE; i++) { |
2404 | + struct vm_area_struct *vma = current->vmacache[i]; |
2405 | + |
2406 | + if (vma && vma->vm_start == start && vma->vm_end == end) |
2407 | + return vma; |
2408 | + } |
2409 | + |
2410 | + return NULL; |
2411 | +} |
2412 | +#endif |
2413 | diff --git a/mm/vmscan.c b/mm/vmscan.c |
2414 | index 6ef484f0777f..0c0b36e5b4f8 100644 |
2415 | --- a/mm/vmscan.c |
2416 | +++ b/mm/vmscan.c |
2417 | @@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, |
2418 | unsigned long freed = 0; |
2419 | unsigned long long delta; |
2420 | long total_scan; |
2421 | - long max_pass; |
2422 | + long freeable; |
2423 | long nr; |
2424 | long new_nr; |
2425 | int nid = shrinkctl->nid; |
2426 | long batch_size = shrinker->batch ? shrinker->batch |
2427 | : SHRINK_BATCH; |
2428 | |
2429 | - max_pass = shrinker->count_objects(shrinker, shrinkctl); |
2430 | - if (max_pass == 0) |
2431 | + freeable = shrinker->count_objects(shrinker, shrinkctl); |
2432 | + if (freeable == 0) |
2433 | return 0; |
2434 | |
2435 | /* |
2436 | @@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, |
2437 | |
2438 | total_scan = nr; |
2439 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
2440 | - delta *= max_pass; |
2441 | + delta *= freeable; |
2442 | do_div(delta, lru_pages + 1); |
2443 | total_scan += delta; |
2444 | if (total_scan < 0) { |
2445 | printk(KERN_ERR |
2446 | "shrink_slab: %pF negative objects to delete nr=%ld\n", |
2447 | shrinker->scan_objects, total_scan); |
2448 | - total_scan = max_pass; |
2449 | + total_scan = freeable; |
2450 | } |
2451 | |
2452 | /* |
2453 | @@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, |
2454 | * shrinkers to return -1 all the time. This results in a large |
2455 | * nr being built up so when a shrink that can do some work |
2456 | * comes along it empties the entire cache due to nr >>> |
2457 | - * max_pass. This is bad for sustaining a working set in |
2458 | + * freeable. This is bad for sustaining a working set in |
2459 | * memory. |
2460 | * |
2461 | * Hence only allow the shrinker to scan the entire cache when |
2462 | * a large delta change is calculated directly. |
2463 | */ |
2464 | - if (delta < max_pass / 4) |
2465 | - total_scan = min(total_scan, max_pass / 2); |
2466 | + if (delta < freeable / 4) |
2467 | + total_scan = min(total_scan, freeable / 2); |
2468 | |
2469 | /* |
2470 | * Avoid risking looping forever due to too large nr value: |
2471 | * never try to free more than twice the estimate number of |
2472 | * freeable entries. |
2473 | */ |
2474 | - if (total_scan > max_pass * 2) |
2475 | - total_scan = max_pass * 2; |
2476 | + if (total_scan > freeable * 2) |
2477 | + total_scan = freeable * 2; |
2478 | |
2479 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, |
2480 | nr_pages_scanned, lru_pages, |
2481 | - max_pass, delta, total_scan); |
2482 | + freeable, delta, total_scan); |
2483 | |
2484 | /* |
2485 | * Normally, we should not scan less than batch_size objects in one |
2486 | @@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, |
2487 | * |
2488 | * We detect the "tight on memory" situations by looking at the total |
2489 | * number of objects we want to scan (total_scan). If it is greater |
2490 | - * than the total number of objects on slab (max_pass), we must be |
2491 | + * than the total number of objects on slab (freeable), we must be |
2492 | * scanning at high prio and therefore should try to reclaim as much as |
2493 | * possible. |
2494 | */ |
2495 | while (total_scan >= batch_size || |
2496 | - total_scan >= max_pass) { |
2497 | + total_scan >= freeable) { |
2498 | unsigned long ret; |
2499 | unsigned long nr_to_scan = min(batch_size, total_scan); |
2500 | |
2501 | @@ -1144,7 +1144,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, |
2502 | TTU_UNMAP|TTU_IGNORE_ACCESS, |
2503 | &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); |
2504 | list_splice(&clean_pages, page_list); |
2505 | - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); |
2506 | + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); |
2507 | return ret; |
2508 | } |
2509 | |
2510 | @@ -2424,8 +2424,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, |
2511 | unsigned long lru_pages = 0; |
2512 | |
2513 | nodes_clear(shrink->nodes_to_scan); |
2514 | - for_each_zone_zonelist(zone, z, zonelist, |
2515 | - gfp_zone(sc->gfp_mask)) { |
2516 | + for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2517 | + gfp_zone(sc->gfp_mask), sc->nodemask) { |
2518 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2519 | continue; |
2520 |