Magellan Linux

Annotation of /trunk/kernel-alx/patches-4.4/0155-4.4.56-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2891 - (hide annotations) (download)
Mon Mar 27 13:49:27 2017 UTC (7 years, 2 months ago) by niro
File size: 62180 byte(s)
linux-4.4.56
1 niro 2891 diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt
2     deleted file mode 100644
3     index 54f10478e8e3..000000000000
4     --- a/Documentation/networking/netlink_mmap.txt
5     +++ /dev/null
6     @@ -1,332 +0,0 @@
7     -This file documents how to use memory mapped I/O with netlink.
8     -
9     -Author: Patrick McHardy <kaber@trash.net>
10     -
11     -Overview
12     ---------
13     -
14     -Memory mapped netlink I/O can be used to increase throughput and decrease
15     -overhead of unicast receive and transmit operations. Some netlink subsystems
16     -require high throughput, these are mainly the netfilter subsystems
17     -nfnetlink_queue and nfnetlink_log, but it can also help speed up large
18     -dump operations of f.i. the routing database.
19     -
20     -Memory mapped netlink I/O used two circular ring buffers for RX and TX which
21     -are mapped into the processes address space.
22     -
23     -The RX ring is used by the kernel to directly construct netlink messages into
24     -user-space memory without copying them as done with regular socket I/O,
25     -additionally as long as the ring contains messages no recvmsg() or poll()
26     -syscalls have to be issued by user-space to get more message.
27     -
28     -The TX ring is used to process messages directly from user-space memory, the
29     -kernel processes all messages contained in the ring using a single sendmsg()
30     -call.
31     -
32     -Usage overview
33     ---------------
34     -
35     -In order to use memory mapped netlink I/O, user-space needs three main changes:
36     -
37     -- ring setup
38     -- conversion of the RX path to get messages from the ring instead of recvmsg()
39     -- conversion of the TX path to construct messages into the ring
40     -
41     -Ring setup is done using setsockopt() to provide the ring parameters to the
42     -kernel, then a call to mmap() to map the ring into the processes address space:
43     -
44     -- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &params, sizeof(params));
45     -- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &params, sizeof(params));
46     -- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)
47     -
48     -Usage of either ring is optional, but even if only the RX ring is used the
49     -mapping still needs to be writable in order to update the frame status after
50     -processing.
51     -
52     -Conversion of the reception path involves calling poll() on the file
53     -descriptor, once the socket is readable the frames from the ring are
54     -processed in order until no more messages are available, as indicated by
55     -a status word in the frame header.
56     -
57     -On kernel side, in order to make use of memory mapped I/O on receive, the
58     -originating netlink subsystem needs to support memory mapped I/O, otherwise
59     -it will use an allocated socket buffer as usual and the contents will be
60     - copied to the ring on transmission, nullifying most of the performance gains.
61     -Dumps of kernel databases automatically support memory mapped I/O.
62     -
63     -Conversion of the transmit path involves changing message construction to
64     -use memory from the TX ring instead of (usually) a buffer declared on the
65     -stack and setting up the frame header appropriately. Optionally poll() can
66     -be used to wait for free frames in the TX ring.
67     -
68     -Structured and definitions for using memory mapped I/O are contained in
69     -<linux/netlink.h>.
70     -
71     -RX and TX rings
72     -----------------
73     -
74     -Each ring contains a number of continuous memory blocks, containing frames of
75     -fixed size dependent on the parameters used for ring setup.
76     -
77     -Ring: [ block 0 ]
78     - [ frame 0 ]
79     - [ frame 1 ]
80     - [ block 1 ]
81     - [ frame 2 ]
82     - [ frame 3 ]
83     - ...
84     - [ block n ]
85     - [ frame 2 * n ]
86     - [ frame 2 * n + 1 ]
87     -
88     -The blocks are only visible to the kernel, from the point of view of user-space
89     -the ring just contains the frames in a continuous memory zone.
90     -
91     -The ring parameters used for setting up the ring are defined as follows:
92     -
93     -struct nl_mmap_req {
94     - unsigned int nm_block_size;
95     - unsigned int nm_block_nr;
96     - unsigned int nm_frame_size;
97     - unsigned int nm_frame_nr;
98     -};
99     -
100     -Frames are grouped into blocks, where each block is a continuous region of memory
101     -and holds nm_block_size / nm_frame_size frames. The total number of frames in
102     -the ring is nm_frame_nr. The following invariants hold:
103     -
104     -- frames_per_block = nm_block_size / nm_frame_size
105     -
106     -- nm_frame_nr = frames_per_block * nm_block_nr
107     -
108     -Some parameters are constrained, specifically:
109     -
110     -- nm_block_size must be a multiple of the architectures memory page size.
111     - The getpagesize() function can be used to get the page size.
112     -
113     -- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be
114     - able to hold at least the frame header
115     -
116     -- nm_frame_size must be smaller or equal to nm_block_size
117     -
118     -- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT
119     -
120     -- nm_frame_nr must equal the actual number of frames as specified above.
121     -
122     -When the kernel can't allocate physically continuous memory for a ring block,
123     -it will fall back to use physically discontinuous memory. This might affect
124     -performance negatively, in order to avoid this the nm_frame_size parameter
125     -should be chosen to be as small as possible for the required frame size and
126     -the number of blocks should be increased instead.
127     -
128     -Ring frames
129     -------------
130     -
131     -Each frames contain a frame header, consisting of a synchronization word and some
132     -meta-data, and the message itself.
133     -
134     -Frame: [ header message ]
135     -
136     -The frame header is defined as follows:
137     -
138     -struct nl_mmap_hdr {
139     - unsigned int nm_status;
140     - unsigned int nm_len;
141     - __u32 nm_group;
142     - /* credentials */
143     - __u32 nm_pid;
144     - __u32 nm_uid;
145     - __u32 nm_gid;
146     -};
147     -
148     -- nm_status is used for synchronizing processing between the kernel and user-
149     - space and specifies ownership of the frame as well as the operation to perform
150     -
151     -- nm_len contains the length of the message contained in the data area
152     -
153     -- nm_group specified the destination multicast group of message
154     -
155     -- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending
156     - process. These values correspond to the data available using SOCK_PASSCRED in
157     - the SCM_CREDENTIALS cmsg.
158     -
159     -The possible values in the status word are:
160     -
161     -- NL_MMAP_STATUS_UNUSED:
162     - RX ring: frame belongs to the kernel and contains no message
163     - for user-space. Approriate action is to invoke poll()
164     - to wait for new messages.
165     -
166     - TX ring: frame belongs to user-space and can be used for
167     - message construction.
168     -
169     -- NL_MMAP_STATUS_RESERVED:
170     - RX ring only: frame is currently used by the kernel for message
171     - construction and contains no valid message yet.
172     - Appropriate action is to invoke poll() to wait for
173     - new messages.
174     -
175     -- NL_MMAP_STATUS_VALID:
176     - RX ring: frame contains a valid message. Approriate action is
177     - to process the message and release the frame back to
178     - the kernel by setting the status to
179     - NL_MMAP_STATUS_UNUSED or queue the frame by setting the
180     - status to NL_MMAP_STATUS_SKIP.
181     -
182     - TX ring: the frame contains a valid message from user-space to
183     - be processed by the kernel. After completing processing
184     - the kernel will release the frame back to user-space by
185     - setting the status to NL_MMAP_STATUS_UNUSED.
186     -
187     -- NL_MMAP_STATUS_COPY:
188     - RX ring only: a message is ready to be processed but could not be
189     - stored in the ring, either because it exceeded the
190     - frame size or because the originating subsystem does
191     - not support memory mapped I/O. Appropriate action is
192     - to invoke recvmsg() to receive the message and release
193     - the frame back to the kernel by setting the status to
194     - NL_MMAP_STATUS_UNUSED.
195     -
196     -- NL_MMAP_STATUS_SKIP:
197     - RX ring only: user-space queued the message for later processing, but
198     - processed some messages following it in the ring. The
199     - kernel should skip this frame when looking for unused
200     - frames.
201     -
202     -The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the
203     -frame header.
204     -
205     -TX limitations
206     ---------------
207     -
208     -As of Jan 2015 the message is always copied from the ring frame to an
209     -allocated buffer due to unresolved security concerns.
210     -See commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.").
211     -
212     -Example
213     --------
214     -
215     -Ring setup:
216     -
217     - unsigned int block_size = 16 * getpagesize();
218     - struct nl_mmap_req req = {
219     - .nm_block_size = block_size,
220     - .nm_block_nr = 64,
221     - .nm_frame_size = 16384,
222     - .nm_frame_nr = 64 * block_size / 16384,
223     - };
224     - unsigned int ring_size;
225     - void *rx_ring, *tx_ring;
226     -
227     - /* Configure ring parameters */
228     - if (setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0)
229     - exit(1);
230     - if (setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0)
231     - exit(1)
232     -
233     - /* Calculate size of each individual ring */
234     - ring_size = req.nm_block_nr * req.nm_block_size;
235     -
236     - /* Map RX/TX rings. The TX ring is located after the RX ring */
237     - rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE,
238     - MAP_SHARED, fd, 0);
239     - if ((long)rx_ring == -1L)
240     - exit(1);
241     - tx_ring = rx_ring + ring_size:
242     -
243     -Message reception:
244     -
245     -This example assumes some ring parameters of the ring setup are available.
246     -
247     - unsigned int frame_offset = 0;
248     - struct nl_mmap_hdr *hdr;
249     - struct nlmsghdr *nlh;
250     - unsigned char buf[16384];
251     - ssize_t len;
252     -
253     - while (1) {
254     - struct pollfd pfds[1];
255     -
256     - pfds[0].fd = fd;
257     - pfds[0].events = POLLIN | POLLERR;
258     - pfds[0].revents = 0;
259     -
260     - if (poll(pfds, 1, -1) < 0 && errno != -EINTR)
261     - exit(1);
262     -
263     - /* Check for errors. Error handling omitted */
264     - if (pfds[0].revents & POLLERR)
265     - <handle error>
266     -
267     - /* If no new messages, poll again */
268     - if (!(pfds[0].revents & POLLIN))
269     - continue;
270     -
271     - /* Process all frames */
272     - while (1) {
273     - /* Get next frame header */
274     - hdr = rx_ring + frame_offset;
275     -
276     - if (hdr->nm_status == NL_MMAP_STATUS_VALID) {
277     - /* Regular memory mapped frame */
278     - nlh = (void *)hdr + NL_MMAP_HDRLEN;
279     - len = hdr->nm_len;
280     -
281     - /* Release empty message immediately. May happen
282     - * on error during message construction.
283     - */
284     - if (len == 0)
285     - goto release;
286     - } else if (hdr->nm_status == NL_MMAP_STATUS_COPY) {
287     - /* Frame queued to socket receive queue */
288     - len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT);
289     - if (len <= 0)
290     - break;
291     - nlh = buf;
292     - } else
293     - /* No more messages to process, continue polling */
294     - break;
295     -
296     - process_msg(nlh);
297     -release:
298     - /* Release frame back to the kernel */
299     - hdr->nm_status = NL_MMAP_STATUS_UNUSED;
300     -
301     - /* Advance frame offset to next frame */
302     - frame_offset = (frame_offset + frame_size) % ring_size;
303     - }
304     - }
305     -
306     -Message transmission:
307     -
308     -This example assumes some ring parameters of the ring setup are available.
309     -A single message is constructed and transmitted, to send multiple messages
310     -at once they would be constructed in consecutive frames before a final call
311     -to sendto().
312     -
313     - unsigned int frame_offset = 0;
314     - struct nl_mmap_hdr *hdr;
315     - struct nlmsghdr *nlh;
316     - struct sockaddr_nl addr = {
317     - .nl_family = AF_NETLINK,
318     - };
319     -
320     - hdr = tx_ring + frame_offset;
321     - if (hdr->nm_status != NL_MMAP_STATUS_UNUSED)
322     - /* No frame available. Use poll() to avoid. */
323     - exit(1);
324     -
325     - nlh = (void *)hdr + NL_MMAP_HDRLEN;
326     -
327     - /* Build message */
328     - build_message(nlh);
329     -
330     - /* Fill frame header: length and status need to be set */
331     - hdr->nm_len = nlh->nlmsg_len;
332     - hdr->nm_status = NL_MMAP_STATUS_VALID;
333     -
334     - if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0)
335     - exit(1);
336     -
337     - /* Advance frame offset to next frame */
338     - frame_offset = (frame_offset + frame_size) % ring_size;
339     diff --git a/Makefile b/Makefile
340     index d9cc21df444d..cf9303a5d621 100644
341     --- a/Makefile
342     +++ b/Makefile
343     @@ -1,6 +1,6 @@
344     VERSION = 4
345     PATCHLEVEL = 4
346     -SUBLEVEL = 55
347     +SUBLEVEL = 56
348     EXTRAVERSION =
349     NAME = Blurry Fish Butt
350    
351     diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
352     index 1a8256dd6729..5b2f2306fbcc 100644
353     --- a/arch/x86/kernel/cpu/perf_event.c
354     +++ b/arch/x86/kernel/cpu/perf_event.c
355     @@ -1996,8 +1996,8 @@ static int x86_pmu_event_init(struct perf_event *event)
356    
357     static void refresh_pce(void *ignored)
358     {
359     - if (current->mm)
360     - load_mm_cr4(current->mm);
361     + if (current->active_mm)
362     + load_mm_cr4(current->active_mm);
363     }
364    
365     static void x86_pmu_event_mapped(struct perf_event *event)
366     diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
367     index f129a9af6357..b6b0077da1af 100644
368     --- a/arch/x86/kernel/head64.c
369     +++ b/arch/x86/kernel/head64.c
370     @@ -4,6 +4,7 @@
371     * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
372     */
373    
374     +#define DISABLE_BRANCH_PROFILING
375     #include <linux/init.h>
376     #include <linux/linkage.h>
377     #include <linux/types.h>
378     diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
379     index d470cf219a2d..4e5ac46adc9d 100644
380     --- a/arch/x86/mm/kasan_init_64.c
381     +++ b/arch/x86/mm/kasan_init_64.c
382     @@ -1,3 +1,4 @@
383     +#define DISABLE_BRANCH_PROFILING
384     #define pr_fmt(fmt) "kasan: " fmt
385     #include <linux/bootmem.h>
386     #include <linux/kasan.h>
387     diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
388     index d6b619667f1a..349aecbc210a 100644
389     --- a/drivers/net/vrf.c
390     +++ b/drivers/net/vrf.c
391     @@ -345,6 +345,7 @@ static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev)
392    
393     static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
394     {
395     + int len = skb->len;
396     netdev_tx_t ret = is_ip_tx_frame(skb, dev);
397    
398     if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
399     @@ -352,7 +353,7 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
400    
401     u64_stats_update_begin(&dstats->syncp);
402     dstats->tx_pkts++;
403     - dstats->tx_bytes += skb->len;
404     + dstats->tx_bytes += len;
405     u64_stats_update_end(&dstats->syncp);
406     } else {
407     this_cpu_inc(dev->dstats->tx_drps);
408     diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
409     index 6fa8e165878e..590750ab6564 100644
410     --- a/drivers/net/vxlan.c
411     +++ b/drivers/net/vxlan.c
412     @@ -2600,7 +2600,7 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
413    
414     if (data[IFLA_VXLAN_ID]) {
415     __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
416     - if (id >= VXLAN_VID_MASK)
417     + if (id >= VXLAN_N_VID)
418     return -ERANGE;
419     }
420    
421     diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
422     index 8a9feb341f31..dd561f916f0b 100644
423     --- a/fs/ext4/crypto_policy.c
424     +++ b/fs/ext4/crypto_policy.c
425     @@ -156,6 +156,12 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent,
426     WARN_ON(1); /* Should never happen */
427     return 0;
428     }
429     +
430     + /* No restrictions on file types which are never encrypted */
431     + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
432     + !S_ISLNK(child->i_mode))
433     + return 1;
434     +
435     /* no restrictions if the parent directory is not encrypted */
436     if (!ext4_encrypted_inode(parent))
437     return 1;
438     diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
439     index 1fb12f9c97a6..789e2d6724a9 100644
440     --- a/fs/ext4/ioctl.c
441     +++ b/fs/ext4/ioctl.c
442     @@ -633,8 +633,12 @@ resizefs_out:
443     if (err)
444     goto encryption_policy_out;
445    
446     + mutex_lock(&inode->i_mutex);
447     +
448     err = ext4_process_policy(&policy, inode);
449    
450     + mutex_unlock(&inode->i_mutex);
451     +
452     mnt_drop_write_file(filp);
453     encryption_policy_out:
454     return err;
455     diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c
456     index e504f548b64e..5bbd1989d5e6 100644
457     --- a/fs/f2fs/crypto_policy.c
458     +++ b/fs/f2fs/crypto_policy.c
459     @@ -149,6 +149,11 @@ int f2fs_is_child_context_consistent_with_parent(struct inode *parent,
460     BUG_ON(1);
461     }
462    
463     + /* No restrictions on file types which are never encrypted */
464     + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
465     + !S_ISLNK(child->i_mode))
466     + return 1;
467     +
468     /* no restrictions if the parent directory is not encrypted */
469     if (!f2fs_encrypted_inode(parent))
470     return 1;
471     diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
472     index a197215ad52b..4b449d263333 100644
473     --- a/fs/f2fs/file.c
474     +++ b/fs/f2fs/file.c
475     @@ -1535,12 +1535,19 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
476     #ifdef CONFIG_F2FS_FS_ENCRYPTION
477     struct f2fs_encryption_policy policy;
478     struct inode *inode = file_inode(filp);
479     + int err;
480    
481     if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg,
482     sizeof(policy)))
483     return -EFAULT;
484    
485     - return f2fs_process_policy(&policy, inode);
486     + mutex_lock(&inode->i_mutex);
487     +
488     + err = f2fs_process_policy(&policy, inode);
489     +
490     + mutex_unlock(&inode->i_mutex);
491     +
492     + return err;
493     #else
494     return -EOPNOTSUPP;
495     #endif
496     diff --git a/include/linux/dccp.h b/include/linux/dccp.h
497     index 61d042bbbf60..68449293c4b6 100644
498     --- a/include/linux/dccp.h
499     +++ b/include/linux/dccp.h
500     @@ -163,6 +163,7 @@ struct dccp_request_sock {
501     __u64 dreq_isr;
502     __u64 dreq_gsr;
503     __be32 dreq_service;
504     + spinlock_t dreq_lock;
505     struct list_head dreq_featneg;
506     __u32 dreq_timestamp_echo;
507     __u32 dreq_timestamp_time;
508     diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
509     index f095155d8749..0dba4e4ed2be 100644
510     --- a/include/uapi/linux/netlink.h
511     +++ b/include/uapi/linux/netlink.h
512     @@ -107,8 +107,10 @@ struct nlmsgerr {
513     #define NETLINK_PKTINFO 3
514     #define NETLINK_BROADCAST_ERROR 4
515     #define NETLINK_NO_ENOBUFS 5
516     +#ifndef __KERNEL__
517     #define NETLINK_RX_RING 6
518     #define NETLINK_TX_RING 7
519     +#endif
520     #define NETLINK_LISTEN_ALL_NSID 8
521     #define NETLINK_LIST_MEMBERSHIPS 9
522     #define NETLINK_CAP_ACK 10
523     @@ -134,6 +136,7 @@ struct nl_mmap_hdr {
524     __u32 nm_gid;
525     };
526    
527     +#ifndef __KERNEL__
528     enum nl_mmap_status {
529     NL_MMAP_STATUS_UNUSED,
530     NL_MMAP_STATUS_RESERVED,
531     @@ -145,6 +148,7 @@ enum nl_mmap_status {
532     #define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO
533     #define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
534     #define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
535     +#endif
536    
537     #define NET_MAJOR 36 /* Major 36 is reserved for networking */
538    
539     diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h
540     index f2159d30d1f5..d79399394b46 100644
541     --- a/include/uapi/linux/netlink_diag.h
542     +++ b/include/uapi/linux/netlink_diag.h
543     @@ -48,6 +48,8 @@ enum {
544    
545     #define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */
546     #define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */
547     +#ifndef __KERNEL__
548     #define NDIAG_SHOW_RING_CFG 0x00000004 /* show ring configuration */
549     +#endif
550    
551     #endif
552     diff --git a/include/uapi/linux/packet_diag.h b/include/uapi/linux/packet_diag.h
553     index d08c63f3dd6f..0c5d5dd61b6a 100644
554     --- a/include/uapi/linux/packet_diag.h
555     +++ b/include/uapi/linux/packet_diag.h
556     @@ -64,7 +64,7 @@ struct packet_diag_mclist {
557     __u32 pdmc_count;
558     __u16 pdmc_type;
559     __u16 pdmc_alen;
560     - __u8 pdmc_addr[MAX_ADDR_LEN];
561     + __u8 pdmc_addr[32]; /* MAX_ADDR_LEN */
562     };
563    
564     struct packet_diag_ring {
565     diff --git a/kernel/futex.c b/kernel/futex.c
566     index 9d251dc3ec40..3057dabf726f 100644
567     --- a/kernel/futex.c
568     +++ b/kernel/futex.c
569     @@ -2690,7 +2690,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
570     {
571     struct hrtimer_sleeper timeout, *to = NULL;
572     struct rt_mutex_waiter rt_waiter;
573     - struct rt_mutex *pi_mutex = NULL;
574     struct futex_hash_bucket *hb;
575     union futex_key key2 = FUTEX_KEY_INIT;
576     struct futex_q q = futex_q_init;
577     @@ -2774,6 +2773,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
578     if (q.pi_state && (q.pi_state->owner != current)) {
579     spin_lock(q.lock_ptr);
580     ret = fixup_pi_state_owner(uaddr2, &q, current);
581     + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
582     + rt_mutex_unlock(&q.pi_state->pi_mutex);
583     /*
584     * Drop the reference to the pi state which
585     * the requeue_pi() code acquired for us.
586     @@ -2782,6 +2783,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
587     spin_unlock(q.lock_ptr);
588     }
589     } else {
590     + struct rt_mutex *pi_mutex;
591     +
592     /*
593     * We have been woken up by futex_unlock_pi(), a timeout, or a
594     * signal. futex_unlock_pi() will not destroy the lock_ptr nor
595     @@ -2805,18 +2808,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
596     if (res)
597     ret = (res < 0) ? res : 0;
598    
599     + /*
600     + * If fixup_pi_state_owner() faulted and was unable to handle
601     + * the fault, unlock the rt_mutex and return the fault to
602     + * userspace.
603     + */
604     + if (ret && rt_mutex_owner(pi_mutex) == current)
605     + rt_mutex_unlock(pi_mutex);
606     +
607     /* Unqueue and drop the lock. */
608     unqueue_me_pi(&q);
609     }
610    
611     - /*
612     - * If fixup_pi_state_owner() faulted and was unable to handle the
613     - * fault, unlock the rt_mutex and return the fault to userspace.
614     - */
615     - if (ret == -EFAULT) {
616     - if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
617     - rt_mutex_unlock(pi_mutex);
618     - } else if (ret == -EINTR) {
619     + if (ret == -EINTR) {
620     /*
621     * We've already been requeued, but cannot restart by calling
622     * futex_lock_pi() directly. We could restart this syscall, but
623     diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
624     index f7fba74108a9..e24754a0e052 100644
625     --- a/net/bridge/br_input.c
626     +++ b/net/bridge/br_input.c
627     @@ -29,6 +29,7 @@ EXPORT_SYMBOL(br_should_route_hook);
628     static int
629     br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
630     {
631     + br_drop_fake_rtable(skb);
632     return netif_receive_skb(skb);
633     }
634    
635     diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
636     index 7ddbe7ec81d6..97fc19f001bf 100644
637     --- a/net/bridge/br_netfilter_hooks.c
638     +++ b/net/bridge/br_netfilter_hooks.c
639     @@ -516,21 +516,6 @@ static unsigned int br_nf_pre_routing(void *priv,
640     }
641    
642    
643     -/* PF_BRIDGE/LOCAL_IN ************************************************/
644     -/* The packet is locally destined, which requires a real
645     - * dst_entry, so detach the fake one. On the way up, the
646     - * packet would pass through PRE_ROUTING again (which already
647     - * took place when the packet entered the bridge), but we
648     - * register an IPv4 PRE_ROUTING 'sabotage' hook that will
649     - * prevent this from happening. */
650     -static unsigned int br_nf_local_in(void *priv,
651     - struct sk_buff *skb,
652     - const struct nf_hook_state *state)
653     -{
654     - br_drop_fake_rtable(skb);
655     - return NF_ACCEPT;
656     -}
657     -
658     /* PF_BRIDGE/FORWARD *************************************************/
659     static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
660     {
661     @@ -901,12 +886,6 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
662     .priority = NF_BR_PRI_BRNF,
663     },
664     {
665     - .hook = br_nf_local_in,
666     - .pf = NFPROTO_BRIDGE,
667     - .hooknum = NF_BR_LOCAL_IN,
668     - .priority = NF_BR_PRI_BRNF,
669     - },
670     - {
671     .hook = br_nf_forward_ip,
672     .pf = NFPROTO_BRIDGE,
673     .hooknum = NF_BR_FORWARD,
674     diff --git a/net/core/dev.c b/net/core/dev.c
675     index 08215a85c742..48399d8ce614 100644
676     --- a/net/core/dev.c
677     +++ b/net/core/dev.c
678     @@ -1677,27 +1677,54 @@ EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
679     static struct static_key netstamp_needed __read_mostly;
680     #ifdef HAVE_JUMP_LABEL
681     static atomic_t netstamp_needed_deferred;
682     +static atomic_t netstamp_wanted;
683     static void netstamp_clear(struct work_struct *work)
684     {
685     int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
686     + int wanted;
687    
688     - while (deferred--)
689     - static_key_slow_dec(&netstamp_needed);
690     + wanted = atomic_add_return(deferred, &netstamp_wanted);
691     + if (wanted > 0)
692     + static_key_enable(&netstamp_needed);
693     + else
694     + static_key_disable(&netstamp_needed);
695     }
696     static DECLARE_WORK(netstamp_work, netstamp_clear);
697     #endif
698    
699     void net_enable_timestamp(void)
700     {
701     +#ifdef HAVE_JUMP_LABEL
702     + int wanted;
703     +
704     + while (1) {
705     + wanted = atomic_read(&netstamp_wanted);
706     + if (wanted <= 0)
707     + break;
708     + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
709     + return;
710     + }
711     + atomic_inc(&netstamp_needed_deferred);
712     + schedule_work(&netstamp_work);
713     +#else
714     static_key_slow_inc(&netstamp_needed);
715     +#endif
716     }
717     EXPORT_SYMBOL(net_enable_timestamp);
718    
719     void net_disable_timestamp(void)
720     {
721     #ifdef HAVE_JUMP_LABEL
722     - /* net_disable_timestamp() can be called from non process context */
723     - atomic_inc(&netstamp_needed_deferred);
724     + int wanted;
725     +
726     + while (1) {
727     + wanted = atomic_read(&netstamp_wanted);
728     + if (wanted <= 1)
729     + break;
730     + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
731     + return;
732     + }
733     + atomic_dec(&netstamp_needed_deferred);
734     schedule_work(&netstamp_work);
735     #else
736     static_key_slow_dec(&netstamp_needed);
737     diff --git a/net/core/skbuff.c b/net/core/skbuff.c
738     index 4968b5ddea69..73dfd7729bc9 100644
739     --- a/net/core/skbuff.c
740     +++ b/net/core/skbuff.c
741     @@ -3678,13 +3678,14 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
742     if (!skb_may_tx_timestamp(sk, false))
743     return;
744    
745     - /* take a reference to prevent skb_orphan() from freeing the socket */
746     - sock_hold(sk);
747     -
748     - *skb_hwtstamps(skb) = *hwtstamps;
749     - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
750     -
751     - sock_put(sk);
752     + /* Take a reference to prevent skb_orphan() from freeing the socket,
753     + * but only if the socket refcount is not zero.
754     + */
755     + if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
756     + *skb_hwtstamps(skb) = *hwtstamps;
757     + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
758     + sock_put(sk);
759     + }
760     }
761     EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
762    
763     @@ -3735,7 +3736,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
764     {
765     struct sock *sk = skb->sk;
766     struct sock_exterr_skb *serr;
767     - int err;
768     + int err = 1;
769    
770     skb->wifi_acked_valid = 1;
771     skb->wifi_acked = acked;
772     @@ -3745,14 +3746,15 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
773     serr->ee.ee_errno = ENOMSG;
774     serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
775    
776     - /* take a reference to prevent skb_orphan() from freeing the socket */
777     - sock_hold(sk);
778     -
779     - err = sock_queue_err_skb(sk, skb);
780     + /* Take a reference to prevent skb_orphan() from freeing the socket,
781     + * but only if the socket refcount is not zero.
782     + */
783     + if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
784     + err = sock_queue_err_skb(sk, skb);
785     + sock_put(sk);
786     + }
787     if (err)
788     kfree_skb(skb);
789     -
790     - sock_put(sk);
791     }
792     EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
793    
794     diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
795     index f053198e730c..5e3a7302f774 100644
796     --- a/net/dccp/ccids/ccid2.c
797     +++ b/net/dccp/ccids/ccid2.c
798     @@ -749,6 +749,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
799     for (i = 0; i < hc->tx_seqbufc; i++)
800     kfree(hc->tx_seqbuf[i]);
801     hc->tx_seqbufc = 0;
802     + dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
803     }
804    
805     static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
806     diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
807     index 0759f5b9180e..6467bf392e1b 100644
808     --- a/net/dccp/ipv4.c
809     +++ b/net/dccp/ipv4.c
810     @@ -289,7 +289,8 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
811    
812     switch (type) {
813     case ICMP_REDIRECT:
814     - dccp_do_redirect(skb, sk);
815     + if (!sock_owned_by_user(sk))
816     + dccp_do_redirect(skb, sk);
817     goto out;
818     case ICMP_SOURCE_QUENCH:
819     /* Just silently ignore these. */
820     diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
821     index 27c4e81efa24..8113ad58fcb4 100644
822     --- a/net/dccp/ipv6.c
823     +++ b/net/dccp/ipv6.c
824     @@ -122,10 +122,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
825     np = inet6_sk(sk);
826    
827     if (type == NDISC_REDIRECT) {
828     - struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
829     + if (!sock_owned_by_user(sk)) {
830     + struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
831    
832     - if (dst)
833     - dst->ops->redirect(dst, sk, skb);
834     + if (dst)
835     + dst->ops->redirect(dst, sk, skb);
836     + }
837     goto out;
838     }
839    
840     diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
841     index 1994f8af646b..68eed344b471 100644
842     --- a/net/dccp/minisocks.c
843     +++ b/net/dccp/minisocks.c
844     @@ -122,6 +122,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk,
845     /* It is still raw copy of parent, so invalidate
846     * destructor and make plain sk_free() */
847     newsk->sk_destruct = NULL;
848     + bh_unlock_sock(newsk);
849     sk_free(newsk);
850     return NULL;
851     }
852     @@ -145,6 +146,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
853     struct dccp_request_sock *dreq = dccp_rsk(req);
854     bool own_req;
855    
856     + /* TCP/DCCP listeners became lockless.
857     + * DCCP stores complex state in its request_sock, so we need
858     + * a protection for them, now this code runs without being protected
859     + * by the parent (listener) lock.
860     + */
861     + spin_lock_bh(&dreq->dreq_lock);
862     +
863     /* Check for retransmitted REQUEST */
864     if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
865    
866     @@ -159,7 +167,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
867     inet_rtx_syn_ack(sk, req);
868     }
869     /* Network Duplicate, discard packet */
870     - return NULL;
871     + goto out;
872     }
873    
874     DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
875     @@ -185,20 +193,20 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
876    
877     child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
878     req, &own_req);
879     - if (!child)
880     - goto listen_overflow;
881     -
882     - return inet_csk_complete_hashdance(sk, child, req, own_req);
883     + if (child) {
884     + child = inet_csk_complete_hashdance(sk, child, req, own_req);
885     + goto out;
886     + }
887    
888     -listen_overflow:
889     - dccp_pr_debug("listen_overflow!\n");
890     DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
891     drop:
892     if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
893     req->rsk_ops->send_reset(sk, skb);
894    
895     inet_csk_reqsk_queue_drop(sk, req);
896     - return NULL;
897     +out:
898     + spin_unlock_bh(&dreq->dreq_lock);
899     + return child;
900     }
901    
902     EXPORT_SYMBOL_GPL(dccp_check_req);
903     @@ -249,6 +257,7 @@ int dccp_reqsk_init(struct request_sock *req,
904     {
905     struct dccp_request_sock *dreq = dccp_rsk(req);
906    
907     + spin_lock_init(&dreq->dreq_lock);
908     inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
909     inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
910     inet_rsk(req)->acked = 0;
911     diff --git a/net/ipv4/route.c b/net/ipv4/route.c
912     index ef2f527a119b..da4d68d78590 100644
913     --- a/net/ipv4/route.c
914     +++ b/net/ipv4/route.c
915     @@ -1958,6 +1958,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
916     {
917     int res;
918    
919     + tos &= IPTOS_RT_MASK;
920     rcu_read_lock();
921    
922     /* Multicast recognition logic is moved from route cache to here.
923     diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
924     index b58a38eea059..198fc2314c82 100644
925     --- a/net/ipv4/tcp_ipv4.c
926     +++ b/net/ipv4/tcp_ipv4.c
927     @@ -271,10 +271,13 @@ EXPORT_SYMBOL(tcp_v4_connect);
928     */
929     void tcp_v4_mtu_reduced(struct sock *sk)
930     {
931     - struct dst_entry *dst;
932     struct inet_sock *inet = inet_sk(sk);
933     - u32 mtu = tcp_sk(sk)->mtu_info;
934     + struct dst_entry *dst;
935     + u32 mtu;
936    
937     + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
938     + return;
939     + mtu = tcp_sk(sk)->mtu_info;
940     dst = inet_csk_update_pmtu(sk, mtu);
941     if (!dst)
942     return;
943     @@ -420,7 +423,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
944    
945     switch (type) {
946     case ICMP_REDIRECT:
947     - do_redirect(icmp_skb, sk);
948     + if (!sock_owned_by_user(sk))
949     + do_redirect(icmp_skb, sk);
950     goto out;
951     case ICMP_SOURCE_QUENCH:
952     /* Just silently ignore these. */
953     diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
954     index 193ba1fa8a9a..ebb34d0c5e80 100644
955     --- a/net/ipv4/tcp_timer.c
956     +++ b/net/ipv4/tcp_timer.c
957     @@ -223,7 +223,8 @@ void tcp_delack_timer_handler(struct sock *sk)
958    
959     sk_mem_reclaim_partial(sk);
960    
961     - if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
962     + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
963     + !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
964     goto out;
965    
966     if (time_after(icsk->icsk_ack.timeout, jiffies)) {
967     @@ -504,7 +505,8 @@ void tcp_write_timer_handler(struct sock *sk)
968     struct inet_connection_sock *icsk = inet_csk(sk);
969     int event;
970    
971     - if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
972     + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
973     + !icsk->icsk_pending)
974     goto out;
975    
976     if (time_after(icsk->icsk_timeout, jiffies)) {
977     diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
978     index 34cf46d74554..85bf86458706 100644
979     --- a/net/ipv6/ip6_fib.c
980     +++ b/net/ipv6/ip6_fib.c
981     @@ -903,6 +903,8 @@ add:
982     ins = &rt->dst.rt6_next;
983     iter = *ins;
984     while (iter) {
985     + if (iter->rt6i_metric > rt->rt6i_metric)
986     + break;
987     if (rt6_qualify_for_ecmp(iter)) {
988     *ins = iter->dst.rt6_next;
989     fib6_purge_rt(iter, fn, info->nl_net);
990     diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
991     index 58900c21e4e4..8004532fa882 100644
992     --- a/net/ipv6/ip6_output.c
993     +++ b/net/ipv6/ip6_output.c
994     @@ -742,13 +742,14 @@ slow_path:
995     * Fragment the datagram.
996     */
997    
998     - *prevhdr = NEXTHDR_FRAGMENT;
999     troom = rt->dst.dev->needed_tailroom;
1000    
1001     /*
1002     * Keep copying data until we run out.
1003     */
1004     while (left > 0) {
1005     + u8 *fragnexthdr_offset;
1006     +
1007     len = left;
1008     /* IF: it doesn't fit, use 'mtu' - the data space left */
1009     if (len > mtu)
1010     @@ -793,6 +794,10 @@ slow_path:
1011     */
1012     skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1013    
1014     + fragnexthdr_offset = skb_network_header(frag);
1015     + fragnexthdr_offset += prevhdr - skb_network_header(skb);
1016     + *fragnexthdr_offset = NEXTHDR_FRAGMENT;
1017     +
1018     /*
1019     * Build fragment header.
1020     */
1021     diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
1022     index 0a8610b33d79..bdcc4d9cedd3 100644
1023     --- a/net/ipv6/ip6_vti.c
1024     +++ b/net/ipv6/ip6_vti.c
1025     @@ -680,6 +680,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
1026     u->link = p->link;
1027     u->i_key = p->i_key;
1028     u->o_key = p->o_key;
1029     + if (u->i_key)
1030     + u->i_flags |= GRE_KEY;
1031     + if (u->o_key)
1032     + u->o_flags |= GRE_KEY;
1033     u->proto = p->proto;
1034    
1035     memcpy(u->name, p->name, sizeof(u->name));
1036     diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
1037     index 76a8c8057a23..1a63c4deef26 100644
1038     --- a/net/ipv6/tcp_ipv6.c
1039     +++ b/net/ipv6/tcp_ipv6.c
1040     @@ -376,10 +376,12 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
1041     np = inet6_sk(sk);
1042    
1043     if (type == NDISC_REDIRECT) {
1044     - struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
1045     + if (!sock_owned_by_user(sk)) {
1046     + struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
1047    
1048     - if (dst)
1049     - dst->ops->redirect(dst, sk, skb);
1050     + if (dst)
1051     + dst->ops->redirect(dst, sk, skb);
1052     + }
1053     goto out;
1054     }
1055    
1056     diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
1057     index 445b7cd0826a..48ab93842322 100644
1058     --- a/net/l2tp/l2tp_ip.c
1059     +++ b/net/l2tp/l2tp_ip.c
1060     @@ -383,7 +383,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
1061     drop:
1062     IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
1063     kfree_skb(skb);
1064     - return -1;
1065     + return 0;
1066     }
1067    
1068     /* Userspace will call sendmsg() on the tunnel socket to send L2TP
1069     diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
1070     index 881bc2072809..52cfc4478511 100644
1071     --- a/net/mpls/af_mpls.c
1072     +++ b/net/mpls/af_mpls.c
1073     @@ -1567,6 +1567,7 @@ static void mpls_net_exit(struct net *net)
1074     for (index = 0; index < platform_labels; index++) {
1075     struct mpls_route *rt = rtnl_dereference(platform_label[index]);
1076     RCU_INIT_POINTER(platform_label[index], NULL);
1077     + mpls_notify_route(net, index, rt, NULL, NULL);
1078     mpls_rt_free(rt);
1079     }
1080     rtnl_unlock();
1081     diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig
1082     index 2c5e95e9bfbd..5d6e8c05b3d4 100644
1083     --- a/net/netlink/Kconfig
1084     +++ b/net/netlink/Kconfig
1085     @@ -2,15 +2,6 @@
1086     # Netlink Sockets
1087     #
1088    
1089     -config NETLINK_MMAP
1090     - bool "NETLINK: mmaped IO"
1091     - ---help---
1092     - This option enables support for memory mapped netlink IO. This
1093     - reduces overhead by avoiding copying data between kernel- and
1094     - userspace.
1095     -
1096     - If unsure, say N.
1097     -
1098     config NETLINK_DIAG
1099     tristate "NETLINK: socket monitoring interface"
1100     default n
1101     diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
1102     index 360700a2f46c..8e33019d8e7b 100644
1103     --- a/net/netlink/af_netlink.c
1104     +++ b/net/netlink/af_netlink.c
1105     @@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
1106    
1107     dev_hold(dev);
1108    
1109     - if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head))
1110     + if (is_vmalloc_addr(skb->head))
1111     nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
1112     else
1113     nskb = skb_clone(skb, GFP_ATOMIC);
1114     @@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk)
1115     wake_up_interruptible(&nlk->wait);
1116     }
1117    
1118     -#ifdef CONFIG_NETLINK_MMAP
1119     -static bool netlink_rx_is_mmaped(struct sock *sk)
1120     -{
1121     - return nlk_sk(sk)->rx_ring.pg_vec != NULL;
1122     -}
1123     -
1124     -static bool netlink_tx_is_mmaped(struct sock *sk)
1125     -{
1126     - return nlk_sk(sk)->tx_ring.pg_vec != NULL;
1127     -}
1128     -
1129     -static __pure struct page *pgvec_to_page(const void *addr)
1130     -{
1131     - if (is_vmalloc_addr(addr))
1132     - return vmalloc_to_page(addr);
1133     - else
1134     - return virt_to_page(addr);
1135     -}
1136     -
1137     -static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
1138     -{
1139     - unsigned int i;
1140     -
1141     - for (i = 0; i < len; i++) {
1142     - if (pg_vec[i] != NULL) {
1143     - if (is_vmalloc_addr(pg_vec[i]))
1144     - vfree(pg_vec[i]);
1145     - else
1146     - free_pages((unsigned long)pg_vec[i], order);
1147     - }
1148     - }
1149     - kfree(pg_vec);
1150     -}
1151     -
1152     -static void *alloc_one_pg_vec_page(unsigned long order)
1153     -{
1154     - void *buffer;
1155     - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
1156     - __GFP_NOWARN | __GFP_NORETRY;
1157     -
1158     - buffer = (void *)__get_free_pages(gfp_flags, order);
1159     - if (buffer != NULL)
1160     - return buffer;
1161     -
1162     - buffer = vzalloc((1 << order) * PAGE_SIZE);
1163     - if (buffer != NULL)
1164     - return buffer;
1165     -
1166     - gfp_flags &= ~__GFP_NORETRY;
1167     - return (void *)__get_free_pages(gfp_flags, order);
1168     -}
1169     -
1170     -static void **alloc_pg_vec(struct netlink_sock *nlk,
1171     - struct nl_mmap_req *req, unsigned int order)
1172     -{
1173     - unsigned int block_nr = req->nm_block_nr;
1174     - unsigned int i;
1175     - void **pg_vec;
1176     -
1177     - pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
1178     - if (pg_vec == NULL)
1179     - return NULL;
1180     -
1181     - for (i = 0; i < block_nr; i++) {
1182     - pg_vec[i] = alloc_one_pg_vec_page(order);
1183     - if (pg_vec[i] == NULL)
1184     - goto err1;
1185     - }
1186     -
1187     - return pg_vec;
1188     -err1:
1189     - free_pg_vec(pg_vec, order, block_nr);
1190     - return NULL;
1191     -}
1192     -
1193     -
1194     -static void
1195     -__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
1196     - unsigned int order)
1197     -{
1198     - struct netlink_sock *nlk = nlk_sk(sk);
1199     - struct sk_buff_head *queue;
1200     - struct netlink_ring *ring;
1201     -
1202     - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1203     - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
1204     -
1205     - spin_lock_bh(&queue->lock);
1206     -
1207     - ring->frame_max = req->nm_frame_nr - 1;
1208     - ring->head = 0;
1209     - ring->frame_size = req->nm_frame_size;
1210     - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE;
1211     -
1212     - swap(ring->pg_vec_len, req->nm_block_nr);
1213     - swap(ring->pg_vec_order, order);
1214     - swap(ring->pg_vec, pg_vec);
1215     -
1216     - __skb_queue_purge(queue);
1217     - spin_unlock_bh(&queue->lock);
1218     -
1219     - WARN_ON(atomic_read(&nlk->mapped));
1220     -
1221     - if (pg_vec)
1222     - free_pg_vec(pg_vec, order, req->nm_block_nr);
1223     -}
1224     -
1225     -static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
1226     - bool tx_ring)
1227     -{
1228     - struct netlink_sock *nlk = nlk_sk(sk);
1229     - struct netlink_ring *ring;
1230     - void **pg_vec = NULL;
1231     - unsigned int order = 0;
1232     -
1233     - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
1234     -
1235     - if (atomic_read(&nlk->mapped))
1236     - return -EBUSY;
1237     - if (atomic_read(&ring->pending))
1238     - return -EBUSY;
1239     -
1240     - if (req->nm_block_nr) {
1241     - if (ring->pg_vec != NULL)
1242     - return -EBUSY;
1243     -
1244     - if ((int)req->nm_block_size <= 0)
1245     - return -EINVAL;
1246     - if (!PAGE_ALIGNED(req->nm_block_size))
1247     - return -EINVAL;
1248     - if (req->nm_frame_size < NL_MMAP_HDRLEN)
1249     - return -EINVAL;
1250     - if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
1251     - return -EINVAL;
1252     -
1253     - ring->frames_per_block = req->nm_block_size /
1254     - req->nm_frame_size;
1255     - if (ring->frames_per_block == 0)
1256     - return -EINVAL;
1257     - if (ring->frames_per_block * req->nm_block_nr !=
1258     - req->nm_frame_nr)
1259     - return -EINVAL;
1260     -
1261     - order = get_order(req->nm_block_size);
1262     - pg_vec = alloc_pg_vec(nlk, req, order);
1263     - if (pg_vec == NULL)
1264     - return -ENOMEM;
1265     - } else {
1266     - if (req->nm_frame_nr)
1267     - return -EINVAL;
1268     - }
1269     -
1270     - mutex_lock(&nlk->pg_vec_lock);
1271     - if (atomic_read(&nlk->mapped) == 0) {
1272     - __netlink_set_ring(sk, req, tx_ring, pg_vec, order);
1273     - mutex_unlock(&nlk->pg_vec_lock);
1274     - return 0;
1275     - }
1276     -
1277     - mutex_unlock(&nlk->pg_vec_lock);
1278     -
1279     - if (pg_vec)
1280     - free_pg_vec(pg_vec, order, req->nm_block_nr);
1281     -
1282     - return -EBUSY;
1283     -}
1284     -
1285     -static void netlink_mm_open(struct vm_area_struct *vma)
1286     -{
1287     - struct file *file = vma->vm_file;
1288     - struct socket *sock = file->private_data;
1289     - struct sock *sk = sock->sk;
1290     -
1291     - if (sk)
1292     - atomic_inc(&nlk_sk(sk)->mapped);
1293     -}
1294     -
1295     -static void netlink_mm_close(struct vm_area_struct *vma)
1296     -{
1297     - struct file *file = vma->vm_file;
1298     - struct socket *sock = file->private_data;
1299     - struct sock *sk = sock->sk;
1300     -
1301     - if (sk)
1302     - atomic_dec(&nlk_sk(sk)->mapped);
1303     -}
1304     -
1305     -static const struct vm_operations_struct netlink_mmap_ops = {
1306     - .open = netlink_mm_open,
1307     - .close = netlink_mm_close,
1308     -};
1309     -
1310     -static int netlink_mmap(struct file *file, struct socket *sock,
1311     - struct vm_area_struct *vma)
1312     -{
1313     - struct sock *sk = sock->sk;
1314     - struct netlink_sock *nlk = nlk_sk(sk);
1315     - struct netlink_ring *ring;
1316     - unsigned long start, size, expected;
1317     - unsigned int i;
1318     - int err = -EINVAL;
1319     -
1320     - if (vma->vm_pgoff)
1321     - return -EINVAL;
1322     -
1323     - mutex_lock(&nlk->pg_vec_lock);
1324     -
1325     - expected = 0;
1326     - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
1327     - if (ring->pg_vec == NULL)
1328     - continue;
1329     - expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
1330     - }
1331     -
1332     - if (expected == 0)
1333     - goto out;
1334     -
1335     - size = vma->vm_end - vma->vm_start;
1336     - if (size != expected)
1337     - goto out;
1338     -
1339     - start = vma->vm_start;
1340     - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
1341     - if (ring->pg_vec == NULL)
1342     - continue;
1343     -
1344     - for (i = 0; i < ring->pg_vec_len; i++) {
1345     - struct page *page;
1346     - void *kaddr = ring->pg_vec[i];
1347     - unsigned int pg_num;
1348     -
1349     - for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
1350     - page = pgvec_to_page(kaddr);
1351     - err = vm_insert_page(vma, start, page);
1352     - if (err < 0)
1353     - goto out;
1354     - start += PAGE_SIZE;
1355     - kaddr += PAGE_SIZE;
1356     - }
1357     - }
1358     - }
1359     -
1360     - atomic_inc(&nlk->mapped);
1361     - vma->vm_ops = &netlink_mmap_ops;
1362     - err = 0;
1363     -out:
1364     - mutex_unlock(&nlk->pg_vec_lock);
1365     - return err;
1366     -}
1367     -
1368     -static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
1369     -{
1370     -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1371     - struct page *p_start, *p_end;
1372     -
1373     - /* First page is flushed through netlink_{get,set}_status */
1374     - p_start = pgvec_to_page(hdr + PAGE_SIZE);
1375     - p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
1376     - while (p_start <= p_end) {
1377     - flush_dcache_page(p_start);
1378     - p_start++;
1379     - }
1380     -#endif
1381     -}
1382     -
1383     -static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
1384     -{
1385     - smp_rmb();
1386     - flush_dcache_page(pgvec_to_page(hdr));
1387     - return hdr->nm_status;
1388     -}
1389     -
1390     -static void netlink_set_status(struct nl_mmap_hdr *hdr,
1391     - enum nl_mmap_status status)
1392     -{
1393     - smp_mb();
1394     - hdr->nm_status = status;
1395     - flush_dcache_page(pgvec_to_page(hdr));
1396     -}
1397     -
1398     -static struct nl_mmap_hdr *
1399     -__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
1400     -{
1401     - unsigned int pg_vec_pos, frame_off;
1402     -
1403     - pg_vec_pos = pos / ring->frames_per_block;
1404     - frame_off = pos % ring->frames_per_block;
1405     -
1406     - return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
1407     -}
1408     -
1409     -static struct nl_mmap_hdr *
1410     -netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
1411     - enum nl_mmap_status status)
1412     -{
1413     - struct nl_mmap_hdr *hdr;
1414     -
1415     - hdr = __netlink_lookup_frame(ring, pos);
1416     - if (netlink_get_status(hdr) != status)
1417     - return NULL;
1418     -
1419     - return hdr;
1420     -}
1421     -
1422     -static struct nl_mmap_hdr *
1423     -netlink_current_frame(const struct netlink_ring *ring,
1424     - enum nl_mmap_status status)
1425     -{
1426     - return netlink_lookup_frame(ring, ring->head, status);
1427     -}
1428     -
1429     -static void netlink_increment_head(struct netlink_ring *ring)
1430     -{
1431     - ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
1432     -}
1433     -
1434     -static void netlink_forward_ring(struct netlink_ring *ring)
1435     -{
1436     - unsigned int head = ring->head;
1437     - const struct nl_mmap_hdr *hdr;
1438     -
1439     - do {
1440     - hdr = __netlink_lookup_frame(ring, ring->head);
1441     - if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
1442     - break;
1443     - if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
1444     - break;
1445     - netlink_increment_head(ring);
1446     - } while (ring->head != head);
1447     -}
1448     -
1449     -static bool netlink_has_valid_frame(struct netlink_ring *ring)
1450     -{
1451     - unsigned int head = ring->head, pos = head;
1452     - const struct nl_mmap_hdr *hdr;
1453     -
1454     - do {
1455     - hdr = __netlink_lookup_frame(ring, pos);
1456     - if (hdr->nm_status == NL_MMAP_STATUS_VALID)
1457     - return true;
1458     - pos = pos != 0 ? pos - 1 : ring->frame_max;
1459     - } while (pos != head);
1460     -
1461     - return false;
1462     -}
1463     -
1464     -static bool netlink_dump_space(struct netlink_sock *nlk)
1465     -{
1466     - struct netlink_ring *ring = &nlk->rx_ring;
1467     - struct nl_mmap_hdr *hdr;
1468     - unsigned int n;
1469     -
1470     - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
1471     - if (hdr == NULL)
1472     - return false;
1473     -
1474     - n = ring->head + ring->frame_max / 2;
1475     - if (n > ring->frame_max)
1476     - n -= ring->frame_max;
1477     -
1478     - hdr = __netlink_lookup_frame(ring, n);
1479     -
1480     - return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
1481     -}
1482     -
1483     -static unsigned int netlink_poll(struct file *file, struct socket *sock,
1484     - poll_table *wait)
1485     -{
1486     - struct sock *sk = sock->sk;
1487     - struct netlink_sock *nlk = nlk_sk(sk);
1488     - unsigned int mask;
1489     - int err;
1490     -
1491     - if (nlk->rx_ring.pg_vec != NULL) {
1492     - /* Memory mapped sockets don't call recvmsg(), so flow control
1493     - * for dumps is performed here. A dump is allowed to continue
1494     - * if at least half the ring is unused.
1495     - */
1496     - while (nlk->cb_running && netlink_dump_space(nlk)) {
1497     - err = netlink_dump(sk);
1498     - if (err < 0) {
1499     - sk->sk_err = -err;
1500     - sk->sk_error_report(sk);
1501     - break;
1502     - }
1503     - }
1504     - netlink_rcv_wake(sk);
1505     - }
1506     -
1507     - mask = datagram_poll(file, sock, wait);
1508     -
1509     - /* We could already have received frames in the normal receive
1510     - * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
1511     - * so if mask contains pollin/etc already, there's no point
1512     - * walking the ring.
1513     - */
1514     - if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
1515     - spin_lock_bh(&sk->sk_receive_queue.lock);
1516     - if (nlk->rx_ring.pg_vec) {
1517     - if (netlink_has_valid_frame(&nlk->rx_ring))
1518     - mask |= POLLIN | POLLRDNORM;
1519     - }
1520     - spin_unlock_bh(&sk->sk_receive_queue.lock);
1521     - }
1522     -
1523     - spin_lock_bh(&sk->sk_write_queue.lock);
1524     - if (nlk->tx_ring.pg_vec) {
1525     - if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
1526     - mask |= POLLOUT | POLLWRNORM;
1527     - }
1528     - spin_unlock_bh(&sk->sk_write_queue.lock);
1529     -
1530     - return mask;
1531     -}
1532     -
1533     -static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
1534     -{
1535     - return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
1536     -}
1537     -
1538     -static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
1539     - struct netlink_ring *ring,
1540     - struct nl_mmap_hdr *hdr)
1541     -{
1542     - unsigned int size;
1543     - void *data;
1544     -
1545     - size = ring->frame_size - NL_MMAP_HDRLEN;
1546     - data = (void *)hdr + NL_MMAP_HDRLEN;
1547     -
1548     - skb->head = data;
1549     - skb->data = data;
1550     - skb_reset_tail_pointer(skb);
1551     - skb->end = skb->tail + size;
1552     - skb->len = 0;
1553     -
1554     - skb->destructor = netlink_skb_destructor;
1555     - NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
1556     - NETLINK_CB(skb).sk = sk;
1557     -}
1558     -
1559     -static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
1560     - u32 dst_portid, u32 dst_group,
1561     - struct scm_cookie *scm)
1562     -{
1563     - struct netlink_sock *nlk = nlk_sk(sk);
1564     - struct netlink_ring *ring;
1565     - struct nl_mmap_hdr *hdr;
1566     - struct sk_buff *skb;
1567     - unsigned int maxlen;
1568     - int err = 0, len = 0;
1569     -
1570     - mutex_lock(&nlk->pg_vec_lock);
1571     -
1572     - ring = &nlk->tx_ring;
1573     - maxlen = ring->frame_size - NL_MMAP_HDRLEN;
1574     -
1575     - do {
1576     - unsigned int nm_len;
1577     -
1578     - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
1579     - if (hdr == NULL) {
1580     - if (!(msg->msg_flags & MSG_DONTWAIT) &&
1581     - atomic_read(&nlk->tx_ring.pending))
1582     - schedule();
1583     - continue;
1584     - }
1585     -
1586     - nm_len = ACCESS_ONCE(hdr->nm_len);
1587     - if (nm_len > maxlen) {
1588     - err = -EINVAL;
1589     - goto out;
1590     - }
1591     -
1592     - netlink_frame_flush_dcache(hdr, nm_len);
1593     -
1594     - skb = alloc_skb(nm_len, GFP_KERNEL);
1595     - if (skb == NULL) {
1596     - err = -ENOBUFS;
1597     - goto out;
1598     - }
1599     - __skb_put(skb, nm_len);
1600     - memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
1601     - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
1602     -
1603     - netlink_increment_head(ring);
1604     -
1605     - NETLINK_CB(skb).portid = nlk->portid;
1606     - NETLINK_CB(skb).dst_group = dst_group;
1607     - NETLINK_CB(skb).creds = scm->creds;
1608     -
1609     - err = security_netlink_send(sk, skb);
1610     - if (err) {
1611     - kfree_skb(skb);
1612     - goto out;
1613     - }
1614     -
1615     - if (unlikely(dst_group)) {
1616     - atomic_inc(&skb->users);
1617     - netlink_broadcast(sk, skb, dst_portid, dst_group,
1618     - GFP_KERNEL);
1619     - }
1620     - err = netlink_unicast(sk, skb, dst_portid,
1621     - msg->msg_flags & MSG_DONTWAIT);
1622     - if (err < 0)
1623     - goto out;
1624     - len += err;
1625     -
1626     - } while (hdr != NULL ||
1627     - (!(msg->msg_flags & MSG_DONTWAIT) &&
1628     - atomic_read(&nlk->tx_ring.pending)));
1629     -
1630     - if (len > 0)
1631     - err = len;
1632     -out:
1633     - mutex_unlock(&nlk->pg_vec_lock);
1634     - return err;
1635     -}
1636     -
1637     -static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
1638     -{
1639     - struct nl_mmap_hdr *hdr;
1640     -
1641     - hdr = netlink_mmap_hdr(skb);
1642     - hdr->nm_len = skb->len;
1643     - hdr->nm_group = NETLINK_CB(skb).dst_group;
1644     - hdr->nm_pid = NETLINK_CB(skb).creds.pid;
1645     - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
1646     - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
1647     - netlink_frame_flush_dcache(hdr, hdr->nm_len);
1648     - netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
1649     -
1650     - NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
1651     - kfree_skb(skb);
1652     -}
1653     -
1654     -static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
1655     -{
1656     - struct netlink_sock *nlk = nlk_sk(sk);
1657     - struct netlink_ring *ring = &nlk->rx_ring;
1658     - struct nl_mmap_hdr *hdr;
1659     -
1660     - spin_lock_bh(&sk->sk_receive_queue.lock);
1661     - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
1662     - if (hdr == NULL) {
1663     - spin_unlock_bh(&sk->sk_receive_queue.lock);
1664     - kfree_skb(skb);
1665     - netlink_overrun(sk);
1666     - return;
1667     - }
1668     - netlink_increment_head(ring);
1669     - __skb_queue_tail(&sk->sk_receive_queue, skb);
1670     - spin_unlock_bh(&sk->sk_receive_queue.lock);
1671     -
1672     - hdr->nm_len = skb->len;
1673     - hdr->nm_group = NETLINK_CB(skb).dst_group;
1674     - hdr->nm_pid = NETLINK_CB(skb).creds.pid;
1675     - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
1676     - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
1677     - netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
1678     -}
1679     -
1680     -#else /* CONFIG_NETLINK_MMAP */
1681     -#define netlink_rx_is_mmaped(sk) false
1682     -#define netlink_tx_is_mmaped(sk) false
1683     -#define netlink_mmap sock_no_mmap
1684     -#define netlink_poll datagram_poll
1685     -#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm) 0
1686     -#endif /* CONFIG_NETLINK_MMAP */
1687     -
1688     static void netlink_skb_destructor(struct sk_buff *skb)
1689     {
1690     -#ifdef CONFIG_NETLINK_MMAP
1691     - struct nl_mmap_hdr *hdr;
1692     - struct netlink_ring *ring;
1693     - struct sock *sk;
1694     -
1695     - /* If a packet from the kernel to userspace was freed because of an
1696     - * error without being delivered to userspace, the kernel must reset
1697     - * the status. In the direction userspace to kernel, the status is
1698     - * always reset here after the packet was processed and freed.
1699     - */
1700     - if (netlink_skb_is_mmaped(skb)) {
1701     - hdr = netlink_mmap_hdr(skb);
1702     - sk = NETLINK_CB(skb).sk;
1703     -
1704     - if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
1705     - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
1706     - ring = &nlk_sk(sk)->tx_ring;
1707     - } else {
1708     - if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
1709     - hdr->nm_len = 0;
1710     - netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
1711     - }
1712     - ring = &nlk_sk(sk)->rx_ring;
1713     - }
1714     -
1715     - WARN_ON(atomic_read(&ring->pending) == 0);
1716     - atomic_dec(&ring->pending);
1717     - sock_put(sk);
1718     -
1719     - skb->head = NULL;
1720     - }
1721     -#endif
1722     if (is_vmalloc_addr(skb->head)) {
1723     if (!skb->cloned ||
1724     !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
1725     @@ -936,18 +334,6 @@ static void netlink_sock_destruct(struct sock *sk)
1726     }
1727    
1728     skb_queue_purge(&sk->sk_receive_queue);
1729     -#ifdef CONFIG_NETLINK_MMAP
1730     - if (1) {
1731     - struct nl_mmap_req req;
1732     -
1733     - memset(&req, 0, sizeof(req));
1734     - if (nlk->rx_ring.pg_vec)
1735     - __netlink_set_ring(sk, &req, false, NULL, 0);
1736     - memset(&req, 0, sizeof(req));
1737     - if (nlk->tx_ring.pg_vec)
1738     - __netlink_set_ring(sk, &req, true, NULL, 0);
1739     - }
1740     -#endif /* CONFIG_NETLINK_MMAP */
1741    
1742     if (!sock_flag(sk, SOCK_DEAD)) {
1743     printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
1744     @@ -1201,9 +587,6 @@ static int __netlink_create(struct net *net, struct socket *sock,
1745     mutex_init(nlk->cb_mutex);
1746     }
1747     init_waitqueue_head(&nlk->wait);
1748     -#ifdef CONFIG_NETLINK_MMAP
1749     - mutex_init(&nlk->pg_vec_lock);
1750     -#endif
1751    
1752     sk->sk_destruct = netlink_sock_destruct;
1753     sk->sk_protocol = protocol;
1754     @@ -1745,8 +1128,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
1755     nlk = nlk_sk(sk);
1756    
1757     if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
1758     - test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
1759     - !netlink_skb_is_mmaped(skb)) {
1760     + test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
1761     DECLARE_WAITQUEUE(wait, current);
1762     if (!*timeo) {
1763     if (!ssk || netlink_is_kernel(ssk))
1764     @@ -1784,14 +1166,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
1765    
1766     netlink_deliver_tap(skb);
1767    
1768     -#ifdef CONFIG_NETLINK_MMAP
1769     - if (netlink_skb_is_mmaped(skb))
1770     - netlink_queue_mmaped_skb(sk, skb);
1771     - else if (netlink_rx_is_mmaped(sk))
1772     - netlink_ring_set_copied(sk, skb);
1773     - else
1774     -#endif /* CONFIG_NETLINK_MMAP */
1775     - skb_queue_tail(&sk->sk_receive_queue, skb);
1776     + skb_queue_tail(&sk->sk_receive_queue, skb);
1777     sk->sk_data_ready(sk);
1778     return len;
1779     }
1780     @@ -1815,9 +1190,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
1781     int delta;
1782    
1783     WARN_ON(skb->sk != NULL);
1784     - if (netlink_skb_is_mmaped(skb))
1785     - return skb;
1786     -
1787     delta = skb->end - skb->tail;
1788     if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
1789     return skb;
1790     @@ -1897,71 +1269,6 @@ struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
1791     unsigned int ldiff, u32 dst_portid,
1792     gfp_t gfp_mask)
1793     {
1794     -#ifdef CONFIG_NETLINK_MMAP
1795     - unsigned int maxlen, linear_size;
1796     - struct sock *sk = NULL;
1797     - struct sk_buff *skb;
1798     - struct netlink_ring *ring;
1799     - struct nl_mmap_hdr *hdr;
1800     -
1801     - sk = netlink_getsockbyportid(ssk, dst_portid);
1802     - if (IS_ERR(sk))
1803     - goto out;
1804     -
1805     - ring = &nlk_sk(sk)->rx_ring;
1806     - /* fast-path without atomic ops for common case: non-mmaped receiver */
1807     - if (ring->pg_vec == NULL)
1808     - goto out_put;
1809     -
1810     - /* We need to account the full linear size needed as a ring
1811     - * slot cannot have non-linear parts.
1812     - */
1813     - linear_size = size + ldiff;
1814     - if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
1815     - goto out_put;
1816     -
1817     - skb = alloc_skb_head(gfp_mask);
1818     - if (skb == NULL)
1819     - goto err1;
1820     -
1821     - spin_lock_bh(&sk->sk_receive_queue.lock);
1822     - /* check again under lock */
1823     - if (ring->pg_vec == NULL)
1824     - goto out_free;
1825     -
1826     - /* check again under lock */
1827     - maxlen = ring->frame_size - NL_MMAP_HDRLEN;
1828     - if (maxlen < linear_size)
1829     - goto out_free;
1830     -
1831     - netlink_forward_ring(ring);
1832     - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
1833     - if (hdr == NULL)
1834     - goto err2;
1835     -
1836     - netlink_ring_setup_skb(skb, sk, ring, hdr);
1837     - netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
1838     - atomic_inc(&ring->pending);
1839     - netlink_increment_head(ring);
1840     -
1841     - spin_unlock_bh(&sk->sk_receive_queue.lock);
1842     - return skb;
1843     -
1844     -err2:
1845     - kfree_skb(skb);
1846     - spin_unlock_bh(&sk->sk_receive_queue.lock);
1847     - netlink_overrun(sk);
1848     -err1:
1849     - sock_put(sk);
1850     - return NULL;
1851     -
1852     -out_free:
1853     - kfree_skb(skb);
1854     - spin_unlock_bh(&sk->sk_receive_queue.lock);
1855     -out_put:
1856     - sock_put(sk);
1857     -out:
1858     -#endif
1859     return alloc_skb(size, gfp_mask);
1860     }
1861     EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
1862     @@ -2242,8 +1549,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
1863     if (level != SOL_NETLINK)
1864     return -ENOPROTOOPT;
1865    
1866     - if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
1867     - optlen >= sizeof(int) &&
1868     + if (optlen >= sizeof(int) &&
1869     get_user(val, (unsigned int __user *)optval))
1870     return -EFAULT;
1871    
1872     @@ -2296,25 +1602,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
1873     }
1874     err = 0;
1875     break;
1876     -#ifdef CONFIG_NETLINK_MMAP
1877     - case NETLINK_RX_RING:
1878     - case NETLINK_TX_RING: {
1879     - struct nl_mmap_req req;
1880     -
1881     - /* Rings might consume more memory than queue limits, require
1882     - * CAP_NET_ADMIN.
1883     - */
1884     - if (!capable(CAP_NET_ADMIN))
1885     - return -EPERM;
1886     - if (optlen < sizeof(req))
1887     - return -EINVAL;
1888     - if (copy_from_user(&req, optval, sizeof(req)))
1889     - return -EFAULT;
1890     - err = netlink_set_ring(sk, &req,
1891     - optname == NETLINK_TX_RING);
1892     - break;
1893     - }
1894     -#endif /* CONFIG_NETLINK_MMAP */
1895     case NETLINK_LISTEN_ALL_NSID:
1896     if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
1897     return -EPERM;
1898     @@ -2484,18 +1771,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1899     smp_rmb();
1900     }
1901    
1902     - /* It's a really convoluted way for userland to ask for mmaped
1903     - * sendmsg(), but that's what we've got...
1904     - */
1905     - if (netlink_tx_is_mmaped(sk) &&
1906     - iter_is_iovec(&msg->msg_iter) &&
1907     - msg->msg_iter.nr_segs == 1 &&
1908     - msg->msg_iter.iov->iov_base == NULL) {
1909     - err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
1910     - &scm);
1911     - goto out;
1912     - }
1913     -
1914     err = -EMSGSIZE;
1915     if (len > sk->sk_sndbuf - 32)
1916     goto out;
1917     @@ -2812,8 +2087,7 @@ static int netlink_dump(struct sock *sk)
1918     goto errout_skb;
1919     }
1920    
1921     - if (!netlink_rx_is_mmaped(sk) &&
1922     - atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1923     + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1924     goto errout_skb;
1925    
1926     /* NLMSG_GOODSIZE is small to avoid high order allocations being
1927     @@ -2902,16 +2176,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
1928     struct netlink_sock *nlk;
1929     int ret;
1930    
1931     - /* Memory mapped dump requests need to be copied to avoid looping
1932     - * on the pending state in netlink_mmap_sendmsg() while the CB hold
1933     - * a reference to the skb.
1934     - */
1935     - if (netlink_skb_is_mmaped(skb)) {
1936     - skb = skb_copy(skb, GFP_KERNEL);
1937     - if (skb == NULL)
1938     - return -ENOBUFS;
1939     - } else
1940     - atomic_inc(&skb->users);
1941     + atomic_inc(&skb->users);
1942    
1943     sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
1944     if (sk == NULL) {
1945     @@ -3255,7 +2520,7 @@ static const struct proto_ops netlink_ops = {
1946     .socketpair = sock_no_socketpair,
1947     .accept = sock_no_accept,
1948     .getname = netlink_getname,
1949     - .poll = netlink_poll,
1950     + .poll = datagram_poll,
1951     .ioctl = sock_no_ioctl,
1952     .listen = sock_no_listen,
1953     .shutdown = sock_no_shutdown,
1954     @@ -3263,7 +2528,7 @@ static const struct proto_ops netlink_ops = {
1955     .getsockopt = netlink_getsockopt,
1956     .sendmsg = netlink_sendmsg,
1957     .recvmsg = netlink_recvmsg,
1958     - .mmap = netlink_mmap,
1959     + .mmap = sock_no_mmap,
1960     .sendpage = sock_no_sendpage,
1961     };
1962    
1963     diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
1964     index df32cb92d9fc..ea4600aea6b0 100644
1965     --- a/net/netlink/af_netlink.h
1966     +++ b/net/netlink/af_netlink.h
1967     @@ -45,12 +45,6 @@ struct netlink_sock {
1968     int (*netlink_bind)(struct net *net, int group);
1969     void (*netlink_unbind)(struct net *net, int group);
1970     struct module *module;
1971     -#ifdef CONFIG_NETLINK_MMAP
1972     - struct mutex pg_vec_lock;
1973     - struct netlink_ring rx_ring;
1974     - struct netlink_ring tx_ring;
1975     - atomic_t mapped;
1976     -#endif /* CONFIG_NETLINK_MMAP */
1977    
1978     struct rhash_head node;
1979     struct rcu_head rcu;
1980     @@ -62,15 +56,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)
1981     return container_of(sk, struct netlink_sock, sk);
1982     }
1983    
1984     -static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb)
1985     -{
1986     -#ifdef CONFIG_NETLINK_MMAP
1987     - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
1988     -#else
1989     - return false;
1990     -#endif /* CONFIG_NETLINK_MMAP */
1991     -}
1992     -
1993     struct netlink_table {
1994     struct rhashtable hash;
1995     struct hlist_head mc_list;
1996     diff --git a/net/netlink/diag.c b/net/netlink/diag.c
1997     index 3ee63a3cff30..8dd836a8dd60 100644
1998     --- a/net/netlink/diag.c
1999     +++ b/net/netlink/diag.c
2000     @@ -8,41 +8,6 @@
2001    
2002     #include "af_netlink.h"
2003    
2004     -#ifdef CONFIG_NETLINK_MMAP
2005     -static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type,
2006     - struct sk_buff *nlskb)
2007     -{
2008     - struct netlink_diag_ring ndr;
2009     -
2010     - ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
2011     - ndr.ndr_block_nr = ring->pg_vec_len;
2012     - ndr.ndr_frame_size = ring->frame_size;
2013     - ndr.ndr_frame_nr = ring->frame_max + 1;
2014     -
2015     - return nla_put(nlskb, nl_type, sizeof(ndr), &ndr);
2016     -}
2017     -
2018     -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
2019     -{
2020     - struct netlink_sock *nlk = nlk_sk(sk);
2021     - int ret;
2022     -
2023     - mutex_lock(&nlk->pg_vec_lock);
2024     - ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb);
2025     - if (!ret)
2026     - ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING,
2027     - nlskb);
2028     - mutex_unlock(&nlk->pg_vec_lock);
2029     -
2030     - return ret;
2031     -}
2032     -#else
2033     -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
2034     -{
2035     - return 0;
2036     -}
2037     -#endif
2038     -
2039     static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
2040     {
2041     struct netlink_sock *nlk = nlk_sk(sk);
2042     @@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
2043     sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
2044     goto out_nlmsg_trim;
2045    
2046     - if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) &&
2047     - sk_diag_put_rings_cfg(sk, skb))
2048     - goto out_nlmsg_trim;
2049     -
2050     nlmsg_end(skb, nlh);
2051     return 0;
2052    
2053     diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
2054     index d805cd577a60..3975ac809934 100644
2055     --- a/net/packet/af_packet.c
2056     +++ b/net/packet/af_packet.c
2057     @@ -3021,7 +3021,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2058     int addr_len)
2059     {
2060     struct sock *sk = sock->sk;
2061     - char name[15];
2062     + char name[sizeof(uaddr->sa_data) + 1];
2063    
2064     /*
2065     * Check legality
2066     @@ -3029,7 +3029,11 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2067    
2068     if (addr_len != sizeof(struct sockaddr))
2069     return -EINVAL;
2070     - strlcpy(name, uaddr->sa_data, sizeof(name));
2071     + /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
2072     + * zero-terminated.
2073     + */
2074     + memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
2075     + name[sizeof(uaddr->sa_data)] = 0;
2076    
2077     return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
2078     }
2079     diff --git a/net/sched/act_api.c b/net/sched/act_api.c
2080     index 06e7c4a37245..694a06f1e0d5 100644
2081     --- a/net/sched/act_api.c
2082     +++ b/net/sched/act_api.c
2083     @@ -820,10 +820,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
2084     goto out_module_put;
2085    
2086     err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);
2087     - if (err < 0)
2088     + if (err <= 0)
2089     goto out_module_put;
2090     - if (err == 0)
2091     - goto noflush_out;
2092    
2093     nla_nest_end(skb, nest);
2094    
2095     @@ -840,7 +838,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
2096     out_module_put:
2097     module_put(a.ops->owner);
2098     err_out:
2099     -noflush_out:
2100     kfree_skb(skb);
2101     return err;
2102     }
2103     diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
2104     index bb41699c6c49..7ecb14f3db54 100644
2105     --- a/net/sched/act_connmark.c
2106     +++ b/net/sched/act_connmark.c
2107     @@ -109,6 +109,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
2108     if (ret < 0)
2109     return ret;
2110    
2111     + if (!tb[TCA_CONNMARK_PARMS])
2112     + return -EINVAL;
2113     +
2114     parm = nla_data(tb[TCA_CONNMARK_PARMS]);
2115    
2116     if (!tcf_hash_check(parm->index, a, bind)) {