Magellan Linux

Annotation of /trunk/kernel-alx/patches-4.9/0194-4.9.95-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3171 - (hide annotations) (download)
Wed Aug 8 14:17:26 2018 UTC (5 years, 9 months ago) by niro
File size: 186555 byte(s)
-linux-4.9.95
1 niro 3171 diff --git a/Makefile b/Makefile
2     index 02188cf8e9af..1aeec9df709d 100644
3     --- a/Makefile
4     +++ b/Makefile
5     @@ -1,6 +1,6 @@
6     VERSION = 4
7     PATCHLEVEL = 9
8     -SUBLEVEL = 94
9     +SUBLEVEL = 95
10     EXTRAVERSION =
11     NAME = Roaring Lionus
12    
13     diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
14     index d5423ab15ed5..9fe1043e72d2 100644
15     --- a/arch/arm/include/asm/kvm_host.h
16     +++ b/arch/arm/include/asm/kvm_host.h
17     @@ -318,4 +318,10 @@ static inline int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
18     return -ENXIO;
19     }
20    
21     +static inline bool kvm_arm_harden_branch_predictor(void)
22     +{
23     + /* No way to detect it yet, pretend it is not there. */
24     + return false;
25     +}
26     +
27     #endif /* __ARM_KVM_HOST_H__ */
28     diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
29     index a58bbaa3ec60..d10e36235438 100644
30     --- a/arch/arm/include/asm/kvm_mmu.h
31     +++ b/arch/arm/include/asm/kvm_mmu.h
32     @@ -223,6 +223,16 @@ static inline unsigned int kvm_get_vmid_bits(void)
33     return 8;
34     }
35    
36     +static inline void *kvm_get_hyp_vector(void)
37     +{
38     + return kvm_ksym_ref(__kvm_hyp_vector);
39     +}
40     +
41     +static inline int kvm_map_vectors(void)
42     +{
43     + return 0;
44     +}
45     +
46     #endif /* !__ASSEMBLY__ */
47    
48     #endif /* __ARM_KVM_MMU_H__ */
49     diff --git a/arch/arm/include/asm/kvm_psci.h b/arch/arm/include/asm/kvm_psci.h
50     deleted file mode 100644
51     index 6bda945d31fa..000000000000
52     --- a/arch/arm/include/asm/kvm_psci.h
53     +++ /dev/null
54     @@ -1,27 +0,0 @@
55     -/*
56     - * Copyright (C) 2012 - ARM Ltd
57     - * Author: Marc Zyngier <marc.zyngier@arm.com>
58     - *
59     - * This program is free software; you can redistribute it and/or modify
60     - * it under the terms of the GNU General Public License version 2 as
61     - * published by the Free Software Foundation.
62     - *
63     - * This program is distributed in the hope that it will be useful,
64     - * but WITHOUT ANY WARRANTY; without even the implied warranty of
65     - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
66     - * GNU General Public License for more details.
67     - *
68     - * You should have received a copy of the GNU General Public License
69     - * along with this program. If not, see <http://www.gnu.org/licenses/>.
70     - */
71     -
72     -#ifndef __ARM_KVM_PSCI_H__
73     -#define __ARM_KVM_PSCI_H__
74     -
75     -#define KVM_ARM_PSCI_0_1 1
76     -#define KVM_ARM_PSCI_0_2 2
77     -
78     -int kvm_psci_version(struct kvm_vcpu *vcpu);
79     -int kvm_psci_call(struct kvm_vcpu *vcpu);
80     -
81     -#endif /* __ARM_KVM_PSCI_H__ */
82     diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
83     index c38bfbeec306..ef6595c7d697 100644
84     --- a/arch/arm/kvm/arm.c
85     +++ b/arch/arm/kvm/arm.c
86     @@ -29,6 +29,7 @@
87     #include <linux/kvm.h>
88     #include <trace/events/kvm.h>
89     #include <kvm/arm_pmu.h>
90     +#include <kvm/arm_psci.h>
91    
92     #define CREATE_TRACE_POINTS
93     #include "trace.h"
94     @@ -44,7 +45,6 @@
95     #include <asm/kvm_mmu.h>
96     #include <asm/kvm_emulate.h>
97     #include <asm/kvm_coproc.h>
98     -#include <asm/kvm_psci.h>
99     #include <asm/sections.h>
100    
101     #ifdef REQUIRES_VIRT
102     @@ -1088,7 +1088,7 @@ static void cpu_init_hyp_mode(void *dummy)
103     pgd_ptr = kvm_mmu_get_httbr();
104     stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
105     hyp_stack_ptr = stack_page + PAGE_SIZE;
106     - vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
107     + vector_ptr = (unsigned long)kvm_get_hyp_vector();
108    
109     __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
110     __cpu_init_stage2();
111     @@ -1345,6 +1345,13 @@ static int init_hyp_mode(void)
112     goto out_err;
113     }
114    
115     +
116     + err = kvm_map_vectors();
117     + if (err) {
118     + kvm_err("Cannot map vectors\n");
119     + goto out_err;
120     + }
121     +
122     /*
123     * Map the Hyp stack pages
124     */
125     diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
126     index 4e57ebca6e69..de1aedce2a8b 100644
127     --- a/arch/arm/kvm/handle_exit.c
128     +++ b/arch/arm/kvm/handle_exit.c
129     @@ -21,7 +21,7 @@
130     #include <asm/kvm_emulate.h>
131     #include <asm/kvm_coproc.h>
132     #include <asm/kvm_mmu.h>
133     -#include <asm/kvm_psci.h>
134     +#include <kvm/arm_psci.h>
135     #include <trace/events/kvm.h>
136    
137     #include "trace.h"
138     @@ -36,7 +36,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
139     kvm_vcpu_hvc_get_imm(vcpu));
140     vcpu->stat.hvc_exit_stat++;
141    
142     - ret = kvm_psci_call(vcpu);
143     + ret = kvm_hvc_call_handler(vcpu);
144     if (ret < 0) {
145     vcpu_set_reg(vcpu, 0, ~0UL);
146     return 1;
147     diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
148     index a08d7a93aebb..3d962257c166 100644
149     --- a/arch/arm/kvm/psci.c
150     +++ b/arch/arm/kvm/psci.c
151     @@ -15,16 +15,16 @@
152     * along with this program. If not, see <http://www.gnu.org/licenses/>.
153     */
154    
155     +#include <linux/arm-smccc.h>
156     #include <linux/preempt.h>
157     #include <linux/kvm_host.h>
158     #include <linux/wait.h>
159    
160     #include <asm/cputype.h>
161     #include <asm/kvm_emulate.h>
162     -#include <asm/kvm_psci.h>
163     #include <asm/kvm_host.h>
164    
165     -#include <uapi/linux/psci.h>
166     +#include <kvm/arm_psci.h>
167    
168     /*
169     * This is an implementation of the Power State Coordination Interface
170     @@ -33,6 +33,38 @@
171    
172     #define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
173    
174     +static u32 smccc_get_function(struct kvm_vcpu *vcpu)
175     +{
176     + return vcpu_get_reg(vcpu, 0);
177     +}
178     +
179     +static unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu)
180     +{
181     + return vcpu_get_reg(vcpu, 1);
182     +}
183     +
184     +static unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu)
185     +{
186     + return vcpu_get_reg(vcpu, 2);
187     +}
188     +
189     +static unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu)
190     +{
191     + return vcpu_get_reg(vcpu, 3);
192     +}
193     +
194     +static void smccc_set_retval(struct kvm_vcpu *vcpu,
195     + unsigned long a0,
196     + unsigned long a1,
197     + unsigned long a2,
198     + unsigned long a3)
199     +{
200     + vcpu_set_reg(vcpu, 0, a0);
201     + vcpu_set_reg(vcpu, 1, a1);
202     + vcpu_set_reg(vcpu, 2, a2);
203     + vcpu_set_reg(vcpu, 3, a3);
204     +}
205     +
206     static unsigned long psci_affinity_mask(unsigned long affinity_level)
207     {
208     if (affinity_level <= 3)
209     @@ -75,7 +107,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
210     unsigned long context_id;
211     phys_addr_t target_pc;
212    
213     - cpu_id = vcpu_get_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK;
214     + cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
215     if (vcpu_mode_is_32bit(source_vcpu))
216     cpu_id &= ~((u32) 0);
217    
218     @@ -88,14 +120,14 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
219     if (!vcpu)
220     return PSCI_RET_INVALID_PARAMS;
221     if (!vcpu->arch.power_off) {
222     - if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
223     + if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1)
224     return PSCI_RET_ALREADY_ON;
225     else
226     return PSCI_RET_INVALID_PARAMS;
227     }
228    
229     - target_pc = vcpu_get_reg(source_vcpu, 2);
230     - context_id = vcpu_get_reg(source_vcpu, 3);
231     + target_pc = smccc_get_arg2(source_vcpu);
232     + context_id = smccc_get_arg3(source_vcpu);
233    
234     kvm_reset_vcpu(vcpu);
235    
236     @@ -114,7 +146,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
237     * NOTE: We always update r0 (or x0) because for PSCI v0.1
238     * the general puspose registers are undefined upon CPU_ON.
239     */
240     - vcpu_set_reg(vcpu, 0, context_id);
241     + smccc_set_retval(vcpu, context_id, 0, 0, 0);
242     vcpu->arch.power_off = false;
243     smp_mb(); /* Make sure the above is visible */
244    
245     @@ -134,8 +166,8 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
246     struct kvm *kvm = vcpu->kvm;
247     struct kvm_vcpu *tmp;
248    
249     - target_affinity = vcpu_get_reg(vcpu, 1);
250     - lowest_affinity_level = vcpu_get_reg(vcpu, 2);
251     + target_affinity = smccc_get_arg1(vcpu);
252     + lowest_affinity_level = smccc_get_arg2(vcpu);
253    
254     /* Determine target affinity mask */
255     target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
256     @@ -198,18 +230,10 @@ static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
257     kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
258     }
259    
260     -int kvm_psci_version(struct kvm_vcpu *vcpu)
261     -{
262     - if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
263     - return KVM_ARM_PSCI_0_2;
264     -
265     - return KVM_ARM_PSCI_0_1;
266     -}
267     -
268     static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
269     {
270     struct kvm *kvm = vcpu->kvm;
271     - unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0);
272     + unsigned long psci_fn = smccc_get_function(vcpu);
273     unsigned long val;
274     int ret = 1;
275    
276     @@ -219,7 +243,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
277     * Bits[31:16] = Major Version = 0
278     * Bits[15:0] = Minor Version = 2
279     */
280     - val = 2;
281     + val = KVM_ARM_PSCI_0_2;
282     break;
283     case PSCI_0_2_FN_CPU_SUSPEND:
284     case PSCI_0_2_FN64_CPU_SUSPEND:
285     @@ -276,14 +300,56 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
286     break;
287     }
288    
289     - vcpu_set_reg(vcpu, 0, val);
290     + smccc_set_retval(vcpu, val, 0, 0, 0);
291     + return ret;
292     +}
293     +
294     +static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
295     +{
296     + u32 psci_fn = smccc_get_function(vcpu);
297     + u32 feature;
298     + unsigned long val;
299     + int ret = 1;
300     +
301     + switch(psci_fn) {
302     + case PSCI_0_2_FN_PSCI_VERSION:
303     + val = KVM_ARM_PSCI_1_0;
304     + break;
305     + case PSCI_1_0_FN_PSCI_FEATURES:
306     + feature = smccc_get_arg1(vcpu);
307     + switch(feature) {
308     + case PSCI_0_2_FN_PSCI_VERSION:
309     + case PSCI_0_2_FN_CPU_SUSPEND:
310     + case PSCI_0_2_FN64_CPU_SUSPEND:
311     + case PSCI_0_2_FN_CPU_OFF:
312     + case PSCI_0_2_FN_CPU_ON:
313     + case PSCI_0_2_FN64_CPU_ON:
314     + case PSCI_0_2_FN_AFFINITY_INFO:
315     + case PSCI_0_2_FN64_AFFINITY_INFO:
316     + case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
317     + case PSCI_0_2_FN_SYSTEM_OFF:
318     + case PSCI_0_2_FN_SYSTEM_RESET:
319     + case PSCI_1_0_FN_PSCI_FEATURES:
320     + case ARM_SMCCC_VERSION_FUNC_ID:
321     + val = 0;
322     + break;
323     + default:
324     + val = PSCI_RET_NOT_SUPPORTED;
325     + break;
326     + }
327     + break;
328     + default:
329     + return kvm_psci_0_2_call(vcpu);
330     + }
331     +
332     + smccc_set_retval(vcpu, val, 0, 0, 0);
333     return ret;
334     }
335    
336     static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
337     {
338     struct kvm *kvm = vcpu->kvm;
339     - unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0);
340     + unsigned long psci_fn = smccc_get_function(vcpu);
341     unsigned long val;
342    
343     switch (psci_fn) {
344     @@ -301,7 +367,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
345     break;
346     }
347    
348     - vcpu_set_reg(vcpu, 0, val);
349     + smccc_set_retval(vcpu, val, 0, 0, 0);
350     return 1;
351     }
352    
353     @@ -319,9 +385,11 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
354     * Errors:
355     * -EINVAL: Unrecognized PSCI function
356     */
357     -int kvm_psci_call(struct kvm_vcpu *vcpu)
358     +static int kvm_psci_call(struct kvm_vcpu *vcpu)
359     {
360     - switch (kvm_psci_version(vcpu)) {
361     + switch (kvm_psci_version(vcpu, vcpu->kvm)) {
362     + case KVM_ARM_PSCI_1_0:
363     + return kvm_psci_1_0_call(vcpu);
364     case KVM_ARM_PSCI_0_2:
365     return kvm_psci_0_2_call(vcpu);
366     case KVM_ARM_PSCI_0_1:
367     @@ -330,3 +398,30 @@ int kvm_psci_call(struct kvm_vcpu *vcpu)
368     return -EINVAL;
369     };
370     }
371     +
372     +int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
373     +{
374     + u32 func_id = smccc_get_function(vcpu);
375     + u32 val = PSCI_RET_NOT_SUPPORTED;
376     + u32 feature;
377     +
378     + switch (func_id) {
379     + case ARM_SMCCC_VERSION_FUNC_ID:
380     + val = ARM_SMCCC_VERSION_1_1;
381     + break;
382     + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
383     + feature = smccc_get_arg1(vcpu);
384     + switch(feature) {
385     + case ARM_SMCCC_ARCH_WORKAROUND_1:
386     + if (kvm_arm_harden_branch_predictor())
387     + val = 0;
388     + break;
389     + }
390     + break;
391     + default:
392     + return kvm_psci_call(vcpu);
393     + }
394     +
395     + smccc_set_retval(vcpu, val, 0, 0, 0);
396     + return 1;
397     +}
398     diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
399     index c8471cf46cbb..90e58bbbd858 100644
400     --- a/arch/arm64/Kconfig
401     +++ b/arch/arm64/Kconfig
402     @@ -745,6 +745,23 @@ config UNMAP_KERNEL_AT_EL0
403    
404     If unsure, say Y.
405    
406     +config HARDEN_BRANCH_PREDICTOR
407     + bool "Harden the branch predictor against aliasing attacks" if EXPERT
408     + default y
409     + help
410     + Speculation attacks against some high-performance processors rely on
411     + being able to manipulate the branch predictor for a victim context by
412     + executing aliasing branches in the attacker context. Such attacks
413     + can be partially mitigated against by clearing internal branch
414     + predictor state and limiting the prediction logic in some situations.
415     +
416     + This config option will take CPU-specific actions to harden the
417     + branch predictor against aliasing attacks and may rely on specific
418     + instruction sequences or control bits being set by the system
419     + firmware.
420     +
421     + If unsure, say Y.
422     +
423     menuconfig ARMV8_DEPRECATED
424     bool "Emulate deprecated/obsolete ARMv8 instructions"
425     depends on COMPAT
426     diff --git a/arch/arm64/crypto/sha256-core.S b/arch/arm64/crypto/sha256-core.S
427     new file mode 100644
428     index 000000000000..3ce82cc860bc
429     --- /dev/null
430     +++ b/arch/arm64/crypto/sha256-core.S
431     @@ -0,0 +1,2061 @@
432     +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
433     +//
434     +// Licensed under the OpenSSL license (the "License"). You may not use
435     +// this file except in compliance with the License. You can obtain a copy
436     +// in the file LICENSE in the source distribution or at
437     +// https://www.openssl.org/source/license.html
438     +
439     +// ====================================================================
440     +// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
441     +// project. The module is, however, dual licensed under OpenSSL and
442     +// CRYPTOGAMS licenses depending on where you obtain it. For further
443     +// details see http://www.openssl.org/~appro/cryptogams/.
444     +//
445     +// Permission to use under GPLv2 terms is granted.
446     +// ====================================================================
447     +//
448     +// SHA256/512 for ARMv8.
449     +//
450     +// Performance in cycles per processed byte and improvement coefficient
451     +// over code generated with "default" compiler:
452     +//
453     +// SHA256-hw SHA256(*) SHA512
454     +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
455     +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
456     +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
457     +// Denver 2.01 10.5 (+26%) 6.70 (+8%)
458     +// X-Gene 20.0 (+100%) 12.8 (+300%(***))
459     +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
460     +//
461     +// (*) Software SHA256 results are of lesser relevance, presented
462     +// mostly for informational purposes.
463     +// (**) The result is a trade-off: it's possible to improve it by
464     +// 10% (or by 1 cycle per round), but at the cost of 20% loss
465     +// on Cortex-A53 (or by 4 cycles per round).
466     +// (***) Super-impressive coefficients over gcc-generated code are
467     +// indication of some compiler "pathology", most notably code
468     +// generated with -mgeneral-regs-only is significanty faster
469     +// and the gap is only 40-90%.
470     +//
471     +// October 2016.
472     +//
473     +// Originally it was reckoned that it makes no sense to implement NEON
474     +// version of SHA256 for 64-bit processors. This is because performance
475     +// improvement on most wide-spread Cortex-A5x processors was observed
476     +// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
477     +// observed that 32-bit NEON SHA256 performs significantly better than
478     +// 64-bit scalar version on *some* of the more recent processors. As
479     +// result 64-bit NEON version of SHA256 was added to provide best
480     +// all-round performance. For example it executes ~30% faster on X-Gene
481     +// and Mongoose. [For reference, NEON version of SHA512 is bound to
482     +// deliver much less improvement, likely *negative* on Cortex-A5x.
483     +// Which is why NEON support is limited to SHA256.]
484     +
485     +#ifndef __KERNEL__
486     +# include "arm_arch.h"
487     +#endif
488     +
489     +.text
490     +
491     +.extern OPENSSL_armcap_P
492     +.globl sha256_block_data_order
493     +.type sha256_block_data_order,%function
494     +.align 6
495     +sha256_block_data_order:
496     +#ifndef __KERNEL__
497     +# ifdef __ILP32__
498     + ldrsw x16,.LOPENSSL_armcap_P
499     +# else
500     + ldr x16,.LOPENSSL_armcap_P
501     +# endif
502     + adr x17,.LOPENSSL_armcap_P
503     + add x16,x16,x17
504     + ldr w16,[x16]
505     + tst w16,#ARMV8_SHA256
506     + b.ne .Lv8_entry
507     + tst w16,#ARMV7_NEON
508     + b.ne .Lneon_entry
509     +#endif
510     + stp x29,x30,[sp,#-128]!
511     + add x29,sp,#0
512     +
513     + stp x19,x20,[sp,#16]
514     + stp x21,x22,[sp,#32]
515     + stp x23,x24,[sp,#48]
516     + stp x25,x26,[sp,#64]
517     + stp x27,x28,[sp,#80]
518     + sub sp,sp,#4*4
519     +
520     + ldp w20,w21,[x0] // load context
521     + ldp w22,w23,[x0,#2*4]
522     + ldp w24,w25,[x0,#4*4]
523     + add x2,x1,x2,lsl#6 // end of input
524     + ldp w26,w27,[x0,#6*4]
525     + adr x30,.LK256
526     + stp x0,x2,[x29,#96]
527     +
528     +.Loop:
529     + ldp w3,w4,[x1],#2*4
530     + ldr w19,[x30],#4 // *K++
531     + eor w28,w21,w22 // magic seed
532     + str x1,[x29,#112]
533     +#ifndef __AARCH64EB__
534     + rev w3,w3 // 0
535     +#endif
536     + ror w16,w24,#6
537     + add w27,w27,w19 // h+=K[i]
538     + eor w6,w24,w24,ror#14
539     + and w17,w25,w24
540     + bic w19,w26,w24
541     + add w27,w27,w3 // h+=X[i]
542     + orr w17,w17,w19 // Ch(e,f,g)
543     + eor w19,w20,w21 // a^b, b^c in next round
544     + eor w16,w16,w6,ror#11 // Sigma1(e)
545     + ror w6,w20,#2
546     + add w27,w27,w17 // h+=Ch(e,f,g)
547     + eor w17,w20,w20,ror#9
548     + add w27,w27,w16 // h+=Sigma1(e)
549     + and w28,w28,w19 // (b^c)&=(a^b)
550     + add w23,w23,w27 // d+=h
551     + eor w28,w28,w21 // Maj(a,b,c)
552     + eor w17,w6,w17,ror#13 // Sigma0(a)
553     + add w27,w27,w28 // h+=Maj(a,b,c)
554     + ldr w28,[x30],#4 // *K++, w19 in next round
555     + //add w27,w27,w17 // h+=Sigma0(a)
556     +#ifndef __AARCH64EB__
557     + rev w4,w4 // 1
558     +#endif
559     + ldp w5,w6,[x1],#2*4
560     + add w27,w27,w17 // h+=Sigma0(a)
561     + ror w16,w23,#6
562     + add w26,w26,w28 // h+=K[i]
563     + eor w7,w23,w23,ror#14
564     + and w17,w24,w23
565     + bic w28,w25,w23
566     + add w26,w26,w4 // h+=X[i]
567     + orr w17,w17,w28 // Ch(e,f,g)
568     + eor w28,w27,w20 // a^b, b^c in next round
569     + eor w16,w16,w7,ror#11 // Sigma1(e)
570     + ror w7,w27,#2
571     + add w26,w26,w17 // h+=Ch(e,f,g)
572     + eor w17,w27,w27,ror#9
573     + add w26,w26,w16 // h+=Sigma1(e)
574     + and w19,w19,w28 // (b^c)&=(a^b)
575     + add w22,w22,w26 // d+=h
576     + eor w19,w19,w20 // Maj(a,b,c)
577     + eor w17,w7,w17,ror#13 // Sigma0(a)
578     + add w26,w26,w19 // h+=Maj(a,b,c)
579     + ldr w19,[x30],#4 // *K++, w28 in next round
580     + //add w26,w26,w17 // h+=Sigma0(a)
581     +#ifndef __AARCH64EB__
582     + rev w5,w5 // 2
583     +#endif
584     + add w26,w26,w17 // h+=Sigma0(a)
585     + ror w16,w22,#6
586     + add w25,w25,w19 // h+=K[i]
587     + eor w8,w22,w22,ror#14
588     + and w17,w23,w22
589     + bic w19,w24,w22
590     + add w25,w25,w5 // h+=X[i]
591     + orr w17,w17,w19 // Ch(e,f,g)
592     + eor w19,w26,w27 // a^b, b^c in next round
593     + eor w16,w16,w8,ror#11 // Sigma1(e)
594     + ror w8,w26,#2
595     + add w25,w25,w17 // h+=Ch(e,f,g)
596     + eor w17,w26,w26,ror#9
597     + add w25,w25,w16 // h+=Sigma1(e)
598     + and w28,w28,w19 // (b^c)&=(a^b)
599     + add w21,w21,w25 // d+=h
600     + eor w28,w28,w27 // Maj(a,b,c)
601     + eor w17,w8,w17,ror#13 // Sigma0(a)
602     + add w25,w25,w28 // h+=Maj(a,b,c)
603     + ldr w28,[x30],#4 // *K++, w19 in next round
604     + //add w25,w25,w17 // h+=Sigma0(a)
605     +#ifndef __AARCH64EB__
606     + rev w6,w6 // 3
607     +#endif
608     + ldp w7,w8,[x1],#2*4
609     + add w25,w25,w17 // h+=Sigma0(a)
610     + ror w16,w21,#6
611     + add w24,w24,w28 // h+=K[i]
612     + eor w9,w21,w21,ror#14
613     + and w17,w22,w21
614     + bic w28,w23,w21
615     + add w24,w24,w6 // h+=X[i]
616     + orr w17,w17,w28 // Ch(e,f,g)
617     + eor w28,w25,w26 // a^b, b^c in next round
618     + eor w16,w16,w9,ror#11 // Sigma1(e)
619     + ror w9,w25,#2
620     + add w24,w24,w17 // h+=Ch(e,f,g)
621     + eor w17,w25,w25,ror#9
622     + add w24,w24,w16 // h+=Sigma1(e)
623     + and w19,w19,w28 // (b^c)&=(a^b)
624     + add w20,w20,w24 // d+=h
625     + eor w19,w19,w26 // Maj(a,b,c)
626     + eor w17,w9,w17,ror#13 // Sigma0(a)
627     + add w24,w24,w19 // h+=Maj(a,b,c)
628     + ldr w19,[x30],#4 // *K++, w28 in next round
629     + //add w24,w24,w17 // h+=Sigma0(a)
630     +#ifndef __AARCH64EB__
631     + rev w7,w7 // 4
632     +#endif
633     + add w24,w24,w17 // h+=Sigma0(a)
634     + ror w16,w20,#6
635     + add w23,w23,w19 // h+=K[i]
636     + eor w10,w20,w20,ror#14
637     + and w17,w21,w20
638     + bic w19,w22,w20
639     + add w23,w23,w7 // h+=X[i]
640     + orr w17,w17,w19 // Ch(e,f,g)
641     + eor w19,w24,w25 // a^b, b^c in next round
642     + eor w16,w16,w10,ror#11 // Sigma1(e)
643     + ror w10,w24,#2
644     + add w23,w23,w17 // h+=Ch(e,f,g)
645     + eor w17,w24,w24,ror#9
646     + add w23,w23,w16 // h+=Sigma1(e)
647     + and w28,w28,w19 // (b^c)&=(a^b)
648     + add w27,w27,w23 // d+=h
649     + eor w28,w28,w25 // Maj(a,b,c)
650     + eor w17,w10,w17,ror#13 // Sigma0(a)
651     + add w23,w23,w28 // h+=Maj(a,b,c)
652     + ldr w28,[x30],#4 // *K++, w19 in next round
653     + //add w23,w23,w17 // h+=Sigma0(a)
654     +#ifndef __AARCH64EB__
655     + rev w8,w8 // 5
656     +#endif
657     + ldp w9,w10,[x1],#2*4
658     + add w23,w23,w17 // h+=Sigma0(a)
659     + ror w16,w27,#6
660     + add w22,w22,w28 // h+=K[i]
661     + eor w11,w27,w27,ror#14
662     + and w17,w20,w27
663     + bic w28,w21,w27
664     + add w22,w22,w8 // h+=X[i]
665     + orr w17,w17,w28 // Ch(e,f,g)
666     + eor w28,w23,w24 // a^b, b^c in next round
667     + eor w16,w16,w11,ror#11 // Sigma1(e)
668     + ror w11,w23,#2
669     + add w22,w22,w17 // h+=Ch(e,f,g)
670     + eor w17,w23,w23,ror#9
671     + add w22,w22,w16 // h+=Sigma1(e)
672     + and w19,w19,w28 // (b^c)&=(a^b)
673     + add w26,w26,w22 // d+=h
674     + eor w19,w19,w24 // Maj(a,b,c)
675     + eor w17,w11,w17,ror#13 // Sigma0(a)
676     + add w22,w22,w19 // h+=Maj(a,b,c)
677     + ldr w19,[x30],#4 // *K++, w28 in next round
678     + //add w22,w22,w17 // h+=Sigma0(a)
679     +#ifndef __AARCH64EB__
680     + rev w9,w9 // 6
681     +#endif
682     + add w22,w22,w17 // h+=Sigma0(a)
683     + ror w16,w26,#6
684     + add w21,w21,w19 // h+=K[i]
685     + eor w12,w26,w26,ror#14
686     + and w17,w27,w26
687     + bic w19,w20,w26
688     + add w21,w21,w9 // h+=X[i]
689     + orr w17,w17,w19 // Ch(e,f,g)
690     + eor w19,w22,w23 // a^b, b^c in next round
691     + eor w16,w16,w12,ror#11 // Sigma1(e)
692     + ror w12,w22,#2
693     + add w21,w21,w17 // h+=Ch(e,f,g)
694     + eor w17,w22,w22,ror#9
695     + add w21,w21,w16 // h+=Sigma1(e)
696     + and w28,w28,w19 // (b^c)&=(a^b)
697     + add w25,w25,w21 // d+=h
698     + eor w28,w28,w23 // Maj(a,b,c)
699     + eor w17,w12,w17,ror#13 // Sigma0(a)
700     + add w21,w21,w28 // h+=Maj(a,b,c)
701     + ldr w28,[x30],#4 // *K++, w19 in next round
702     + //add w21,w21,w17 // h+=Sigma0(a)
703     +#ifndef __AARCH64EB__
704     + rev w10,w10 // 7
705     +#endif
706     + ldp w11,w12,[x1],#2*4
707     + add w21,w21,w17 // h+=Sigma0(a)
708     + ror w16,w25,#6
709     + add w20,w20,w28 // h+=K[i]
710     + eor w13,w25,w25,ror#14
711     + and w17,w26,w25
712     + bic w28,w27,w25
713     + add w20,w20,w10 // h+=X[i]
714     + orr w17,w17,w28 // Ch(e,f,g)
715     + eor w28,w21,w22 // a^b, b^c in next round
716     + eor w16,w16,w13,ror#11 // Sigma1(e)
717     + ror w13,w21,#2
718     + add w20,w20,w17 // h+=Ch(e,f,g)
719     + eor w17,w21,w21,ror#9
720     + add w20,w20,w16 // h+=Sigma1(e)
721     + and w19,w19,w28 // (b^c)&=(a^b)
722     + add w24,w24,w20 // d+=h
723     + eor w19,w19,w22 // Maj(a,b,c)
724     + eor w17,w13,w17,ror#13 // Sigma0(a)
725     + add w20,w20,w19 // h+=Maj(a,b,c)
726     + ldr w19,[x30],#4 // *K++, w28 in next round
727     + //add w20,w20,w17 // h+=Sigma0(a)
728     +#ifndef __AARCH64EB__
729     + rev w11,w11 // 8
730     +#endif
731     + add w20,w20,w17 // h+=Sigma0(a)
732     + ror w16,w24,#6
733     + add w27,w27,w19 // h+=K[i]
734     + eor w14,w24,w24,ror#14
735     + and w17,w25,w24
736     + bic w19,w26,w24
737     + add w27,w27,w11 // h+=X[i]
738     + orr w17,w17,w19 // Ch(e,f,g)
739     + eor w19,w20,w21 // a^b, b^c in next round
740     + eor w16,w16,w14,ror#11 // Sigma1(e)
741     + ror w14,w20,#2
742     + add w27,w27,w17 // h+=Ch(e,f,g)
743     + eor w17,w20,w20,ror#9
744     + add w27,w27,w16 // h+=Sigma1(e)
745     + and w28,w28,w19 // (b^c)&=(a^b)
746     + add w23,w23,w27 // d+=h
747     + eor w28,w28,w21 // Maj(a,b,c)
748     + eor w17,w14,w17,ror#13 // Sigma0(a)
749     + add w27,w27,w28 // h+=Maj(a,b,c)
750     + ldr w28,[x30],#4 // *K++, w19 in next round
751     + //add w27,w27,w17 // h+=Sigma0(a)
752     +#ifndef __AARCH64EB__
753     + rev w12,w12 // 9
754     +#endif
755     + ldp w13,w14,[x1],#2*4
756     + add w27,w27,w17 // h+=Sigma0(a)
757     + ror w16,w23,#6
758     + add w26,w26,w28 // h+=K[i]
759     + eor w15,w23,w23,ror#14
760     + and w17,w24,w23
761     + bic w28,w25,w23
762     + add w26,w26,w12 // h+=X[i]
763     + orr w17,w17,w28 // Ch(e,f,g)
764     + eor w28,w27,w20 // a^b, b^c in next round
765     + eor w16,w16,w15,ror#11 // Sigma1(e)
766     + ror w15,w27,#2
767     + add w26,w26,w17 // h+=Ch(e,f,g)
768     + eor w17,w27,w27,ror#9
769     + add w26,w26,w16 // h+=Sigma1(e)
770     + and w19,w19,w28 // (b^c)&=(a^b)
771     + add w22,w22,w26 // d+=h
772     + eor w19,w19,w20 // Maj(a,b,c)
773     + eor w17,w15,w17,ror#13 // Sigma0(a)
774     + add w26,w26,w19 // h+=Maj(a,b,c)
775     + ldr w19,[x30],#4 // *K++, w28 in next round
776     + //add w26,w26,w17 // h+=Sigma0(a)
777     +#ifndef __AARCH64EB__
778     + rev w13,w13 // 10
779     +#endif
780     + add w26,w26,w17 // h+=Sigma0(a)
781     + ror w16,w22,#6
782     + add w25,w25,w19 // h+=K[i]
783     + eor w0,w22,w22,ror#14
784     + and w17,w23,w22
785     + bic w19,w24,w22
786     + add w25,w25,w13 // h+=X[i]
787     + orr w17,w17,w19 // Ch(e,f,g)
788     + eor w19,w26,w27 // a^b, b^c in next round
789     + eor w16,w16,w0,ror#11 // Sigma1(e)
790     + ror w0,w26,#2
791     + add w25,w25,w17 // h+=Ch(e,f,g)
792     + eor w17,w26,w26,ror#9
793     + add w25,w25,w16 // h+=Sigma1(e)
794     + and w28,w28,w19 // (b^c)&=(a^b)
795     + add w21,w21,w25 // d+=h
796     + eor w28,w28,w27 // Maj(a,b,c)
797     + eor w17,w0,w17,ror#13 // Sigma0(a)
798     + add w25,w25,w28 // h+=Maj(a,b,c)
799     + ldr w28,[x30],#4 // *K++, w19 in next round
800     + //add w25,w25,w17 // h+=Sigma0(a)
801     +#ifndef __AARCH64EB__
802     + rev w14,w14 // 11
803     +#endif
804     + ldp w15,w0,[x1],#2*4
805     + add w25,w25,w17 // h+=Sigma0(a)
806     + str w6,[sp,#12]
807     + ror w16,w21,#6
808     + add w24,w24,w28 // h+=K[i]
809     + eor w6,w21,w21,ror#14
810     + and w17,w22,w21
811     + bic w28,w23,w21
812     + add w24,w24,w14 // h+=X[i]
813     + orr w17,w17,w28 // Ch(e,f,g)
814     + eor w28,w25,w26 // a^b, b^c in next round
815     + eor w16,w16,w6,ror#11 // Sigma1(e)
816     + ror w6,w25,#2
817     + add w24,w24,w17 // h+=Ch(e,f,g)
818     + eor w17,w25,w25,ror#9
819     + add w24,w24,w16 // h+=Sigma1(e)
820     + and w19,w19,w28 // (b^c)&=(a^b)
821     + add w20,w20,w24 // d+=h
822     + eor w19,w19,w26 // Maj(a,b,c)
823     + eor w17,w6,w17,ror#13 // Sigma0(a)
824     + add w24,w24,w19 // h+=Maj(a,b,c)
825     + ldr w19,[x30],#4 // *K++, w28 in next round
826     + //add w24,w24,w17 // h+=Sigma0(a)
827     +#ifndef __AARCH64EB__
828     + rev w15,w15 // 12
829     +#endif
830     + add w24,w24,w17 // h+=Sigma0(a)
831     + str w7,[sp,#0]
832     + ror w16,w20,#6
833     + add w23,w23,w19 // h+=K[i]
834     + eor w7,w20,w20,ror#14
835     + and w17,w21,w20
836     + bic w19,w22,w20
837     + add w23,w23,w15 // h+=X[i]
838     + orr w17,w17,w19 // Ch(e,f,g)
839     + eor w19,w24,w25 // a^b, b^c in next round
840     + eor w16,w16,w7,ror#11 // Sigma1(e)
841     + ror w7,w24,#2
842     + add w23,w23,w17 // h+=Ch(e,f,g)
843     + eor w17,w24,w24,ror#9
844     + add w23,w23,w16 // h+=Sigma1(e)
845     + and w28,w28,w19 // (b^c)&=(a^b)
846     + add w27,w27,w23 // d+=h
847     + eor w28,w28,w25 // Maj(a,b,c)
848     + eor w17,w7,w17,ror#13 // Sigma0(a)
849     + add w23,w23,w28 // h+=Maj(a,b,c)
850     + ldr w28,[x30],#4 // *K++, w19 in next round
851     + //add w23,w23,w17 // h+=Sigma0(a)
852     +#ifndef __AARCH64EB__
853     + rev w0,w0 // 13
854     +#endif
855     + ldp w1,w2,[x1]
856     + add w23,w23,w17 // h+=Sigma0(a)
857     + str w8,[sp,#4]
858     + ror w16,w27,#6
859     + add w22,w22,w28 // h+=K[i]
860     + eor w8,w27,w27,ror#14
861     + and w17,w20,w27
862     + bic w28,w21,w27
863     + add w22,w22,w0 // h+=X[i]
864     + orr w17,w17,w28 // Ch(e,f,g)
865     + eor w28,w23,w24 // a^b, b^c in next round
866     + eor w16,w16,w8,ror#11 // Sigma1(e)
867     + ror w8,w23,#2
868     + add w22,w22,w17 // h+=Ch(e,f,g)
869     + eor w17,w23,w23,ror#9
870     + add w22,w22,w16 // h+=Sigma1(e)
871     + and w19,w19,w28 // (b^c)&=(a^b)
872     + add w26,w26,w22 // d+=h
873     + eor w19,w19,w24 // Maj(a,b,c)
874     + eor w17,w8,w17,ror#13 // Sigma0(a)
875     + add w22,w22,w19 // h+=Maj(a,b,c)
876     + ldr w19,[x30],#4 // *K++, w28 in next round
877     + //add w22,w22,w17 // h+=Sigma0(a)
878     +#ifndef __AARCH64EB__
879     + rev w1,w1 // 14
880     +#endif
881     + ldr w6,[sp,#12]
882     + add w22,w22,w17 // h+=Sigma0(a)
883     + str w9,[sp,#8]
884     + ror w16,w26,#6
885     + add w21,w21,w19 // h+=K[i]
886     + eor w9,w26,w26,ror#14
887     + and w17,w27,w26
888     + bic w19,w20,w26
889     + add w21,w21,w1 // h+=X[i]
890     + orr w17,w17,w19 // Ch(e,f,g)
891     + eor w19,w22,w23 // a^b, b^c in next round
892     + eor w16,w16,w9,ror#11 // Sigma1(e)
893     + ror w9,w22,#2
894     + add w21,w21,w17 // h+=Ch(e,f,g)
895     + eor w17,w22,w22,ror#9
896     + add w21,w21,w16 // h+=Sigma1(e)
897     + and w28,w28,w19 // (b^c)&=(a^b)
898     + add w25,w25,w21 // d+=h
899     + eor w28,w28,w23 // Maj(a,b,c)
900     + eor w17,w9,w17,ror#13 // Sigma0(a)
901     + add w21,w21,w28 // h+=Maj(a,b,c)
902     + ldr w28,[x30],#4 // *K++, w19 in next round
903     + //add w21,w21,w17 // h+=Sigma0(a)
904     +#ifndef __AARCH64EB__
905     + rev w2,w2 // 15
906     +#endif
907     + ldr w7,[sp,#0]
908     + add w21,w21,w17 // h+=Sigma0(a)
909     + str w10,[sp,#12]
910     + ror w16,w25,#6
911     + add w20,w20,w28 // h+=K[i]
912     + ror w9,w4,#7
913     + and w17,w26,w25
914     + ror w8,w1,#17
915     + bic w28,w27,w25
916     + ror w10,w21,#2
917     + add w20,w20,w2 // h+=X[i]
918     + eor w16,w16,w25,ror#11
919     + eor w9,w9,w4,ror#18
920     + orr w17,w17,w28 // Ch(e,f,g)
921     + eor w28,w21,w22 // a^b, b^c in next round
922     + eor w16,w16,w25,ror#25 // Sigma1(e)
923     + eor w10,w10,w21,ror#13
924     + add w20,w20,w17 // h+=Ch(e,f,g)
925     + and w19,w19,w28 // (b^c)&=(a^b)
926     + eor w8,w8,w1,ror#19
927     + eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
928     + add w20,w20,w16 // h+=Sigma1(e)
929     + eor w19,w19,w22 // Maj(a,b,c)
930     + eor w17,w10,w21,ror#22 // Sigma0(a)
931     + eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
932     + add w3,w3,w12
933     + add w24,w24,w20 // d+=h
934     + add w20,w20,w19 // h+=Maj(a,b,c)
935     + ldr w19,[x30],#4 // *K++, w28 in next round
936     + add w3,w3,w9
937     + add w20,w20,w17 // h+=Sigma0(a)
938     + add w3,w3,w8
939     +.Loop_16_xx:
940     + ldr w8,[sp,#4]
941     + str w11,[sp,#0]
942     + ror w16,w24,#6
943     + add w27,w27,w19 // h+=K[i]
944     + ror w10,w5,#7
945     + and w17,w25,w24
946     + ror w9,w2,#17
947     + bic w19,w26,w24
948     + ror w11,w20,#2
949     + add w27,w27,w3 // h+=X[i]
950     + eor w16,w16,w24,ror#11
951     + eor w10,w10,w5,ror#18
952     + orr w17,w17,w19 // Ch(e,f,g)
953     + eor w19,w20,w21 // a^b, b^c in next round
954     + eor w16,w16,w24,ror#25 // Sigma1(e)
955     + eor w11,w11,w20,ror#13
956     + add w27,w27,w17 // h+=Ch(e,f,g)
957     + and w28,w28,w19 // (b^c)&=(a^b)
958     + eor w9,w9,w2,ror#19
959     + eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
960     + add w27,w27,w16 // h+=Sigma1(e)
961     + eor w28,w28,w21 // Maj(a,b,c)
962     + eor w17,w11,w20,ror#22 // Sigma0(a)
963     + eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
964     + add w4,w4,w13
965     + add w23,w23,w27 // d+=h
966     + add w27,w27,w28 // h+=Maj(a,b,c)
967     + ldr w28,[x30],#4 // *K++, w19 in next round
968     + add w4,w4,w10
969     + add w27,w27,w17 // h+=Sigma0(a)
970     + add w4,w4,w9
971     + ldr w9,[sp,#8]
972     + str w12,[sp,#4]
973     + ror w16,w23,#6
974     + add w26,w26,w28 // h+=K[i]
975     + ror w11,w6,#7
976     + and w17,w24,w23
977     + ror w10,w3,#17
978     + bic w28,w25,w23
979     + ror w12,w27,#2
980     + add w26,w26,w4 // h+=X[i]
981     + eor w16,w16,w23,ror#11
982     + eor w11,w11,w6,ror#18
983     + orr w17,w17,w28 // Ch(e,f,g)
984     + eor w28,w27,w20 // a^b, b^c in next round
985     + eor w16,w16,w23,ror#25 // Sigma1(e)
986     + eor w12,w12,w27,ror#13
987     + add w26,w26,w17 // h+=Ch(e,f,g)
988     + and w19,w19,w28 // (b^c)&=(a^b)
989     + eor w10,w10,w3,ror#19
990     + eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
991     + add w26,w26,w16 // h+=Sigma1(e)
992     + eor w19,w19,w20 // Maj(a,b,c)
993     + eor w17,w12,w27,ror#22 // Sigma0(a)
994     + eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
995     + add w5,w5,w14
996     + add w22,w22,w26 // d+=h
997     + add w26,w26,w19 // h+=Maj(a,b,c)
998     + ldr w19,[x30],#4 // *K++, w28 in next round
999     + add w5,w5,w11
1000     + add w26,w26,w17 // h+=Sigma0(a)
1001     + add w5,w5,w10
1002     + ldr w10,[sp,#12]
1003     + str w13,[sp,#8]
1004     + ror w16,w22,#6
1005     + add w25,w25,w19 // h+=K[i]
1006     + ror w12,w7,#7
1007     + and w17,w23,w22
1008     + ror w11,w4,#17
1009     + bic w19,w24,w22
1010     + ror w13,w26,#2
1011     + add w25,w25,w5 // h+=X[i]
1012     + eor w16,w16,w22,ror#11
1013     + eor w12,w12,w7,ror#18
1014     + orr w17,w17,w19 // Ch(e,f,g)
1015     + eor w19,w26,w27 // a^b, b^c in next round
1016     + eor w16,w16,w22,ror#25 // Sigma1(e)
1017     + eor w13,w13,w26,ror#13
1018     + add w25,w25,w17 // h+=Ch(e,f,g)
1019     + and w28,w28,w19 // (b^c)&=(a^b)
1020     + eor w11,w11,w4,ror#19
1021     + eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
1022     + add w25,w25,w16 // h+=Sigma1(e)
1023     + eor w28,w28,w27 // Maj(a,b,c)
1024     + eor w17,w13,w26,ror#22 // Sigma0(a)
1025     + eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
1026     + add w6,w6,w15
1027     + add w21,w21,w25 // d+=h
1028     + add w25,w25,w28 // h+=Maj(a,b,c)
1029     + ldr w28,[x30],#4 // *K++, w19 in next round
1030     + add w6,w6,w12
1031     + add w25,w25,w17 // h+=Sigma0(a)
1032     + add w6,w6,w11
1033     + ldr w11,[sp,#0]
1034     + str w14,[sp,#12]
1035     + ror w16,w21,#6
1036     + add w24,w24,w28 // h+=K[i]
1037     + ror w13,w8,#7
1038     + and w17,w22,w21
1039     + ror w12,w5,#17
1040     + bic w28,w23,w21
1041     + ror w14,w25,#2
1042     + add w24,w24,w6 // h+=X[i]
1043     + eor w16,w16,w21,ror#11
1044     + eor w13,w13,w8,ror#18
1045     + orr w17,w17,w28 // Ch(e,f,g)
1046     + eor w28,w25,w26 // a^b, b^c in next round
1047     + eor w16,w16,w21,ror#25 // Sigma1(e)
1048     + eor w14,w14,w25,ror#13
1049     + add w24,w24,w17 // h+=Ch(e,f,g)
1050     + and w19,w19,w28 // (b^c)&=(a^b)
1051     + eor w12,w12,w5,ror#19
1052     + eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
1053     + add w24,w24,w16 // h+=Sigma1(e)
1054     + eor w19,w19,w26 // Maj(a,b,c)
1055     + eor w17,w14,w25,ror#22 // Sigma0(a)
1056     + eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
1057     + add w7,w7,w0
1058     + add w20,w20,w24 // d+=h
1059     + add w24,w24,w19 // h+=Maj(a,b,c)
1060     + ldr w19,[x30],#4 // *K++, w28 in next round
1061     + add w7,w7,w13
1062     + add w24,w24,w17 // h+=Sigma0(a)
1063     + add w7,w7,w12
1064     + ldr w12,[sp,#4]
1065     + str w15,[sp,#0]
1066     + ror w16,w20,#6
1067     + add w23,w23,w19 // h+=K[i]
1068     + ror w14,w9,#7
1069     + and w17,w21,w20
1070     + ror w13,w6,#17
1071     + bic w19,w22,w20
1072     + ror w15,w24,#2
1073     + add w23,w23,w7 // h+=X[i]
1074     + eor w16,w16,w20,ror#11
1075     + eor w14,w14,w9,ror#18
1076     + orr w17,w17,w19 // Ch(e,f,g)
1077     + eor w19,w24,w25 // a^b, b^c in next round
1078     + eor w16,w16,w20,ror#25 // Sigma1(e)
1079     + eor w15,w15,w24,ror#13
1080     + add w23,w23,w17 // h+=Ch(e,f,g)
1081     + and w28,w28,w19 // (b^c)&=(a^b)
1082     + eor w13,w13,w6,ror#19
1083     + eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
1084     + add w23,w23,w16 // h+=Sigma1(e)
1085     + eor w28,w28,w25 // Maj(a,b,c)
1086     + eor w17,w15,w24,ror#22 // Sigma0(a)
1087     + eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
1088     + add w8,w8,w1
1089     + add w27,w27,w23 // d+=h
1090     + add w23,w23,w28 // h+=Maj(a,b,c)
1091     + ldr w28,[x30],#4 // *K++, w19 in next round
1092     + add w8,w8,w14
1093     + add w23,w23,w17 // h+=Sigma0(a)
1094     + add w8,w8,w13
1095     + ldr w13,[sp,#8]
1096     + str w0,[sp,#4]
1097     + ror w16,w27,#6
1098     + add w22,w22,w28 // h+=K[i]
1099     + ror w15,w10,#7
1100     + and w17,w20,w27
1101     + ror w14,w7,#17
1102     + bic w28,w21,w27
1103     + ror w0,w23,#2
1104     + add w22,w22,w8 // h+=X[i]
1105     + eor w16,w16,w27,ror#11
1106     + eor w15,w15,w10,ror#18
1107     + orr w17,w17,w28 // Ch(e,f,g)
1108     + eor w28,w23,w24 // a^b, b^c in next round
1109     + eor w16,w16,w27,ror#25 // Sigma1(e)
1110     + eor w0,w0,w23,ror#13
1111     + add w22,w22,w17 // h+=Ch(e,f,g)
1112     + and w19,w19,w28 // (b^c)&=(a^b)
1113     + eor w14,w14,w7,ror#19
1114     + eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
1115     + add w22,w22,w16 // h+=Sigma1(e)
1116     + eor w19,w19,w24 // Maj(a,b,c)
1117     + eor w17,w0,w23,ror#22 // Sigma0(a)
1118     + eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
1119     + add w9,w9,w2
1120     + add w26,w26,w22 // d+=h
1121     + add w22,w22,w19 // h+=Maj(a,b,c)
1122     + ldr w19,[x30],#4 // *K++, w28 in next round
1123     + add w9,w9,w15
1124     + add w22,w22,w17 // h+=Sigma0(a)
1125     + add w9,w9,w14
1126     + ldr w14,[sp,#12]
1127     + str w1,[sp,#8]
1128     + ror w16,w26,#6
1129     + add w21,w21,w19 // h+=K[i]
1130     + ror w0,w11,#7
1131     + and w17,w27,w26
1132     + ror w15,w8,#17
1133     + bic w19,w20,w26
1134     + ror w1,w22,#2
1135     + add w21,w21,w9 // h+=X[i]
1136     + eor w16,w16,w26,ror#11
1137     + eor w0,w0,w11,ror#18
1138     + orr w17,w17,w19 // Ch(e,f,g)
1139     + eor w19,w22,w23 // a^b, b^c in next round
1140     + eor w16,w16,w26,ror#25 // Sigma1(e)
1141     + eor w1,w1,w22,ror#13
1142     + add w21,w21,w17 // h+=Ch(e,f,g)
1143     + and w28,w28,w19 // (b^c)&=(a^b)
1144     + eor w15,w15,w8,ror#19
1145     + eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
1146     + add w21,w21,w16 // h+=Sigma1(e)
1147     + eor w28,w28,w23 // Maj(a,b,c)
1148     + eor w17,w1,w22,ror#22 // Sigma0(a)
1149     + eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
1150     + add w10,w10,w3
1151     + add w25,w25,w21 // d+=h
1152     + add w21,w21,w28 // h+=Maj(a,b,c)
1153     + ldr w28,[x30],#4 // *K++, w19 in next round
1154     + add w10,w10,w0
1155     + add w21,w21,w17 // h+=Sigma0(a)
1156     + add w10,w10,w15
1157     + ldr w15,[sp,#0]
1158     + str w2,[sp,#12]
1159     + ror w16,w25,#6
1160     + add w20,w20,w28 // h+=K[i]
1161     + ror w1,w12,#7
1162     + and w17,w26,w25
1163     + ror w0,w9,#17
1164     + bic w28,w27,w25
1165     + ror w2,w21,#2
1166     + add w20,w20,w10 // h+=X[i]
1167     + eor w16,w16,w25,ror#11
1168     + eor w1,w1,w12,ror#18
1169     + orr w17,w17,w28 // Ch(e,f,g)
1170     + eor w28,w21,w22 // a^b, b^c in next round
1171     + eor w16,w16,w25,ror#25 // Sigma1(e)
1172     + eor w2,w2,w21,ror#13
1173     + add w20,w20,w17 // h+=Ch(e,f,g)
1174     + and w19,w19,w28 // (b^c)&=(a^b)
1175     + eor w0,w0,w9,ror#19
1176     + eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
1177     + add w20,w20,w16 // h+=Sigma1(e)
1178     + eor w19,w19,w22 // Maj(a,b,c)
1179     + eor w17,w2,w21,ror#22 // Sigma0(a)
1180     + eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
1181     + add w11,w11,w4
1182     + add w24,w24,w20 // d+=h
1183     + add w20,w20,w19 // h+=Maj(a,b,c)
1184     + ldr w19,[x30],#4 // *K++, w28 in next round
1185     + add w11,w11,w1
1186     + add w20,w20,w17 // h+=Sigma0(a)
1187     + add w11,w11,w0
1188     + ldr w0,[sp,#4]
1189     + str w3,[sp,#0]
1190     + ror w16,w24,#6
1191     + add w27,w27,w19 // h+=K[i]
1192     + ror w2,w13,#7
1193     + and w17,w25,w24
1194     + ror w1,w10,#17
1195     + bic w19,w26,w24
1196     + ror w3,w20,#2
1197     + add w27,w27,w11 // h+=X[i]
1198     + eor w16,w16,w24,ror#11
1199     + eor w2,w2,w13,ror#18
1200     + orr w17,w17,w19 // Ch(e,f,g)
1201     + eor w19,w20,w21 // a^b, b^c in next round
1202     + eor w16,w16,w24,ror#25 // Sigma1(e)
1203     + eor w3,w3,w20,ror#13
1204     + add w27,w27,w17 // h+=Ch(e,f,g)
1205     + and w28,w28,w19 // (b^c)&=(a^b)
1206     + eor w1,w1,w10,ror#19
1207     + eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
1208     + add w27,w27,w16 // h+=Sigma1(e)
1209     + eor w28,w28,w21 // Maj(a,b,c)
1210     + eor w17,w3,w20,ror#22 // Sigma0(a)
1211     + eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
1212     + add w12,w12,w5
1213     + add w23,w23,w27 // d+=h
1214     + add w27,w27,w28 // h+=Maj(a,b,c)
1215     + ldr w28,[x30],#4 // *K++, w19 in next round
1216     + add w12,w12,w2
1217     + add w27,w27,w17 // h+=Sigma0(a)
1218     + add w12,w12,w1
1219     + ldr w1,[sp,#8]
1220     + str w4,[sp,#4]
1221     + ror w16,w23,#6
1222     + add w26,w26,w28 // h+=K[i]
1223     + ror w3,w14,#7
1224     + and w17,w24,w23
1225     + ror w2,w11,#17
1226     + bic w28,w25,w23
1227     + ror w4,w27,#2
1228     + add w26,w26,w12 // h+=X[i]
1229     + eor w16,w16,w23,ror#11
1230     + eor w3,w3,w14,ror#18
1231     + orr w17,w17,w28 // Ch(e,f,g)
1232     + eor w28,w27,w20 // a^b, b^c in next round
1233     + eor w16,w16,w23,ror#25 // Sigma1(e)
1234     + eor w4,w4,w27,ror#13
1235     + add w26,w26,w17 // h+=Ch(e,f,g)
1236     + and w19,w19,w28 // (b^c)&=(a^b)
1237     + eor w2,w2,w11,ror#19
1238     + eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
1239     + add w26,w26,w16 // h+=Sigma1(e)
1240     + eor w19,w19,w20 // Maj(a,b,c)
1241     + eor w17,w4,w27,ror#22 // Sigma0(a)
1242     + eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
1243     + add w13,w13,w6
1244     + add w22,w22,w26 // d+=h
1245     + add w26,w26,w19 // h+=Maj(a,b,c)
1246     + ldr w19,[x30],#4 // *K++, w28 in next round
1247     + add w13,w13,w3
1248     + add w26,w26,w17 // h+=Sigma0(a)
1249     + add w13,w13,w2
1250     + ldr w2,[sp,#12]
1251     + str w5,[sp,#8]
1252     + ror w16,w22,#6
1253     + add w25,w25,w19 // h+=K[i]
1254     + ror w4,w15,#7
1255     + and w17,w23,w22
1256     + ror w3,w12,#17
1257     + bic w19,w24,w22
1258     + ror w5,w26,#2
1259     + add w25,w25,w13 // h+=X[i]
1260     + eor w16,w16,w22,ror#11
1261     + eor w4,w4,w15,ror#18
1262     + orr w17,w17,w19 // Ch(e,f,g)
1263     + eor w19,w26,w27 // a^b, b^c in next round
1264     + eor w16,w16,w22,ror#25 // Sigma1(e)
1265     + eor w5,w5,w26,ror#13
1266     + add w25,w25,w17 // h+=Ch(e,f,g)
1267     + and w28,w28,w19 // (b^c)&=(a^b)
1268     + eor w3,w3,w12,ror#19
1269     + eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
1270     + add w25,w25,w16 // h+=Sigma1(e)
1271     + eor w28,w28,w27 // Maj(a,b,c)
1272     + eor w17,w5,w26,ror#22 // Sigma0(a)
1273     + eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
1274     + add w14,w14,w7
1275     + add w21,w21,w25 // d+=h
1276     + add w25,w25,w28 // h+=Maj(a,b,c)
1277     + ldr w28,[x30],#4 // *K++, w19 in next round
1278     + add w14,w14,w4
1279     + add w25,w25,w17 // h+=Sigma0(a)
1280     + add w14,w14,w3
1281     + ldr w3,[sp,#0]
1282     + str w6,[sp,#12]
1283     + ror w16,w21,#6
1284     + add w24,w24,w28 // h+=K[i]
1285     + ror w5,w0,#7
1286     + and w17,w22,w21
1287     + ror w4,w13,#17
1288     + bic w28,w23,w21
1289     + ror w6,w25,#2
1290     + add w24,w24,w14 // h+=X[i]
1291     + eor w16,w16,w21,ror#11
1292     + eor w5,w5,w0,ror#18
1293     + orr w17,w17,w28 // Ch(e,f,g)
1294     + eor w28,w25,w26 // a^b, b^c in next round
1295     + eor w16,w16,w21,ror#25 // Sigma1(e)
1296     + eor w6,w6,w25,ror#13
1297     + add w24,w24,w17 // h+=Ch(e,f,g)
1298     + and w19,w19,w28 // (b^c)&=(a^b)
1299     + eor w4,w4,w13,ror#19
1300     + eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
1301     + add w24,w24,w16 // h+=Sigma1(e)
1302     + eor w19,w19,w26 // Maj(a,b,c)
1303     + eor w17,w6,w25,ror#22 // Sigma0(a)
1304     + eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
1305     + add w15,w15,w8
1306     + add w20,w20,w24 // d+=h
1307     + add w24,w24,w19 // h+=Maj(a,b,c)
1308     + ldr w19,[x30],#4 // *K++, w28 in next round
1309     + add w15,w15,w5
1310     + add w24,w24,w17 // h+=Sigma0(a)
1311     + add w15,w15,w4
1312     + ldr w4,[sp,#4]
1313     + str w7,[sp,#0]
1314     + ror w16,w20,#6
1315     + add w23,w23,w19 // h+=K[i]
1316     + ror w6,w1,#7
1317     + and w17,w21,w20
1318     + ror w5,w14,#17
1319     + bic w19,w22,w20
1320     + ror w7,w24,#2
1321     + add w23,w23,w15 // h+=X[i]
1322     + eor w16,w16,w20,ror#11
1323     + eor w6,w6,w1,ror#18
1324     + orr w17,w17,w19 // Ch(e,f,g)
1325     + eor w19,w24,w25 // a^b, b^c in next round
1326     + eor w16,w16,w20,ror#25 // Sigma1(e)
1327     + eor w7,w7,w24,ror#13
1328     + add w23,w23,w17 // h+=Ch(e,f,g)
1329     + and w28,w28,w19 // (b^c)&=(a^b)
1330     + eor w5,w5,w14,ror#19
1331     + eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
1332     + add w23,w23,w16 // h+=Sigma1(e)
1333     + eor w28,w28,w25 // Maj(a,b,c)
1334     + eor w17,w7,w24,ror#22 // Sigma0(a)
1335     + eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
1336     + add w0,w0,w9
1337     + add w27,w27,w23 // d+=h
1338     + add w23,w23,w28 // h+=Maj(a,b,c)
1339     + ldr w28,[x30],#4 // *K++, w19 in next round
1340     + add w0,w0,w6
1341     + add w23,w23,w17 // h+=Sigma0(a)
1342     + add w0,w0,w5
1343     + ldr w5,[sp,#8]
1344     + str w8,[sp,#4]
1345     + ror w16,w27,#6
1346     + add w22,w22,w28 // h+=K[i]
1347     + ror w7,w2,#7
1348     + and w17,w20,w27
1349     + ror w6,w15,#17
1350     + bic w28,w21,w27
1351     + ror w8,w23,#2
1352     + add w22,w22,w0 // h+=X[i]
1353     + eor w16,w16,w27,ror#11
1354     + eor w7,w7,w2,ror#18
1355     + orr w17,w17,w28 // Ch(e,f,g)
1356     + eor w28,w23,w24 // a^b, b^c in next round
1357     + eor w16,w16,w27,ror#25 // Sigma1(e)
1358     + eor w8,w8,w23,ror#13
1359     + add w22,w22,w17 // h+=Ch(e,f,g)
1360     + and w19,w19,w28 // (b^c)&=(a^b)
1361     + eor w6,w6,w15,ror#19
1362     + eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
1363     + add w22,w22,w16 // h+=Sigma1(e)
1364     + eor w19,w19,w24 // Maj(a,b,c)
1365     + eor w17,w8,w23,ror#22 // Sigma0(a)
1366     + eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
1367     + add w1,w1,w10
1368     + add w26,w26,w22 // d+=h
1369     + add w22,w22,w19 // h+=Maj(a,b,c)
1370     + ldr w19,[x30],#4 // *K++, w28 in next round
1371     + add w1,w1,w7
1372     + add w22,w22,w17 // h+=Sigma0(a)
1373     + add w1,w1,w6
1374     + ldr w6,[sp,#12]
1375     + str w9,[sp,#8]
1376     + ror w16,w26,#6
1377     + add w21,w21,w19 // h+=K[i]
1378     + ror w8,w3,#7
1379     + and w17,w27,w26
1380     + ror w7,w0,#17
1381     + bic w19,w20,w26
1382     + ror w9,w22,#2
1383     + add w21,w21,w1 // h+=X[i]
1384     + eor w16,w16,w26,ror#11
1385     + eor w8,w8,w3,ror#18
1386     + orr w17,w17,w19 // Ch(e,f,g)
1387     + eor w19,w22,w23 // a^b, b^c in next round
1388     + eor w16,w16,w26,ror#25 // Sigma1(e)
1389     + eor w9,w9,w22,ror#13
1390     + add w21,w21,w17 // h+=Ch(e,f,g)
1391     + and w28,w28,w19 // (b^c)&=(a^b)
1392     + eor w7,w7,w0,ror#19
1393     + eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
1394     + add w21,w21,w16 // h+=Sigma1(e)
1395     + eor w28,w28,w23 // Maj(a,b,c)
1396     + eor w17,w9,w22,ror#22 // Sigma0(a)
1397     + eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
1398     + add w2,w2,w11
1399     + add w25,w25,w21 // d+=h
1400     + add w21,w21,w28 // h+=Maj(a,b,c)
1401     + ldr w28,[x30],#4 // *K++, w19 in next round
1402     + add w2,w2,w8
1403     + add w21,w21,w17 // h+=Sigma0(a)
1404     + add w2,w2,w7
1405     + ldr w7,[sp,#0]
1406     + str w10,[sp,#12]
1407     + ror w16,w25,#6
1408     + add w20,w20,w28 // h+=K[i]
1409     + ror w9,w4,#7
1410     + and w17,w26,w25
1411     + ror w8,w1,#17
1412     + bic w28,w27,w25
1413     + ror w10,w21,#2
1414     + add w20,w20,w2 // h+=X[i]
1415     + eor w16,w16,w25,ror#11
1416     + eor w9,w9,w4,ror#18
1417     + orr w17,w17,w28 // Ch(e,f,g)
1418     + eor w28,w21,w22 // a^b, b^c in next round
1419     + eor w16,w16,w25,ror#25 // Sigma1(e)
1420     + eor w10,w10,w21,ror#13
1421     + add w20,w20,w17 // h+=Ch(e,f,g)
1422     + and w19,w19,w28 // (b^c)&=(a^b)
1423     + eor w8,w8,w1,ror#19
1424     + eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
1425     + add w20,w20,w16 // h+=Sigma1(e)
1426     + eor w19,w19,w22 // Maj(a,b,c)
1427     + eor w17,w10,w21,ror#22 // Sigma0(a)
1428     + eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
1429     + add w3,w3,w12
1430     + add w24,w24,w20 // d+=h
1431     + add w20,w20,w19 // h+=Maj(a,b,c)
1432     + ldr w19,[x30],#4 // *K++, w28 in next round
1433     + add w3,w3,w9
1434     + add w20,w20,w17 // h+=Sigma0(a)
1435     + add w3,w3,w8
1436     + cbnz w19,.Loop_16_xx
1437     +
1438     + ldp x0,x2,[x29,#96]
1439     + ldr x1,[x29,#112]
1440     + sub x30,x30,#260 // rewind
1441     +
1442     + ldp w3,w4,[x0]
1443     + ldp w5,w6,[x0,#2*4]
1444     + add x1,x1,#14*4 // advance input pointer
1445     + ldp w7,w8,[x0,#4*4]
1446     + add w20,w20,w3
1447     + ldp w9,w10,[x0,#6*4]
1448     + add w21,w21,w4
1449     + add w22,w22,w5
1450     + add w23,w23,w6
1451     + stp w20,w21,[x0]
1452     + add w24,w24,w7
1453     + add w25,w25,w8
1454     + stp w22,w23,[x0,#2*4]
1455     + add w26,w26,w9
1456     + add w27,w27,w10
1457     + cmp x1,x2
1458     + stp w24,w25,[x0,#4*4]
1459     + stp w26,w27,[x0,#6*4]
1460     + b.ne .Loop
1461     +
1462     + ldp x19,x20,[x29,#16]
1463     + add sp,sp,#4*4
1464     + ldp x21,x22,[x29,#32]
1465     + ldp x23,x24,[x29,#48]
1466     + ldp x25,x26,[x29,#64]
1467     + ldp x27,x28,[x29,#80]
1468     + ldp x29,x30,[sp],#128
1469     + ret
1470     +.size sha256_block_data_order,.-sha256_block_data_order
1471     +
1472     +.align 6
1473     +.type .LK256,%object
1474     +.LK256:
1475     + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1476     + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1477     + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1478     + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1479     + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1480     + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1481     + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1482     + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1483     + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1484     + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1485     + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1486     + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1487     + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1488     + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1489     + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1490     + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1491     + .long 0 //terminator
1492     +.size .LK256,.-.LK256
1493     +#ifndef __KERNEL__
1494     +.align 3
1495     +.LOPENSSL_armcap_P:
1496     +# ifdef __ILP32__
1497     + .long OPENSSL_armcap_P-.
1498     +# else
1499     + .quad OPENSSL_armcap_P-.
1500     +# endif
1501     +#endif
1502     +.asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
1503     +.align 2
1504     +#ifndef __KERNEL__
1505     +.type sha256_block_armv8,%function
1506     +.align 6
1507     +sha256_block_armv8:
1508     +.Lv8_entry:
1509     + stp x29,x30,[sp,#-16]!
1510     + add x29,sp,#0
1511     +
1512     + ld1 {v0.4s,v1.4s},[x0]
1513     + adr x3,.LK256
1514     +
1515     +.Loop_hw:
1516     + ld1 {v4.16b-v7.16b},[x1],#64
1517     + sub x2,x2,#1
1518     + ld1 {v16.4s},[x3],#16
1519     + rev32 v4.16b,v4.16b
1520     + rev32 v5.16b,v5.16b
1521     + rev32 v6.16b,v6.16b
1522     + rev32 v7.16b,v7.16b
1523     + orr v18.16b,v0.16b,v0.16b // offload
1524     + orr v19.16b,v1.16b,v1.16b
1525     + ld1 {v17.4s},[x3],#16
1526     + add v16.4s,v16.4s,v4.4s
1527     + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
1528     + orr v2.16b,v0.16b,v0.16b
1529     + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1530     + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1531     + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
1532     + ld1 {v16.4s},[x3],#16
1533     + add v17.4s,v17.4s,v5.4s
1534     + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
1535     + orr v2.16b,v0.16b,v0.16b
1536     + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1537     + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1538     + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
1539     + ld1 {v17.4s},[x3],#16
1540     + add v16.4s,v16.4s,v6.4s
1541     + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
1542     + orr v2.16b,v0.16b,v0.16b
1543     + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1544     + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1545     + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
1546     + ld1 {v16.4s},[x3],#16
1547     + add v17.4s,v17.4s,v7.4s
1548     + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
1549     + orr v2.16b,v0.16b,v0.16b
1550     + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1551     + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1552     + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
1553     + ld1 {v17.4s},[x3],#16
1554     + add v16.4s,v16.4s,v4.4s
1555     + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
1556     + orr v2.16b,v0.16b,v0.16b
1557     + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1558     + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1559     + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
1560     + ld1 {v16.4s},[x3],#16
1561     + add v17.4s,v17.4s,v5.4s
1562     + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
1563     + orr v2.16b,v0.16b,v0.16b
1564     + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1565     + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1566     + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
1567     + ld1 {v17.4s},[x3],#16
1568     + add v16.4s,v16.4s,v6.4s
1569     + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
1570     + orr v2.16b,v0.16b,v0.16b
1571     + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1572     + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1573     + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
1574     + ld1 {v16.4s},[x3],#16
1575     + add v17.4s,v17.4s,v7.4s
1576     + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
1577     + orr v2.16b,v0.16b,v0.16b
1578     + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1579     + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1580     + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
1581     + ld1 {v17.4s},[x3],#16
1582     + add v16.4s,v16.4s,v4.4s
1583     + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
1584     + orr v2.16b,v0.16b,v0.16b
1585     + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1586     + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1587     + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
1588     + ld1 {v16.4s},[x3],#16
1589     + add v17.4s,v17.4s,v5.4s
1590     + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
1591     + orr v2.16b,v0.16b,v0.16b
1592     + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1593     + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1594     + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
1595     + ld1 {v17.4s},[x3],#16
1596     + add v16.4s,v16.4s,v6.4s
1597     + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
1598     + orr v2.16b,v0.16b,v0.16b
1599     + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1600     + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1601     + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
1602     + ld1 {v16.4s},[x3],#16
1603     + add v17.4s,v17.4s,v7.4s
1604     + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
1605     + orr v2.16b,v0.16b,v0.16b
1606     + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1607     + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1608     + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
1609     + ld1 {v17.4s},[x3],#16
1610     + add v16.4s,v16.4s,v4.4s
1611     + orr v2.16b,v0.16b,v0.16b
1612     + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1613     + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1614     +
1615     + ld1 {v16.4s},[x3],#16
1616     + add v17.4s,v17.4s,v5.4s
1617     + orr v2.16b,v0.16b,v0.16b
1618     + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1619     + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1620     +
1621     + ld1 {v17.4s},[x3]
1622     + add v16.4s,v16.4s,v6.4s
1623     + sub x3,x3,#64*4-16 // rewind
1624     + orr v2.16b,v0.16b,v0.16b
1625     + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1626     + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1627     +
1628     + add v17.4s,v17.4s,v7.4s
1629     + orr v2.16b,v0.16b,v0.16b
1630     + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1631     + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1632     +
1633     + add v0.4s,v0.4s,v18.4s
1634     + add v1.4s,v1.4s,v19.4s
1635     +
1636     + cbnz x2,.Loop_hw
1637     +
1638     + st1 {v0.4s,v1.4s},[x0]
1639     +
1640     + ldr x29,[sp],#16
1641     + ret
1642     +.size sha256_block_armv8,.-sha256_block_armv8
1643     +#endif
1644     +#ifdef __KERNEL__
1645     +.globl sha256_block_neon
1646     +#endif
1647     +.type sha256_block_neon,%function
1648     +.align 4
1649     +sha256_block_neon:
1650     +.Lneon_entry:
1651     + stp x29, x30, [sp, #-16]!
1652     + mov x29, sp
1653     + sub sp,sp,#16*4
1654     +
1655     + adr x16,.LK256
1656     + add x2,x1,x2,lsl#6 // len to point at the end of inp
1657     +
1658     + ld1 {v0.16b},[x1], #16
1659     + ld1 {v1.16b},[x1], #16
1660     + ld1 {v2.16b},[x1], #16
1661     + ld1 {v3.16b},[x1], #16
1662     + ld1 {v4.4s},[x16], #16
1663     + ld1 {v5.4s},[x16], #16
1664     + ld1 {v6.4s},[x16], #16
1665     + ld1 {v7.4s},[x16], #16
1666     + rev32 v0.16b,v0.16b // yes, even on
1667     + rev32 v1.16b,v1.16b // big-endian
1668     + rev32 v2.16b,v2.16b
1669     + rev32 v3.16b,v3.16b
1670     + mov x17,sp
1671     + add v4.4s,v4.4s,v0.4s
1672     + add v5.4s,v5.4s,v1.4s
1673     + add v6.4s,v6.4s,v2.4s
1674     + st1 {v4.4s-v5.4s},[x17], #32
1675     + add v7.4s,v7.4s,v3.4s
1676     + st1 {v6.4s-v7.4s},[x17]
1677     + sub x17,x17,#32
1678     +
1679     + ldp w3,w4,[x0]
1680     + ldp w5,w6,[x0,#8]
1681     + ldp w7,w8,[x0,#16]
1682     + ldp w9,w10,[x0,#24]
1683     + ldr w12,[sp,#0]
1684     + mov w13,wzr
1685     + eor w14,w4,w5
1686     + mov w15,wzr
1687     + b .L_00_48
1688     +
1689     +.align 4
1690     +.L_00_48:
1691     + ext v4.16b,v0.16b,v1.16b,#4
1692     + add w10,w10,w12
1693     + add w3,w3,w15
1694     + and w12,w8,w7
1695     + bic w15,w9,w7
1696     + ext v7.16b,v2.16b,v3.16b,#4
1697     + eor w11,w7,w7,ror#5
1698     + add w3,w3,w13
1699     + mov d19,v3.d[1]
1700     + orr w12,w12,w15
1701     + eor w11,w11,w7,ror#19
1702     + ushr v6.4s,v4.4s,#7
1703     + eor w15,w3,w3,ror#11
1704     + ushr v5.4s,v4.4s,#3
1705     + add w10,w10,w12
1706     + add v0.4s,v0.4s,v7.4s
1707     + ror w11,w11,#6
1708     + sli v6.4s,v4.4s,#25
1709     + eor w13,w3,w4
1710     + eor w15,w15,w3,ror#20
1711     + ushr v7.4s,v4.4s,#18
1712     + add w10,w10,w11
1713     + ldr w12,[sp,#4]
1714     + and w14,w14,w13
1715     + eor v5.16b,v5.16b,v6.16b
1716     + ror w15,w15,#2
1717     + add w6,w6,w10
1718     + sli v7.4s,v4.4s,#14
1719     + eor w14,w14,w4
1720     + ushr v16.4s,v19.4s,#17
1721     + add w9,w9,w12
1722     + add w10,w10,w15
1723     + and w12,w7,w6
1724     + eor v5.16b,v5.16b,v7.16b
1725     + bic w15,w8,w6
1726     + eor w11,w6,w6,ror#5
1727     + sli v16.4s,v19.4s,#15
1728     + add w10,w10,w14
1729     + orr w12,w12,w15
1730     + ushr v17.4s,v19.4s,#10
1731     + eor w11,w11,w6,ror#19
1732     + eor w15,w10,w10,ror#11
1733     + ushr v7.4s,v19.4s,#19
1734     + add w9,w9,w12
1735     + ror w11,w11,#6
1736     + add v0.4s,v0.4s,v5.4s
1737     + eor w14,w10,w3
1738     + eor w15,w15,w10,ror#20
1739     + sli v7.4s,v19.4s,#13
1740     + add w9,w9,w11
1741     + ldr w12,[sp,#8]
1742     + and w13,w13,w14
1743     + eor v17.16b,v17.16b,v16.16b
1744     + ror w15,w15,#2
1745     + add w5,w5,w9
1746     + eor w13,w13,w3
1747     + eor v17.16b,v17.16b,v7.16b
1748     + add w8,w8,w12
1749     + add w9,w9,w15
1750     + and w12,w6,w5
1751     + add v0.4s,v0.4s,v17.4s
1752     + bic w15,w7,w5
1753     + eor w11,w5,w5,ror#5
1754     + add w9,w9,w13
1755     + ushr v18.4s,v0.4s,#17
1756     + orr w12,w12,w15
1757     + ushr v19.4s,v0.4s,#10
1758     + eor w11,w11,w5,ror#19
1759     + eor w15,w9,w9,ror#11
1760     + sli v18.4s,v0.4s,#15
1761     + add w8,w8,w12
1762     + ushr v17.4s,v0.4s,#19
1763     + ror w11,w11,#6
1764     + eor w13,w9,w10
1765     + eor v19.16b,v19.16b,v18.16b
1766     + eor w15,w15,w9,ror#20
1767     + add w8,w8,w11
1768     + sli v17.4s,v0.4s,#13
1769     + ldr w12,[sp,#12]
1770     + and w14,w14,w13
1771     + ror w15,w15,#2
1772     + ld1 {v4.4s},[x16], #16
1773     + add w4,w4,w8
1774     + eor v19.16b,v19.16b,v17.16b
1775     + eor w14,w14,w10
1776     + eor v17.16b,v17.16b,v17.16b
1777     + add w7,w7,w12
1778     + add w8,w8,w15
1779     + and w12,w5,w4
1780     + mov v17.d[1],v19.d[0]
1781     + bic w15,w6,w4
1782     + eor w11,w4,w4,ror#5
1783     + add w8,w8,w14
1784     + add v0.4s,v0.4s,v17.4s
1785     + orr w12,w12,w15
1786     + eor w11,w11,w4,ror#19
1787     + eor w15,w8,w8,ror#11
1788     + add v4.4s,v4.4s,v0.4s
1789     + add w7,w7,w12
1790     + ror w11,w11,#6
1791     + eor w14,w8,w9
1792     + eor w15,w15,w8,ror#20
1793     + add w7,w7,w11
1794     + ldr w12,[sp,#16]
1795     + and w13,w13,w14
1796     + ror w15,w15,#2
1797     + add w3,w3,w7
1798     + eor w13,w13,w9
1799     + st1 {v4.4s},[x17], #16
1800     + ext v4.16b,v1.16b,v2.16b,#4
1801     + add w6,w6,w12
1802     + add w7,w7,w15
1803     + and w12,w4,w3
1804     + bic w15,w5,w3
1805     + ext v7.16b,v3.16b,v0.16b,#4
1806     + eor w11,w3,w3,ror#5
1807     + add w7,w7,w13
1808     + mov d19,v0.d[1]
1809     + orr w12,w12,w15
1810     + eor w11,w11,w3,ror#19
1811     + ushr v6.4s,v4.4s,#7
1812     + eor w15,w7,w7,ror#11
1813     + ushr v5.4s,v4.4s,#3
1814     + add w6,w6,w12
1815     + add v1.4s,v1.4s,v7.4s
1816     + ror w11,w11,#6
1817     + sli v6.4s,v4.4s,#25
1818     + eor w13,w7,w8
1819     + eor w15,w15,w7,ror#20
1820     + ushr v7.4s,v4.4s,#18
1821     + add w6,w6,w11
1822     + ldr w12,[sp,#20]
1823     + and w14,w14,w13
1824     + eor v5.16b,v5.16b,v6.16b
1825     + ror w15,w15,#2
1826     + add w10,w10,w6
1827     + sli v7.4s,v4.4s,#14
1828     + eor w14,w14,w8
1829     + ushr v16.4s,v19.4s,#17
1830     + add w5,w5,w12
1831     + add w6,w6,w15
1832     + and w12,w3,w10
1833     + eor v5.16b,v5.16b,v7.16b
1834     + bic w15,w4,w10
1835     + eor w11,w10,w10,ror#5
1836     + sli v16.4s,v19.4s,#15
1837     + add w6,w6,w14
1838     + orr w12,w12,w15
1839     + ushr v17.4s,v19.4s,#10
1840     + eor w11,w11,w10,ror#19
1841     + eor w15,w6,w6,ror#11
1842     + ushr v7.4s,v19.4s,#19
1843     + add w5,w5,w12
1844     + ror w11,w11,#6
1845     + add v1.4s,v1.4s,v5.4s
1846     + eor w14,w6,w7
1847     + eor w15,w15,w6,ror#20
1848     + sli v7.4s,v19.4s,#13
1849     + add w5,w5,w11
1850     + ldr w12,[sp,#24]
1851     + and w13,w13,w14
1852     + eor v17.16b,v17.16b,v16.16b
1853     + ror w15,w15,#2
1854     + add w9,w9,w5
1855     + eor w13,w13,w7
1856     + eor v17.16b,v17.16b,v7.16b
1857     + add w4,w4,w12
1858     + add w5,w5,w15
1859     + and w12,w10,w9
1860     + add v1.4s,v1.4s,v17.4s
1861     + bic w15,w3,w9
1862     + eor w11,w9,w9,ror#5
1863     + add w5,w5,w13
1864     + ushr v18.4s,v1.4s,#17
1865     + orr w12,w12,w15
1866     + ushr v19.4s,v1.4s,#10
1867     + eor w11,w11,w9,ror#19
1868     + eor w15,w5,w5,ror#11
1869     + sli v18.4s,v1.4s,#15
1870     + add w4,w4,w12
1871     + ushr v17.4s,v1.4s,#19
1872     + ror w11,w11,#6
1873     + eor w13,w5,w6
1874     + eor v19.16b,v19.16b,v18.16b
1875     + eor w15,w15,w5,ror#20
1876     + add w4,w4,w11
1877     + sli v17.4s,v1.4s,#13
1878     + ldr w12,[sp,#28]
1879     + and w14,w14,w13
1880     + ror w15,w15,#2
1881     + ld1 {v4.4s},[x16], #16
1882     + add w8,w8,w4
1883     + eor v19.16b,v19.16b,v17.16b
1884     + eor w14,w14,w6
1885     + eor v17.16b,v17.16b,v17.16b
1886     + add w3,w3,w12
1887     + add w4,w4,w15
1888     + and w12,w9,w8
1889     + mov v17.d[1],v19.d[0]
1890     + bic w15,w10,w8
1891     + eor w11,w8,w8,ror#5
1892     + add w4,w4,w14
1893     + add v1.4s,v1.4s,v17.4s
1894     + orr w12,w12,w15
1895     + eor w11,w11,w8,ror#19
1896     + eor w15,w4,w4,ror#11
1897     + add v4.4s,v4.4s,v1.4s
1898     + add w3,w3,w12
1899     + ror w11,w11,#6
1900     + eor w14,w4,w5
1901     + eor w15,w15,w4,ror#20
1902     + add w3,w3,w11
1903     + ldr w12,[sp,#32]
1904     + and w13,w13,w14
1905     + ror w15,w15,#2
1906     + add w7,w7,w3
1907     + eor w13,w13,w5
1908     + st1 {v4.4s},[x17], #16
1909     + ext v4.16b,v2.16b,v3.16b,#4
1910     + add w10,w10,w12
1911     + add w3,w3,w15
1912     + and w12,w8,w7
1913     + bic w15,w9,w7
1914     + ext v7.16b,v0.16b,v1.16b,#4
1915     + eor w11,w7,w7,ror#5
1916     + add w3,w3,w13
1917     + mov d19,v1.d[1]
1918     + orr w12,w12,w15
1919     + eor w11,w11,w7,ror#19
1920     + ushr v6.4s,v4.4s,#7
1921     + eor w15,w3,w3,ror#11
1922     + ushr v5.4s,v4.4s,#3
1923     + add w10,w10,w12
1924     + add v2.4s,v2.4s,v7.4s
1925     + ror w11,w11,#6
1926     + sli v6.4s,v4.4s,#25
1927     + eor w13,w3,w4
1928     + eor w15,w15,w3,ror#20
1929     + ushr v7.4s,v4.4s,#18
1930     + add w10,w10,w11
1931     + ldr w12,[sp,#36]
1932     + and w14,w14,w13
1933     + eor v5.16b,v5.16b,v6.16b
1934     + ror w15,w15,#2
1935     + add w6,w6,w10
1936     + sli v7.4s,v4.4s,#14
1937     + eor w14,w14,w4
1938     + ushr v16.4s,v19.4s,#17
1939     + add w9,w9,w12
1940     + add w10,w10,w15
1941     + and w12,w7,w6
1942     + eor v5.16b,v5.16b,v7.16b
1943     + bic w15,w8,w6
1944     + eor w11,w6,w6,ror#5
1945     + sli v16.4s,v19.4s,#15
1946     + add w10,w10,w14
1947     + orr w12,w12,w15
1948     + ushr v17.4s,v19.4s,#10
1949     + eor w11,w11,w6,ror#19
1950     + eor w15,w10,w10,ror#11
1951     + ushr v7.4s,v19.4s,#19
1952     + add w9,w9,w12
1953     + ror w11,w11,#6
1954     + add v2.4s,v2.4s,v5.4s
1955     + eor w14,w10,w3
1956     + eor w15,w15,w10,ror#20
1957     + sli v7.4s,v19.4s,#13
1958     + add w9,w9,w11
1959     + ldr w12,[sp,#40]
1960     + and w13,w13,w14
1961     + eor v17.16b,v17.16b,v16.16b
1962     + ror w15,w15,#2
1963     + add w5,w5,w9
1964     + eor w13,w13,w3
1965     + eor v17.16b,v17.16b,v7.16b
1966     + add w8,w8,w12
1967     + add w9,w9,w15
1968     + and w12,w6,w5
1969     + add v2.4s,v2.4s,v17.4s
1970     + bic w15,w7,w5
1971     + eor w11,w5,w5,ror#5
1972     + add w9,w9,w13
1973     + ushr v18.4s,v2.4s,#17
1974     + orr w12,w12,w15
1975     + ushr v19.4s,v2.4s,#10
1976     + eor w11,w11,w5,ror#19
1977     + eor w15,w9,w9,ror#11
1978     + sli v18.4s,v2.4s,#15
1979     + add w8,w8,w12
1980     + ushr v17.4s,v2.4s,#19
1981     + ror w11,w11,#6
1982     + eor w13,w9,w10
1983     + eor v19.16b,v19.16b,v18.16b
1984     + eor w15,w15,w9,ror#20
1985     + add w8,w8,w11
1986     + sli v17.4s,v2.4s,#13
1987     + ldr w12,[sp,#44]
1988     + and w14,w14,w13
1989     + ror w15,w15,#2
1990     + ld1 {v4.4s},[x16], #16
1991     + add w4,w4,w8
1992     + eor v19.16b,v19.16b,v17.16b
1993     + eor w14,w14,w10
1994     + eor v17.16b,v17.16b,v17.16b
1995     + add w7,w7,w12
1996     + add w8,w8,w15
1997     + and w12,w5,w4
1998     + mov v17.d[1],v19.d[0]
1999     + bic w15,w6,w4
2000     + eor w11,w4,w4,ror#5
2001     + add w8,w8,w14
2002     + add v2.4s,v2.4s,v17.4s
2003     + orr w12,w12,w15
2004     + eor w11,w11,w4,ror#19
2005     + eor w15,w8,w8,ror#11
2006     + add v4.4s,v4.4s,v2.4s
2007     + add w7,w7,w12
2008     + ror w11,w11,#6
2009     + eor w14,w8,w9
2010     + eor w15,w15,w8,ror#20
2011     + add w7,w7,w11
2012     + ldr w12,[sp,#48]
2013     + and w13,w13,w14
2014     + ror w15,w15,#2
2015     + add w3,w3,w7
2016     + eor w13,w13,w9
2017     + st1 {v4.4s},[x17], #16
2018     + ext v4.16b,v3.16b,v0.16b,#4
2019     + add w6,w6,w12
2020     + add w7,w7,w15
2021     + and w12,w4,w3
2022     + bic w15,w5,w3
2023     + ext v7.16b,v1.16b,v2.16b,#4
2024     + eor w11,w3,w3,ror#5
2025     + add w7,w7,w13
2026     + mov d19,v2.d[1]
2027     + orr w12,w12,w15
2028     + eor w11,w11,w3,ror#19
2029     + ushr v6.4s,v4.4s,#7
2030     + eor w15,w7,w7,ror#11
2031     + ushr v5.4s,v4.4s,#3
2032     + add w6,w6,w12
2033     + add v3.4s,v3.4s,v7.4s
2034     + ror w11,w11,#6
2035     + sli v6.4s,v4.4s,#25
2036     + eor w13,w7,w8
2037     + eor w15,w15,w7,ror#20
2038     + ushr v7.4s,v4.4s,#18
2039     + add w6,w6,w11
2040     + ldr w12,[sp,#52]
2041     + and w14,w14,w13
2042     + eor v5.16b,v5.16b,v6.16b
2043     + ror w15,w15,#2
2044     + add w10,w10,w6
2045     + sli v7.4s,v4.4s,#14
2046     + eor w14,w14,w8
2047     + ushr v16.4s,v19.4s,#17
2048     + add w5,w5,w12
2049     + add w6,w6,w15
2050     + and w12,w3,w10
2051     + eor v5.16b,v5.16b,v7.16b
2052     + bic w15,w4,w10
2053     + eor w11,w10,w10,ror#5
2054     + sli v16.4s,v19.4s,#15
2055     + add w6,w6,w14
2056     + orr w12,w12,w15
2057     + ushr v17.4s,v19.4s,#10
2058     + eor w11,w11,w10,ror#19
2059     + eor w15,w6,w6,ror#11
2060     + ushr v7.4s,v19.4s,#19
2061     + add w5,w5,w12
2062     + ror w11,w11,#6
2063     + add v3.4s,v3.4s,v5.4s
2064     + eor w14,w6,w7
2065     + eor w15,w15,w6,ror#20
2066     + sli v7.4s,v19.4s,#13
2067     + add w5,w5,w11
2068     + ldr w12,[sp,#56]
2069     + and w13,w13,w14
2070     + eor v17.16b,v17.16b,v16.16b
2071     + ror w15,w15,#2
2072     + add w9,w9,w5
2073     + eor w13,w13,w7
2074     + eor v17.16b,v17.16b,v7.16b
2075     + add w4,w4,w12
2076     + add w5,w5,w15
2077     + and w12,w10,w9
2078     + add v3.4s,v3.4s,v17.4s
2079     + bic w15,w3,w9
2080     + eor w11,w9,w9,ror#5
2081     + add w5,w5,w13
2082     + ushr v18.4s,v3.4s,#17
2083     + orr w12,w12,w15
2084     + ushr v19.4s,v3.4s,#10
2085     + eor w11,w11,w9,ror#19
2086     + eor w15,w5,w5,ror#11
2087     + sli v18.4s,v3.4s,#15
2088     + add w4,w4,w12
2089     + ushr v17.4s,v3.4s,#19
2090     + ror w11,w11,#6
2091     + eor w13,w5,w6
2092     + eor v19.16b,v19.16b,v18.16b
2093     + eor w15,w15,w5,ror#20
2094     + add w4,w4,w11
2095     + sli v17.4s,v3.4s,#13
2096     + ldr w12,[sp,#60]
2097     + and w14,w14,w13
2098     + ror w15,w15,#2
2099     + ld1 {v4.4s},[x16], #16
2100     + add w8,w8,w4
2101     + eor v19.16b,v19.16b,v17.16b
2102     + eor w14,w14,w6
2103     + eor v17.16b,v17.16b,v17.16b
2104     + add w3,w3,w12
2105     + add w4,w4,w15
2106     + and w12,w9,w8
2107     + mov v17.d[1],v19.d[0]
2108     + bic w15,w10,w8
2109     + eor w11,w8,w8,ror#5
2110     + add w4,w4,w14
2111     + add v3.4s,v3.4s,v17.4s
2112     + orr w12,w12,w15
2113     + eor w11,w11,w8,ror#19
2114     + eor w15,w4,w4,ror#11
2115     + add v4.4s,v4.4s,v3.4s
2116     + add w3,w3,w12
2117     + ror w11,w11,#6
2118     + eor w14,w4,w5
2119     + eor w15,w15,w4,ror#20
2120     + add w3,w3,w11
2121     + ldr w12,[x16]
2122     + and w13,w13,w14
2123     + ror w15,w15,#2
2124     + add w7,w7,w3
2125     + eor w13,w13,w5
2126     + st1 {v4.4s},[x17], #16
2127     + cmp w12,#0 // check for K256 terminator
2128     + ldr w12,[sp,#0]
2129     + sub x17,x17,#64
2130     + bne .L_00_48
2131     +
2132     + sub x16,x16,#256 // rewind x16
2133     + cmp x1,x2
2134     + mov x17, #64
2135     + csel x17, x17, xzr, eq
2136     + sub x1,x1,x17 // avoid SEGV
2137     + mov x17,sp
2138     + add w10,w10,w12
2139     + add w3,w3,w15
2140     + and w12,w8,w7
2141     + ld1 {v0.16b},[x1],#16
2142     + bic w15,w9,w7
2143     + eor w11,w7,w7,ror#5
2144     + ld1 {v4.4s},[x16],#16
2145     + add w3,w3,w13
2146     + orr w12,w12,w15
2147     + eor w11,w11,w7,ror#19
2148     + eor w15,w3,w3,ror#11
2149     + rev32 v0.16b,v0.16b
2150     + add w10,w10,w12
2151     + ror w11,w11,#6
2152     + eor w13,w3,w4
2153     + eor w15,w15,w3,ror#20
2154     + add v4.4s,v4.4s,v0.4s
2155     + add w10,w10,w11
2156     + ldr w12,[sp,#4]
2157     + and w14,w14,w13
2158     + ror w15,w15,#2
2159     + add w6,w6,w10
2160     + eor w14,w14,w4
2161     + add w9,w9,w12
2162     + add w10,w10,w15
2163     + and w12,w7,w6
2164     + bic w15,w8,w6
2165     + eor w11,w6,w6,ror#5
2166     + add w10,w10,w14
2167     + orr w12,w12,w15
2168     + eor w11,w11,w6,ror#19
2169     + eor w15,w10,w10,ror#11
2170     + add w9,w9,w12
2171     + ror w11,w11,#6
2172     + eor w14,w10,w3
2173     + eor w15,w15,w10,ror#20
2174     + add w9,w9,w11
2175     + ldr w12,[sp,#8]
2176     + and w13,w13,w14
2177     + ror w15,w15,#2
2178     + add w5,w5,w9
2179     + eor w13,w13,w3
2180     + add w8,w8,w12
2181     + add w9,w9,w15
2182     + and w12,w6,w5
2183     + bic w15,w7,w5
2184     + eor w11,w5,w5,ror#5
2185     + add w9,w9,w13
2186     + orr w12,w12,w15
2187     + eor w11,w11,w5,ror#19
2188     + eor w15,w9,w9,ror#11
2189     + add w8,w8,w12
2190     + ror w11,w11,#6
2191     + eor w13,w9,w10
2192     + eor w15,w15,w9,ror#20
2193     + add w8,w8,w11
2194     + ldr w12,[sp,#12]
2195     + and w14,w14,w13
2196     + ror w15,w15,#2
2197     + add w4,w4,w8
2198     + eor w14,w14,w10
2199     + add w7,w7,w12
2200     + add w8,w8,w15
2201     + and w12,w5,w4
2202     + bic w15,w6,w4
2203     + eor w11,w4,w4,ror#5
2204     + add w8,w8,w14
2205     + orr w12,w12,w15
2206     + eor w11,w11,w4,ror#19
2207     + eor w15,w8,w8,ror#11
2208     + add w7,w7,w12
2209     + ror w11,w11,#6
2210     + eor w14,w8,w9
2211     + eor w15,w15,w8,ror#20
2212     + add w7,w7,w11
2213     + ldr w12,[sp,#16]
2214     + and w13,w13,w14
2215     + ror w15,w15,#2
2216     + add w3,w3,w7
2217     + eor w13,w13,w9
2218     + st1 {v4.4s},[x17], #16
2219     + add w6,w6,w12
2220     + add w7,w7,w15
2221     + and w12,w4,w3
2222     + ld1 {v1.16b},[x1],#16
2223     + bic w15,w5,w3
2224     + eor w11,w3,w3,ror#5
2225     + ld1 {v4.4s},[x16],#16
2226     + add w7,w7,w13
2227     + orr w12,w12,w15
2228     + eor w11,w11,w3,ror#19
2229     + eor w15,w7,w7,ror#11
2230     + rev32 v1.16b,v1.16b
2231     + add w6,w6,w12
2232     + ror w11,w11,#6
2233     + eor w13,w7,w8
2234     + eor w15,w15,w7,ror#20
2235     + add v4.4s,v4.4s,v1.4s
2236     + add w6,w6,w11
2237     + ldr w12,[sp,#20]
2238     + and w14,w14,w13
2239     + ror w15,w15,#2
2240     + add w10,w10,w6
2241     + eor w14,w14,w8
2242     + add w5,w5,w12
2243     + add w6,w6,w15
2244     + and w12,w3,w10
2245     + bic w15,w4,w10
2246     + eor w11,w10,w10,ror#5
2247     + add w6,w6,w14
2248     + orr w12,w12,w15
2249     + eor w11,w11,w10,ror#19
2250     + eor w15,w6,w6,ror#11
2251     + add w5,w5,w12
2252     + ror w11,w11,#6
2253     + eor w14,w6,w7
2254     + eor w15,w15,w6,ror#20
2255     + add w5,w5,w11
2256     + ldr w12,[sp,#24]
2257     + and w13,w13,w14
2258     + ror w15,w15,#2
2259     + add w9,w9,w5
2260     + eor w13,w13,w7
2261     + add w4,w4,w12
2262     + add w5,w5,w15
2263     + and w12,w10,w9
2264     + bic w15,w3,w9
2265     + eor w11,w9,w9,ror#5
2266     + add w5,w5,w13
2267     + orr w12,w12,w15
2268     + eor w11,w11,w9,ror#19
2269     + eor w15,w5,w5,ror#11
2270     + add w4,w4,w12
2271     + ror w11,w11,#6
2272     + eor w13,w5,w6
2273     + eor w15,w15,w5,ror#20
2274     + add w4,w4,w11
2275     + ldr w12,[sp,#28]
2276     + and w14,w14,w13
2277     + ror w15,w15,#2
2278     + add w8,w8,w4
2279     + eor w14,w14,w6
2280     + add w3,w3,w12
2281     + add w4,w4,w15
2282     + and w12,w9,w8
2283     + bic w15,w10,w8
2284     + eor w11,w8,w8,ror#5
2285     + add w4,w4,w14
2286     + orr w12,w12,w15
2287     + eor w11,w11,w8,ror#19
2288     + eor w15,w4,w4,ror#11
2289     + add w3,w3,w12
2290     + ror w11,w11,#6
2291     + eor w14,w4,w5
2292     + eor w15,w15,w4,ror#20
2293     + add w3,w3,w11
2294     + ldr w12,[sp,#32]
2295     + and w13,w13,w14
2296     + ror w15,w15,#2
2297     + add w7,w7,w3
2298     + eor w13,w13,w5
2299     + st1 {v4.4s},[x17], #16
2300     + add w10,w10,w12
2301     + add w3,w3,w15
2302     + and w12,w8,w7
2303     + ld1 {v2.16b},[x1],#16
2304     + bic w15,w9,w7
2305     + eor w11,w7,w7,ror#5
2306     + ld1 {v4.4s},[x16],#16
2307     + add w3,w3,w13
2308     + orr w12,w12,w15
2309     + eor w11,w11,w7,ror#19
2310     + eor w15,w3,w3,ror#11
2311     + rev32 v2.16b,v2.16b
2312     + add w10,w10,w12
2313     + ror w11,w11,#6
2314     + eor w13,w3,w4
2315     + eor w15,w15,w3,ror#20
2316     + add v4.4s,v4.4s,v2.4s
2317     + add w10,w10,w11
2318     + ldr w12,[sp,#36]
2319     + and w14,w14,w13
2320     + ror w15,w15,#2
2321     + add w6,w6,w10
2322     + eor w14,w14,w4
2323     + add w9,w9,w12
2324     + add w10,w10,w15
2325     + and w12,w7,w6
2326     + bic w15,w8,w6
2327     + eor w11,w6,w6,ror#5
2328     + add w10,w10,w14
2329     + orr w12,w12,w15
2330     + eor w11,w11,w6,ror#19
2331     + eor w15,w10,w10,ror#11
2332     + add w9,w9,w12
2333     + ror w11,w11,#6
2334     + eor w14,w10,w3
2335     + eor w15,w15,w10,ror#20
2336     + add w9,w9,w11
2337     + ldr w12,[sp,#40]
2338     + and w13,w13,w14
2339     + ror w15,w15,#2
2340     + add w5,w5,w9
2341     + eor w13,w13,w3
2342     + add w8,w8,w12
2343     + add w9,w9,w15
2344     + and w12,w6,w5
2345     + bic w15,w7,w5
2346     + eor w11,w5,w5,ror#5
2347     + add w9,w9,w13
2348     + orr w12,w12,w15
2349     + eor w11,w11,w5,ror#19
2350     + eor w15,w9,w9,ror#11
2351     + add w8,w8,w12
2352     + ror w11,w11,#6
2353     + eor w13,w9,w10
2354     + eor w15,w15,w9,ror#20
2355     + add w8,w8,w11
2356     + ldr w12,[sp,#44]
2357     + and w14,w14,w13
2358     + ror w15,w15,#2
2359     + add w4,w4,w8
2360     + eor w14,w14,w10
2361     + add w7,w7,w12
2362     + add w8,w8,w15
2363     + and w12,w5,w4
2364     + bic w15,w6,w4
2365     + eor w11,w4,w4,ror#5
2366     + add w8,w8,w14
2367     + orr w12,w12,w15
2368     + eor w11,w11,w4,ror#19
2369     + eor w15,w8,w8,ror#11
2370     + add w7,w7,w12
2371     + ror w11,w11,#6
2372     + eor w14,w8,w9
2373     + eor w15,w15,w8,ror#20
2374     + add w7,w7,w11
2375     + ldr w12,[sp,#48]
2376     + and w13,w13,w14
2377     + ror w15,w15,#2
2378     + add w3,w3,w7
2379     + eor w13,w13,w9
2380     + st1 {v4.4s},[x17], #16
2381     + add w6,w6,w12
2382     + add w7,w7,w15
2383     + and w12,w4,w3
2384     + ld1 {v3.16b},[x1],#16
2385     + bic w15,w5,w3
2386     + eor w11,w3,w3,ror#5
2387     + ld1 {v4.4s},[x16],#16
2388     + add w7,w7,w13
2389     + orr w12,w12,w15
2390     + eor w11,w11,w3,ror#19
2391     + eor w15,w7,w7,ror#11
2392     + rev32 v3.16b,v3.16b
2393     + add w6,w6,w12
2394     + ror w11,w11,#6
2395     + eor w13,w7,w8
2396     + eor w15,w15,w7,ror#20
2397     + add v4.4s,v4.4s,v3.4s
2398     + add w6,w6,w11
2399     + ldr w12,[sp,#52]
2400     + and w14,w14,w13
2401     + ror w15,w15,#2
2402     + add w10,w10,w6
2403     + eor w14,w14,w8
2404     + add w5,w5,w12
2405     + add w6,w6,w15
2406     + and w12,w3,w10
2407     + bic w15,w4,w10
2408     + eor w11,w10,w10,ror#5
2409     + add w6,w6,w14
2410     + orr w12,w12,w15
2411     + eor w11,w11,w10,ror#19
2412     + eor w15,w6,w6,ror#11
2413     + add w5,w5,w12
2414     + ror w11,w11,#6
2415     + eor w14,w6,w7
2416     + eor w15,w15,w6,ror#20
2417     + add w5,w5,w11
2418     + ldr w12,[sp,#56]
2419     + and w13,w13,w14
2420     + ror w15,w15,#2
2421     + add w9,w9,w5
2422     + eor w13,w13,w7
2423     + add w4,w4,w12
2424     + add w5,w5,w15
2425     + and w12,w10,w9
2426     + bic w15,w3,w9
2427     + eor w11,w9,w9,ror#5
2428     + add w5,w5,w13
2429     + orr w12,w12,w15
2430     + eor w11,w11,w9,ror#19
2431     + eor w15,w5,w5,ror#11
2432     + add w4,w4,w12
2433     + ror w11,w11,#6
2434     + eor w13,w5,w6
2435     + eor w15,w15,w5,ror#20
2436     + add w4,w4,w11
2437     + ldr w12,[sp,#60]
2438     + and w14,w14,w13
2439     + ror w15,w15,#2
2440     + add w8,w8,w4
2441     + eor w14,w14,w6
2442     + add w3,w3,w12
2443     + add w4,w4,w15
2444     + and w12,w9,w8
2445     + bic w15,w10,w8
2446     + eor w11,w8,w8,ror#5
2447     + add w4,w4,w14
2448     + orr w12,w12,w15
2449     + eor w11,w11,w8,ror#19
2450     + eor w15,w4,w4,ror#11
2451     + add w3,w3,w12
2452     + ror w11,w11,#6
2453     + eor w14,w4,w5
2454     + eor w15,w15,w4,ror#20
2455     + add w3,w3,w11
2456     + and w13,w13,w14
2457     + ror w15,w15,#2
2458     + add w7,w7,w3
2459     + eor w13,w13,w5
2460     + st1 {v4.4s},[x17], #16
2461     + add w3,w3,w15 // h+=Sigma0(a) from the past
2462     + ldp w11,w12,[x0,#0]
2463     + add w3,w3,w13 // h+=Maj(a,b,c) from the past
2464     + ldp w13,w14,[x0,#8]
2465     + add w3,w3,w11 // accumulate
2466     + add w4,w4,w12
2467     + ldp w11,w12,[x0,#16]
2468     + add w5,w5,w13
2469     + add w6,w6,w14
2470     + ldp w13,w14,[x0,#24]
2471     + add w7,w7,w11
2472     + add w8,w8,w12
2473     + ldr w12,[sp,#0]
2474     + stp w3,w4,[x0,#0]
2475     + add w9,w9,w13
2476     + mov w13,wzr
2477     + stp w5,w6,[x0,#8]
2478     + add w10,w10,w14
2479     + stp w7,w8,[x0,#16]
2480     + eor w14,w4,w5
2481     + stp w9,w10,[x0,#24]
2482     + mov w15,wzr
2483     + mov x17,sp
2484     + b.ne .L_00_48
2485     +
2486     + ldr x29,[x29]
2487     + add sp,sp,#16*4+16
2488     + ret
2489     +.size sha256_block_neon,.-sha256_block_neon
2490     +#ifndef __KERNEL__
2491     +.comm OPENSSL_armcap_P,4,4
2492     +#endif
2493     diff --git a/arch/arm64/crypto/sha512-core.S b/arch/arm64/crypto/sha512-core.S
2494     new file mode 100644
2495     index 000000000000..bd0f59f06c9d
2496     --- /dev/null
2497     +++ b/arch/arm64/crypto/sha512-core.S
2498     @@ -0,0 +1,1085 @@
2499     +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
2500     +//
2501     +// Licensed under the OpenSSL license (the "License"). You may not use
2502     +// this file except in compliance with the License. You can obtain a copy
2503     +// in the file LICENSE in the source distribution or at
2504     +// https://www.openssl.org/source/license.html
2505     +
2506     +// ====================================================================
2507     +// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
2508     +// project. The module is, however, dual licensed under OpenSSL and
2509     +// CRYPTOGAMS licenses depending on where you obtain it. For further
2510     +// details see http://www.openssl.org/~appro/cryptogams/.
2511     +//
2512     +// Permission to use under GPLv2 terms is granted.
2513     +// ====================================================================
2514     +//
2515     +// SHA256/512 for ARMv8.
2516     +//
2517     +// Performance in cycles per processed byte and improvement coefficient
2518     +// over code generated with "default" compiler:
2519     +//
2520     +// SHA256-hw SHA256(*) SHA512
2521     +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
2522     +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
2523     +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
2524     +// Denver 2.01 10.5 (+26%) 6.70 (+8%)
2525     +// X-Gene 20.0 (+100%) 12.8 (+300%(***))
2526     +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
2527     +//
2528     +// (*) Software SHA256 results are of lesser relevance, presented
2529     +// mostly for informational purposes.
2530     +// (**) The result is a trade-off: it's possible to improve it by
2531     +// 10% (or by 1 cycle per round), but at the cost of 20% loss
2532     +// on Cortex-A53 (or by 4 cycles per round).
2533     +// (***) Super-impressive coefficients over gcc-generated code are
2534     +// indication of some compiler "pathology", most notably code
2535     +// generated with -mgeneral-regs-only is significanty faster
2536     +// and the gap is only 40-90%.
2537     +//
2538     +// October 2016.
2539     +//
2540     +// Originally it was reckoned that it makes no sense to implement NEON
2541     +// version of SHA256 for 64-bit processors. This is because performance
2542     +// improvement on most wide-spread Cortex-A5x processors was observed
2543     +// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
2544     +// observed that 32-bit NEON SHA256 performs significantly better than
2545     +// 64-bit scalar version on *some* of the more recent processors. As
2546     +// result 64-bit NEON version of SHA256 was added to provide best
2547     +// all-round performance. For example it executes ~30% faster on X-Gene
2548     +// and Mongoose. [For reference, NEON version of SHA512 is bound to
2549     +// deliver much less improvement, likely *negative* on Cortex-A5x.
2550     +// Which is why NEON support is limited to SHA256.]
2551     +
2552     +#ifndef __KERNEL__
2553     +# include "arm_arch.h"
2554     +#endif
2555     +
2556     +.text
2557     +
2558     +.extern OPENSSL_armcap_P
2559     +.globl sha512_block_data_order
2560     +.type sha512_block_data_order,%function
2561     +.align 6
2562     +sha512_block_data_order:
2563     + stp x29,x30,[sp,#-128]!
2564     + add x29,sp,#0
2565     +
2566     + stp x19,x20,[sp,#16]
2567     + stp x21,x22,[sp,#32]
2568     + stp x23,x24,[sp,#48]
2569     + stp x25,x26,[sp,#64]
2570     + stp x27,x28,[sp,#80]
2571     + sub sp,sp,#4*8
2572     +
2573     + ldp x20,x21,[x0] // load context
2574     + ldp x22,x23,[x0,#2*8]
2575     + ldp x24,x25,[x0,#4*8]
2576     + add x2,x1,x2,lsl#7 // end of input
2577     + ldp x26,x27,[x0,#6*8]
2578     + adr x30,.LK512
2579     + stp x0,x2,[x29,#96]
2580     +
2581     +.Loop:
2582     + ldp x3,x4,[x1],#2*8
2583     + ldr x19,[x30],#8 // *K++
2584     + eor x28,x21,x22 // magic seed
2585     + str x1,[x29,#112]
2586     +#ifndef __AARCH64EB__
2587     + rev x3,x3 // 0
2588     +#endif
2589     + ror x16,x24,#14
2590     + add x27,x27,x19 // h+=K[i]
2591     + eor x6,x24,x24,ror#23
2592     + and x17,x25,x24
2593     + bic x19,x26,x24
2594     + add x27,x27,x3 // h+=X[i]
2595     + orr x17,x17,x19 // Ch(e,f,g)
2596     + eor x19,x20,x21 // a^b, b^c in next round
2597     + eor x16,x16,x6,ror#18 // Sigma1(e)
2598     + ror x6,x20,#28
2599     + add x27,x27,x17 // h+=Ch(e,f,g)
2600     + eor x17,x20,x20,ror#5
2601     + add x27,x27,x16 // h+=Sigma1(e)
2602     + and x28,x28,x19 // (b^c)&=(a^b)
2603     + add x23,x23,x27 // d+=h
2604     + eor x28,x28,x21 // Maj(a,b,c)
2605     + eor x17,x6,x17,ror#34 // Sigma0(a)
2606     + add x27,x27,x28 // h+=Maj(a,b,c)
2607     + ldr x28,[x30],#8 // *K++, x19 in next round
2608     + //add x27,x27,x17 // h+=Sigma0(a)
2609     +#ifndef __AARCH64EB__
2610     + rev x4,x4 // 1
2611     +#endif
2612     + ldp x5,x6,[x1],#2*8
2613     + add x27,x27,x17 // h+=Sigma0(a)
2614     + ror x16,x23,#14
2615     + add x26,x26,x28 // h+=K[i]
2616     + eor x7,x23,x23,ror#23
2617     + and x17,x24,x23
2618     + bic x28,x25,x23
2619     + add x26,x26,x4 // h+=X[i]
2620     + orr x17,x17,x28 // Ch(e,f,g)
2621     + eor x28,x27,x20 // a^b, b^c in next round
2622     + eor x16,x16,x7,ror#18 // Sigma1(e)
2623     + ror x7,x27,#28
2624     + add x26,x26,x17 // h+=Ch(e,f,g)
2625     + eor x17,x27,x27,ror#5
2626     + add x26,x26,x16 // h+=Sigma1(e)
2627     + and x19,x19,x28 // (b^c)&=(a^b)
2628     + add x22,x22,x26 // d+=h
2629     + eor x19,x19,x20 // Maj(a,b,c)
2630     + eor x17,x7,x17,ror#34 // Sigma0(a)
2631     + add x26,x26,x19 // h+=Maj(a,b,c)
2632     + ldr x19,[x30],#8 // *K++, x28 in next round
2633     + //add x26,x26,x17 // h+=Sigma0(a)
2634     +#ifndef __AARCH64EB__
2635     + rev x5,x5 // 2
2636     +#endif
2637     + add x26,x26,x17 // h+=Sigma0(a)
2638     + ror x16,x22,#14
2639     + add x25,x25,x19 // h+=K[i]
2640     + eor x8,x22,x22,ror#23
2641     + and x17,x23,x22
2642     + bic x19,x24,x22
2643     + add x25,x25,x5 // h+=X[i]
2644     + orr x17,x17,x19 // Ch(e,f,g)
2645     + eor x19,x26,x27 // a^b, b^c in next round
2646     + eor x16,x16,x8,ror#18 // Sigma1(e)
2647     + ror x8,x26,#28
2648     + add x25,x25,x17 // h+=Ch(e,f,g)
2649     + eor x17,x26,x26,ror#5
2650     + add x25,x25,x16 // h+=Sigma1(e)
2651     + and x28,x28,x19 // (b^c)&=(a^b)
2652     + add x21,x21,x25 // d+=h
2653     + eor x28,x28,x27 // Maj(a,b,c)
2654     + eor x17,x8,x17,ror#34 // Sigma0(a)
2655     + add x25,x25,x28 // h+=Maj(a,b,c)
2656     + ldr x28,[x30],#8 // *K++, x19 in next round
2657     + //add x25,x25,x17 // h+=Sigma0(a)
2658     +#ifndef __AARCH64EB__
2659     + rev x6,x6 // 3
2660     +#endif
2661     + ldp x7,x8,[x1],#2*8
2662     + add x25,x25,x17 // h+=Sigma0(a)
2663     + ror x16,x21,#14
2664     + add x24,x24,x28 // h+=K[i]
2665     + eor x9,x21,x21,ror#23
2666     + and x17,x22,x21
2667     + bic x28,x23,x21
2668     + add x24,x24,x6 // h+=X[i]
2669     + orr x17,x17,x28 // Ch(e,f,g)
2670     + eor x28,x25,x26 // a^b, b^c in next round
2671     + eor x16,x16,x9,ror#18 // Sigma1(e)
2672     + ror x9,x25,#28
2673     + add x24,x24,x17 // h+=Ch(e,f,g)
2674     + eor x17,x25,x25,ror#5
2675     + add x24,x24,x16 // h+=Sigma1(e)
2676     + and x19,x19,x28 // (b^c)&=(a^b)
2677     + add x20,x20,x24 // d+=h
2678     + eor x19,x19,x26 // Maj(a,b,c)
2679     + eor x17,x9,x17,ror#34 // Sigma0(a)
2680     + add x24,x24,x19 // h+=Maj(a,b,c)
2681     + ldr x19,[x30],#8 // *K++, x28 in next round
2682     + //add x24,x24,x17 // h+=Sigma0(a)
2683     +#ifndef __AARCH64EB__
2684     + rev x7,x7 // 4
2685     +#endif
2686     + add x24,x24,x17 // h+=Sigma0(a)
2687     + ror x16,x20,#14
2688     + add x23,x23,x19 // h+=K[i]
2689     + eor x10,x20,x20,ror#23
2690     + and x17,x21,x20
2691     + bic x19,x22,x20
2692     + add x23,x23,x7 // h+=X[i]
2693     + orr x17,x17,x19 // Ch(e,f,g)
2694     + eor x19,x24,x25 // a^b, b^c in next round
2695     + eor x16,x16,x10,ror#18 // Sigma1(e)
2696     + ror x10,x24,#28
2697     + add x23,x23,x17 // h+=Ch(e,f,g)
2698     + eor x17,x24,x24,ror#5
2699     + add x23,x23,x16 // h+=Sigma1(e)
2700     + and x28,x28,x19 // (b^c)&=(a^b)
2701     + add x27,x27,x23 // d+=h
2702     + eor x28,x28,x25 // Maj(a,b,c)
2703     + eor x17,x10,x17,ror#34 // Sigma0(a)
2704     + add x23,x23,x28 // h+=Maj(a,b,c)
2705     + ldr x28,[x30],#8 // *K++, x19 in next round
2706     + //add x23,x23,x17 // h+=Sigma0(a)
2707     +#ifndef __AARCH64EB__
2708     + rev x8,x8 // 5
2709     +#endif
2710     + ldp x9,x10,[x1],#2*8
2711     + add x23,x23,x17 // h+=Sigma0(a)
2712     + ror x16,x27,#14
2713     + add x22,x22,x28 // h+=K[i]
2714     + eor x11,x27,x27,ror#23
2715     + and x17,x20,x27
2716     + bic x28,x21,x27
2717     + add x22,x22,x8 // h+=X[i]
2718     + orr x17,x17,x28 // Ch(e,f,g)
2719     + eor x28,x23,x24 // a^b, b^c in next round
2720     + eor x16,x16,x11,ror#18 // Sigma1(e)
2721     + ror x11,x23,#28
2722     + add x22,x22,x17 // h+=Ch(e,f,g)
2723     + eor x17,x23,x23,ror#5
2724     + add x22,x22,x16 // h+=Sigma1(e)
2725     + and x19,x19,x28 // (b^c)&=(a^b)
2726     + add x26,x26,x22 // d+=h
2727     + eor x19,x19,x24 // Maj(a,b,c)
2728     + eor x17,x11,x17,ror#34 // Sigma0(a)
2729     + add x22,x22,x19 // h+=Maj(a,b,c)
2730     + ldr x19,[x30],#8 // *K++, x28 in next round
2731     + //add x22,x22,x17 // h+=Sigma0(a)
2732     +#ifndef __AARCH64EB__
2733     + rev x9,x9 // 6
2734     +#endif
2735     + add x22,x22,x17 // h+=Sigma0(a)
2736     + ror x16,x26,#14
2737     + add x21,x21,x19 // h+=K[i]
2738     + eor x12,x26,x26,ror#23
2739     + and x17,x27,x26
2740     + bic x19,x20,x26
2741     + add x21,x21,x9 // h+=X[i]
2742     + orr x17,x17,x19 // Ch(e,f,g)
2743     + eor x19,x22,x23 // a^b, b^c in next round
2744     + eor x16,x16,x12,ror#18 // Sigma1(e)
2745     + ror x12,x22,#28
2746     + add x21,x21,x17 // h+=Ch(e,f,g)
2747     + eor x17,x22,x22,ror#5
2748     + add x21,x21,x16 // h+=Sigma1(e)
2749     + and x28,x28,x19 // (b^c)&=(a^b)
2750     + add x25,x25,x21 // d+=h
2751     + eor x28,x28,x23 // Maj(a,b,c)
2752     + eor x17,x12,x17,ror#34 // Sigma0(a)
2753     + add x21,x21,x28 // h+=Maj(a,b,c)
2754     + ldr x28,[x30],#8 // *K++, x19 in next round
2755     + //add x21,x21,x17 // h+=Sigma0(a)
2756     +#ifndef __AARCH64EB__
2757     + rev x10,x10 // 7
2758     +#endif
2759     + ldp x11,x12,[x1],#2*8
2760     + add x21,x21,x17 // h+=Sigma0(a)
2761     + ror x16,x25,#14
2762     + add x20,x20,x28 // h+=K[i]
2763     + eor x13,x25,x25,ror#23
2764     + and x17,x26,x25
2765     + bic x28,x27,x25
2766     + add x20,x20,x10 // h+=X[i]
2767     + orr x17,x17,x28 // Ch(e,f,g)
2768     + eor x28,x21,x22 // a^b, b^c in next round
2769     + eor x16,x16,x13,ror#18 // Sigma1(e)
2770     + ror x13,x21,#28
2771     + add x20,x20,x17 // h+=Ch(e,f,g)
2772     + eor x17,x21,x21,ror#5
2773     + add x20,x20,x16 // h+=Sigma1(e)
2774     + and x19,x19,x28 // (b^c)&=(a^b)
2775     + add x24,x24,x20 // d+=h
2776     + eor x19,x19,x22 // Maj(a,b,c)
2777     + eor x17,x13,x17,ror#34 // Sigma0(a)
2778     + add x20,x20,x19 // h+=Maj(a,b,c)
2779     + ldr x19,[x30],#8 // *K++, x28 in next round
2780     + //add x20,x20,x17 // h+=Sigma0(a)
2781     +#ifndef __AARCH64EB__
2782     + rev x11,x11 // 8
2783     +#endif
2784     + add x20,x20,x17 // h+=Sigma0(a)
2785     + ror x16,x24,#14
2786     + add x27,x27,x19 // h+=K[i]
2787     + eor x14,x24,x24,ror#23
2788     + and x17,x25,x24
2789     + bic x19,x26,x24
2790     + add x27,x27,x11 // h+=X[i]
2791     + orr x17,x17,x19 // Ch(e,f,g)
2792     + eor x19,x20,x21 // a^b, b^c in next round
2793     + eor x16,x16,x14,ror#18 // Sigma1(e)
2794     + ror x14,x20,#28
2795     + add x27,x27,x17 // h+=Ch(e,f,g)
2796     + eor x17,x20,x20,ror#5
2797     + add x27,x27,x16 // h+=Sigma1(e)
2798     + and x28,x28,x19 // (b^c)&=(a^b)
2799     + add x23,x23,x27 // d+=h
2800     + eor x28,x28,x21 // Maj(a,b,c)
2801     + eor x17,x14,x17,ror#34 // Sigma0(a)
2802     + add x27,x27,x28 // h+=Maj(a,b,c)
2803     + ldr x28,[x30],#8 // *K++, x19 in next round
2804     + //add x27,x27,x17 // h+=Sigma0(a)
2805     +#ifndef __AARCH64EB__
2806     + rev x12,x12 // 9
2807     +#endif
2808     + ldp x13,x14,[x1],#2*8
2809     + add x27,x27,x17 // h+=Sigma0(a)
2810     + ror x16,x23,#14
2811     + add x26,x26,x28 // h+=K[i]
2812     + eor x15,x23,x23,ror#23
2813     + and x17,x24,x23
2814     + bic x28,x25,x23
2815     + add x26,x26,x12 // h+=X[i]
2816     + orr x17,x17,x28 // Ch(e,f,g)
2817     + eor x28,x27,x20 // a^b, b^c in next round
2818     + eor x16,x16,x15,ror#18 // Sigma1(e)
2819     + ror x15,x27,#28
2820     + add x26,x26,x17 // h+=Ch(e,f,g)
2821     + eor x17,x27,x27,ror#5
2822     + add x26,x26,x16 // h+=Sigma1(e)
2823     + and x19,x19,x28 // (b^c)&=(a^b)
2824     + add x22,x22,x26 // d+=h
2825     + eor x19,x19,x20 // Maj(a,b,c)
2826     + eor x17,x15,x17,ror#34 // Sigma0(a)
2827     + add x26,x26,x19 // h+=Maj(a,b,c)
2828     + ldr x19,[x30],#8 // *K++, x28 in next round
2829     + //add x26,x26,x17 // h+=Sigma0(a)
2830     +#ifndef __AARCH64EB__
2831     + rev x13,x13 // 10
2832     +#endif
2833     + add x26,x26,x17 // h+=Sigma0(a)
2834     + ror x16,x22,#14
2835     + add x25,x25,x19 // h+=K[i]
2836     + eor x0,x22,x22,ror#23
2837     + and x17,x23,x22
2838     + bic x19,x24,x22
2839     + add x25,x25,x13 // h+=X[i]
2840     + orr x17,x17,x19 // Ch(e,f,g)
2841     + eor x19,x26,x27 // a^b, b^c in next round
2842     + eor x16,x16,x0,ror#18 // Sigma1(e)
2843     + ror x0,x26,#28
2844     + add x25,x25,x17 // h+=Ch(e,f,g)
2845     + eor x17,x26,x26,ror#5
2846     + add x25,x25,x16 // h+=Sigma1(e)
2847     + and x28,x28,x19 // (b^c)&=(a^b)
2848     + add x21,x21,x25 // d+=h
2849     + eor x28,x28,x27 // Maj(a,b,c)
2850     + eor x17,x0,x17,ror#34 // Sigma0(a)
2851     + add x25,x25,x28 // h+=Maj(a,b,c)
2852     + ldr x28,[x30],#8 // *K++, x19 in next round
2853     + //add x25,x25,x17 // h+=Sigma0(a)
2854     +#ifndef __AARCH64EB__
2855     + rev x14,x14 // 11
2856     +#endif
2857     + ldp x15,x0,[x1],#2*8
2858     + add x25,x25,x17 // h+=Sigma0(a)
2859     + str x6,[sp,#24]
2860     + ror x16,x21,#14
2861     + add x24,x24,x28 // h+=K[i]
2862     + eor x6,x21,x21,ror#23
2863     + and x17,x22,x21
2864     + bic x28,x23,x21
2865     + add x24,x24,x14 // h+=X[i]
2866     + orr x17,x17,x28 // Ch(e,f,g)
2867     + eor x28,x25,x26 // a^b, b^c in next round
2868     + eor x16,x16,x6,ror#18 // Sigma1(e)
2869     + ror x6,x25,#28
2870     + add x24,x24,x17 // h+=Ch(e,f,g)
2871     + eor x17,x25,x25,ror#5
2872     + add x24,x24,x16 // h+=Sigma1(e)
2873     + and x19,x19,x28 // (b^c)&=(a^b)
2874     + add x20,x20,x24 // d+=h
2875     + eor x19,x19,x26 // Maj(a,b,c)
2876     + eor x17,x6,x17,ror#34 // Sigma0(a)
2877     + add x24,x24,x19 // h+=Maj(a,b,c)
2878     + ldr x19,[x30],#8 // *K++, x28 in next round
2879     + //add x24,x24,x17 // h+=Sigma0(a)
2880     +#ifndef __AARCH64EB__
2881     + rev x15,x15 // 12
2882     +#endif
2883     + add x24,x24,x17 // h+=Sigma0(a)
2884     + str x7,[sp,#0]
2885     + ror x16,x20,#14
2886     + add x23,x23,x19 // h+=K[i]
2887     + eor x7,x20,x20,ror#23
2888     + and x17,x21,x20
2889     + bic x19,x22,x20
2890     + add x23,x23,x15 // h+=X[i]
2891     + orr x17,x17,x19 // Ch(e,f,g)
2892     + eor x19,x24,x25 // a^b, b^c in next round
2893     + eor x16,x16,x7,ror#18 // Sigma1(e)
2894     + ror x7,x24,#28
2895     + add x23,x23,x17 // h+=Ch(e,f,g)
2896     + eor x17,x24,x24,ror#5
2897     + add x23,x23,x16 // h+=Sigma1(e)
2898     + and x28,x28,x19 // (b^c)&=(a^b)
2899     + add x27,x27,x23 // d+=h
2900     + eor x28,x28,x25 // Maj(a,b,c)
2901     + eor x17,x7,x17,ror#34 // Sigma0(a)
2902     + add x23,x23,x28 // h+=Maj(a,b,c)
2903     + ldr x28,[x30],#8 // *K++, x19 in next round
2904     + //add x23,x23,x17 // h+=Sigma0(a)
2905     +#ifndef __AARCH64EB__
2906     + rev x0,x0 // 13
2907     +#endif
2908     + ldp x1,x2,[x1]
2909     + add x23,x23,x17 // h+=Sigma0(a)
2910     + str x8,[sp,#8]
2911     + ror x16,x27,#14
2912     + add x22,x22,x28 // h+=K[i]
2913     + eor x8,x27,x27,ror#23
2914     + and x17,x20,x27
2915     + bic x28,x21,x27
2916     + add x22,x22,x0 // h+=X[i]
2917     + orr x17,x17,x28 // Ch(e,f,g)
2918     + eor x28,x23,x24 // a^b, b^c in next round
2919     + eor x16,x16,x8,ror#18 // Sigma1(e)
2920     + ror x8,x23,#28
2921     + add x22,x22,x17 // h+=Ch(e,f,g)
2922     + eor x17,x23,x23,ror#5
2923     + add x22,x22,x16 // h+=Sigma1(e)
2924     + and x19,x19,x28 // (b^c)&=(a^b)
2925     + add x26,x26,x22 // d+=h
2926     + eor x19,x19,x24 // Maj(a,b,c)
2927     + eor x17,x8,x17,ror#34 // Sigma0(a)
2928     + add x22,x22,x19 // h+=Maj(a,b,c)
2929     + ldr x19,[x30],#8 // *K++, x28 in next round
2930     + //add x22,x22,x17 // h+=Sigma0(a)
2931     +#ifndef __AARCH64EB__
2932     + rev x1,x1 // 14
2933     +#endif
2934     + ldr x6,[sp,#24]
2935     + add x22,x22,x17 // h+=Sigma0(a)
2936     + str x9,[sp,#16]
2937     + ror x16,x26,#14
2938     + add x21,x21,x19 // h+=K[i]
2939     + eor x9,x26,x26,ror#23
2940     + and x17,x27,x26
2941     + bic x19,x20,x26
2942     + add x21,x21,x1 // h+=X[i]
2943     + orr x17,x17,x19 // Ch(e,f,g)
2944     + eor x19,x22,x23 // a^b, b^c in next round
2945     + eor x16,x16,x9,ror#18 // Sigma1(e)
2946     + ror x9,x22,#28
2947     + add x21,x21,x17 // h+=Ch(e,f,g)
2948     + eor x17,x22,x22,ror#5
2949     + add x21,x21,x16 // h+=Sigma1(e)
2950     + and x28,x28,x19 // (b^c)&=(a^b)
2951     + add x25,x25,x21 // d+=h
2952     + eor x28,x28,x23 // Maj(a,b,c)
2953     + eor x17,x9,x17,ror#34 // Sigma0(a)
2954     + add x21,x21,x28 // h+=Maj(a,b,c)
2955     + ldr x28,[x30],#8 // *K++, x19 in next round
2956     + //add x21,x21,x17 // h+=Sigma0(a)
2957     +#ifndef __AARCH64EB__
2958     + rev x2,x2 // 15
2959     +#endif
2960     + ldr x7,[sp,#0]
2961     + add x21,x21,x17 // h+=Sigma0(a)
2962     + str x10,[sp,#24]
2963     + ror x16,x25,#14
2964     + add x20,x20,x28 // h+=K[i]
2965     + ror x9,x4,#1
2966     + and x17,x26,x25
2967     + ror x8,x1,#19
2968     + bic x28,x27,x25
2969     + ror x10,x21,#28
2970     + add x20,x20,x2 // h+=X[i]
2971     + eor x16,x16,x25,ror#18
2972     + eor x9,x9,x4,ror#8
2973     + orr x17,x17,x28 // Ch(e,f,g)
2974     + eor x28,x21,x22 // a^b, b^c in next round
2975     + eor x16,x16,x25,ror#41 // Sigma1(e)
2976     + eor x10,x10,x21,ror#34
2977     + add x20,x20,x17 // h+=Ch(e,f,g)
2978     + and x19,x19,x28 // (b^c)&=(a^b)
2979     + eor x8,x8,x1,ror#61
2980     + eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
2981     + add x20,x20,x16 // h+=Sigma1(e)
2982     + eor x19,x19,x22 // Maj(a,b,c)
2983     + eor x17,x10,x21,ror#39 // Sigma0(a)
2984     + eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
2985     + add x3,x3,x12
2986     + add x24,x24,x20 // d+=h
2987     + add x20,x20,x19 // h+=Maj(a,b,c)
2988     + ldr x19,[x30],#8 // *K++, x28 in next round
2989     + add x3,x3,x9
2990     + add x20,x20,x17 // h+=Sigma0(a)
2991     + add x3,x3,x8
2992     +.Loop_16_xx:
2993     + ldr x8,[sp,#8]
2994     + str x11,[sp,#0]
2995     + ror x16,x24,#14
2996     + add x27,x27,x19 // h+=K[i]
2997     + ror x10,x5,#1
2998     + and x17,x25,x24
2999     + ror x9,x2,#19
3000     + bic x19,x26,x24
3001     + ror x11,x20,#28
3002     + add x27,x27,x3 // h+=X[i]
3003     + eor x16,x16,x24,ror#18
3004     + eor x10,x10,x5,ror#8
3005     + orr x17,x17,x19 // Ch(e,f,g)
3006     + eor x19,x20,x21 // a^b, b^c in next round
3007     + eor x16,x16,x24,ror#41 // Sigma1(e)
3008     + eor x11,x11,x20,ror#34
3009     + add x27,x27,x17 // h+=Ch(e,f,g)
3010     + and x28,x28,x19 // (b^c)&=(a^b)
3011     + eor x9,x9,x2,ror#61
3012     + eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
3013     + add x27,x27,x16 // h+=Sigma1(e)
3014     + eor x28,x28,x21 // Maj(a,b,c)
3015     + eor x17,x11,x20,ror#39 // Sigma0(a)
3016     + eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
3017     + add x4,x4,x13
3018     + add x23,x23,x27 // d+=h
3019     + add x27,x27,x28 // h+=Maj(a,b,c)
3020     + ldr x28,[x30],#8 // *K++, x19 in next round
3021     + add x4,x4,x10
3022     + add x27,x27,x17 // h+=Sigma0(a)
3023     + add x4,x4,x9
3024     + ldr x9,[sp,#16]
3025     + str x12,[sp,#8]
3026     + ror x16,x23,#14
3027     + add x26,x26,x28 // h+=K[i]
3028     + ror x11,x6,#1
3029     + and x17,x24,x23
3030     + ror x10,x3,#19
3031     + bic x28,x25,x23
3032     + ror x12,x27,#28
3033     + add x26,x26,x4 // h+=X[i]
3034     + eor x16,x16,x23,ror#18
3035     + eor x11,x11,x6,ror#8
3036     + orr x17,x17,x28 // Ch(e,f,g)
3037     + eor x28,x27,x20 // a^b, b^c in next round
3038     + eor x16,x16,x23,ror#41 // Sigma1(e)
3039     + eor x12,x12,x27,ror#34
3040     + add x26,x26,x17 // h+=Ch(e,f,g)
3041     + and x19,x19,x28 // (b^c)&=(a^b)
3042     + eor x10,x10,x3,ror#61
3043     + eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
3044     + add x26,x26,x16 // h+=Sigma1(e)
3045     + eor x19,x19,x20 // Maj(a,b,c)
3046     + eor x17,x12,x27,ror#39 // Sigma0(a)
3047     + eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
3048     + add x5,x5,x14
3049     + add x22,x22,x26 // d+=h
3050     + add x26,x26,x19 // h+=Maj(a,b,c)
3051     + ldr x19,[x30],#8 // *K++, x28 in next round
3052     + add x5,x5,x11
3053     + add x26,x26,x17 // h+=Sigma0(a)
3054     + add x5,x5,x10
3055     + ldr x10,[sp,#24]
3056     + str x13,[sp,#16]
3057     + ror x16,x22,#14
3058     + add x25,x25,x19 // h+=K[i]
3059     + ror x12,x7,#1
3060     + and x17,x23,x22
3061     + ror x11,x4,#19
3062     + bic x19,x24,x22
3063     + ror x13,x26,#28
3064     + add x25,x25,x5 // h+=X[i]
3065     + eor x16,x16,x22,ror#18
3066     + eor x12,x12,x7,ror#8
3067     + orr x17,x17,x19 // Ch(e,f,g)
3068     + eor x19,x26,x27 // a^b, b^c in next round
3069     + eor x16,x16,x22,ror#41 // Sigma1(e)
3070     + eor x13,x13,x26,ror#34
3071     + add x25,x25,x17 // h+=Ch(e,f,g)
3072     + and x28,x28,x19 // (b^c)&=(a^b)
3073     + eor x11,x11,x4,ror#61
3074     + eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
3075     + add x25,x25,x16 // h+=Sigma1(e)
3076     + eor x28,x28,x27 // Maj(a,b,c)
3077     + eor x17,x13,x26,ror#39 // Sigma0(a)
3078     + eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
3079     + add x6,x6,x15
3080     + add x21,x21,x25 // d+=h
3081     + add x25,x25,x28 // h+=Maj(a,b,c)
3082     + ldr x28,[x30],#8 // *K++, x19 in next round
3083     + add x6,x6,x12
3084     + add x25,x25,x17 // h+=Sigma0(a)
3085     + add x6,x6,x11
3086     + ldr x11,[sp,#0]
3087     + str x14,[sp,#24]
3088     + ror x16,x21,#14
3089     + add x24,x24,x28 // h+=K[i]
3090     + ror x13,x8,#1
3091     + and x17,x22,x21
3092     + ror x12,x5,#19
3093     + bic x28,x23,x21
3094     + ror x14,x25,#28
3095     + add x24,x24,x6 // h+=X[i]
3096     + eor x16,x16,x21,ror#18
3097     + eor x13,x13,x8,ror#8
3098     + orr x17,x17,x28 // Ch(e,f,g)
3099     + eor x28,x25,x26 // a^b, b^c in next round
3100     + eor x16,x16,x21,ror#41 // Sigma1(e)
3101     + eor x14,x14,x25,ror#34
3102     + add x24,x24,x17 // h+=Ch(e,f,g)
3103     + and x19,x19,x28 // (b^c)&=(a^b)
3104     + eor x12,x12,x5,ror#61
3105     + eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
3106     + add x24,x24,x16 // h+=Sigma1(e)
3107     + eor x19,x19,x26 // Maj(a,b,c)
3108     + eor x17,x14,x25,ror#39 // Sigma0(a)
3109     + eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
3110     + add x7,x7,x0
3111     + add x20,x20,x24 // d+=h
3112     + add x24,x24,x19 // h+=Maj(a,b,c)
3113     + ldr x19,[x30],#8 // *K++, x28 in next round
3114     + add x7,x7,x13
3115     + add x24,x24,x17 // h+=Sigma0(a)
3116     + add x7,x7,x12
3117     + ldr x12,[sp,#8]
3118     + str x15,[sp,#0]
3119     + ror x16,x20,#14
3120     + add x23,x23,x19 // h+=K[i]
3121     + ror x14,x9,#1
3122     + and x17,x21,x20
3123     + ror x13,x6,#19
3124     + bic x19,x22,x20
3125     + ror x15,x24,#28
3126     + add x23,x23,x7 // h+=X[i]
3127     + eor x16,x16,x20,ror#18
3128     + eor x14,x14,x9,ror#8
3129     + orr x17,x17,x19 // Ch(e,f,g)
3130     + eor x19,x24,x25 // a^b, b^c in next round
3131     + eor x16,x16,x20,ror#41 // Sigma1(e)
3132     + eor x15,x15,x24,ror#34
3133     + add x23,x23,x17 // h+=Ch(e,f,g)
3134     + and x28,x28,x19 // (b^c)&=(a^b)
3135     + eor x13,x13,x6,ror#61
3136     + eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
3137     + add x23,x23,x16 // h+=Sigma1(e)
3138     + eor x28,x28,x25 // Maj(a,b,c)
3139     + eor x17,x15,x24,ror#39 // Sigma0(a)
3140     + eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
3141     + add x8,x8,x1
3142     + add x27,x27,x23 // d+=h
3143     + add x23,x23,x28 // h+=Maj(a,b,c)
3144     + ldr x28,[x30],#8 // *K++, x19 in next round
3145     + add x8,x8,x14
3146     + add x23,x23,x17 // h+=Sigma0(a)
3147     + add x8,x8,x13
3148     + ldr x13,[sp,#16]
3149     + str x0,[sp,#8]
3150     + ror x16,x27,#14
3151     + add x22,x22,x28 // h+=K[i]
3152     + ror x15,x10,#1
3153     + and x17,x20,x27
3154     + ror x14,x7,#19
3155     + bic x28,x21,x27
3156     + ror x0,x23,#28
3157     + add x22,x22,x8 // h+=X[i]
3158     + eor x16,x16,x27,ror#18
3159     + eor x15,x15,x10,ror#8
3160     + orr x17,x17,x28 // Ch(e,f,g)
3161     + eor x28,x23,x24 // a^b, b^c in next round
3162     + eor x16,x16,x27,ror#41 // Sigma1(e)
3163     + eor x0,x0,x23,ror#34
3164     + add x22,x22,x17 // h+=Ch(e,f,g)
3165     + and x19,x19,x28 // (b^c)&=(a^b)
3166     + eor x14,x14,x7,ror#61
3167     + eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
3168     + add x22,x22,x16 // h+=Sigma1(e)
3169     + eor x19,x19,x24 // Maj(a,b,c)
3170     + eor x17,x0,x23,ror#39 // Sigma0(a)
3171     + eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
3172     + add x9,x9,x2
3173     + add x26,x26,x22 // d+=h
3174     + add x22,x22,x19 // h+=Maj(a,b,c)
3175     + ldr x19,[x30],#8 // *K++, x28 in next round
3176     + add x9,x9,x15
3177     + add x22,x22,x17 // h+=Sigma0(a)
3178     + add x9,x9,x14
3179     + ldr x14,[sp,#24]
3180     + str x1,[sp,#16]
3181     + ror x16,x26,#14
3182     + add x21,x21,x19 // h+=K[i]
3183     + ror x0,x11,#1
3184     + and x17,x27,x26
3185     + ror x15,x8,#19
3186     + bic x19,x20,x26
3187     + ror x1,x22,#28
3188     + add x21,x21,x9 // h+=X[i]
3189     + eor x16,x16,x26,ror#18
3190     + eor x0,x0,x11,ror#8
3191     + orr x17,x17,x19 // Ch(e,f,g)
3192     + eor x19,x22,x23 // a^b, b^c in next round
3193     + eor x16,x16,x26,ror#41 // Sigma1(e)
3194     + eor x1,x1,x22,ror#34
3195     + add x21,x21,x17 // h+=Ch(e,f,g)
3196     + and x28,x28,x19 // (b^c)&=(a^b)
3197     + eor x15,x15,x8,ror#61
3198     + eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
3199     + add x21,x21,x16 // h+=Sigma1(e)
3200     + eor x28,x28,x23 // Maj(a,b,c)
3201     + eor x17,x1,x22,ror#39 // Sigma0(a)
3202     + eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
3203     + add x10,x10,x3
3204     + add x25,x25,x21 // d+=h
3205     + add x21,x21,x28 // h+=Maj(a,b,c)
3206     + ldr x28,[x30],#8 // *K++, x19 in next round
3207     + add x10,x10,x0
3208     + add x21,x21,x17 // h+=Sigma0(a)
3209     + add x10,x10,x15
3210     + ldr x15,[sp,#0]
3211     + str x2,[sp,#24]
3212     + ror x16,x25,#14
3213     + add x20,x20,x28 // h+=K[i]
3214     + ror x1,x12,#1
3215     + and x17,x26,x25
3216     + ror x0,x9,#19
3217     + bic x28,x27,x25
3218     + ror x2,x21,#28
3219     + add x20,x20,x10 // h+=X[i]
3220     + eor x16,x16,x25,ror#18
3221     + eor x1,x1,x12,ror#8
3222     + orr x17,x17,x28 // Ch(e,f,g)
3223     + eor x28,x21,x22 // a^b, b^c in next round
3224     + eor x16,x16,x25,ror#41 // Sigma1(e)
3225     + eor x2,x2,x21,ror#34
3226     + add x20,x20,x17 // h+=Ch(e,f,g)
3227     + and x19,x19,x28 // (b^c)&=(a^b)
3228     + eor x0,x0,x9,ror#61
3229     + eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
3230     + add x20,x20,x16 // h+=Sigma1(e)
3231     + eor x19,x19,x22 // Maj(a,b,c)
3232     + eor x17,x2,x21,ror#39 // Sigma0(a)
3233     + eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
3234     + add x11,x11,x4
3235     + add x24,x24,x20 // d+=h
3236     + add x20,x20,x19 // h+=Maj(a,b,c)
3237     + ldr x19,[x30],#8 // *K++, x28 in next round
3238     + add x11,x11,x1
3239     + add x20,x20,x17 // h+=Sigma0(a)
3240     + add x11,x11,x0
3241     + ldr x0,[sp,#8]
3242     + str x3,[sp,#0]
3243     + ror x16,x24,#14
3244     + add x27,x27,x19 // h+=K[i]
3245     + ror x2,x13,#1
3246     + and x17,x25,x24
3247     + ror x1,x10,#19
3248     + bic x19,x26,x24
3249     + ror x3,x20,#28
3250     + add x27,x27,x11 // h+=X[i]
3251     + eor x16,x16,x24,ror#18
3252     + eor x2,x2,x13,ror#8
3253     + orr x17,x17,x19 // Ch(e,f,g)
3254     + eor x19,x20,x21 // a^b, b^c in next round
3255     + eor x16,x16,x24,ror#41 // Sigma1(e)
3256     + eor x3,x3,x20,ror#34
3257     + add x27,x27,x17 // h+=Ch(e,f,g)
3258     + and x28,x28,x19 // (b^c)&=(a^b)
3259     + eor x1,x1,x10,ror#61
3260     + eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
3261     + add x27,x27,x16 // h+=Sigma1(e)
3262     + eor x28,x28,x21 // Maj(a,b,c)
3263     + eor x17,x3,x20,ror#39 // Sigma0(a)
3264     + eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
3265     + add x12,x12,x5
3266     + add x23,x23,x27 // d+=h
3267     + add x27,x27,x28 // h+=Maj(a,b,c)
3268     + ldr x28,[x30],#8 // *K++, x19 in next round
3269     + add x12,x12,x2
3270     + add x27,x27,x17 // h+=Sigma0(a)
3271     + add x12,x12,x1
3272     + ldr x1,[sp,#16]
3273     + str x4,[sp,#8]
3274     + ror x16,x23,#14
3275     + add x26,x26,x28 // h+=K[i]
3276     + ror x3,x14,#1
3277     + and x17,x24,x23
3278     + ror x2,x11,#19
3279     + bic x28,x25,x23
3280     + ror x4,x27,#28
3281     + add x26,x26,x12 // h+=X[i]
3282     + eor x16,x16,x23,ror#18
3283     + eor x3,x3,x14,ror#8
3284     + orr x17,x17,x28 // Ch(e,f,g)
3285     + eor x28,x27,x20 // a^b, b^c in next round
3286     + eor x16,x16,x23,ror#41 // Sigma1(e)
3287     + eor x4,x4,x27,ror#34
3288     + add x26,x26,x17 // h+=Ch(e,f,g)
3289     + and x19,x19,x28 // (b^c)&=(a^b)
3290     + eor x2,x2,x11,ror#61
3291     + eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
3292     + add x26,x26,x16 // h+=Sigma1(e)
3293     + eor x19,x19,x20 // Maj(a,b,c)
3294     + eor x17,x4,x27,ror#39 // Sigma0(a)
3295     + eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
3296     + add x13,x13,x6
3297     + add x22,x22,x26 // d+=h
3298     + add x26,x26,x19 // h+=Maj(a,b,c)
3299     + ldr x19,[x30],#8 // *K++, x28 in next round
3300     + add x13,x13,x3
3301     + add x26,x26,x17 // h+=Sigma0(a)
3302     + add x13,x13,x2
3303     + ldr x2,[sp,#24]
3304     + str x5,[sp,#16]
3305     + ror x16,x22,#14
3306     + add x25,x25,x19 // h+=K[i]
3307     + ror x4,x15,#1
3308     + and x17,x23,x22
3309     + ror x3,x12,#19
3310     + bic x19,x24,x22
3311     + ror x5,x26,#28
3312     + add x25,x25,x13 // h+=X[i]
3313     + eor x16,x16,x22,ror#18
3314     + eor x4,x4,x15,ror#8
3315     + orr x17,x17,x19 // Ch(e,f,g)
3316     + eor x19,x26,x27 // a^b, b^c in next round
3317     + eor x16,x16,x22,ror#41 // Sigma1(e)
3318     + eor x5,x5,x26,ror#34
3319     + add x25,x25,x17 // h+=Ch(e,f,g)
3320     + and x28,x28,x19 // (b^c)&=(a^b)
3321     + eor x3,x3,x12,ror#61
3322     + eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
3323     + add x25,x25,x16 // h+=Sigma1(e)
3324     + eor x28,x28,x27 // Maj(a,b,c)
3325     + eor x17,x5,x26,ror#39 // Sigma0(a)
3326     + eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
3327     + add x14,x14,x7
3328     + add x21,x21,x25 // d+=h
3329     + add x25,x25,x28 // h+=Maj(a,b,c)
3330     + ldr x28,[x30],#8 // *K++, x19 in next round
3331     + add x14,x14,x4
3332     + add x25,x25,x17 // h+=Sigma0(a)
3333     + add x14,x14,x3
3334     + ldr x3,[sp,#0]
3335     + str x6,[sp,#24]
3336     + ror x16,x21,#14
3337     + add x24,x24,x28 // h+=K[i]
3338     + ror x5,x0,#1
3339     + and x17,x22,x21
3340     + ror x4,x13,#19
3341     + bic x28,x23,x21
3342     + ror x6,x25,#28
3343     + add x24,x24,x14 // h+=X[i]
3344     + eor x16,x16,x21,ror#18
3345     + eor x5,x5,x0,ror#8
3346     + orr x17,x17,x28 // Ch(e,f,g)
3347     + eor x28,x25,x26 // a^b, b^c in next round
3348     + eor x16,x16,x21,ror#41 // Sigma1(e)
3349     + eor x6,x6,x25,ror#34
3350     + add x24,x24,x17 // h+=Ch(e,f,g)
3351     + and x19,x19,x28 // (b^c)&=(a^b)
3352     + eor x4,x4,x13,ror#61
3353     + eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
3354     + add x24,x24,x16 // h+=Sigma1(e)
3355     + eor x19,x19,x26 // Maj(a,b,c)
3356     + eor x17,x6,x25,ror#39 // Sigma0(a)
3357     + eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
3358     + add x15,x15,x8
3359     + add x20,x20,x24 // d+=h
3360     + add x24,x24,x19 // h+=Maj(a,b,c)
3361     + ldr x19,[x30],#8 // *K++, x28 in next round
3362     + add x15,x15,x5
3363     + add x24,x24,x17 // h+=Sigma0(a)
3364     + add x15,x15,x4
3365     + ldr x4,[sp,#8]
3366     + str x7,[sp,#0]
3367     + ror x16,x20,#14
3368     + add x23,x23,x19 // h+=K[i]
3369     + ror x6,x1,#1
3370     + and x17,x21,x20
3371     + ror x5,x14,#19
3372     + bic x19,x22,x20
3373     + ror x7,x24,#28
3374     + add x23,x23,x15 // h+=X[i]
3375     + eor x16,x16,x20,ror#18
3376     + eor x6,x6,x1,ror#8
3377     + orr x17,x17,x19 // Ch(e,f,g)
3378     + eor x19,x24,x25 // a^b, b^c in next round
3379     + eor x16,x16,x20,ror#41 // Sigma1(e)
3380     + eor x7,x7,x24,ror#34
3381     + add x23,x23,x17 // h+=Ch(e,f,g)
3382     + and x28,x28,x19 // (b^c)&=(a^b)
3383     + eor x5,x5,x14,ror#61
3384     + eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
3385     + add x23,x23,x16 // h+=Sigma1(e)
3386     + eor x28,x28,x25 // Maj(a,b,c)
3387     + eor x17,x7,x24,ror#39 // Sigma0(a)
3388     + eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
3389     + add x0,x0,x9
3390     + add x27,x27,x23 // d+=h
3391     + add x23,x23,x28 // h+=Maj(a,b,c)
3392     + ldr x28,[x30],#8 // *K++, x19 in next round
3393     + add x0,x0,x6
3394     + add x23,x23,x17 // h+=Sigma0(a)
3395     + add x0,x0,x5
3396     + ldr x5,[sp,#16]
3397     + str x8,[sp,#8]
3398     + ror x16,x27,#14
3399     + add x22,x22,x28 // h+=K[i]
3400     + ror x7,x2,#1
3401     + and x17,x20,x27
3402     + ror x6,x15,#19
3403     + bic x28,x21,x27
3404     + ror x8,x23,#28
3405     + add x22,x22,x0 // h+=X[i]
3406     + eor x16,x16,x27,ror#18
3407     + eor x7,x7,x2,ror#8
3408     + orr x17,x17,x28 // Ch(e,f,g)
3409     + eor x28,x23,x24 // a^b, b^c in next round
3410     + eor x16,x16,x27,ror#41 // Sigma1(e)
3411     + eor x8,x8,x23,ror#34
3412     + add x22,x22,x17 // h+=Ch(e,f,g)
3413     + and x19,x19,x28 // (b^c)&=(a^b)
3414     + eor x6,x6,x15,ror#61
3415     + eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
3416     + add x22,x22,x16 // h+=Sigma1(e)
3417     + eor x19,x19,x24 // Maj(a,b,c)
3418     + eor x17,x8,x23,ror#39 // Sigma0(a)
3419     + eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
3420     + add x1,x1,x10
3421     + add x26,x26,x22 // d+=h
3422     + add x22,x22,x19 // h+=Maj(a,b,c)
3423     + ldr x19,[x30],#8 // *K++, x28 in next round
3424     + add x1,x1,x7
3425     + add x22,x22,x17 // h+=Sigma0(a)
3426     + add x1,x1,x6
3427     + ldr x6,[sp,#24]
3428     + str x9,[sp,#16]
3429     + ror x16,x26,#14
3430     + add x21,x21,x19 // h+=K[i]
3431     + ror x8,x3,#1
3432     + and x17,x27,x26
3433     + ror x7,x0,#19
3434     + bic x19,x20,x26
3435     + ror x9,x22,#28
3436     + add x21,x21,x1 // h+=X[i]
3437     + eor x16,x16,x26,ror#18
3438     + eor x8,x8,x3,ror#8
3439     + orr x17,x17,x19 // Ch(e,f,g)
3440     + eor x19,x22,x23 // a^b, b^c in next round
3441     + eor x16,x16,x26,ror#41 // Sigma1(e)
3442     + eor x9,x9,x22,ror#34
3443     + add x21,x21,x17 // h+=Ch(e,f,g)
3444     + and x28,x28,x19 // (b^c)&=(a^b)
3445     + eor x7,x7,x0,ror#61
3446     + eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
3447     + add x21,x21,x16 // h+=Sigma1(e)
3448     + eor x28,x28,x23 // Maj(a,b,c)
3449     + eor x17,x9,x22,ror#39 // Sigma0(a)
3450     + eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
3451     + add x2,x2,x11
3452     + add x25,x25,x21 // d+=h
3453     + add x21,x21,x28 // h+=Maj(a,b,c)
3454     + ldr x28,[x30],#8 // *K++, x19 in next round
3455     + add x2,x2,x8
3456     + add x21,x21,x17 // h+=Sigma0(a)
3457     + add x2,x2,x7
3458     + ldr x7,[sp,#0]
3459     + str x10,[sp,#24]
3460     + ror x16,x25,#14
3461     + add x20,x20,x28 // h+=K[i]
3462     + ror x9,x4,#1
3463     + and x17,x26,x25
3464     + ror x8,x1,#19
3465     + bic x28,x27,x25
3466     + ror x10,x21,#28
3467     + add x20,x20,x2 // h+=X[i]
3468     + eor x16,x16,x25,ror#18
3469     + eor x9,x9,x4,ror#8
3470     + orr x17,x17,x28 // Ch(e,f,g)
3471     + eor x28,x21,x22 // a^b, b^c in next round
3472     + eor x16,x16,x25,ror#41 // Sigma1(e)
3473     + eor x10,x10,x21,ror#34
3474     + add x20,x20,x17 // h+=Ch(e,f,g)
3475     + and x19,x19,x28 // (b^c)&=(a^b)
3476     + eor x8,x8,x1,ror#61
3477     + eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
3478     + add x20,x20,x16 // h+=Sigma1(e)
3479     + eor x19,x19,x22 // Maj(a,b,c)
3480     + eor x17,x10,x21,ror#39 // Sigma0(a)
3481     + eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
3482     + add x3,x3,x12
3483     + add x24,x24,x20 // d+=h
3484     + add x20,x20,x19 // h+=Maj(a,b,c)
3485     + ldr x19,[x30],#8 // *K++, x28 in next round
3486     + add x3,x3,x9
3487     + add x20,x20,x17 // h+=Sigma0(a)
3488     + add x3,x3,x8
3489     + cbnz x19,.Loop_16_xx
3490     +
3491     + ldp x0,x2,[x29,#96]
3492     + ldr x1,[x29,#112]
3493     + sub x30,x30,#648 // rewind
3494     +
3495     + ldp x3,x4,[x0]
3496     + ldp x5,x6,[x0,#2*8]
3497     + add x1,x1,#14*8 // advance input pointer
3498     + ldp x7,x8,[x0,#4*8]
3499     + add x20,x20,x3
3500     + ldp x9,x10,[x0,#6*8]
3501     + add x21,x21,x4
3502     + add x22,x22,x5
3503     + add x23,x23,x6
3504     + stp x20,x21,[x0]
3505     + add x24,x24,x7
3506     + add x25,x25,x8
3507     + stp x22,x23,[x0,#2*8]
3508     + add x26,x26,x9
3509     + add x27,x27,x10
3510     + cmp x1,x2
3511     + stp x24,x25,[x0,#4*8]
3512     + stp x26,x27,[x0,#6*8]
3513     + b.ne .Loop
3514     +
3515     + ldp x19,x20,[x29,#16]
3516     + add sp,sp,#4*8
3517     + ldp x21,x22,[x29,#32]
3518     + ldp x23,x24,[x29,#48]
3519     + ldp x25,x26,[x29,#64]
3520     + ldp x27,x28,[x29,#80]
3521     + ldp x29,x30,[sp],#128
3522     + ret
3523     +.size sha512_block_data_order,.-sha512_block_data_order
3524     +
3525     +.align 6
3526     +.type .LK512,%object
3527     +.LK512:
3528     + .quad 0x428a2f98d728ae22,0x7137449123ef65cd
3529     + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
3530     + .quad 0x3956c25bf348b538,0x59f111f1b605d019
3531     + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
3532     + .quad 0xd807aa98a3030242,0x12835b0145706fbe
3533     + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
3534     + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
3535     + .quad 0x9bdc06a725c71235,0xc19bf174cf692694
3536     + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
3537     + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
3538     + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
3539     + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
3540     + .quad 0x983e5152ee66dfab,0xa831c66d2db43210
3541     + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
3542     + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
3543     + .quad 0x06ca6351e003826f,0x142929670a0e6e70
3544     + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
3545     + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
3546     + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
3547     + .quad 0x81c2c92e47edaee6,0x92722c851482353b
3548     + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
3549     + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
3550     + .quad 0xd192e819d6ef5218,0xd69906245565a910
3551     + .quad 0xf40e35855771202a,0x106aa07032bbd1b8
3552     + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
3553     + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
3554     + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
3555     + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
3556     + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
3557     + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
3558     + .quad 0x90befffa23631e28,0xa4506cebde82bde9
3559     + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
3560     + .quad 0xca273eceea26619c,0xd186b8c721c0c207
3561     + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
3562     + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
3563     + .quad 0x113f9804bef90dae,0x1b710b35131c471b
3564     + .quad 0x28db77f523047d84,0x32caab7b40c72493
3565     + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
3566     + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
3567     + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
3568     + .quad 0 // terminator
3569     +.size .LK512,.-.LK512
3570     +#ifndef __KERNEL__
3571     +.align 3
3572     +.LOPENSSL_armcap_P:
3573     +# ifdef __ILP32__
3574     + .long OPENSSL_armcap_P-.
3575     +# else
3576     + .quad OPENSSL_armcap_P-.
3577     +# endif
3578     +#endif
3579     +.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
3580     +.align 2
3581     +#ifndef __KERNEL__
3582     +.comm OPENSSL_armcap_P,4,4
3583     +#endif
3584     diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
3585     index 7193bf97b8da..e60375ce0dd2 100644
3586     --- a/arch/arm64/include/asm/assembler.h
3587     +++ b/arch/arm64/include/asm/assembler.h
3588     @@ -86,6 +86,24 @@
3589     dmb \opt
3590     .endm
3591    
3592     +/*
3593     + * Value prediction barrier
3594     + */
3595     + .macro csdb
3596     + hint #20
3597     + .endm
3598     +
3599     +/*
3600     + * Sanitise a 64-bit bounded index wrt speculation, returning zero if out
3601     + * of bounds.
3602     + */
3603     + .macro mask_nospec64, idx, limit, tmp
3604     + sub \tmp, \idx, \limit
3605     + bic \tmp, \tmp, \idx
3606     + and \idx, \idx, \tmp, asr #63
3607     + csdb
3608     + .endm
3609     +
3610     /*
3611     * NOP sequence
3612     */
3613     @@ -416,4 +434,5 @@ alternative_endif
3614     .macro pte_to_phys, phys, pte
3615     and \phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
3616     .endm
3617     +
3618     #endif /* __ASM_ASSEMBLER_H */
3619     diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
3620     index 0fe7e43b7fbc..0b0755c961ac 100644
3621     --- a/arch/arm64/include/asm/barrier.h
3622     +++ b/arch/arm64/include/asm/barrier.h
3623     @@ -31,6 +31,8 @@
3624     #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
3625     #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
3626    
3627     +#define csdb() asm volatile("hint #20" : : : "memory")
3628     +
3629     #define mb() dsb(sy)
3630     #define rmb() dsb(ld)
3631     #define wmb() dsb(st)
3632     @@ -38,6 +40,27 @@
3633     #define dma_rmb() dmb(oshld)
3634     #define dma_wmb() dmb(oshst)
3635    
3636     +/*
3637     + * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz
3638     + * and 0 otherwise.
3639     + */
3640     +#define array_index_mask_nospec array_index_mask_nospec
3641     +static inline unsigned long array_index_mask_nospec(unsigned long idx,
3642     + unsigned long sz)
3643     +{
3644     + unsigned long mask;
3645     +
3646     + asm volatile(
3647     + " cmp %1, %2\n"
3648     + " sbc %0, xzr, xzr\n"
3649     + : "=r" (mask)
3650     + : "r" (idx), "Ir" (sz)
3651     + : "cc");
3652     +
3653     + csdb();
3654     + return mask;
3655     +}
3656     +
3657     #define __smp_mb() dmb(ish)
3658     #define __smp_rmb() dmb(ishld)
3659     #define __smp_wmb() dmb(ishst)
3660     diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
3661     index 7ddf233f05bd..ce67bf6a0886 100644
3662     --- a/arch/arm64/include/asm/cpucaps.h
3663     +++ b/arch/arm64/include/asm/cpucaps.h
3664     @@ -35,7 +35,8 @@
3665     #define ARM64_HYP_OFFSET_LOW 14
3666     #define ARM64_MISMATCHED_CACHE_LINE_SIZE 15
3667     #define ARM64_UNMAP_KERNEL_AT_EL0 16
3668     +#define ARM64_HARDEN_BRANCH_PREDICTOR 17
3669    
3670     -#define ARM64_NCAPS 17
3671     +#define ARM64_NCAPS 18
3672    
3673     #endif /* __ASM_CPUCAPS_H */
3674     diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
3675     index 1d47930c30dc..9ee3038a6b98 100644
3676     --- a/arch/arm64/include/asm/cputype.h
3677     +++ b/arch/arm64/include/asm/cputype.h
3678     @@ -75,7 +75,10 @@
3679     #define ARM_CPU_PART_AEM_V8 0xD0F
3680     #define ARM_CPU_PART_FOUNDATION 0xD00
3681     #define ARM_CPU_PART_CORTEX_A57 0xD07
3682     +#define ARM_CPU_PART_CORTEX_A72 0xD08
3683     #define ARM_CPU_PART_CORTEX_A53 0xD03
3684     +#define ARM_CPU_PART_CORTEX_A73 0xD09
3685     +#define ARM_CPU_PART_CORTEX_A75 0xD0A
3686    
3687     #define APM_CPU_PART_POTENZA 0x000
3688    
3689     @@ -87,6 +90,9 @@
3690    
3691     #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
3692     #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
3693     +#define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72)
3694     +#define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73)
3695     +#define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75)
3696     #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
3697     #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
3698     #define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2)
3699     diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
3700     index 20dcb196b240..4e5f36a804b4 100644
3701     --- a/arch/arm64/include/asm/futex.h
3702     +++ b/arch/arm64/include/asm/futex.h
3703     @@ -51,13 +51,14 @@
3704     : "memory")
3705    
3706     static inline int
3707     -futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
3708     +futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *_uaddr)
3709     {
3710     int op = (encoded_op >> 28) & 7;
3711     int cmp = (encoded_op >> 24) & 15;
3712     int oparg = (int)(encoded_op << 8) >> 20;
3713     int cmparg = (int)(encoded_op << 20) >> 20;
3714     int oldval = 0, ret, tmp;
3715     + u32 __user *uaddr = __uaccess_mask_ptr(_uaddr);
3716    
3717     if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
3718     oparg = 1U << (oparg & 0x1f);
3719     @@ -109,15 +110,17 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
3720     }
3721    
3722     static inline int
3723     -futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
3724     +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr,
3725     u32 oldval, u32 newval)
3726     {
3727     int ret = 0;
3728     u32 val, tmp;
3729     + u32 __user *uaddr;
3730    
3731     - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
3732     + if (!access_ok(VERIFY_WRITE, _uaddr, sizeof(u32)))
3733     return -EFAULT;
3734    
3735     + uaddr = __uaccess_mask_ptr(_uaddr);
3736     asm volatile("// futex_atomic_cmpxchg_inatomic\n"
3737     ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
3738     " prfm pstl1strm, %2\n"
3739     diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
3740     index e5050388e062..37d56e85036e 100644
3741     --- a/arch/arm64/include/asm/kvm_host.h
3742     +++ b/arch/arm64/include/asm/kvm_host.h
3743     @@ -393,4 +393,9 @@ static inline void __cpu_init_stage2(void)
3744     "PARange is %d bits, unsupported configuration!", parange);
3745     }
3746    
3747     +static inline bool kvm_arm_harden_branch_predictor(void)
3748     +{
3749     + return cpus_have_cap(ARM64_HARDEN_BRANCH_PREDICTOR);
3750     +}
3751     +
3752     #endif /* __ARM64_KVM_HOST_H__ */
3753     diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
3754     index 6d22017ebbad..80bf33715ecb 100644
3755     --- a/arch/arm64/include/asm/kvm_mmu.h
3756     +++ b/arch/arm64/include/asm/kvm_mmu.h
3757     @@ -313,5 +313,43 @@ static inline unsigned int kvm_get_vmid_bits(void)
3758     return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
3759     }
3760    
3761     +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
3762     +#include <asm/mmu.h>
3763     +
3764     +static inline void *kvm_get_hyp_vector(void)
3765     +{
3766     + struct bp_hardening_data *data = arm64_get_bp_hardening_data();
3767     + void *vect = kvm_ksym_ref(__kvm_hyp_vector);
3768     +
3769     + if (data->fn) {
3770     + vect = __bp_harden_hyp_vecs_start +
3771     + data->hyp_vectors_slot * SZ_2K;
3772     +
3773     + if (!cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
3774     + vect = lm_alias(vect);
3775     + }
3776     +
3777     + return vect;
3778     +}
3779     +
3780     +static inline int kvm_map_vectors(void)
3781     +{
3782     + return create_hyp_mappings(kvm_ksym_ref(__bp_harden_hyp_vecs_start),
3783     + kvm_ksym_ref(__bp_harden_hyp_vecs_end),
3784     + PAGE_HYP_EXEC);
3785     +}
3786     +
3787     +#else
3788     +static inline void *kvm_get_hyp_vector(void)
3789     +{
3790     + return kvm_ksym_ref(__kvm_hyp_vector);
3791     +}
3792     +
3793     +static inline int kvm_map_vectors(void)
3794     +{
3795     + return 0;
3796     +}
3797     +#endif
3798     +
3799     #endif /* __ASSEMBLY__ */
3800     #endif /* __ARM64_KVM_MMU_H__ */
3801     diff --git a/arch/arm64/include/asm/kvm_psci.h b/arch/arm64/include/asm/kvm_psci.h
3802     deleted file mode 100644
3803     index bc39e557c56c..000000000000
3804     --- a/arch/arm64/include/asm/kvm_psci.h
3805     +++ /dev/null
3806     @@ -1,27 +0,0 @@
3807     -/*
3808     - * Copyright (C) 2012,2013 - ARM Ltd
3809     - * Author: Marc Zyngier <marc.zyngier@arm.com>
3810     - *
3811     - * This program is free software; you can redistribute it and/or modify
3812     - * it under the terms of the GNU General Public License version 2 as
3813     - * published by the Free Software Foundation.
3814     - *
3815     - * This program is distributed in the hope that it will be useful,
3816     - * but WITHOUT ANY WARRANTY; without even the implied warranty of
3817     - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3818     - * GNU General Public License for more details.
3819     - *
3820     - * You should have received a copy of the GNU General Public License
3821     - * along with this program. If not, see <http://www.gnu.org/licenses/>.
3822     - */
3823     -
3824     -#ifndef __ARM64_KVM_PSCI_H__
3825     -#define __ARM64_KVM_PSCI_H__
3826     -
3827     -#define KVM_ARM_PSCI_0_1 1
3828     -#define KVM_ARM_PSCI_0_2 2
3829     -
3830     -int kvm_psci_version(struct kvm_vcpu *vcpu);
3831     -int kvm_psci_call(struct kvm_vcpu *vcpu);
3832     -
3833     -#endif /* __ARM64_KVM_PSCI_H__ */
3834     diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
3835     index 5e3faba689e0..ba917be5565a 100644
3836     --- a/arch/arm64/include/asm/memory.h
3837     +++ b/arch/arm64/include/asm/memory.h
3838     @@ -60,8 +60,6 @@
3839     * KIMAGE_VADDR - the virtual address of the start of the kernel image
3840     * VA_BITS - the maximum number of bits for virtual addresses.
3841     * VA_START - the first kernel virtual address.
3842     - * TASK_SIZE - the maximum size of a user space task.
3843     - * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
3844     */
3845     #define VA_BITS (CONFIG_ARM64_VA_BITS)
3846     #define VA_START (UL(0xffffffffffffffff) - \
3847     @@ -76,19 +74,6 @@
3848     #define PCI_IO_END (VMEMMAP_START - SZ_2M)
3849     #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
3850     #define FIXADDR_TOP (PCI_IO_START - SZ_2M)
3851     -#define TASK_SIZE_64 (UL(1) << VA_BITS)
3852     -
3853     -#ifdef CONFIG_COMPAT
3854     -#define TASK_SIZE_32 UL(0x100000000)
3855     -#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \
3856     - TASK_SIZE_32 : TASK_SIZE_64)
3857     -#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
3858     - TASK_SIZE_32 : TASK_SIZE_64)
3859     -#else
3860     -#define TASK_SIZE TASK_SIZE_64
3861     -#endif /* CONFIG_COMPAT */
3862     -
3863     -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4))
3864    
3865     #define KERNEL_START _text
3866     #define KERNEL_END _end
3867     diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
3868     index a813edf28737..d51158a61892 100644
3869     --- a/arch/arm64/include/asm/mmu.h
3870     +++ b/arch/arm64/include/asm/mmu.h
3871     @@ -20,6 +20,8 @@
3872    
3873     #ifndef __ASSEMBLY__
3874    
3875     +#include <linux/percpu.h>
3876     +
3877     typedef struct {
3878     atomic64_t id;
3879     void *vdso;
3880     @@ -38,6 +40,43 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
3881     cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0);
3882     }
3883    
3884     +typedef void (*bp_hardening_cb_t)(void);
3885     +
3886     +struct bp_hardening_data {
3887     + int hyp_vectors_slot;
3888     + bp_hardening_cb_t fn;
3889     +};
3890     +
3891     +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
3892     +extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[];
3893     +
3894     +DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
3895     +
3896     +static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
3897     +{
3898     + return this_cpu_ptr(&bp_hardening_data);
3899     +}
3900     +
3901     +static inline void arm64_apply_bp_hardening(void)
3902     +{
3903     + struct bp_hardening_data *d;
3904     +
3905     + if (!cpus_have_cap(ARM64_HARDEN_BRANCH_PREDICTOR))
3906     + return;
3907     +
3908     + d = arm64_get_bp_hardening_data();
3909     + if (d->fn)
3910     + d->fn();
3911     +}
3912     +#else
3913     +static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
3914     +{
3915     + return NULL;
3916     +}
3917     +
3918     +static inline void arm64_apply_bp_hardening(void) { }
3919     +#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */
3920     +
3921     extern void paging_init(void);
3922     extern void bootmem_init(void);
3923     extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
3924     diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
3925     index 60e34824e18c..5917147af0c4 100644
3926     --- a/arch/arm64/include/asm/processor.h
3927     +++ b/arch/arm64/include/asm/processor.h
3928     @@ -19,6 +19,13 @@
3929     #ifndef __ASM_PROCESSOR_H
3930     #define __ASM_PROCESSOR_H
3931    
3932     +#define TASK_SIZE_64 (UL(1) << VA_BITS)
3933     +
3934     +#define KERNEL_DS UL(-1)
3935     +#define USER_DS (TASK_SIZE_64 - 1)
3936     +
3937     +#ifndef __ASSEMBLY__
3938     +
3939     /*
3940     * Default implementation of macro that returns current
3941     * instruction pointer ("program counter").
3942     @@ -37,6 +44,22 @@
3943     #include <asm/ptrace.h>
3944     #include <asm/types.h>
3945    
3946     +/*
3947     + * TASK_SIZE - the maximum size of a user space task.
3948     + * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
3949     + */
3950     +#ifdef CONFIG_COMPAT
3951     +#define TASK_SIZE_32 UL(0x100000000)
3952     +#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \
3953     + TASK_SIZE_32 : TASK_SIZE_64)
3954     +#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
3955     + TASK_SIZE_32 : TASK_SIZE_64)
3956     +#else
3957     +#define TASK_SIZE TASK_SIZE_64
3958     +#endif /* CONFIG_COMPAT */
3959     +
3960     +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4))
3961     +
3962     #define STACK_TOP_MAX TASK_SIZE_64
3963     #ifdef CONFIG_COMPAT
3964     #define AARCH32_VECTORS_BASE 0xffff0000
3965     @@ -192,4 +215,5 @@ int cpu_enable_pan(void *__unused);
3966     int cpu_enable_uao(void *__unused);
3967     int cpu_enable_cache_maint_trap(void *__unused);
3968    
3969     +#endif /* __ASSEMBLY__ */
3970     #endif /* __ASM_PROCESSOR_H */
3971     diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
3972     index 7cb7f7cdcfbc..88bbe364b6ae 100644
3973     --- a/arch/arm64/include/asm/sysreg.h
3974     +++ b/arch/arm64/include/asm/sysreg.h
3975     @@ -118,6 +118,8 @@
3976    
3977     /* id_aa64pfr0 */
3978     #define ID_AA64PFR0_CSV3_SHIFT 60
3979     +#define ID_AA64PFR0_CSV2_SHIFT 56
3980     +#define ID_AA64PFR0_SVE_SHIFT 32
3981     #define ID_AA64PFR0_GIC_SHIFT 24
3982     #define ID_AA64PFR0_ASIMD_SHIFT 20
3983     #define ID_AA64PFR0_FP_SHIFT 16
3984     diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
3985     index 811cf16a65f9..1d047d6c421b 100644
3986     --- a/arch/arm64/include/asm/uaccess.h
3987     +++ b/arch/arm64/include/asm/uaccess.h
3988     @@ -28,6 +28,7 @@
3989    
3990     #include <asm/alternative.h>
3991     #include <asm/cpufeature.h>
3992     +#include <asm/processor.h>
3993     #include <asm/ptrace.h>
3994     #include <asm/sysreg.h>
3995     #include <asm/errno.h>
3996     @@ -59,16 +60,20 @@ struct exception_table_entry
3997    
3998     extern int fixup_exception(struct pt_regs *regs);
3999    
4000     -#define KERNEL_DS (-1UL)
4001     #define get_ds() (KERNEL_DS)
4002     -
4003     -#define USER_DS TASK_SIZE_64
4004     #define get_fs() (current_thread_info()->addr_limit)
4005    
4006     static inline void set_fs(mm_segment_t fs)
4007     {
4008     current_thread_info()->addr_limit = fs;
4009    
4010     + /*
4011     + * Prevent a mispredicted conditional call to set_fs from forwarding
4012     + * the wrong address limit to access_ok under speculation.
4013     + */
4014     + dsb(nsh);
4015     + isb();
4016     +
4017     /*
4018     * Enable/disable UAO so that copy_to_user() etc can access
4019     * kernel memory with the unprivileged instructions.
4020     @@ -87,22 +92,32 @@ static inline void set_fs(mm_segment_t fs)
4021     * Returns 1 if the range is valid, 0 otherwise.
4022     *
4023     * This is equivalent to the following test:
4024     - * (u65)addr + (u65)size <= current->addr_limit
4025     - *
4026     - * This needs 65-bit arithmetic.
4027     + * (u65)addr + (u65)size <= (u65)current->addr_limit + 1
4028     */
4029     -#define __range_ok(addr, size) \
4030     -({ \
4031     - unsigned long __addr = (unsigned long __force)(addr); \
4032     - unsigned long flag, roksum; \
4033     - __chk_user_ptr(addr); \
4034     - asm("adds %1, %1, %3; ccmp %1, %4, #2, cc; cset %0, ls" \
4035     - : "=&r" (flag), "=&r" (roksum) \
4036     - : "1" (__addr), "Ir" (size), \
4037     - "r" (current_thread_info()->addr_limit) \
4038     - : "cc"); \
4039     - flag; \
4040     -})
4041     +static inline unsigned long __range_ok(unsigned long addr, unsigned long size)
4042     +{
4043     + unsigned long limit = current_thread_info()->addr_limit;
4044     +
4045     + __chk_user_ptr(addr);
4046     + asm volatile(
4047     + // A + B <= C + 1 for all A,B,C, in four easy steps:
4048     + // 1: X = A + B; X' = X % 2^64
4049     + " adds %0, %0, %2\n"
4050     + // 2: Set C = 0 if X > 2^64, to guarantee X' > C in step 4
4051     + " csel %1, xzr, %1, hi\n"
4052     + // 3: Set X' = ~0 if X >= 2^64. For X == 2^64, this decrements X'
4053     + // to compensate for the carry flag being set in step 4. For
4054     + // X > 2^64, X' merely has to remain nonzero, which it does.
4055     + " csinv %0, %0, xzr, cc\n"
4056     + // 4: For X < 2^64, this gives us X' - C - 1 <= 0, where the -1
4057     + // comes from the carry in being clear. Otherwise, we are
4058     + // testing X' - C == 0, subject to the previous adjustments.
4059     + " sbcs xzr, %0, %1\n"
4060     + " cset %0, ls\n"
4061     + : "+r" (addr), "+r" (limit) : "Ir" (size) : "cc");
4062     +
4063     + return addr;
4064     +}
4065    
4066     /*
4067     * When dealing with data aborts, watchpoints, or instruction traps we may end
4068     @@ -111,7 +126,7 @@ static inline void set_fs(mm_segment_t fs)
4069     */
4070     #define untagged_addr(addr) sign_extend64(addr, 55)
4071    
4072     -#define access_ok(type, addr, size) __range_ok(addr, size)
4073     +#define access_ok(type, addr, size) __range_ok((unsigned long)(addr), size)
4074     #define user_addr_max get_fs
4075    
4076     #define _ASM_EXTABLE(from, to) \
4077     @@ -120,6 +135,26 @@ static inline void set_fs(mm_segment_t fs)
4078     " .long (" #from " - .), (" #to " - .)\n" \
4079     " .popsection\n"
4080    
4081     +/*
4082     + * Sanitise a uaccess pointer such that it becomes NULL if above the
4083     + * current addr_limit.
4084     + */
4085     +#define uaccess_mask_ptr(ptr) (__typeof__(ptr))__uaccess_mask_ptr(ptr)
4086     +static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
4087     +{
4088     + void __user *safe_ptr;
4089     +
4090     + asm volatile(
4091     + " bics xzr, %1, %2\n"
4092     + " csel %0, %1, xzr, eq\n"
4093     + : "=&r" (safe_ptr)
4094     + : "r" (ptr), "r" (current_thread_info()->addr_limit)
4095     + : "cc");
4096     +
4097     + csdb();
4098     + return safe_ptr;
4099     +}
4100     +
4101     /*
4102     * The "__xxx" versions of the user access functions do not verify the address
4103     * space - it must have been done previously with a separate "access_ok()"
4104     @@ -174,30 +209,35 @@ do { \
4105     CONFIG_ARM64_PAN)); \
4106     } while (0)
4107    
4108     -#define __get_user(x, ptr) \
4109     +#define __get_user_check(x, ptr, err) \
4110     ({ \
4111     - int __gu_err = 0; \
4112     - __get_user_err((x), (ptr), __gu_err); \
4113     - __gu_err; \
4114     + __typeof__(*(ptr)) __user *__p = (ptr); \
4115     + might_fault(); \
4116     + if (access_ok(VERIFY_READ, __p, sizeof(*__p))) { \
4117     + __p = uaccess_mask_ptr(__p); \
4118     + __get_user_err((x), __p, (err)); \
4119     + } else { \
4120     + (x) = 0; (err) = -EFAULT; \
4121     + } \
4122     })
4123    
4124     #define __get_user_error(x, ptr, err) \
4125     ({ \
4126     - __get_user_err((x), (ptr), (err)); \
4127     + __get_user_check((x), (ptr), (err)); \
4128     (void)0; \
4129     })
4130    
4131     -#define __get_user_unaligned __get_user
4132     -
4133     -#define get_user(x, ptr) \
4134     +#define __get_user(x, ptr) \
4135     ({ \
4136     - __typeof__(*(ptr)) __user *__p = (ptr); \
4137     - might_fault(); \
4138     - access_ok(VERIFY_READ, __p, sizeof(*__p)) ? \
4139     - __get_user((x), __p) : \
4140     - ((x) = 0, -EFAULT); \
4141     + int __gu_err = 0; \
4142     + __get_user_check((x), (ptr), __gu_err); \
4143     + __gu_err; \
4144     })
4145    
4146     +#define __get_user_unaligned __get_user
4147     +
4148     +#define get_user __get_user
4149     +
4150     #define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \
4151     asm volatile( \
4152     "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \
4153     @@ -242,47 +282,51 @@ do { \
4154     CONFIG_ARM64_PAN)); \
4155     } while (0)
4156    
4157     -#define __put_user(x, ptr) \
4158     +#define __put_user_check(x, ptr, err) \
4159     ({ \
4160     - int __pu_err = 0; \
4161     - __put_user_err((x), (ptr), __pu_err); \
4162     - __pu_err; \
4163     + __typeof__(*(ptr)) __user *__p = (ptr); \
4164     + might_fault(); \
4165     + if (access_ok(VERIFY_WRITE, __p, sizeof(*__p))) { \
4166     + __p = uaccess_mask_ptr(__p); \
4167     + __put_user_err((x), __p, (err)); \
4168     + } else { \
4169     + (err) = -EFAULT; \
4170     + } \
4171     })
4172    
4173     #define __put_user_error(x, ptr, err) \
4174     ({ \
4175     - __put_user_err((x), (ptr), (err)); \
4176     + __put_user_check((x), (ptr), (err)); \
4177     (void)0; \
4178     })
4179    
4180     -#define __put_user_unaligned __put_user
4181     -
4182     -#define put_user(x, ptr) \
4183     +#define __put_user(x, ptr) \
4184     ({ \
4185     - __typeof__(*(ptr)) __user *__p = (ptr); \
4186     - might_fault(); \
4187     - access_ok(VERIFY_WRITE, __p, sizeof(*__p)) ? \
4188     - __put_user((x), __p) : \
4189     - -EFAULT; \
4190     + int __pu_err = 0; \
4191     + __put_user_check((x), (ptr), __pu_err); \
4192     + __pu_err; \
4193     })
4194    
4195     +#define __put_user_unaligned __put_user
4196     +
4197     +#define put_user __put_user
4198     +
4199     extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
4200     extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
4201     -extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
4202     -extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
4203     +extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n);
4204    
4205     static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n)
4206     {
4207     kasan_check_write(to, n);
4208     check_object_size(to, n, false);
4209     - return __arch_copy_from_user(to, from, n);
4210     + return __arch_copy_from_user(to, __uaccess_mask_ptr(from), n);
4211     }
4212    
4213     static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n)
4214     {
4215     kasan_check_read(from, n);
4216     check_object_size(from, n, true);
4217     - return __arch_copy_to_user(to, from, n);
4218     + return __arch_copy_to_user(__uaccess_mask_ptr(to), from, n);
4219     }
4220    
4221     static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n)
4222     @@ -310,22 +354,25 @@ static inline unsigned long __must_check copy_to_user(void __user *to, const voi
4223     return n;
4224     }
4225    
4226     -static inline unsigned long __must_check copy_in_user(void __user *to, const void __user *from, unsigned long n)
4227     +static inline unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n)
4228     {
4229     if (access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, to, n))
4230     - n = __copy_in_user(to, from, n);
4231     + n = __arch_copy_in_user(__uaccess_mask_ptr(to), __uaccess_mask_ptr(from), n);
4232     return n;
4233     }
4234     +#define copy_in_user __copy_in_user
4235    
4236     #define __copy_to_user_inatomic __copy_to_user
4237     #define __copy_from_user_inatomic __copy_from_user
4238    
4239     -static inline unsigned long __must_check clear_user(void __user *to, unsigned long n)
4240     +extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned long n);
4241     +static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n)
4242     {
4243     if (access_ok(VERIFY_WRITE, to, n))
4244     - n = __clear_user(to, n);
4245     + n = __arch_clear_user(__uaccess_mask_ptr(to), n);
4246     return n;
4247     }
4248     +#define clear_user __clear_user
4249    
4250     extern long strncpy_from_user(char *dest, const char __user *src, long count);
4251    
4252     diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
4253     index 7d66bbaafc0c..74b8fd860714 100644
4254     --- a/arch/arm64/kernel/Makefile
4255     +++ b/arch/arm64/kernel/Makefile
4256     @@ -51,6 +51,10 @@ arm64-obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o
4257     arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \
4258     cpu-reset.o
4259    
4260     +ifeq ($(CONFIG_KVM),y)
4261     +arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR) += bpi.o
4262     +endif
4263     +
4264     obj-y += $(arm64-obj-y) vdso/ probes/
4265     obj-m += $(arm64-obj-m)
4266     head-y := head.o
4267     diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
4268     index e9c4dc9e0ada..66be504edb6c 100644
4269     --- a/arch/arm64/kernel/arm64ksyms.c
4270     +++ b/arch/arm64/kernel/arm64ksyms.c
4271     @@ -37,8 +37,8 @@ EXPORT_SYMBOL(clear_page);
4272     /* user mem (segment) */
4273     EXPORT_SYMBOL(__arch_copy_from_user);
4274     EXPORT_SYMBOL(__arch_copy_to_user);
4275     -EXPORT_SYMBOL(__clear_user);
4276     -EXPORT_SYMBOL(__copy_in_user);
4277     +EXPORT_SYMBOL(__arch_clear_user);
4278     +EXPORT_SYMBOL(__arch_copy_in_user);
4279    
4280     /* physical memory */
4281     EXPORT_SYMBOL(memstart_addr);
4282     diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S
4283     new file mode 100644
4284     index 000000000000..dc4eb154e33b
4285     --- /dev/null
4286     +++ b/arch/arm64/kernel/bpi.S
4287     @@ -0,0 +1,75 @@
4288     +/*
4289     + * Contains CPU specific branch predictor invalidation sequences
4290     + *
4291     + * Copyright (C) 2018 ARM Ltd.
4292     + *
4293     + * This program is free software; you can redistribute it and/or modify
4294     + * it under the terms of the GNU General Public License version 2 as
4295     + * published by the Free Software Foundation.
4296     + *
4297     + * This program is distributed in the hope that it will be useful,
4298     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
4299     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4300     + * GNU General Public License for more details.
4301     + *
4302     + * You should have received a copy of the GNU General Public License
4303     + * along with this program. If not, see <http://www.gnu.org/licenses/>.
4304     + */
4305     +
4306     +#include <linux/linkage.h>
4307     +#include <linux/arm-smccc.h>
4308     +
4309     +.macro ventry target
4310     + .rept 31
4311     + nop
4312     + .endr
4313     + b \target
4314     +.endm
4315     +
4316     +.macro vectors target
4317     + ventry \target + 0x000
4318     + ventry \target + 0x080
4319     + ventry \target + 0x100
4320     + ventry \target + 0x180
4321     +
4322     + ventry \target + 0x200
4323     + ventry \target + 0x280
4324     + ventry \target + 0x300
4325     + ventry \target + 0x380
4326     +
4327     + ventry \target + 0x400
4328     + ventry \target + 0x480
4329     + ventry \target + 0x500
4330     + ventry \target + 0x580
4331     +
4332     + ventry \target + 0x600
4333     + ventry \target + 0x680
4334     + ventry \target + 0x700
4335     + ventry \target + 0x780
4336     +.endm
4337     +
4338     + .align 11
4339     +ENTRY(__bp_harden_hyp_vecs_start)
4340     + .rept 4
4341     + vectors __kvm_hyp_vector
4342     + .endr
4343     +ENTRY(__bp_harden_hyp_vecs_end)
4344     +
4345     +.macro smccc_workaround_1 inst
4346     + sub sp, sp, #(8 * 4)
4347     + stp x2, x3, [sp, #(8 * 0)]
4348     + stp x0, x1, [sp, #(8 * 2)]
4349     + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1
4350     + \inst #0
4351     + ldp x2, x3, [sp, #(8 * 0)]
4352     + ldp x0, x1, [sp, #(8 * 2)]
4353     + add sp, sp, #(8 * 4)
4354     +.endm
4355     +
4356     +ENTRY(__smccc_workaround_1_smc_start)
4357     + smccc_workaround_1 smc
4358     +ENTRY(__smccc_workaround_1_smc_end)
4359     +
4360     +ENTRY(__smccc_workaround_1_hvc_start)
4361     + smccc_workaround_1 hvc
4362     +ENTRY(__smccc_workaround_1_hvc_end)
4363     diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
4364     index b75e917aac46..74107134cc30 100644
4365     --- a/arch/arm64/kernel/cpu_errata.c
4366     +++ b/arch/arm64/kernel/cpu_errata.c
4367     @@ -46,6 +46,147 @@ static int cpu_enable_trap_ctr_access(void *__unused)
4368     return 0;
4369     }
4370    
4371     +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
4372     +#include <asm/mmu_context.h>
4373     +#include <asm/cacheflush.h>
4374     +
4375     +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
4376     +
4377     +#ifdef CONFIG_KVM
4378     +extern char __smccc_workaround_1_smc_start[];
4379     +extern char __smccc_workaround_1_smc_end[];
4380     +extern char __smccc_workaround_1_hvc_start[];
4381     +extern char __smccc_workaround_1_hvc_end[];
4382     +
4383     +static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
4384     + const char *hyp_vecs_end)
4385     +{
4386     + void *dst = __bp_harden_hyp_vecs_start + slot * SZ_2K;
4387     + int i;
4388     +
4389     + for (i = 0; i < SZ_2K; i += 0x80)
4390     + memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
4391     +
4392     + flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
4393     +}
4394     +
4395     +static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
4396     + const char *hyp_vecs_start,
4397     + const char *hyp_vecs_end)
4398     +{
4399     + static int last_slot = -1;
4400     + static DEFINE_SPINLOCK(bp_lock);
4401     + int cpu, slot = -1;
4402     +
4403     + spin_lock(&bp_lock);
4404     + for_each_possible_cpu(cpu) {
4405     + if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
4406     + slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
4407     + break;
4408     + }
4409     + }
4410     +
4411     + if (slot == -1) {
4412     + last_slot++;
4413     + BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start)
4414     + / SZ_2K) <= last_slot);
4415     + slot = last_slot;
4416     + __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
4417     + }
4418     +
4419     + __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
4420     + __this_cpu_write(bp_hardening_data.fn, fn);
4421     + spin_unlock(&bp_lock);
4422     +}
4423     +#else
4424     +#define __smccc_workaround_1_smc_start NULL
4425     +#define __smccc_workaround_1_smc_end NULL
4426     +#define __smccc_workaround_1_hvc_start NULL
4427     +#define __smccc_workaround_1_hvc_end NULL
4428     +
4429     +static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
4430     + const char *hyp_vecs_start,
4431     + const char *hyp_vecs_end)
4432     +{
4433     + __this_cpu_write(bp_hardening_data.fn, fn);
4434     +}
4435     +#endif /* CONFIG_KVM */
4436     +
4437     +static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry,
4438     + bp_hardening_cb_t fn,
4439     + const char *hyp_vecs_start,
4440     + const char *hyp_vecs_end)
4441     +{
4442     + u64 pfr0;
4443     +
4444     + if (!entry->matches(entry, SCOPE_LOCAL_CPU))
4445     + return;
4446     +
4447     + pfr0 = read_cpuid(ID_AA64PFR0_EL1);
4448     + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT))
4449     + return;
4450     +
4451     + __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end);
4452     +}
4453     +
4454     +#include <uapi/linux/psci.h>
4455     +#include <linux/arm-smccc.h>
4456     +#include <linux/psci.h>
4457     +
4458     +static void call_smc_arch_workaround_1(void)
4459     +{
4460     + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
4461     +}
4462     +
4463     +static void call_hvc_arch_workaround_1(void)
4464     +{
4465     + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
4466     +}
4467     +
4468     +static int enable_smccc_arch_workaround_1(void *data)
4469     +{
4470     + const struct arm64_cpu_capabilities *entry = data;
4471     + bp_hardening_cb_t cb;
4472     + void *smccc_start, *smccc_end;
4473     + struct arm_smccc_res res;
4474     +
4475     + if (!entry->matches(entry, SCOPE_LOCAL_CPU))
4476     + return 0;
4477     +
4478     + if (psci_ops.smccc_version == SMCCC_VERSION_1_0)
4479     + return 0;
4480     +
4481     + switch (psci_ops.conduit) {
4482     + case PSCI_CONDUIT_HVC:
4483     + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
4484     + ARM_SMCCC_ARCH_WORKAROUND_1, &res);
4485     + if (res.a0)
4486     + return 0;
4487     + cb = call_hvc_arch_workaround_1;
4488     + smccc_start = __smccc_workaround_1_hvc_start;
4489     + smccc_end = __smccc_workaround_1_hvc_end;
4490     + break;
4491     +
4492     + case PSCI_CONDUIT_SMC:
4493     + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
4494     + ARM_SMCCC_ARCH_WORKAROUND_1, &res);
4495     + if (res.a0)
4496     + return 0;
4497     + cb = call_smc_arch_workaround_1;
4498     + smccc_start = __smccc_workaround_1_smc_start;
4499     + smccc_end = __smccc_workaround_1_smc_end;
4500     + break;
4501     +
4502     + default:
4503     + return 0;
4504     + }
4505     +
4506     + install_bp_hardening_cb(entry, cb, smccc_start, smccc_end);
4507     +
4508     + return 0;
4509     +}
4510     +#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */
4511     +
4512     #define MIDR_RANGE(model, min, max) \
4513     .def_scope = SCOPE_LOCAL_CPU, \
4514     .matches = is_affected_midr_range, \
4515     @@ -53,6 +194,13 @@ static int cpu_enable_trap_ctr_access(void *__unused)
4516     .midr_range_min = min, \
4517     .midr_range_max = max
4518    
4519     +#define MIDR_ALL_VERSIONS(model) \
4520     + .def_scope = SCOPE_LOCAL_CPU, \
4521     + .matches = is_affected_midr_range, \
4522     + .midr_model = model, \
4523     + .midr_range_min = 0, \
4524     + .midr_range_max = (MIDR_VARIANT_MASK | MIDR_REVISION_MASK)
4525     +
4526     const struct arm64_cpu_capabilities arm64_errata[] = {
4527     #if defined(CONFIG_ARM64_ERRATUM_826319) || \
4528     defined(CONFIG_ARM64_ERRATUM_827319) || \
4529     @@ -130,6 +278,38 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
4530     .def_scope = SCOPE_LOCAL_CPU,
4531     .enable = cpu_enable_trap_ctr_access,
4532     },
4533     +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
4534     + {
4535     + .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
4536     + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
4537     + .enable = enable_smccc_arch_workaround_1,
4538     + },
4539     + {
4540     + .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
4541     + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
4542     + .enable = enable_smccc_arch_workaround_1,
4543     + },
4544     + {
4545     + .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
4546     + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
4547     + .enable = enable_smccc_arch_workaround_1,
4548     + },
4549     + {
4550     + .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
4551     + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75),
4552     + .enable = enable_smccc_arch_workaround_1,
4553     + },
4554     + {
4555     + .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
4556     + MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
4557     + .enable = enable_smccc_arch_workaround_1,
4558     + },
4559     + {
4560     + .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
4561     + MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
4562     + .enable = enable_smccc_arch_workaround_1,
4563     + },
4564     +#endif
4565     {
4566     }
4567     };
4568     @@ -143,15 +323,18 @@ void verify_local_cpu_errata_workarounds(void)
4569     {
4570     const struct arm64_cpu_capabilities *caps = arm64_errata;
4571    
4572     - for (; caps->matches; caps++)
4573     - if (!cpus_have_cap(caps->capability) &&
4574     - caps->matches(caps, SCOPE_LOCAL_CPU)) {
4575     + for (; caps->matches; caps++) {
4576     + if (cpus_have_cap(caps->capability)) {
4577     + if (caps->enable)
4578     + caps->enable((void *)caps);
4579     + } else if (caps->matches(caps, SCOPE_LOCAL_CPU)) {
4580     pr_crit("CPU%d: Requires work around for %s, not detected"
4581     " at boot time\n",
4582     smp_processor_id(),
4583     caps->desc ? : "an erratum");
4584     cpu_die_early();
4585     }
4586     + }
4587     }
4588    
4589     void update_cpu_errata_workarounds(void)
4590     diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
4591     index 5056fc597ae9..a0ee01202503 100644
4592     --- a/arch/arm64/kernel/cpufeature.c
4593     +++ b/arch/arm64/kernel/cpufeature.c
4594     @@ -94,7 +94,8 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
4595    
4596     static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
4597     ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0),
4598     - ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 28, 0),
4599     + ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0),
4600     + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 24, 0),
4601     ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 4, 0),
4602     ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64PFR0_GIC_SHIFT, 4, 0),
4603     S_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
4604     @@ -1024,9 +1025,8 @@ static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array,
4605     if (WARN_ON(preemptible()))
4606     return false;
4607    
4608     - for (caps = cap_array; caps->desc; caps++)
4609     + for (caps = cap_array; caps->matches; caps++)
4610     if (caps->capability == cap &&
4611     - caps->matches &&
4612     caps->matches(caps, SCOPE_LOCAL_CPU))
4613     return true;
4614     return false;
4615     @@ -1059,7 +1059,7 @@ void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
4616     * uses an IPI, giving us a PSTATE that disappears when
4617     * we return.
4618     */
4619     - stop_machine(caps->enable, NULL, cpu_online_mask);
4620     + stop_machine(caps->enable, (void *)caps, cpu_online_mask);
4621     }
4622    
4623     /*
4624     @@ -1116,7 +1116,7 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps_list)
4625     cpu_die_early();
4626     }
4627     if (caps->enable)
4628     - caps->enable(NULL);
4629     + caps->enable((void *)caps);
4630     }
4631     }
4632    
4633     diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
4634     index 8d1600b18562..b79e302d2a3e 100644
4635     --- a/arch/arm64/kernel/entry.S
4636     +++ b/arch/arm64/kernel/entry.S
4637     @@ -30,6 +30,7 @@
4638     #include <asm/irq.h>
4639     #include <asm/memory.h>
4640     #include <asm/mmu.h>
4641     +#include <asm/processor.h>
4642     #include <asm/thread_info.h>
4643     #include <asm/asm-uaccess.h>
4644     #include <asm/unistd.h>
4645     @@ -125,10 +126,10 @@ alternative_else_nop_endif
4646     .else
4647     add x21, sp, #S_FRAME_SIZE
4648     get_thread_info tsk
4649     - /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
4650     + /* Save the task's original addr_limit and set USER_DS */
4651     ldr x20, [tsk, #TI_ADDR_LIMIT]
4652     str x20, [sp, #S_ORIG_ADDR_LIMIT]
4653     - mov x20, #TASK_SIZE_64
4654     + mov x20, #USER_DS
4655     str x20, [tsk, #TI_ADDR_LIMIT]
4656     /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
4657     .endif /* \el == 0 */
4658     @@ -588,13 +589,15 @@ el0_ia:
4659     * Instruction abort handling
4660     */
4661     mrs x26, far_el1
4662     - // enable interrupts before calling the main handler
4663     - enable_dbg_and_irq
4664     + msr daifclr, #(8 | 4 | 1)
4665     +#ifdef CONFIG_TRACE_IRQFLAGS
4666     + bl trace_hardirqs_off
4667     +#endif
4668     ct_user_exit
4669     mov x0, x26
4670     mov x1, x25
4671     mov x2, sp
4672     - bl do_mem_abort
4673     + bl do_el0_ia_bp_hardening
4674     b ret_to_user
4675     el0_fpsimd_acc:
4676     /*
4677     @@ -621,8 +624,10 @@ el0_sp_pc:
4678     * Stack or PC alignment exception handling
4679     */
4680     mrs x26, far_el1
4681     - // enable interrupts before calling the main handler
4682     - enable_dbg_and_irq
4683     + enable_dbg
4684     +#ifdef CONFIG_TRACE_IRQFLAGS
4685     + bl trace_hardirqs_off
4686     +#endif
4687     ct_user_exit
4688     mov x0, x26
4689     mov x1, x25
4690     @@ -681,6 +686,11 @@ el0_irq_naked:
4691     #endif
4692    
4693     ct_user_exit
4694     +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
4695     + tbz x22, #55, 1f
4696     + bl do_el0_irq_bp_hardening
4697     +1:
4698     +#endif
4699     irq_handler
4700    
4701     #ifdef CONFIG_TRACE_IRQFLAGS
4702     @@ -794,6 +804,7 @@ el0_svc_naked: // compat entry point
4703     b.ne __sys_trace
4704     cmp scno, sc_nr // check upper syscall limit
4705     b.hs ni_sys
4706     + mask_nospec64 scno, sc_nr, x19 // enforce bounds for syscall number
4707     ldr x16, [stbl, scno, lsl #3] // address in the syscall table
4708     blr x16 // call sys_* routine
4709     b ret_fast_syscall
4710     diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
4711     index 2e6e9e99977b..efe43c5f2dc1 100644
4712     --- a/arch/arm64/kvm/handle_exit.c
4713     +++ b/arch/arm64/kvm/handle_exit.c
4714     @@ -22,12 +22,15 @@
4715     #include <linux/kvm.h>
4716     #include <linux/kvm_host.h>
4717    
4718     +#include <kvm/arm_psci.h>
4719     +
4720     #include <asm/esr.h>
4721     #include <asm/kvm_asm.h>
4722     #include <asm/kvm_coproc.h>
4723     #include <asm/kvm_emulate.h>
4724     #include <asm/kvm_mmu.h>
4725     -#include <asm/kvm_psci.h>
4726     +#include <asm/debug-monitors.h>
4727     +#include <asm/traps.h>
4728    
4729     #define CREATE_TRACE_POINTS
4730     #include "trace.h"
4731     @@ -42,7 +45,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
4732     kvm_vcpu_hvc_get_imm(vcpu));
4733     vcpu->stat.hvc_exit_stat++;
4734    
4735     - ret = kvm_psci_call(vcpu);
4736     + ret = kvm_hvc_call_handler(vcpu);
4737     if (ret < 0) {
4738     vcpu_set_reg(vcpu, 0, ~0UL);
4739     return 1;
4740     @@ -53,7 +56,16 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
4741    
4742     static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
4743     {
4744     + /*
4745     + * "If an SMC instruction executed at Non-secure EL1 is
4746     + * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a
4747     + * Trap exception, not a Secure Monitor Call exception [...]"
4748     + *
4749     + * We need to advance the PC after the trap, as it would
4750     + * otherwise return to the same address...
4751     + */
4752     vcpu_set_reg(vcpu, 0, ~0UL);
4753     + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
4754     return 1;
4755     }
4756    
4757     diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
4758     index 4e92399f7105..4e9d50c3e658 100644
4759     --- a/arch/arm64/kvm/hyp/hyp-entry.S
4760     +++ b/arch/arm64/kvm/hyp/hyp-entry.S
4761     @@ -15,6 +15,7 @@
4762     * along with this program. If not, see <http://www.gnu.org/licenses/>.
4763     */
4764    
4765     +#include <linux/arm-smccc.h>
4766     #include <linux/linkage.h>
4767    
4768     #include <asm/alternative.h>
4769     @@ -79,10 +80,11 @@ alternative_endif
4770     lsr x0, x1, #ESR_ELx_EC_SHIFT
4771    
4772     cmp x0, #ESR_ELx_EC_HVC64
4773     + ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
4774     b.ne el1_trap
4775    
4776     - mrs x1, vttbr_el2 // If vttbr is valid, the 64bit guest
4777     - cbnz x1, el1_trap // called HVC
4778     + mrs x1, vttbr_el2 // If vttbr is valid, the guest
4779     + cbnz x1, el1_hvc_guest // called HVC
4780    
4781     /* Here, we're pretty sure the host called HVC. */
4782     ldp x0, x1, [sp], #16
4783     @@ -101,6 +103,20 @@ alternative_endif
4784    
4785     2: eret
4786    
4787     +el1_hvc_guest:
4788     + /*
4789     + * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
4790     + * The workaround has already been applied on the host,
4791     + * so let's quickly get back to the guest. We don't bother
4792     + * restoring x1, as it can be clobbered anyway.
4793     + */
4794     + ldr x1, [sp] // Guest's x0
4795     + eor w1, w1, #ARM_SMCCC_ARCH_WORKAROUND_1
4796     + cbnz w1, el1_trap
4797     + mov x0, x1
4798     + add sp, sp, #16
4799     + eret
4800     +
4801     el1_trap:
4802     /*
4803     * x0: ESR_EC
4804     diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
4805     index 9174ba917d65..c49d09387192 100644
4806     --- a/arch/arm64/kvm/hyp/switch.c
4807     +++ b/arch/arm64/kvm/hyp/switch.c
4808     @@ -17,6 +17,9 @@
4809    
4810     #include <linux/types.h>
4811     #include <linux/jump_label.h>
4812     +#include <uapi/linux/psci.h>
4813     +
4814     +#include <kvm/arm_psci.h>
4815    
4816     #include <asm/kvm_asm.h>
4817     #include <asm/kvm_emulate.h>
4818     @@ -50,7 +53,7 @@ static void __hyp_text __activate_traps_vhe(void)
4819     val &= ~CPACR_EL1_FPEN;
4820     write_sysreg(val, cpacr_el1);
4821    
4822     - write_sysreg(__kvm_hyp_vector, vbar_el1);
4823     + write_sysreg(kvm_get_hyp_vector(), vbar_el1);
4824     }
4825    
4826     static void __hyp_text __activate_traps_nvhe(void)
4827     diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
4828     index 5d1cad3ce6d6..efbf610eaf4e 100644
4829     --- a/arch/arm64/lib/clear_user.S
4830     +++ b/arch/arm64/lib/clear_user.S
4831     @@ -24,7 +24,7 @@
4832    
4833     .text
4834    
4835     -/* Prototype: int __clear_user(void *addr, size_t sz)
4836     +/* Prototype: int __arch_clear_user(void *addr, size_t sz)
4837     * Purpose : clear some user memory
4838     * Params : addr - user memory address to clear
4839     * : sz - number of bytes to clear
4840     @@ -32,7 +32,7 @@
4841     *
4842     * Alignment fixed up by hardware.
4843     */
4844     -ENTRY(__clear_user)
4845     +ENTRY(__arch_clear_user)
4846     ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
4847     CONFIG_ARM64_PAN)
4848     mov x2, x1 // save the size for fixup return
4849     @@ -57,7 +57,7 @@ uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
4850     ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
4851     CONFIG_ARM64_PAN)
4852     ret
4853     -ENDPROC(__clear_user)
4854     +ENDPROC(__arch_clear_user)
4855    
4856     .section .fixup,"ax"
4857     .align 2
4858     diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
4859     index f7292dd08c84..841bf8f7fab7 100644
4860     --- a/arch/arm64/lib/copy_in_user.S
4861     +++ b/arch/arm64/lib/copy_in_user.S
4862     @@ -67,7 +67,7 @@
4863     .endm
4864    
4865     end .req x5
4866     -ENTRY(__copy_in_user)
4867     +ENTRY(__arch_copy_in_user)
4868     ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
4869     CONFIG_ARM64_PAN)
4870     add end, x0, x2
4871     @@ -76,7 +76,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
4872     CONFIG_ARM64_PAN)
4873     mov x0, #0
4874     ret
4875     -ENDPROC(__copy_in_user)
4876     +ENDPROC(__arch_copy_in_user)
4877    
4878     .section .fixup,"ax"
4879     .align 2
4880     diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
4881     index f00f5eeb556f..62d976e843fc 100644
4882     --- a/arch/arm64/mm/context.c
4883     +++ b/arch/arm64/mm/context.c
4884     @@ -230,9 +230,21 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
4885     raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
4886    
4887     switch_mm_fastpath:
4888     +
4889     + arm64_apply_bp_hardening();
4890     +
4891     cpu_switch_mm(mm->pgd, mm);
4892     }
4893    
4894     +/* Errata workaround post TTBRx_EL1 update. */
4895     +asmlinkage void post_ttbr_update_workaround(void)
4896     +{
4897     + asm(ALTERNATIVE("nop; nop; nop",
4898     + "ic iallu; dsb nsh; isb",
4899     + ARM64_WORKAROUND_CAVIUM_27456,
4900     + CONFIG_CAVIUM_ERRATUM_27456));
4901     +}
4902     +
4903     static int asids_init(void)
4904     {
4905     asid_bits = get_cpu_asid_bits();
4906     diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
4907     index 403fe9e57135..ad49ae8f3967 100644
4908     --- a/arch/arm64/mm/fault.c
4909     +++ b/arch/arm64/mm/fault.c
4910     @@ -332,7 +332,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
4911     mm_flags |= FAULT_FLAG_WRITE;
4912     }
4913    
4914     - if (is_permission_fault(esr) && (addr < USER_DS)) {
4915     + if (is_permission_fault(esr) && (addr < TASK_SIZE)) {
4916     /* regs->orig_addr_limit may be 0 if we entered from EL0 */
4917     if (regs->orig_addr_limit == KERNEL_DS)
4918     die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
4919     @@ -590,6 +590,29 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
4920     arm64_notify_die("", regs, &info, esr);
4921     }
4922    
4923     +asmlinkage void __exception do_el0_irq_bp_hardening(void)
4924     +{
4925     + /* PC has already been checked in entry.S */
4926     + arm64_apply_bp_hardening();
4927     +}
4928     +
4929     +asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
4930     + unsigned int esr,
4931     + struct pt_regs *regs)
4932     +{
4933     + /*
4934     + * We've taken an instruction abort from userspace and not yet
4935     + * re-enabled IRQs. If the address is a kernel address, apply
4936     + * BP hardening prior to enabling IRQs and pre-emption.
4937     + */
4938     + if (addr > TASK_SIZE)
4939     + arm64_apply_bp_hardening();
4940     +
4941     + local_irq_enable();
4942     + do_mem_abort(addr, esr, regs);
4943     +}
4944     +
4945     +
4946     /*
4947     * Handle stack alignment exceptions.
4948     */
4949     @@ -600,6 +623,12 @@ asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
4950     struct siginfo info;
4951     struct task_struct *tsk = current;
4952    
4953     + if (user_mode(regs)) {
4954     + if (instruction_pointer(regs) > TASK_SIZE)
4955     + arm64_apply_bp_hardening();
4956     + local_irq_enable();
4957     + }
4958     +
4959     if (show_unhandled_signals && unhandled_signal(tsk, SIGBUS))
4960     pr_info_ratelimited("%s[%d]: %s exception: pc=%p sp=%p\n",
4961     tsk->comm, task_pid_nr(tsk),
4962     @@ -659,6 +688,9 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
4963     if (interrupts_enabled(regs))
4964     trace_hardirqs_off();
4965    
4966     + if (user_mode(regs) && instruction_pointer(regs) > TASK_SIZE)
4967     + arm64_apply_bp_hardening();
4968     +
4969     if (!inf->fn(addr, esr, regs)) {
4970     rv = 1;
4971     } else {
4972     diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
4973     index c07d9cc057e6..619da1cbd32b 100644
4974     --- a/arch/arm64/mm/proc.S
4975     +++ b/arch/arm64/mm/proc.S
4976     @@ -139,12 +139,7 @@ ENTRY(cpu_do_switch_mm)
4977     isb
4978     msr ttbr0_el1, x0 // now update TTBR0
4979     isb
4980     -alternative_if ARM64_WORKAROUND_CAVIUM_27456
4981     - ic iallu
4982     - dsb nsh
4983     - isb
4984     -alternative_else_nop_endif
4985     - ret
4986     + b post_ttbr_update_workaround // Back to C code...
4987     ENDPROC(cpu_do_switch_mm)
4988    
4989     .pushsection ".idmap.text", "awx"
4990     diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
4991     index 700e2d2da096..2e68ca1fe0db 100644
4992     --- a/arch/parisc/kernel/drivers.c
4993     +++ b/arch/parisc/kernel/drivers.c
4994     @@ -648,6 +648,10 @@ static int match_pci_device(struct device *dev, int index,
4995     (modpath->mod == PCI_FUNC(devfn)));
4996     }
4997    
4998     + /* index might be out of bounds for bc[] */
4999     + if (index >= 6)
5000     + return 0;
5001     +
5002     id = PCI_SLOT(pdev->devfn) | (PCI_FUNC(pdev->devfn) << 5);
5003     return (modpath->bc[index] == id);
5004     }
5005     diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
5006     index 295bfb7124bc..39127b691b78 100644
5007     --- a/arch/s390/kernel/ipl.c
5008     +++ b/arch/s390/kernel/ipl.c
5009     @@ -798,6 +798,7 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb,
5010     /* copy and convert to ebcdic */
5011     memcpy(ipb->hdr.loadparm, buf, lp_len);
5012     ASCEBC(ipb->hdr.loadparm, LOADPARM_LEN);
5013     + ipb->hdr.flags |= DIAG308_FLAGS_LP_VALID;
5014     return len;
5015     }
5016    
5017     diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
5018     index b1815b20a99c..37032545c58e 100644
5019     --- a/drivers/acpi/nfit/core.c
5020     +++ b/drivers/acpi/nfit/core.c
5021     @@ -2547,15 +2547,21 @@ static void acpi_nfit_scrub(struct work_struct *work)
5022     static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
5023     {
5024     struct nfit_spa *nfit_spa;
5025     - int rc;
5026    
5027     - list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
5028     - if (nfit_spa_type(nfit_spa->spa) == NFIT_SPA_DCR) {
5029     - /* BLK regions don't need to wait for ars results */
5030     - rc = acpi_nfit_register_region(acpi_desc, nfit_spa);
5031     - if (rc)
5032     - return rc;
5033     - }
5034     + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
5035     + int rc, type = nfit_spa_type(nfit_spa->spa);
5036     +
5037     + /* PMEM and VMEM will be registered by the ARS workqueue */
5038     + if (type == NFIT_SPA_PM || type == NFIT_SPA_VOLATILE)
5039     + continue;
5040     + /* BLK apertures belong to BLK region registration below */
5041     + if (type == NFIT_SPA_BDW)
5042     + continue;
5043     + /* BLK regions don't need to wait for ARS results */
5044     + rc = acpi_nfit_register_region(acpi_desc, nfit_spa);
5045     + if (rc)
5046     + return rc;
5047     + }
5048    
5049     queue_work(nfit_wq, &acpi_desc->work);
5050     return 0;
5051     diff --git a/drivers/block/loop.c b/drivers/block/loop.c
5052     index dc318b9100c2..ff1c4d7aa025 100644
5053     --- a/drivers/block/loop.c
5054     +++ b/drivers/block/loop.c
5055     @@ -1110,11 +1110,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
5056     if (info->lo_encrypt_type) {
5057     unsigned int type = info->lo_encrypt_type;
5058    
5059     - if (type >= MAX_LO_CRYPT)
5060     - return -EINVAL;
5061     + if (type >= MAX_LO_CRYPT) {
5062     + err = -EINVAL;
5063     + goto exit;
5064     + }
5065     xfer = xfer_funcs[type];
5066     - if (xfer == NULL)
5067     - return -EINVAL;
5068     + if (xfer == NULL) {
5069     + err = -EINVAL;
5070     + goto exit;
5071     + }
5072     } else
5073     xfer = NULL;
5074    
5075     diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c
5076     index 8263429e21b8..79a48c37fb35 100644
5077     --- a/drivers/firmware/psci.c
5078     +++ b/drivers/firmware/psci.c
5079     @@ -59,7 +59,10 @@ bool psci_tos_resident_on(int cpu)
5080     return cpu == resident_cpu;
5081     }
5082    
5083     -struct psci_operations psci_ops;
5084     +struct psci_operations psci_ops = {
5085     + .conduit = PSCI_CONDUIT_NONE,
5086     + .smccc_version = SMCCC_VERSION_1_0,
5087     +};
5088    
5089     typedef unsigned long (psci_fn)(unsigned long, unsigned long,
5090     unsigned long, unsigned long);
5091     @@ -210,6 +213,22 @@ static unsigned long psci_migrate_info_up_cpu(void)
5092     0, 0, 0);
5093     }
5094    
5095     +static void set_conduit(enum psci_conduit conduit)
5096     +{
5097     + switch (conduit) {
5098     + case PSCI_CONDUIT_HVC:
5099     + invoke_psci_fn = __invoke_psci_fn_hvc;
5100     + break;
5101     + case PSCI_CONDUIT_SMC:
5102     + invoke_psci_fn = __invoke_psci_fn_smc;
5103     + break;
5104     + default:
5105     + WARN(1, "Unexpected PSCI conduit %d\n", conduit);
5106     + }
5107     +
5108     + psci_ops.conduit = conduit;
5109     +}
5110     +
5111     static int get_set_conduit_method(struct device_node *np)
5112     {
5113     const char *method;
5114     @@ -222,9 +241,9 @@ static int get_set_conduit_method(struct device_node *np)
5115     }
5116    
5117     if (!strcmp("hvc", method)) {
5118     - invoke_psci_fn = __invoke_psci_fn_hvc;
5119     + set_conduit(PSCI_CONDUIT_HVC);
5120     } else if (!strcmp("smc", method)) {
5121     - invoke_psci_fn = __invoke_psci_fn_smc;
5122     + set_conduit(PSCI_CONDUIT_SMC);
5123     } else {
5124     pr_warn("invalid \"method\" property: %s\n", method);
5125     return -EINVAL;
5126     @@ -493,9 +512,36 @@ static void __init psci_init_migrate(void)
5127     pr_info("Trusted OS resident on physical CPU 0x%lx\n", cpuid);
5128     }
5129    
5130     +static void __init psci_init_smccc(void)
5131     +{
5132     + u32 ver = ARM_SMCCC_VERSION_1_0;
5133     + int feature;
5134     +
5135     + feature = psci_features(ARM_SMCCC_VERSION_FUNC_ID);
5136     +
5137     + if (feature != PSCI_RET_NOT_SUPPORTED) {
5138     + u32 ret;
5139     + ret = invoke_psci_fn(ARM_SMCCC_VERSION_FUNC_ID, 0, 0, 0);
5140     + if (ret == ARM_SMCCC_VERSION_1_1) {
5141     + psci_ops.smccc_version = SMCCC_VERSION_1_1;
5142     + ver = ret;
5143     + }
5144     + }
5145     +
5146     + /*
5147     + * Conveniently, the SMCCC and PSCI versions are encoded the
5148     + * same way. No, this isn't accidental.
5149     + */
5150     + pr_info("SMC Calling Convention v%d.%d\n",
5151     + PSCI_VERSION_MAJOR(ver), PSCI_VERSION_MINOR(ver));
5152     +
5153     +}
5154     +
5155     static void __init psci_0_2_set_functions(void)
5156     {
5157     pr_info("Using standard PSCI v0.2 function IDs\n");
5158     + psci_ops.get_version = psci_get_version;
5159     +
5160     psci_function_id[PSCI_FN_CPU_SUSPEND] =
5161     PSCI_FN_NATIVE(0_2, CPU_SUSPEND);
5162     psci_ops.cpu_suspend = psci_cpu_suspend;
5163     @@ -539,6 +585,7 @@ static int __init psci_probe(void)
5164     psci_init_migrate();
5165    
5166     if (PSCI_VERSION_MAJOR(ver) >= 1) {
5167     + psci_init_smccc();
5168     psci_init_cpu_suspend();
5169     psci_init_system_suspend();
5170     }
5171     @@ -652,9 +699,9 @@ int __init psci_acpi_init(void)
5172     pr_info("probing for conduit method from ACPI.\n");
5173    
5174     if (acpi_psci_use_hvc())
5175     - invoke_psci_fn = __invoke_psci_fn_hvc;
5176     + set_conduit(PSCI_CONDUIT_HVC);
5177     else
5178     - invoke_psci_fn = __invoke_psci_fn_smc;
5179     + set_conduit(PSCI_CONDUIT_SMC);
5180    
5181     return psci_probe();
5182     }
5183     diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
5184     index 41b72ce6613f..83e1345db9e2 100644
5185     --- a/drivers/gpu/drm/radeon/radeon_object.c
5186     +++ b/drivers/gpu/drm/radeon/radeon_object.c
5187     @@ -238,9 +238,10 @@ int radeon_bo_create(struct radeon_device *rdev,
5188     * may be slow
5189     * See https://bugs.freedesktop.org/show_bug.cgi?id=88758
5190     */
5191     -
5192     +#ifndef CONFIG_COMPILE_TEST
5193     #warning Please enable CONFIG_MTRR and CONFIG_X86_PAT for better performance \
5194     thanks to write-combining
5195     +#endif
5196    
5197     if (bo->flags & RADEON_GEM_GTT_WC)
5198     DRM_INFO_ONCE("Please enable CONFIG_MTRR and CONFIG_X86_PAT for "
5199     diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
5200     index d8bc4b910192..9360cdce740e 100644
5201     --- a/drivers/hv/channel_mgmt.c
5202     +++ b/drivers/hv/channel_mgmt.c
5203     @@ -70,7 +70,7 @@ static const struct vmbus_device vmbus_devs[] = {
5204     /* PCIE */
5205     { .dev_type = HV_PCIE,
5206     HV_PCIE_GUID,
5207     - .perf_device = true,
5208     + .perf_device = false,
5209     },
5210    
5211     /* Synthetic Frame Buffer */
5212     diff --git a/drivers/hwmon/ina2xx.c b/drivers/hwmon/ina2xx.c
5213     index a629f7c130f0..ac63e562071f 100644
5214     --- a/drivers/hwmon/ina2xx.c
5215     +++ b/drivers/hwmon/ina2xx.c
5216     @@ -447,6 +447,7 @@ static int ina2xx_probe(struct i2c_client *client,
5217    
5218     /* set the device type */
5219     data->config = &ina2xx_config[id->driver_data];
5220     + mutex_init(&data->config_lock);
5221    
5222     if (of_property_read_u32(dev->of_node, "shunt-resistor", &val) < 0) {
5223     struct ina2xx_platform_data *pdata = dev_get_platdata(dev);
5224     @@ -473,8 +474,6 @@ static int ina2xx_probe(struct i2c_client *client,
5225     return -ENODEV;
5226     }
5227    
5228     - mutex_init(&data->config_lock);
5229     -
5230     data->groups[group++] = &ina2xx_group;
5231     if (id->driver_data == ina226)
5232     data->groups[group++] = &ina226_group;
5233     diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
5234     index 48a39222fdf9..a9fc64557c53 100644
5235     --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
5236     +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
5237     @@ -101,7 +101,7 @@ static int get_v4l2_window32(struct v4l2_window __user *kp,
5238     static int put_v4l2_window32(struct v4l2_window __user *kp,
5239     struct v4l2_window32 __user *up)
5240     {
5241     - struct v4l2_clip __user *kclips = kp->clips;
5242     + struct v4l2_clip __user *kclips;
5243     struct v4l2_clip32 __user *uclips;
5244     compat_caddr_t p;
5245     u32 clipcount;
5246     @@ -116,6 +116,8 @@ static int put_v4l2_window32(struct v4l2_window __user *kp,
5247     if (!clipcount)
5248     return 0;
5249    
5250     + if (get_user(kclips, &kp->clips))
5251     + return -EFAULT;
5252     if (get_user(p, &up->clips))
5253     return -EFAULT;
5254     uclips = compat_ptr(p);
5255     diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
5256     index 4da73e2c37cf..2032a6de026b 100644
5257     --- a/drivers/net/phy/micrel.c
5258     +++ b/drivers/net/phy/micrel.c
5259     @@ -268,12 +268,23 @@ static int kszphy_nand_tree_disable(struct phy_device *phydev)
5260     return ret;
5261     }
5262    
5263     -/* Some config bits need to be set again on resume, handle them here. */
5264     -static int kszphy_config_reset(struct phy_device *phydev)
5265     +static int kszphy_config_init(struct phy_device *phydev)
5266     {
5267     struct kszphy_priv *priv = phydev->priv;
5268     + const struct kszphy_type *type;
5269     int ret;
5270    
5271     + if (!priv)
5272     + return 0;
5273     +
5274     + type = priv->type;
5275     +
5276     + if (type->has_broadcast_disable)
5277     + kszphy_broadcast_disable(phydev);
5278     +
5279     + if (type->has_nand_tree_disable)
5280     + kszphy_nand_tree_disable(phydev);
5281     +
5282     if (priv->rmii_ref_clk_sel) {
5283     ret = kszphy_rmii_clk_sel(phydev, priv->rmii_ref_clk_sel_val);
5284     if (ret) {
5285     @@ -284,7 +295,7 @@ static int kszphy_config_reset(struct phy_device *phydev)
5286     }
5287    
5288     if (priv->led_mode >= 0)
5289     - kszphy_setup_led(phydev, priv->type->led_mode_reg, priv->led_mode);
5290     + kszphy_setup_led(phydev, type->led_mode_reg, priv->led_mode);
5291    
5292     if (phy_interrupt_is_valid(phydev)) {
5293     int ctl = phy_read(phydev, MII_BMCR);
5294     @@ -300,25 +311,6 @@ static int kszphy_config_reset(struct phy_device *phydev)
5295     return 0;
5296     }
5297    
5298     -static int kszphy_config_init(struct phy_device *phydev)
5299     -{
5300     - struct kszphy_priv *priv = phydev->priv;
5301     - const struct kszphy_type *type;
5302     -
5303     - if (!priv)
5304     - return 0;
5305     -
5306     - type = priv->type;
5307     -
5308     - if (type->has_broadcast_disable)
5309     - kszphy_broadcast_disable(phydev);
5310     -
5311     - if (type->has_nand_tree_disable)
5312     - kszphy_nand_tree_disable(phydev);
5313     -
5314     - return kszphy_config_reset(phydev);
5315     -}
5316     -
5317     static int ksz8041_config_init(struct phy_device *phydev)
5318     {
5319     struct device_node *of_node = phydev->mdio.dev.of_node;
5320     @@ -723,14 +715,8 @@ static int kszphy_suspend(struct phy_device *phydev)
5321    
5322     static int kszphy_resume(struct phy_device *phydev)
5323     {
5324     - int ret;
5325     -
5326     genphy_resume(phydev);
5327    
5328     - ret = kszphy_config_reset(phydev);
5329     - if (ret)
5330     - return ret;
5331     -
5332     /* Enable PHY Interrupts */
5333     if (phy_interrupt_is_valid(phydev)) {
5334     phydev->interrupts = PHY_INTERRUPT_ENABLED;
5335     diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c
5336     index 27ed25252aac..cfd81eb1b532 100644
5337     --- a/drivers/net/slip/slhc.c
5338     +++ b/drivers/net/slip/slhc.c
5339     @@ -509,6 +509,10 @@ slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize)
5340     if(x < 0 || x > comp->rslot_limit)
5341     goto bad;
5342    
5343     + /* Check if the cstate is initialized */
5344     + if (!comp->rstate[x].initialized)
5345     + goto bad;
5346     +
5347     comp->flags &=~ SLF_TOSS;
5348     comp->recv_current = x;
5349     } else {
5350     @@ -673,6 +677,7 @@ slhc_remember(struct slcompress *comp, unsigned char *icp, int isize)
5351     if (cs->cs_tcp.doff > 5)
5352     memcpy(cs->cs_tcpopt, icp + ihl*4 + sizeof(struct tcphdr), (cs->cs_tcp.doff - 5) * 4);
5353     cs->cs_hsize = ihl*2 + cs->cs_tcp.doff*2;
5354     + cs->initialized = true;
5355     /* Put headers back on packet
5356     * Neither header checksum is recalculated
5357     */
5358     diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
5359     index 1fca0024f294..4fb468666b19 100644
5360     --- a/drivers/net/usb/cdc_ether.c
5361     +++ b/drivers/net/usb/cdc_ether.c
5362     @@ -773,6 +773,12 @@ static const struct usb_device_id products[] = {
5363     USB_CDC_SUBCLASS_ETHERNET,
5364     USB_CDC_PROTO_NONE),
5365     .driver_info = (unsigned long)&wwan_info,
5366     +}, {
5367     + /* Cinterion AHS3 modem by GEMALTO */
5368     + USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0055, USB_CLASS_COMM,
5369     + USB_CDC_SUBCLASS_ETHERNET,
5370     + USB_CDC_PROTO_NONE),
5371     + .driver_info = (unsigned long)&wwan_info,
5372     }, {
5373     /* Telit modules */
5374     USB_VENDOR_AND_INTERFACE_INFO(0x1bc7, USB_CLASS_COMM,
5375     diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
5376     index c53385a0052f..f5a96678494b 100644
5377     --- a/drivers/net/usb/lan78xx.c
5378     +++ b/drivers/net/usb/lan78xx.c
5379     @@ -873,7 +873,8 @@ static int lan78xx_read_otp(struct lan78xx_net *dev, u32 offset,
5380     offset += 0x100;
5381     else
5382     ret = -EINVAL;
5383     - ret = lan78xx_read_raw_otp(dev, offset, length, data);
5384     + if (!ret)
5385     + ret = lan78xx_read_raw_otp(dev, offset, length, data);
5386     }
5387    
5388     return ret;
5389     diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
5390     index 231f84db9ab0..6113624ccec3 100644
5391     --- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
5392     +++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
5393     @@ -1454,6 +1454,7 @@ static int rtl8187_probe(struct usb_interface *intf,
5394     goto err_free_dev;
5395     }
5396     mutex_init(&priv->io_mutex);
5397     + mutex_init(&priv->conf_mutex);
5398    
5399     SET_IEEE80211_DEV(dev, &intf->dev);
5400     usb_set_intfdata(intf, dev);
5401     @@ -1627,7 +1628,6 @@ static int rtl8187_probe(struct usb_interface *intf,
5402     printk(KERN_ERR "rtl8187: Cannot register device\n");
5403     goto err_free_dmabuf;
5404     }
5405     - mutex_init(&priv->conf_mutex);
5406     skb_queue_head_init(&priv->b_tx_status.queue);
5407    
5408     wiphy_info(dev->wiphy, "hwaddr %pM, %s V%d + %s, rfkill mask %d\n",
5409     diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
5410     index 71bf9bded485..66e9bb053629 100644
5411     --- a/drivers/s390/cio/qdio_main.c
5412     +++ b/drivers/s390/cio/qdio_main.c
5413     @@ -126,7 +126,7 @@ static inline int qdio_check_ccq(struct qdio_q *q, unsigned int ccq)
5414     static int qdio_do_eqbs(struct qdio_q *q, unsigned char *state,
5415     int start, int count, int auto_ack)
5416     {
5417     - int rc, tmp_count = count, tmp_start = start, nr = q->nr, retried = 0;
5418     + int rc, tmp_count = count, tmp_start = start, nr = q->nr;
5419     unsigned int ccq = 0;
5420    
5421     qperf_inc(q, eqbs);
5422     @@ -149,14 +149,7 @@ static int qdio_do_eqbs(struct qdio_q *q, unsigned char *state,
5423     qperf_inc(q, eqbs_partial);
5424     DBF_DEV_EVENT(DBF_WARN, q->irq_ptr, "EQBS part:%02x",
5425     tmp_count);
5426     - /*
5427     - * Retry once, if that fails bail out and process the
5428     - * extracted buffers before trying again.
5429     - */
5430     - if (!retried++)
5431     - goto again;
5432     - else
5433     - return count - tmp_count;
5434     + return count - tmp_count;
5435     }
5436    
5437     DBF_ERROR("%4x EQBS ERROR", SCH_NO(q));
5438     @@ -212,7 +205,10 @@ static int qdio_do_sqbs(struct qdio_q *q, unsigned char state, int start,
5439     return 0;
5440     }
5441    
5442     -/* returns number of examined buffers and their common state in *state */
5443     +/*
5444     + * Returns number of examined buffers and their common state in *state.
5445     + * Requested number of buffers-to-examine must be > 0.
5446     + */
5447     static inline int get_buf_states(struct qdio_q *q, unsigned int bufnr,
5448     unsigned char *state, unsigned int count,
5449     int auto_ack, int merge_pending)
5450     @@ -223,17 +219,23 @@ static inline int get_buf_states(struct qdio_q *q, unsigned int bufnr,
5451     if (is_qebsm(q))
5452     return qdio_do_eqbs(q, state, bufnr, count, auto_ack);
5453    
5454     - for (i = 0; i < count; i++) {
5455     - if (!__state) {
5456     - __state = q->slsb.val[bufnr];
5457     - if (merge_pending && __state == SLSB_P_OUTPUT_PENDING)
5458     - __state = SLSB_P_OUTPUT_EMPTY;
5459     - } else if (merge_pending) {
5460     - if ((q->slsb.val[bufnr] & __state) != __state)
5461     - break;
5462     - } else if (q->slsb.val[bufnr] != __state)
5463     - break;
5464     + /* get initial state: */
5465     + __state = q->slsb.val[bufnr];
5466     + if (merge_pending && __state == SLSB_P_OUTPUT_PENDING)
5467     + __state = SLSB_P_OUTPUT_EMPTY;
5468     +
5469     + for (i = 1; i < count; i++) {
5470     bufnr = next_buf(bufnr);
5471     +
5472     + /* merge PENDING into EMPTY: */
5473     + if (merge_pending &&
5474     + q->slsb.val[bufnr] == SLSB_P_OUTPUT_PENDING &&
5475     + __state == SLSB_P_OUTPUT_EMPTY)
5476     + continue;
5477     +
5478     + /* stop if next state differs from initial state: */
5479     + if (q->slsb.val[bufnr] != __state)
5480     + break;
5481     }
5482     *state = __state;
5483     return i;
5484     diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
5485     index e2c37aeed45a..fce49ebc575d 100644
5486     --- a/drivers/vhost/vhost.c
5487     +++ b/drivers/vhost/vhost.c
5488     @@ -1175,10 +1175,12 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
5489     /* Caller should have vq mutex and device mutex */
5490     int vhost_vq_access_ok(struct vhost_virtqueue *vq)
5491     {
5492     - int ret = vq_log_access_ok(vq, vq->log_base);
5493     + if (!vq_log_access_ok(vq, vq->log_base))
5494     + return 0;
5495    
5496     - if (ret || vq->iotlb)
5497     - return ret;
5498     + /* Access validation occurs at prefetch time with IOTLB */
5499     + if (vq->iotlb)
5500     + return 1;
5501    
5502     return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
5503     }
5504     diff --git a/fs/namei.c b/fs/namei.c
5505     index 891670e0956b..85ac38b99065 100644
5506     --- a/fs/namei.c
5507     +++ b/fs/namei.c
5508     @@ -221,9 +221,10 @@ getname_kernel(const char * filename)
5509     if (len <= EMBEDDED_NAME_MAX) {
5510     result->name = (char *)result->iname;
5511     } else if (len <= PATH_MAX) {
5512     + const size_t size = offsetof(struct filename, iname[1]);
5513     struct filename *tmp;
5514    
5515     - tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
5516     + tmp = kmalloc(size, GFP_KERNEL);
5517     if (unlikely(!tmp)) {
5518     __putname(result);
5519     return ERR_PTR(-ENOMEM);
5520     diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h
5521     new file mode 100644
5522     index 000000000000..e518e4e3dfb5
5523     --- /dev/null
5524     +++ b/include/kvm/arm_psci.h
5525     @@ -0,0 +1,51 @@
5526     +/*
5527     + * Copyright (C) 2012,2013 - ARM Ltd
5528     + * Author: Marc Zyngier <marc.zyngier@arm.com>
5529     + *
5530     + * This program is free software; you can redistribute it and/or modify
5531     + * it under the terms of the GNU General Public License version 2 as
5532     + * published by the Free Software Foundation.
5533     + *
5534     + * This program is distributed in the hope that it will be useful,
5535     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
5536     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
5537     + * GNU General Public License for more details.
5538     + *
5539     + * You should have received a copy of the GNU General Public License
5540     + * along with this program. If not, see <http://www.gnu.org/licenses/>.
5541     + */
5542     +
5543     +#ifndef __KVM_ARM_PSCI_H__
5544     +#define __KVM_ARM_PSCI_H__
5545     +
5546     +#include <linux/kvm_host.h>
5547     +#include <uapi/linux/psci.h>
5548     +
5549     +#define KVM_ARM_PSCI_0_1 PSCI_VERSION(0, 1)
5550     +#define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2)
5551     +#define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0)
5552     +
5553     +#define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_0
5554     +
5555     +/*
5556     + * We need the KVM pointer independently from the vcpu as we can call
5557     + * this from HYP, and need to apply kern_hyp_va on it...
5558     + */
5559     +static inline int kvm_psci_version(struct kvm_vcpu *vcpu, struct kvm *kvm)
5560     +{
5561     + /*
5562     + * Our PSCI implementation stays the same across versions from
5563     + * v0.2 onward, only adding the few mandatory functions (such
5564     + * as FEATURES with 1.0) that are required by newer
5565     + * revisions. It is thus safe to return the latest.
5566     + */
5567     + if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
5568     + return KVM_ARM_PSCI_LATEST;
5569     +
5570     + return KVM_ARM_PSCI_0_1;
5571     +}
5572     +
5573     +
5574     +int kvm_hvc_call_handler(struct kvm_vcpu *vcpu);
5575     +
5576     +#endif /* __KVM_ARM_PSCI_H__ */
5577     diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
5578     index 4c5bca38c653..a031897fca76 100644
5579     --- a/include/linux/arm-smccc.h
5580     +++ b/include/linux/arm-smccc.h
5581     @@ -14,14 +14,16 @@
5582     #ifndef __LINUX_ARM_SMCCC_H
5583     #define __LINUX_ARM_SMCCC_H
5584    
5585     +#include <uapi/linux/const.h>
5586     +
5587     /*
5588     * This file provides common defines for ARM SMC Calling Convention as
5589     * specified in
5590     * http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html
5591     */
5592    
5593     -#define ARM_SMCCC_STD_CALL 0
5594     -#define ARM_SMCCC_FAST_CALL 1
5595     +#define ARM_SMCCC_STD_CALL _AC(0,U)
5596     +#define ARM_SMCCC_FAST_CALL _AC(1,U)
5597     #define ARM_SMCCC_TYPE_SHIFT 31
5598    
5599     #define ARM_SMCCC_SMC_32 0
5600     @@ -60,6 +62,24 @@
5601     #define ARM_SMCCC_QUIRK_NONE 0
5602     #define ARM_SMCCC_QUIRK_QCOM_A6 1 /* Save/restore register a6 */
5603    
5604     +#define ARM_SMCCC_VERSION_1_0 0x10000
5605     +#define ARM_SMCCC_VERSION_1_1 0x10001
5606     +
5607     +#define ARM_SMCCC_VERSION_FUNC_ID \
5608     + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
5609     + ARM_SMCCC_SMC_32, \
5610     + 0, 0)
5611     +
5612     +#define ARM_SMCCC_ARCH_FEATURES_FUNC_ID \
5613     + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
5614     + ARM_SMCCC_SMC_32, \
5615     + 0, 1)
5616     +
5617     +#define ARM_SMCCC_ARCH_WORKAROUND_1 \
5618     + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
5619     + ARM_SMCCC_SMC_32, \
5620     + 0, 0x8000)
5621     +
5622     #ifndef __ASSEMBLY__
5623    
5624     #include <linux/linkage.h>
5625     @@ -130,5 +150,146 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
5626    
5627     #define arm_smccc_hvc_quirk(...) __arm_smccc_hvc(__VA_ARGS__)
5628    
5629     +/* SMCCC v1.1 implementation madness follows */
5630     +#ifdef CONFIG_ARM64
5631     +
5632     +#define SMCCC_SMC_INST "smc #0"
5633     +#define SMCCC_HVC_INST "hvc #0"
5634     +
5635     +#elif defined(CONFIG_ARM)
5636     +#include <asm/opcodes-sec.h>
5637     +#include <asm/opcodes-virt.h>
5638     +
5639     +#define SMCCC_SMC_INST __SMC(0)
5640     +#define SMCCC_HVC_INST __HVC(0)
5641     +
5642     +#endif
5643     +
5644     +#define ___count_args(_0, _1, _2, _3, _4, _5, _6, _7, _8, x, ...) x
5645     +
5646     +#define __count_args(...) \
5647     + ___count_args(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1, 0)
5648     +
5649     +#define __constraint_write_0 \
5650     + "+r" (r0), "=&r" (r1), "=&r" (r2), "=&r" (r3)
5651     +#define __constraint_write_1 \
5652     + "+r" (r0), "+r" (r1), "=&r" (r2), "=&r" (r3)
5653     +#define __constraint_write_2 \
5654     + "+r" (r0), "+r" (r1), "+r" (r2), "=&r" (r3)
5655     +#define __constraint_write_3 \
5656     + "+r" (r0), "+r" (r1), "+r" (r2), "+r" (r3)
5657     +#define __constraint_write_4 __constraint_write_3
5658     +#define __constraint_write_5 __constraint_write_4
5659     +#define __constraint_write_6 __constraint_write_5
5660     +#define __constraint_write_7 __constraint_write_6
5661     +
5662     +#define __constraint_read_0
5663     +#define __constraint_read_1
5664     +#define __constraint_read_2
5665     +#define __constraint_read_3
5666     +#define __constraint_read_4 "r" (r4)
5667     +#define __constraint_read_5 __constraint_read_4, "r" (r5)
5668     +#define __constraint_read_6 __constraint_read_5, "r" (r6)
5669     +#define __constraint_read_7 __constraint_read_6, "r" (r7)
5670     +
5671     +#define __declare_arg_0(a0, res) \
5672     + struct arm_smccc_res *___res = res; \
5673     + register u32 r0 asm("r0") = a0; \
5674     + register unsigned long r1 asm("r1"); \
5675     + register unsigned long r2 asm("r2"); \
5676     + register unsigned long r3 asm("r3")
5677     +
5678     +#define __declare_arg_1(a0, a1, res) \
5679     + struct arm_smccc_res *___res = res; \
5680     + register u32 r0 asm("r0") = a0; \
5681     + register typeof(a1) r1 asm("r1") = a1; \
5682     + register unsigned long r2 asm("r2"); \
5683     + register unsigned long r3 asm("r3")
5684     +
5685     +#define __declare_arg_2(a0, a1, a2, res) \
5686     + struct arm_smccc_res *___res = res; \
5687     + register u32 r0 asm("r0") = a0; \
5688     + register typeof(a1) r1 asm("r1") = a1; \
5689     + register typeof(a2) r2 asm("r2") = a2; \
5690     + register unsigned long r3 asm("r3")
5691     +
5692     +#define __declare_arg_3(a0, a1, a2, a3, res) \
5693     + struct arm_smccc_res *___res = res; \
5694     + register u32 r0 asm("r0") = a0; \
5695     + register typeof(a1) r1 asm("r1") = a1; \
5696     + register typeof(a2) r2 asm("r2") = a2; \
5697     + register typeof(a3) r3 asm("r3") = a3
5698     +
5699     +#define __declare_arg_4(a0, a1, a2, a3, a4, res) \
5700     + __declare_arg_3(a0, a1, a2, a3, res); \
5701     + register typeof(a4) r4 asm("r4") = a4
5702     +
5703     +#define __declare_arg_5(a0, a1, a2, a3, a4, a5, res) \
5704     + __declare_arg_4(a0, a1, a2, a3, a4, res); \
5705     + register typeof(a5) r5 asm("r5") = a5
5706     +
5707     +#define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res) \
5708     + __declare_arg_5(a0, a1, a2, a3, a4, a5, res); \
5709     + register typeof(a6) r6 asm("r6") = a6
5710     +
5711     +#define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res) \
5712     + __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res); \
5713     + register typeof(a7) r7 asm("r7") = a7
5714     +
5715     +#define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__)
5716     +#define __declare_args(count, ...) ___declare_args(count, __VA_ARGS__)
5717     +
5718     +#define ___constraints(count) \
5719     + : __constraint_write_ ## count \
5720     + : __constraint_read_ ## count \
5721     + : "memory"
5722     +#define __constraints(count) ___constraints(count)
5723     +
5724     +/*
5725     + * We have an output list that is not necessarily used, and GCC feels
5726     + * entitled to optimise the whole sequence away. "volatile" is what
5727     + * makes it stick.
5728     + */
5729     +#define __arm_smccc_1_1(inst, ...) \
5730     + do { \
5731     + __declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \
5732     + asm volatile(inst "\n" \
5733     + __constraints(__count_args(__VA_ARGS__))); \
5734     + if (___res) \
5735     + *___res = (typeof(*___res)){r0, r1, r2, r3}; \
5736     + } while (0)
5737     +
5738     +/*
5739     + * arm_smccc_1_1_smc() - make an SMCCC v1.1 compliant SMC call
5740     + *
5741     + * This is a variadic macro taking one to eight source arguments, and
5742     + * an optional return structure.
5743     + *
5744     + * @a0-a7: arguments passed in registers 0 to 7
5745     + * @res: result values from registers 0 to 3
5746     + *
5747     + * This macro is used to make SMC calls following SMC Calling Convention v1.1.
5748     + * The content of the supplied param are copied to registers 0 to 7 prior
5749     + * to the SMC instruction. The return values are updated with the content
5750     + * from register 0 to 3 on return from the SMC instruction if not NULL.
5751     + */
5752     +#define arm_smccc_1_1_smc(...) __arm_smccc_1_1(SMCCC_SMC_INST, __VA_ARGS__)
5753     +
5754     +/*
5755     + * arm_smccc_1_1_hvc() - make an SMCCC v1.1 compliant HVC call
5756     + *
5757     + * This is a variadic macro taking one to eight source arguments, and
5758     + * an optional return structure.
5759     + *
5760     + * @a0-a7: arguments passed in registers 0 to 7
5761     + * @res: result values from registers 0 to 3
5762     + *
5763     + * This macro is used to make HVC calls following SMC Calling Convention v1.1.
5764     + * The content of the supplied param are copied to registers 0 to 7 prior
5765     + * to the HVC instruction. The return values are updated with the content
5766     + * from register 0 to 3 on return from the HVC instruction if not NULL.
5767     + */
5768     +#define arm_smccc_1_1_hvc(...) __arm_smccc_1_1(SMCCC_HVC_INST, __VA_ARGS__)
5769     +
5770     #endif /*__ASSEMBLY__*/
5771     #endif /*__LINUX_ARM_SMCCC_H*/
5772     diff --git a/include/linux/mm.h b/include/linux/mm.h
5773     index 8e506783631b..4a07ff4f38e1 100644
5774     --- a/include/linux/mm.h
5775     +++ b/include/linux/mm.h
5776     @@ -76,6 +76,10 @@ extern int mmap_rnd_compat_bits __read_mostly;
5777     #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x)))
5778     #endif
5779    
5780     +#ifndef lm_alias
5781     +#define lm_alias(x) __va(__pa_symbol(x))
5782     +#endif
5783     +
5784     /*
5785     * To prevent common memory management code establishing
5786     * a zero page mapping on a read fault.
5787     diff --git a/include/linux/psci.h b/include/linux/psci.h
5788     index bdea1cb5e1db..347077cf19c6 100644
5789     --- a/include/linux/psci.h
5790     +++ b/include/linux/psci.h
5791     @@ -25,7 +25,19 @@ bool psci_tos_resident_on(int cpu);
5792     int psci_cpu_init_idle(unsigned int cpu);
5793     int psci_cpu_suspend_enter(unsigned long index);
5794    
5795     +enum psci_conduit {
5796     + PSCI_CONDUIT_NONE,
5797     + PSCI_CONDUIT_SMC,
5798     + PSCI_CONDUIT_HVC,
5799     +};
5800     +
5801     +enum smccc_version {
5802     + SMCCC_VERSION_1_0,
5803     + SMCCC_VERSION_1_1,
5804     +};
5805     +
5806     struct psci_operations {
5807     + u32 (*get_version)(void);
5808     int (*cpu_suspend)(u32 state, unsigned long entry_point);
5809     int (*cpu_off)(u32 state);
5810     int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
5811     @@ -33,6 +45,8 @@ struct psci_operations {
5812     int (*affinity_info)(unsigned long target_affinity,
5813     unsigned long lowest_affinity_level);
5814     int (*migrate_info_type)(void);
5815     + enum psci_conduit conduit;
5816     + enum smccc_version smccc_version;
5817     };
5818    
5819     extern struct psci_operations psci_ops;
5820     diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
5821     index 554671c81f4a..4931787193c3 100644
5822     --- a/include/net/bluetooth/hci_core.h
5823     +++ b/include/net/bluetooth/hci_core.h
5824     @@ -893,7 +893,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
5825     u16 conn_timeout);
5826     struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
5827     u8 dst_type, u8 sec_level, u16 conn_timeout,
5828     - u8 role);
5829     + u8 role, bdaddr_t *direct_rpa);
5830     struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
5831     u8 sec_level, u8 auth_type);
5832     struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
5833     diff --git a/include/net/slhc_vj.h b/include/net/slhc_vj.h
5834     index 8716d5942b65..8fcf8908a694 100644
5835     --- a/include/net/slhc_vj.h
5836     +++ b/include/net/slhc_vj.h
5837     @@ -127,6 +127,7 @@ typedef __u32 int32;
5838     */
5839     struct cstate {
5840     byte_t cs_this; /* connection id number (xmit) */
5841     + bool initialized; /* true if initialized */
5842     struct cstate *next; /* next in ring (xmit) */
5843     struct iphdr cs_ip; /* ip/tcp hdr from most recent packet */
5844     struct tcphdr cs_tcp;
5845     diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
5846     index 3d7a0fc021a7..39930ca998cd 100644
5847     --- a/include/uapi/linux/psci.h
5848     +++ b/include/uapi/linux/psci.h
5849     @@ -87,6 +87,9 @@
5850     (((ver) & PSCI_VERSION_MAJOR_MASK) >> PSCI_VERSION_MAJOR_SHIFT)
5851     #define PSCI_VERSION_MINOR(ver) \
5852     ((ver) & PSCI_VERSION_MINOR_MASK)
5853     +#define PSCI_VERSION(maj, min) \
5854     + ((((maj) << PSCI_VERSION_MAJOR_SHIFT) & PSCI_VERSION_MAJOR_MASK) | \
5855     + ((min) & PSCI_VERSION_MINOR_MASK))
5856    
5857     /* PSCI features decoding (>=1.0) */
5858     #define PSCI_1_0_FEATURES_CPU_SUSPEND_PF_SHIFT 1
5859     diff --git a/kernel/events/core.c b/kernel/events/core.c
5860     index c4100c38a467..74710fad35d5 100644
5861     --- a/kernel/events/core.c
5862     +++ b/kernel/events/core.c
5863     @@ -4091,6 +4091,9 @@ static void _free_event(struct perf_event *event)
5864     if (event->ctx)
5865     put_ctx(event->ctx);
5866    
5867     + if (event->hw.target)
5868     + put_task_struct(event->hw.target);
5869     +
5870     exclusive_event_destroy(event);
5871     module_put(event->pmu->module);
5872    
5873     @@ -9214,6 +9217,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5874     * and we cannot use the ctx information because we need the
5875     * pmu before we get a ctx.
5876     */
5877     + get_task_struct(task);
5878     event->hw.target = task;
5879     }
5880    
5881     @@ -9331,6 +9335,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5882     perf_detach_cgroup(event);
5883     if (event->ns)
5884     put_pid_ns(event->ns);
5885     + if (event->hw.target)
5886     + put_task_struct(event->hw.target);
5887     kfree(event);
5888    
5889     return ERR_PTR(err);
5890     diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
5891     index dc59eae54717..cc061495f653 100644
5892     --- a/net/bluetooth/hci_conn.c
5893     +++ b/net/bluetooth/hci_conn.c
5894     @@ -749,18 +749,31 @@ static bool conn_use_rpa(struct hci_conn *conn)
5895     }
5896    
5897     static void hci_req_add_le_create_conn(struct hci_request *req,
5898     - struct hci_conn *conn)
5899     + struct hci_conn *conn,
5900     + bdaddr_t *direct_rpa)
5901     {
5902     struct hci_cp_le_create_conn cp;
5903     struct hci_dev *hdev = conn->hdev;
5904     u8 own_addr_type;
5905    
5906     - /* Update random address, but set require_privacy to false so
5907     - * that we never connect with an non-resolvable address.
5908     + /* If direct address was provided we use it instead of current
5909     + * address.
5910     */
5911     - if (hci_update_random_address(req, false, conn_use_rpa(conn),
5912     - &own_addr_type))
5913     - return;
5914     + if (direct_rpa) {
5915     + if (bacmp(&req->hdev->random_addr, direct_rpa))
5916     + hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6,
5917     + direct_rpa);
5918     +
5919     + /* direct address is always RPA */
5920     + own_addr_type = ADDR_LE_DEV_RANDOM;
5921     + } else {
5922     + /* Update random address, but set require_privacy to false so
5923     + * that we never connect with an non-resolvable address.
5924     + */
5925     + if (hci_update_random_address(req, false, conn_use_rpa(conn),
5926     + &own_addr_type))
5927     + return;
5928     + }
5929    
5930     memset(&cp, 0, sizeof(cp));
5931    
5932     @@ -825,7 +838,7 @@ static void hci_req_directed_advertising(struct hci_request *req,
5933    
5934     struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
5935     u8 dst_type, u8 sec_level, u16 conn_timeout,
5936     - u8 role)
5937     + u8 role, bdaddr_t *direct_rpa)
5938     {
5939     struct hci_conn_params *params;
5940     struct hci_conn *conn;
5941     @@ -940,7 +953,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
5942     hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED);
5943     }
5944    
5945     - hci_req_add_le_create_conn(&req, conn);
5946     + hci_req_add_le_create_conn(&req, conn, direct_rpa);
5947    
5948     create_conn:
5949     err = hci_req_run(&req, create_le_conn_complete);
5950     diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
5951     index e17aacbc5630..d2f9eb169ba8 100644
5952     --- a/net/bluetooth/hci_event.c
5953     +++ b/net/bluetooth/hci_event.c
5954     @@ -4646,7 +4646,8 @@ static void hci_le_conn_update_complete_evt(struct hci_dev *hdev,
5955     /* This function requires the caller holds hdev->lock */
5956     static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
5957     bdaddr_t *addr,
5958     - u8 addr_type, u8 adv_type)
5959     + u8 addr_type, u8 adv_type,
5960     + bdaddr_t *direct_rpa)
5961     {
5962     struct hci_conn *conn;
5963     struct hci_conn_params *params;
5964     @@ -4697,7 +4698,8 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
5965     }
5966    
5967     conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW,
5968     - HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER);
5969     + HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER,
5970     + direct_rpa);
5971     if (!IS_ERR(conn)) {
5972     /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned
5973     * by higher layer that tried to connect, if no then
5974     @@ -4807,8 +4809,13 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
5975     bdaddr_type = irk->addr_type;
5976     }
5977    
5978     - /* Check if we have been requested to connect to this device */
5979     - conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type);
5980     + /* Check if we have been requested to connect to this device.
5981     + *
5982     + * direct_addr is set only for directed advertising reports (it is NULL
5983     + * for advertising reports) and is already verified to be RPA above.
5984     + */
5985     + conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type,
5986     + direct_addr);
5987     if (conn && type == LE_ADV_IND) {
5988     /* Store report for later inclusion by
5989     * mgmt_device_connected
5990     diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
5991     index 2bbca23a9d05..1fc23cb4a3e0 100644
5992     --- a/net/bluetooth/l2cap_core.c
5993     +++ b/net/bluetooth/l2cap_core.c
5994     @@ -7148,7 +7148,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
5995     hcon = hci_connect_le(hdev, dst, dst_type,
5996     chan->sec_level,
5997     HCI_LE_CONN_TIMEOUT,
5998     - HCI_ROLE_SLAVE);
5999     + HCI_ROLE_SLAVE, NULL);
6000     else
6001     hcon = hci_connect_le_scan(hdev, dst, dst_type,
6002     chan->sec_level,
6003     diff --git a/net/rds/send.c b/net/rds/send.c
6004     index ef53d164e146..50241d30e16d 100644
6005     --- a/net/rds/send.c
6006     +++ b/net/rds/send.c
6007     @@ -1,5 +1,5 @@
6008     /*
6009     - * Copyright (c) 2006 Oracle. All rights reserved.
6010     + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
6011     *
6012     * This software is available to you under a choice of one of two
6013     * licenses. You may choose to be licensed under the terms of the GNU
6014     @@ -983,10 +983,15 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
6015     if (conn->c_npaths == 0 && hash != 0) {
6016     rds_send_ping(conn);
6017    
6018     - if (conn->c_npaths == 0) {
6019     - wait_event_interruptible(conn->c_hs_waitq,
6020     - (conn->c_npaths != 0));
6021     - }
6022     + /* The underlying connection is not up yet. Need to wait
6023     + * until it is up to be sure that the non-zero c_path can be
6024     + * used. But if we are interrupted, we have to use the zero
6025     + * c_path in case the connection ends up being non-MP capable.
6026     + */
6027     + if (conn->c_npaths == 0)
6028     + if (wait_event_interruptible(conn->c_hs_waitq,
6029     + conn->c_npaths != 0))
6030     + hash = 0;
6031     if (conn->c_npaths == 1)
6032     hash = 0;
6033     }
6034     diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
6035     index 79aec90259cd..4afd4149a632 100644
6036     --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
6037     +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
6038     @@ -237,9 +237,6 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
6039    
6040     ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
6041    
6042     - err = crypto_ahash_init(req);
6043     - if (err)
6044     - goto out;
6045     err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
6046     if (err)
6047     goto out;
6048     diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
6049     index 150334064071..ff5bc6363a79 100644
6050     --- a/tools/perf/tests/code-reading.c
6051     +++ b/tools/perf/tests/code-reading.c
6052     @@ -224,8 +224,6 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode,
6053     unsigned char buf2[BUFSZ];
6054     size_t ret_len;
6055     u64 objdump_addr;
6056     - const char *objdump_name;
6057     - char decomp_name[KMOD_DECOMP_LEN];
6058     int ret;
6059    
6060     pr_debug("Reading object code for memory address: %#"PRIx64"\n", addr);
6061     @@ -286,25 +284,9 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode,
6062     state->done[state->done_cnt++] = al.map->start;
6063     }
6064    
6065     - objdump_name = al.map->dso->long_name;
6066     - if (dso__needs_decompress(al.map->dso)) {
6067     - if (dso__decompress_kmodule_path(al.map->dso, objdump_name,
6068     - decomp_name,
6069     - sizeof(decomp_name)) < 0) {
6070     - pr_debug("decompression failed\n");
6071     - return -1;
6072     - }
6073     -
6074     - objdump_name = decomp_name;
6075     - }
6076     -
6077     /* Read the object code using objdump */
6078     objdump_addr = map__rip_2objdump(al.map, al.addr);
6079     - ret = read_via_objdump(objdump_name, objdump_addr, buf2, len);
6080     -
6081     - if (dso__needs_decompress(al.map->dso))
6082     - unlink(objdump_name);
6083     -
6084     + ret = read_via_objdump(al.map->dso->long_name, objdump_addr, buf2, len);
6085     if (ret > 0) {
6086     /*
6087     * The kernel maps are inaccurate - assume objdump is right in
6088     diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
6089     index 7e27207d0f45..cac39532c057 100644
6090     --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
6091     +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
6092     @@ -1300,6 +1300,7 @@ static int intel_pt_overflow(struct intel_pt_decoder *decoder)
6093     intel_pt_clear_tx_flags(decoder);
6094     decoder->have_tma = false;
6095     decoder->cbr = 0;
6096     + decoder->timestamp_insn_cnt = 0;
6097     decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC;
6098     decoder->overflow = true;
6099     return -EOVERFLOW;
6100     @@ -1522,6 +1523,7 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder)
6101     case INTEL_PT_PSBEND:
6102     intel_pt_log("ERROR: Missing TIP after FUP\n");
6103     decoder->pkt_state = INTEL_PT_STATE_ERR3;
6104     + decoder->pkt_step = 0;
6105     return -ENOENT;
6106    
6107     case INTEL_PT_OVF:
6108     @@ -2182,14 +2184,6 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
6109     return &decoder->state;
6110     }
6111    
6112     -static bool intel_pt_at_psb(unsigned char *buf, size_t len)
6113     -{
6114     - if (len < INTEL_PT_PSB_LEN)
6115     - return false;
6116     - return memmem(buf, INTEL_PT_PSB_LEN, INTEL_PT_PSB_STR,
6117     - INTEL_PT_PSB_LEN);
6118     -}
6119     -
6120     /**
6121     * intel_pt_next_psb - move buffer pointer to the start of the next PSB packet.
6122     * @buf: pointer to buffer pointer
6123     @@ -2278,6 +2272,7 @@ static unsigned char *intel_pt_last_psb(unsigned char *buf, size_t len)
6124     * @buf: buffer
6125     * @len: size of buffer
6126     * @tsc: TSC value returned
6127     + * @rem: returns remaining size when TSC is found
6128     *
6129     * Find a TSC packet in @buf and return the TSC value. This function assumes
6130     * that @buf starts at a PSB and that PSB+ will contain TSC and so stops if a
6131     @@ -2285,7 +2280,8 @@ static unsigned char *intel_pt_last_psb(unsigned char *buf, size_t len)
6132     *
6133     * Return: %true if TSC is found, false otherwise.
6134     */
6135     -static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc)
6136     +static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc,
6137     + size_t *rem)
6138     {
6139     struct intel_pt_pkt packet;
6140     int ret;
6141     @@ -2296,6 +2292,7 @@ static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc)
6142     return false;
6143     if (packet.type == INTEL_PT_TSC) {
6144     *tsc = packet.payload;
6145     + *rem = len;
6146     return true;
6147     }
6148     if (packet.type == INTEL_PT_PSBEND)
6149     @@ -2346,6 +2343,8 @@ static int intel_pt_tsc_cmp(uint64_t tsc1, uint64_t tsc2)
6150     * @len_a: size of first buffer
6151     * @buf_b: second buffer
6152     * @len_b: size of second buffer
6153     + * @consecutive: returns true if there is data in buf_b that is consecutive
6154     + * to buf_a
6155     *
6156     * If the trace contains TSC we can look at the last TSC of @buf_a and the
6157     * first TSC of @buf_b in order to determine if the buffers overlap, and then
6158     @@ -2358,33 +2357,41 @@ static int intel_pt_tsc_cmp(uint64_t tsc1, uint64_t tsc2)
6159     static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a,
6160     size_t len_a,
6161     unsigned char *buf_b,
6162     - size_t len_b)
6163     + size_t len_b, bool *consecutive)
6164     {
6165     uint64_t tsc_a, tsc_b;
6166     unsigned char *p;
6167     - size_t len;
6168     + size_t len, rem_a, rem_b;
6169    
6170     p = intel_pt_last_psb(buf_a, len_a);
6171     if (!p)
6172     return buf_b; /* No PSB in buf_a => no overlap */
6173    
6174     len = len_a - (p - buf_a);
6175     - if (!intel_pt_next_tsc(p, len, &tsc_a)) {
6176     + if (!intel_pt_next_tsc(p, len, &tsc_a, &rem_a)) {
6177     /* The last PSB+ in buf_a is incomplete, so go back one more */
6178     len_a -= len;
6179     p = intel_pt_last_psb(buf_a, len_a);
6180     if (!p)
6181     return buf_b; /* No full PSB+ => assume no overlap */
6182     len = len_a - (p - buf_a);
6183     - if (!intel_pt_next_tsc(p, len, &tsc_a))
6184     + if (!intel_pt_next_tsc(p, len, &tsc_a, &rem_a))
6185     return buf_b; /* No TSC in buf_a => assume no overlap */
6186     }
6187    
6188     while (1) {
6189     /* Ignore PSB+ with no TSC */
6190     - if (intel_pt_next_tsc(buf_b, len_b, &tsc_b) &&
6191     - intel_pt_tsc_cmp(tsc_a, tsc_b) < 0)
6192     - return buf_b; /* tsc_a < tsc_b => no overlap */
6193     + if (intel_pt_next_tsc(buf_b, len_b, &tsc_b, &rem_b)) {
6194     + int cmp = intel_pt_tsc_cmp(tsc_a, tsc_b);
6195     +
6196     + /* Same TSC, so buffers are consecutive */
6197     + if (!cmp && rem_b >= rem_a) {
6198     + *consecutive = true;
6199     + return buf_b + len_b - (rem_b - rem_a);
6200     + }
6201     + if (cmp < 0)
6202     + return buf_b; /* tsc_a < tsc_b => no overlap */
6203     + }
6204    
6205     if (!intel_pt_step_psb(&buf_b, &len_b))
6206     return buf_b + len_b; /* No PSB in buf_b => no data */
6207     @@ -2398,6 +2405,8 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a,
6208     * @buf_b: second buffer
6209     * @len_b: size of second buffer
6210     * @have_tsc: can use TSC packets to detect overlap
6211     + * @consecutive: returns true if there is data in buf_b that is consecutive
6212     + * to buf_a
6213     *
6214     * When trace samples or snapshots are recorded there is the possibility that
6215     * the data overlaps. Note that, for the purposes of decoding, data is only
6216     @@ -2408,7 +2417,7 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a,
6217     */
6218     unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a,
6219     unsigned char *buf_b, size_t len_b,
6220     - bool have_tsc)
6221     + bool have_tsc, bool *consecutive)
6222     {
6223     unsigned char *found;
6224    
6225     @@ -2420,7 +2429,8 @@ unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a,
6226     return buf_b; /* No overlap */
6227    
6228     if (have_tsc) {
6229     - found = intel_pt_find_overlap_tsc(buf_a, len_a, buf_b, len_b);
6230     + found = intel_pt_find_overlap_tsc(buf_a, len_a, buf_b, len_b,
6231     + consecutive);
6232     if (found)
6233     return found;
6234     }
6235     @@ -2435,28 +2445,16 @@ unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a,
6236     }
6237    
6238     /* Now len_b >= len_a */
6239     - if (len_b > len_a) {
6240     - /* The leftover buffer 'b' must start at a PSB */
6241     - while (!intel_pt_at_psb(buf_b + len_a, len_b - len_a)) {
6242     - if (!intel_pt_step_psb(&buf_a, &len_a))
6243     - return buf_b; /* No overlap */
6244     - }
6245     - }
6246     -
6247     while (1) {
6248     /* Potential overlap so check the bytes */
6249     found = memmem(buf_a, len_a, buf_b, len_a);
6250     - if (found)
6251     + if (found) {
6252     + *consecutive = true;
6253     return buf_b + len_a;
6254     + }
6255    
6256     /* Try again at next PSB in buffer 'a' */
6257     if (!intel_pt_step_psb(&buf_a, &len_a))
6258     return buf_b; /* No overlap */
6259     -
6260     - /* The leftover buffer 'b' must start at a PSB */
6261     - while (!intel_pt_at_psb(buf_b + len_a, len_b - len_a)) {
6262     - if (!intel_pt_step_psb(&buf_a, &len_a))
6263     - return buf_b; /* No overlap */
6264     - }
6265     }
6266     }
6267     diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
6268     index 89399985fa4d..9ae4df1dcedc 100644
6269     --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
6270     +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
6271     @@ -103,7 +103,7 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder);
6272    
6273     unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a,
6274     unsigned char *buf_b, size_t len_b,
6275     - bool have_tsc);
6276     + bool have_tsc, bool *consecutive);
6277    
6278     int intel_pt__strerror(int code, char *buf, size_t buflen);
6279    
6280     diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
6281     index dc041d4368c8..b1161d725ce9 100644
6282     --- a/tools/perf/util/intel-pt.c
6283     +++ b/tools/perf/util/intel-pt.c
6284     @@ -131,6 +131,7 @@ struct intel_pt_queue {
6285     bool stop;
6286     bool step_through_buffers;
6287     bool use_buffer_pid_tid;
6288     + bool sync_switch;
6289     pid_t pid, tid;
6290     int cpu;
6291     int switch_state;
6292     @@ -194,14 +195,17 @@ static void intel_pt_dump_event(struct intel_pt *pt, unsigned char *buf,
6293     static int intel_pt_do_fix_overlap(struct intel_pt *pt, struct auxtrace_buffer *a,
6294     struct auxtrace_buffer *b)
6295     {
6296     + bool consecutive = false;
6297     void *start;
6298    
6299     start = intel_pt_find_overlap(a->data, a->size, b->data, b->size,
6300     - pt->have_tsc);
6301     + pt->have_tsc, &consecutive);
6302     if (!start)
6303     return -EINVAL;
6304     b->use_size = b->data + b->size - start;
6305     b->use_data = start;
6306     + if (b->use_size && consecutive)
6307     + b->consecutive = true;
6308     return 0;
6309     }
6310    
6311     @@ -928,10 +932,12 @@ static int intel_pt_setup_queue(struct intel_pt *pt,
6312     if (pt->timeless_decoding || !pt->have_sched_switch)
6313     ptq->use_buffer_pid_tid = true;
6314     }
6315     +
6316     + ptq->sync_switch = pt->sync_switch;
6317     }
6318    
6319     if (!ptq->on_heap &&
6320     - (!pt->sync_switch ||
6321     + (!ptq->sync_switch ||
6322     ptq->switch_state != INTEL_PT_SS_EXPECTING_SWITCH_EVENT)) {
6323     const struct intel_pt_state *state;
6324     int ret;
6325     @@ -1333,7 +1339,7 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
6326     if (pt->synth_opts.last_branch)
6327     intel_pt_update_last_branch_rb(ptq);
6328    
6329     - if (!pt->sync_switch)
6330     + if (!ptq->sync_switch)
6331     return 0;
6332    
6333     if (intel_pt_is_switch_ip(ptq, state->to_ip)) {
6334     @@ -1414,6 +1420,21 @@ static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip)
6335     return switch_ip;
6336     }
6337    
6338     +static void intel_pt_enable_sync_switch(struct intel_pt *pt)
6339     +{
6340     + unsigned int i;
6341     +
6342     + pt->sync_switch = true;
6343     +
6344     + for (i = 0; i < pt->queues.nr_queues; i++) {
6345     + struct auxtrace_queue *queue = &pt->queues.queue_array[i];
6346     + struct intel_pt_queue *ptq = queue->priv;
6347     +
6348     + if (ptq)
6349     + ptq->sync_switch = true;
6350     + }
6351     +}
6352     +
6353     static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
6354     {
6355     const struct intel_pt_state *state = ptq->state;
6356     @@ -1430,7 +1451,7 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
6357     if (pt->switch_ip) {
6358     intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n",
6359     pt->switch_ip, pt->ptss_ip);
6360     - pt->sync_switch = true;
6361     + intel_pt_enable_sync_switch(pt);
6362     }
6363     }
6364     }
6365     @@ -1446,9 +1467,9 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
6366     if (state->err) {
6367     if (state->err == INTEL_PT_ERR_NODATA)
6368     return 1;
6369     - if (pt->sync_switch &&
6370     + if (ptq->sync_switch &&
6371     state->from_ip >= pt->kernel_start) {
6372     - pt->sync_switch = false;
6373     + ptq->sync_switch = false;
6374     intel_pt_next_tid(pt, ptq);
6375     }
6376     if (pt->synth_opts.errors) {
6377     @@ -1474,7 +1495,7 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
6378     state->timestamp, state->est_timestamp);
6379     ptq->timestamp = state->est_timestamp;
6380     /* Use estimated TSC in unknown switch state */
6381     - } else if (pt->sync_switch &&
6382     + } else if (ptq->sync_switch &&
6383     ptq->switch_state == INTEL_PT_SS_UNKNOWN &&
6384     intel_pt_is_switch_ip(ptq, state->to_ip) &&
6385     ptq->next_tid == -1) {
6386     @@ -1621,7 +1642,7 @@ static int intel_pt_sync_switch(struct intel_pt *pt, int cpu, pid_t tid,
6387     return 1;
6388    
6389     ptq = intel_pt_cpu_to_ptq(pt, cpu);
6390     - if (!ptq)
6391     + if (!ptq || !ptq->sync_switch)
6392     return 1;
6393    
6394     switch (ptq->switch_state) {