TLS/SSL and crypto library
リビジョン | 894c04aa05ba1e64735d7beb9c2a1da93d288e31 (tree) |
---|---|
日時 | 2016-06-22 06:44:54 |
作者 | Andy Polyakov <appro@open...> |
コミッター | Andy Polyakov |
PowerPC assembly pack: add POWER8 support.
Reviewed-by: Dr. Stephen Henson <steve@openssl.org>
(cherry picked from commit 4577871ca393275ac0436b2b08f1a75661ced314)
@@ -139,8 +139,8 @@ my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc | ||
139 | 139 | my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o:::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:"; |
140 | 140 | my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; |
141 | 141 | my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; |
142 | -my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::"; | |
143 | -my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::"; | |
142 | +my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:"; | |
143 | +my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:"; | |
144 | 144 | my $no_asm=":::::::::::::::void"; |
145 | 145 | |
146 | 146 | # As for $BSDthreads. Idea is to maintain "collective" set of flags, |
@@ -71,6 +71,8 @@ aes-sparcv9.s: asm/aes-sparcv9.pl | ||
71 | 71 | |
72 | 72 | aes-ppc.s: asm/aes-ppc.pl |
73 | 73 | $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@ |
74 | +aesp8-ppc.s: asm/aesp8-ppc.pl | |
75 | + $(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@ | |
74 | 76 | |
75 | 77 | aes-parisc.s: asm/aes-parisc.pl |
76 | 78 | $(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@ |
@@ -548,7 +548,7 @@ Lenc_loop: | ||
548 | 548 | xor $s2,$t2,$acc14 |
549 | 549 | xor $s3,$t3,$acc15 |
550 | 550 | addi $key,$key,16 |
551 | - bdnz- Lenc_loop | |
551 | + bdnz Lenc_loop | |
552 | 552 | |
553 | 553 | addi $Tbl2,$Tbl0,2048 |
554 | 554 | nop |
@@ -982,7 +982,7 @@ Ldec_loop: | ||
982 | 982 | xor $s2,$t2,$acc14 |
983 | 983 | xor $s3,$t3,$acc15 |
984 | 984 | addi $key,$key,16 |
985 | - bdnz- Ldec_loop | |
985 | + bdnz Ldec_loop | |
986 | 986 | |
987 | 987 | addi $Tbl2,$Tbl0,2048 |
988 | 988 | nop |
@@ -0,0 +1,3726 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | +# | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | +# | |
10 | +# This module implements support for AES instructions as per PowerISA | |
11 | +# specification version 2.07, first implemented by POWER8 processor. | |
12 | +# The module is endian-agnostic in sense that it supports both big- | |
13 | +# and little-endian cases. Data alignment in parallelizable modes is | |
14 | +# handled with VSX loads and stores, which implies MSR.VSX flag being | |
15 | +# set. It should also be noted that ISA specification doesn't prohibit | |
16 | +# alignment exceptions for these instructions on page boundaries. | |
17 | +# Initially alignment was handled in pure AltiVec/VMX way [when data | |
18 | +# is aligned programmatically, which in turn guarantees exception- | |
19 | +# free execution], but it turned to hamper performance when vcipher | |
20 | +# instructions are interleaved. It's reckoned that eventual | |
21 | +# misalignment penalties at page boundaries are in average lower | |
22 | +# than additional overhead in pure AltiVec approach. | |
23 | +# | |
24 | +# May 2016 | |
25 | +# | |
26 | +# Add XTS subroutine, 9x on little- and 12x improvement on big-endian | |
27 | +# systems were measured. | |
28 | +# | |
29 | +###################################################################### | |
30 | +# Current large-block performance in cycles per byte processed with | |
31 | +# 128-bit key (less is better). | |
32 | +# | |
33 | +# CBC en-/decrypt CTR XTS | |
34 | +# POWER8[le] 3.96/0.72 0.74 1.1 | |
35 | +# POWER8[be] 3.75/0.65 0.66 1.0 | |
36 | + | |
37 | +$flavour = shift; | |
38 | + | |
39 | +if ($flavour =~ /64/) { | |
40 | + $SIZE_T =8; | |
41 | + $LRSAVE =2*$SIZE_T; | |
42 | + $STU ="stdu"; | |
43 | + $POP ="ld"; | |
44 | + $PUSH ="std"; | |
45 | + $UCMP ="cmpld"; | |
46 | + $SHL ="sldi"; | |
47 | +} elsif ($flavour =~ /32/) { | |
48 | + $SIZE_T =4; | |
49 | + $LRSAVE =$SIZE_T; | |
50 | + $STU ="stwu"; | |
51 | + $POP ="lwz"; | |
52 | + $PUSH ="stw"; | |
53 | + $UCMP ="cmplw"; | |
54 | + $SHL ="slwi"; | |
55 | +} else { die "nonsense $flavour"; } | |
56 | + | |
57 | +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; | |
58 | + | |
59 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
60 | +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
61 | +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
62 | +die "can't locate ppc-xlate.pl"; | |
63 | + | |
64 | +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | |
65 | + | |
66 | +$FRAME=8*$SIZE_T; | |
67 | +$prefix="aes_p8"; | |
68 | + | |
69 | +$sp="r1"; | |
70 | +$vrsave="r12"; | |
71 | + | |
72 | +######################################################################### | |
73 | +{{{ # Key setup procedures # | |
74 | +my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8)); | |
75 | +my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6)); | |
76 | +my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11)); | |
77 | + | |
78 | +$code.=<<___; | |
79 | +.machine "any" | |
80 | + | |
81 | +.text | |
82 | + | |
83 | +.align 7 | |
84 | +rcon: | |
85 | +.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev | |
86 | +.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev | |
87 | +.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev | |
88 | +.long 0,0,0,0 ?asis | |
89 | +Lconsts: | |
90 | + mflr r0 | |
91 | + bcl 20,31,\$+4 | |
92 | + mflr $ptr #vvvvv "distance between . and rcon | |
93 | + addi $ptr,$ptr,-0x48 | |
94 | + mtlr r0 | |
95 | + blr | |
96 | + .long 0 | |
97 | + .byte 0,12,0x14,0,0,0,0,0 | |
98 | +.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" | |
99 | + | |
100 | +.globl .${prefix}_set_encrypt_key | |
101 | +.align 5 | |
102 | +.${prefix}_set_encrypt_key: | |
103 | +Lset_encrypt_key: | |
104 | + mflr r11 | |
105 | + $PUSH r11,$LRSAVE($sp) | |
106 | + | |
107 | + li $ptr,-1 | |
108 | + ${UCMP}i $inp,0 | |
109 | + beq- Lenc_key_abort # if ($inp==0) return -1; | |
110 | + ${UCMP}i $out,0 | |
111 | + beq- Lenc_key_abort # if ($out==0) return -1; | |
112 | + li $ptr,-2 | |
113 | + cmpwi $bits,128 | |
114 | + blt- Lenc_key_abort | |
115 | + cmpwi $bits,256 | |
116 | + bgt- Lenc_key_abort | |
117 | + andi. r0,$bits,0x3f | |
118 | + bne- Lenc_key_abort | |
119 | + | |
120 | + lis r0,0xfff0 | |
121 | + mfspr $vrsave,256 | |
122 | + mtspr 256,r0 | |
123 | + | |
124 | + bl Lconsts | |
125 | + mtlr r11 | |
126 | + | |
127 | + neg r9,$inp | |
128 | + lvx $in0,0,$inp | |
129 | + addi $inp,$inp,15 # 15 is not typo | |
130 | + lvsr $key,0,r9 # borrow $key | |
131 | + li r8,0x20 | |
132 | + cmpwi $bits,192 | |
133 | + lvx $in1,0,$inp | |
134 | + le?vspltisb $mask,0x0f # borrow $mask | |
135 | + lvx $rcon,0,$ptr | |
136 | + le?vxor $key,$key,$mask # adjust for byte swap | |
137 | + lvx $mask,r8,$ptr | |
138 | + addi $ptr,$ptr,0x10 | |
139 | + vperm $in0,$in0,$in1,$key # align [and byte swap in LE] | |
140 | + li $cnt,8 | |
141 | + vxor $zero,$zero,$zero | |
142 | + mtctr $cnt | |
143 | + | |
144 | + ?lvsr $outperm,0,$out | |
145 | + vspltisb $outmask,-1 | |
146 | + lvx $outhead,0,$out | |
147 | + ?vperm $outmask,$zero,$outmask,$outperm | |
148 | + | |
149 | + blt Loop128 | |
150 | + addi $inp,$inp,8 | |
151 | + beq L192 | |
152 | + addi $inp,$inp,8 | |
153 | + b L256 | |
154 | + | |
155 | +.align 4 | |
156 | +Loop128: | |
157 | + vperm $key,$in0,$in0,$mask # rotate-n-splat | |
158 | + vsldoi $tmp,$zero,$in0,12 # >>32 | |
159 | + vperm $outtail,$in0,$in0,$outperm # rotate | |
160 | + vsel $stage,$outhead,$outtail,$outmask | |
161 | + vmr $outhead,$outtail | |
162 | + vcipherlast $key,$key,$rcon | |
163 | + stvx $stage,0,$out | |
164 | + addi $out,$out,16 | |
165 | + | |
166 | + vxor $in0,$in0,$tmp | |
167 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
168 | + vxor $in0,$in0,$tmp | |
169 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
170 | + vxor $in0,$in0,$tmp | |
171 | + vadduwm $rcon,$rcon,$rcon | |
172 | + vxor $in0,$in0,$key | |
173 | + bdnz Loop128 | |
174 | + | |
175 | + lvx $rcon,0,$ptr # last two round keys | |
176 | + | |
177 | + vperm $key,$in0,$in0,$mask # rotate-n-splat | |
178 | + vsldoi $tmp,$zero,$in0,12 # >>32 | |
179 | + vperm $outtail,$in0,$in0,$outperm # rotate | |
180 | + vsel $stage,$outhead,$outtail,$outmask | |
181 | + vmr $outhead,$outtail | |
182 | + vcipherlast $key,$key,$rcon | |
183 | + stvx $stage,0,$out | |
184 | + addi $out,$out,16 | |
185 | + | |
186 | + vxor $in0,$in0,$tmp | |
187 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
188 | + vxor $in0,$in0,$tmp | |
189 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
190 | + vxor $in0,$in0,$tmp | |
191 | + vadduwm $rcon,$rcon,$rcon | |
192 | + vxor $in0,$in0,$key | |
193 | + | |
194 | + vperm $key,$in0,$in0,$mask # rotate-n-splat | |
195 | + vsldoi $tmp,$zero,$in0,12 # >>32 | |
196 | + vperm $outtail,$in0,$in0,$outperm # rotate | |
197 | + vsel $stage,$outhead,$outtail,$outmask | |
198 | + vmr $outhead,$outtail | |
199 | + vcipherlast $key,$key,$rcon | |
200 | + stvx $stage,0,$out | |
201 | + addi $out,$out,16 | |
202 | + | |
203 | + vxor $in0,$in0,$tmp | |
204 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
205 | + vxor $in0,$in0,$tmp | |
206 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
207 | + vxor $in0,$in0,$tmp | |
208 | + vxor $in0,$in0,$key | |
209 | + vperm $outtail,$in0,$in0,$outperm # rotate | |
210 | + vsel $stage,$outhead,$outtail,$outmask | |
211 | + vmr $outhead,$outtail | |
212 | + stvx $stage,0,$out | |
213 | + | |
214 | + addi $inp,$out,15 # 15 is not typo | |
215 | + addi $out,$out,0x50 | |
216 | + | |
217 | + li $rounds,10 | |
218 | + b Ldone | |
219 | + | |
220 | +.align 4 | |
221 | +L192: | |
222 | + lvx $tmp,0,$inp | |
223 | + li $cnt,4 | |
224 | + vperm $outtail,$in0,$in0,$outperm # rotate | |
225 | + vsel $stage,$outhead,$outtail,$outmask | |
226 | + vmr $outhead,$outtail | |
227 | + stvx $stage,0,$out | |
228 | + addi $out,$out,16 | |
229 | + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] | |
230 | + vspltisb $key,8 # borrow $key | |
231 | + mtctr $cnt | |
232 | + vsububm $mask,$mask,$key # adjust the mask | |
233 | + | |
234 | +Loop192: | |
235 | + vperm $key,$in1,$in1,$mask # roate-n-splat | |
236 | + vsldoi $tmp,$zero,$in0,12 # >>32 | |
237 | + vcipherlast $key,$key,$rcon | |
238 | + | |
239 | + vxor $in0,$in0,$tmp | |
240 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
241 | + vxor $in0,$in0,$tmp | |
242 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
243 | + vxor $in0,$in0,$tmp | |
244 | + | |
245 | + vsldoi $stage,$zero,$in1,8 | |
246 | + vspltw $tmp,$in0,3 | |
247 | + vxor $tmp,$tmp,$in1 | |
248 | + vsldoi $in1,$zero,$in1,12 # >>32 | |
249 | + vadduwm $rcon,$rcon,$rcon | |
250 | + vxor $in1,$in1,$tmp | |
251 | + vxor $in0,$in0,$key | |
252 | + vxor $in1,$in1,$key | |
253 | + vsldoi $stage,$stage,$in0,8 | |
254 | + | |
255 | + vperm $key,$in1,$in1,$mask # rotate-n-splat | |
256 | + vsldoi $tmp,$zero,$in0,12 # >>32 | |
257 | + vperm $outtail,$stage,$stage,$outperm # rotate | |
258 | + vsel $stage,$outhead,$outtail,$outmask | |
259 | + vmr $outhead,$outtail | |
260 | + vcipherlast $key,$key,$rcon | |
261 | + stvx $stage,0,$out | |
262 | + addi $out,$out,16 | |
263 | + | |
264 | + vsldoi $stage,$in0,$in1,8 | |
265 | + vxor $in0,$in0,$tmp | |
266 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
267 | + vperm $outtail,$stage,$stage,$outperm # rotate | |
268 | + vsel $stage,$outhead,$outtail,$outmask | |
269 | + vmr $outhead,$outtail | |
270 | + vxor $in0,$in0,$tmp | |
271 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
272 | + vxor $in0,$in0,$tmp | |
273 | + stvx $stage,0,$out | |
274 | + addi $out,$out,16 | |
275 | + | |
276 | + vspltw $tmp,$in0,3 | |
277 | + vxor $tmp,$tmp,$in1 | |
278 | + vsldoi $in1,$zero,$in1,12 # >>32 | |
279 | + vadduwm $rcon,$rcon,$rcon | |
280 | + vxor $in1,$in1,$tmp | |
281 | + vxor $in0,$in0,$key | |
282 | + vxor $in1,$in1,$key | |
283 | + vperm $outtail,$in0,$in0,$outperm # rotate | |
284 | + vsel $stage,$outhead,$outtail,$outmask | |
285 | + vmr $outhead,$outtail | |
286 | + stvx $stage,0,$out | |
287 | + addi $inp,$out,15 # 15 is not typo | |
288 | + addi $out,$out,16 | |
289 | + bdnz Loop192 | |
290 | + | |
291 | + li $rounds,12 | |
292 | + addi $out,$out,0x20 | |
293 | + b Ldone | |
294 | + | |
295 | +.align 4 | |
296 | +L256: | |
297 | + lvx $tmp,0,$inp | |
298 | + li $cnt,7 | |
299 | + li $rounds,14 | |
300 | + vperm $outtail,$in0,$in0,$outperm # rotate | |
301 | + vsel $stage,$outhead,$outtail,$outmask | |
302 | + vmr $outhead,$outtail | |
303 | + stvx $stage,0,$out | |
304 | + addi $out,$out,16 | |
305 | + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] | |
306 | + mtctr $cnt | |
307 | + | |
308 | +Loop256: | |
309 | + vperm $key,$in1,$in1,$mask # rotate-n-splat | |
310 | + vsldoi $tmp,$zero,$in0,12 # >>32 | |
311 | + vperm $outtail,$in1,$in1,$outperm # rotate | |
312 | + vsel $stage,$outhead,$outtail,$outmask | |
313 | + vmr $outhead,$outtail | |
314 | + vcipherlast $key,$key,$rcon | |
315 | + stvx $stage,0,$out | |
316 | + addi $out,$out,16 | |
317 | + | |
318 | + vxor $in0,$in0,$tmp | |
319 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
320 | + vxor $in0,$in0,$tmp | |
321 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
322 | + vxor $in0,$in0,$tmp | |
323 | + vadduwm $rcon,$rcon,$rcon | |
324 | + vxor $in0,$in0,$key | |
325 | + vperm $outtail,$in0,$in0,$outperm # rotate | |
326 | + vsel $stage,$outhead,$outtail,$outmask | |
327 | + vmr $outhead,$outtail | |
328 | + stvx $stage,0,$out | |
329 | + addi $inp,$out,15 # 15 is not typo | |
330 | + addi $out,$out,16 | |
331 | + bdz Ldone | |
332 | + | |
333 | + vspltw $key,$in0,3 # just splat | |
334 | + vsldoi $tmp,$zero,$in1,12 # >>32 | |
335 | + vsbox $key,$key | |
336 | + | |
337 | + vxor $in1,$in1,$tmp | |
338 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
339 | + vxor $in1,$in1,$tmp | |
340 | + vsldoi $tmp,$zero,$tmp,12 # >>32 | |
341 | + vxor $in1,$in1,$tmp | |
342 | + | |
343 | + vxor $in1,$in1,$key | |
344 | + b Loop256 | |
345 | + | |
346 | +.align 4 | |
347 | +Ldone: | |
348 | + lvx $in1,0,$inp # redundant in aligned case | |
349 | + vsel $in1,$outhead,$in1,$outmask | |
350 | + stvx $in1,0,$inp | |
351 | + li $ptr,0 | |
352 | + mtspr 256,$vrsave | |
353 | + stw $rounds,0($out) | |
354 | + | |
355 | +Lenc_key_abort: | |
356 | + mr r3,$ptr | |
357 | + blr | |
358 | + .long 0 | |
359 | + .byte 0,12,0x14,1,0,0,3,0 | |
360 | + .long 0 | |
361 | +.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key | |
362 | + | |
363 | +.globl .${prefix}_set_decrypt_key | |
364 | +.align 5 | |
365 | +.${prefix}_set_decrypt_key: | |
366 | + $STU $sp,-$FRAME($sp) | |
367 | + mflr r10 | |
368 | + $PUSH r10,$FRAME+$LRSAVE($sp) | |
369 | + bl Lset_encrypt_key | |
370 | + mtlr r10 | |
371 | + | |
372 | + cmpwi r3,0 | |
373 | + bne- Ldec_key_abort | |
374 | + | |
375 | + slwi $cnt,$rounds,4 | |
376 | + subi $inp,$out,240 # first round key | |
377 | + srwi $rounds,$rounds,1 | |
378 | + add $out,$inp,$cnt # last round key | |
379 | + mtctr $rounds | |
380 | + | |
381 | +Ldeckey: | |
382 | + lwz r0, 0($inp) | |
383 | + lwz r6, 4($inp) | |
384 | + lwz r7, 8($inp) | |
385 | + lwz r8, 12($inp) | |
386 | + addi $inp,$inp,16 | |
387 | + lwz r9, 0($out) | |
388 | + lwz r10,4($out) | |
389 | + lwz r11,8($out) | |
390 | + lwz r12,12($out) | |
391 | + stw r0, 0($out) | |
392 | + stw r6, 4($out) | |
393 | + stw r7, 8($out) | |
394 | + stw r8, 12($out) | |
395 | + subi $out,$out,16 | |
396 | + stw r9, -16($inp) | |
397 | + stw r10,-12($inp) | |
398 | + stw r11,-8($inp) | |
399 | + stw r12,-4($inp) | |
400 | + bdnz Ldeckey | |
401 | + | |
402 | + xor r3,r3,r3 # return value | |
403 | +Ldec_key_abort: | |
404 | + addi $sp,$sp,$FRAME | |
405 | + blr | |
406 | + .long 0 | |
407 | + .byte 0,12,4,1,0x80,0,3,0 | |
408 | + .long 0 | |
409 | +.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key | |
410 | +___ | |
411 | +}}} | |
412 | +######################################################################### | |
413 | +{{{ # Single block en- and decrypt procedures # | |
414 | +sub gen_block () { | |
415 | +my $dir = shift; | |
416 | +my $n = $dir eq "de" ? "n" : ""; | |
417 | +my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7)); | |
418 | + | |
419 | +$code.=<<___; | |
420 | +.globl .${prefix}_${dir}crypt | |
421 | +.align 5 | |
422 | +.${prefix}_${dir}crypt: | |
423 | + lwz $rounds,240($key) | |
424 | + lis r0,0xfc00 | |
425 | + mfspr $vrsave,256 | |
426 | + li $idx,15 # 15 is not typo | |
427 | + mtspr 256,r0 | |
428 | + | |
429 | + lvx v0,0,$inp | |
430 | + neg r11,$out | |
431 | + lvx v1,$idx,$inp | |
432 | + lvsl v2,0,$inp # inpperm | |
433 | + le?vspltisb v4,0x0f | |
434 | + ?lvsl v3,0,r11 # outperm | |
435 | + le?vxor v2,v2,v4 | |
436 | + li $idx,16 | |
437 | + vperm v0,v0,v1,v2 # align [and byte swap in LE] | |
438 | + lvx v1,0,$key | |
439 | + ?lvsl v5,0,$key # keyperm | |
440 | + srwi $rounds,$rounds,1 | |
441 | + lvx v2,$idx,$key | |
442 | + addi $idx,$idx,16 | |
443 | + subi $rounds,$rounds,1 | |
444 | + ?vperm v1,v1,v2,v5 # align round key | |
445 | + | |
446 | + vxor v0,v0,v1 | |
447 | + lvx v1,$idx,$key | |
448 | + addi $idx,$idx,16 | |
449 | + mtctr $rounds | |
450 | + | |
451 | +Loop_${dir}c: | |
452 | + ?vperm v2,v2,v1,v5 | |
453 | + v${n}cipher v0,v0,v2 | |
454 | + lvx v2,$idx,$key | |
455 | + addi $idx,$idx,16 | |
456 | + ?vperm v1,v1,v2,v5 | |
457 | + v${n}cipher v0,v0,v1 | |
458 | + lvx v1,$idx,$key | |
459 | + addi $idx,$idx,16 | |
460 | + bdnz Loop_${dir}c | |
461 | + | |
462 | + ?vperm v2,v2,v1,v5 | |
463 | + v${n}cipher v0,v0,v2 | |
464 | + lvx v2,$idx,$key | |
465 | + ?vperm v1,v1,v2,v5 | |
466 | + v${n}cipherlast v0,v0,v1 | |
467 | + | |
468 | + vspltisb v2,-1 | |
469 | + vxor v1,v1,v1 | |
470 | + li $idx,15 # 15 is not typo | |
471 | + ?vperm v2,v1,v2,v3 # outmask | |
472 | + le?vxor v3,v3,v4 | |
473 | + lvx v1,0,$out # outhead | |
474 | + vperm v0,v0,v0,v3 # rotate [and byte swap in LE] | |
475 | + vsel v1,v1,v0,v2 | |
476 | + lvx v4,$idx,$out | |
477 | + stvx v1,0,$out | |
478 | + vsel v0,v0,v4,v2 | |
479 | + stvx v0,$idx,$out | |
480 | + | |
481 | + mtspr 256,$vrsave | |
482 | + blr | |
483 | + .long 0 | |
484 | + .byte 0,12,0x14,0,0,0,3,0 | |
485 | + .long 0 | |
486 | +.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt | |
487 | +___ | |
488 | +} | |
489 | +&gen_block("en"); | |
490 | +&gen_block("de"); | |
491 | +}}} | |
492 | +######################################################################### | |
493 | +{{{ # CBC en- and decrypt procedures # | |
494 | +my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10)); | |
495 | +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); | |
496 | +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)= | |
497 | + map("v$_",(4..10)); | |
498 | +$code.=<<___; | |
499 | +.globl .${prefix}_cbc_encrypt | |
500 | +.align 5 | |
501 | +.${prefix}_cbc_encrypt: | |
502 | + ${UCMP}i $len,16 | |
503 | + bltlr- | |
504 | + | |
505 | + cmpwi $enc,0 # test direction | |
506 | + lis r0,0xffe0 | |
507 | + mfspr $vrsave,256 | |
508 | + mtspr 256,r0 | |
509 | + | |
510 | + li $idx,15 | |
511 | + vxor $rndkey0,$rndkey0,$rndkey0 | |
512 | + le?vspltisb $tmp,0x0f | |
513 | + | |
514 | + lvx $ivec,0,$ivp # load [unaligned] iv | |
515 | + lvsl $inpperm,0,$ivp | |
516 | + lvx $inptail,$idx,$ivp | |
517 | + le?vxor $inpperm,$inpperm,$tmp | |
518 | + vperm $ivec,$ivec,$inptail,$inpperm | |
519 | + | |
520 | + neg r11,$inp | |
521 | + ?lvsl $keyperm,0,$key # prepare for unaligned key | |
522 | + lwz $rounds,240($key) | |
523 | + | |
524 | + lvsr $inpperm,0,r11 # prepare for unaligned load | |
525 | + lvx $inptail,0,$inp | |
526 | + addi $inp,$inp,15 # 15 is not typo | |
527 | + le?vxor $inpperm,$inpperm,$tmp | |
528 | + | |
529 | + ?lvsr $outperm,0,$out # prepare for unaligned store | |
530 | + vspltisb $outmask,-1 | |
531 | + lvx $outhead,0,$out | |
532 | + ?vperm $outmask,$rndkey0,$outmask,$outperm | |
533 | + le?vxor $outperm,$outperm,$tmp | |
534 | + | |
535 | + srwi $rounds,$rounds,1 | |
536 | + li $idx,16 | |
537 | + subi $rounds,$rounds,1 | |
538 | + beq Lcbc_dec | |
539 | + | |
540 | +Lcbc_enc: | |
541 | + vmr $inout,$inptail | |
542 | + lvx $inptail,0,$inp | |
543 | + addi $inp,$inp,16 | |
544 | + mtctr $rounds | |
545 | + subi $len,$len,16 # len-=16 | |
546 | + | |
547 | + lvx $rndkey0,0,$key | |
548 | + vperm $inout,$inout,$inptail,$inpperm | |
549 | + lvx $rndkey1,$idx,$key | |
550 | + addi $idx,$idx,16 | |
551 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
552 | + vxor $inout,$inout,$rndkey0 | |
553 | + lvx $rndkey0,$idx,$key | |
554 | + addi $idx,$idx,16 | |
555 | + vxor $inout,$inout,$ivec | |
556 | + | |
557 | +Loop_cbc_enc: | |
558 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
559 | + vcipher $inout,$inout,$rndkey1 | |
560 | + lvx $rndkey1,$idx,$key | |
561 | + addi $idx,$idx,16 | |
562 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
563 | + vcipher $inout,$inout,$rndkey0 | |
564 | + lvx $rndkey0,$idx,$key | |
565 | + addi $idx,$idx,16 | |
566 | + bdnz Loop_cbc_enc | |
567 | + | |
568 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
569 | + vcipher $inout,$inout,$rndkey1 | |
570 | + lvx $rndkey1,$idx,$key | |
571 | + li $idx,16 | |
572 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
573 | + vcipherlast $ivec,$inout,$rndkey0 | |
574 | + ${UCMP}i $len,16 | |
575 | + | |
576 | + vperm $tmp,$ivec,$ivec,$outperm | |
577 | + vsel $inout,$outhead,$tmp,$outmask | |
578 | + vmr $outhead,$tmp | |
579 | + stvx $inout,0,$out | |
580 | + addi $out,$out,16 | |
581 | + bge Lcbc_enc | |
582 | + | |
583 | + b Lcbc_done | |
584 | + | |
585 | +.align 4 | |
586 | +Lcbc_dec: | |
587 | + ${UCMP}i $len,128 | |
588 | + bge _aesp8_cbc_decrypt8x | |
589 | + vmr $tmp,$inptail | |
590 | + lvx $inptail,0,$inp | |
591 | + addi $inp,$inp,16 | |
592 | + mtctr $rounds | |
593 | + subi $len,$len,16 # len-=16 | |
594 | + | |
595 | + lvx $rndkey0,0,$key | |
596 | + vperm $tmp,$tmp,$inptail,$inpperm | |
597 | + lvx $rndkey1,$idx,$key | |
598 | + addi $idx,$idx,16 | |
599 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
600 | + vxor $inout,$tmp,$rndkey0 | |
601 | + lvx $rndkey0,$idx,$key | |
602 | + addi $idx,$idx,16 | |
603 | + | |
604 | +Loop_cbc_dec: | |
605 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
606 | + vncipher $inout,$inout,$rndkey1 | |
607 | + lvx $rndkey1,$idx,$key | |
608 | + addi $idx,$idx,16 | |
609 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
610 | + vncipher $inout,$inout,$rndkey0 | |
611 | + lvx $rndkey0,$idx,$key | |
612 | + addi $idx,$idx,16 | |
613 | + bdnz Loop_cbc_dec | |
614 | + | |
615 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
616 | + vncipher $inout,$inout,$rndkey1 | |
617 | + lvx $rndkey1,$idx,$key | |
618 | + li $idx,16 | |
619 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
620 | + vncipherlast $inout,$inout,$rndkey0 | |
621 | + ${UCMP}i $len,16 | |
622 | + | |
623 | + vxor $inout,$inout,$ivec | |
624 | + vmr $ivec,$tmp | |
625 | + vperm $tmp,$inout,$inout,$outperm | |
626 | + vsel $inout,$outhead,$tmp,$outmask | |
627 | + vmr $outhead,$tmp | |
628 | + stvx $inout,0,$out | |
629 | + addi $out,$out,16 | |
630 | + bge Lcbc_dec | |
631 | + | |
632 | +Lcbc_done: | |
633 | + addi $out,$out,-1 | |
634 | + lvx $inout,0,$out # redundant in aligned case | |
635 | + vsel $inout,$outhead,$inout,$outmask | |
636 | + stvx $inout,0,$out | |
637 | + | |
638 | + neg $enc,$ivp # write [unaligned] iv | |
639 | + li $idx,15 # 15 is not typo | |
640 | + vxor $rndkey0,$rndkey0,$rndkey0 | |
641 | + vspltisb $outmask,-1 | |
642 | + le?vspltisb $tmp,0x0f | |
643 | + ?lvsl $outperm,0,$enc | |
644 | + ?vperm $outmask,$rndkey0,$outmask,$outperm | |
645 | + le?vxor $outperm,$outperm,$tmp | |
646 | + lvx $outhead,0,$ivp | |
647 | + vperm $ivec,$ivec,$ivec,$outperm | |
648 | + vsel $inout,$outhead,$ivec,$outmask | |
649 | + lvx $inptail,$idx,$ivp | |
650 | + stvx $inout,0,$ivp | |
651 | + vsel $inout,$ivec,$inptail,$outmask | |
652 | + stvx $inout,$idx,$ivp | |
653 | + | |
654 | + mtspr 256,$vrsave | |
655 | + blr | |
656 | + .long 0 | |
657 | + .byte 0,12,0x14,0,0,0,6,0 | |
658 | + .long 0 | |
659 | +___ | |
660 | +######################################################################### | |
661 | +{{ # Optimized CBC decrypt procedure # | |
662 | +my $key_="r11"; | |
663 | +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); | |
664 | + $x00=0 if ($flavour =~ /osx/); | |
665 | +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13)); | |
666 | +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21)); | |
667 | +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys | |
668 | + # v26-v31 last 6 round keys | |
669 | +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment | |
670 | + | |
671 | +$code.=<<___; | |
672 | +.align 5 | |
673 | +_aesp8_cbc_decrypt8x: | |
674 | + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) | |
675 | + li r10,`$FRAME+8*16+15` | |
676 | + li r11,`$FRAME+8*16+31` | |
677 | + stvx v20,r10,$sp # ABI says so | |
678 | + addi r10,r10,32 | |
679 | + stvx v21,r11,$sp | |
680 | + addi r11,r11,32 | |
681 | + stvx v22,r10,$sp | |
682 | + addi r10,r10,32 | |
683 | + stvx v23,r11,$sp | |
684 | + addi r11,r11,32 | |
685 | + stvx v24,r10,$sp | |
686 | + addi r10,r10,32 | |
687 | + stvx v25,r11,$sp | |
688 | + addi r11,r11,32 | |
689 | + stvx v26,r10,$sp | |
690 | + addi r10,r10,32 | |
691 | + stvx v27,r11,$sp | |
692 | + addi r11,r11,32 | |
693 | + stvx v28,r10,$sp | |
694 | + addi r10,r10,32 | |
695 | + stvx v29,r11,$sp | |
696 | + addi r11,r11,32 | |
697 | + stvx v30,r10,$sp | |
698 | + stvx v31,r11,$sp | |
699 | + li r0,-1 | |
700 | + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave | |
701 | + li $x10,0x10 | |
702 | + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
703 | + li $x20,0x20 | |
704 | + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
705 | + li $x30,0x30 | |
706 | + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
707 | + li $x40,0x40 | |
708 | + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
709 | + li $x50,0x50 | |
710 | + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
711 | + li $x60,0x60 | |
712 | + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
713 | + li $x70,0x70 | |
714 | + mtspr 256,r0 | |
715 | + | |
716 | + subi $rounds,$rounds,3 # -4 in total | |
717 | + subi $len,$len,128 # bias | |
718 | + | |
719 | + lvx $rndkey0,$x00,$key # load key schedule | |
720 | + lvx v30,$x10,$key | |
721 | + addi $key,$key,0x20 | |
722 | + lvx v31,$x00,$key | |
723 | + ?vperm $rndkey0,$rndkey0,v30,$keyperm | |
724 | + addi $key_,$sp,$FRAME+15 | |
725 | + mtctr $rounds | |
726 | + | |
727 | +Load_cbc_dec_key: | |
728 | + ?vperm v24,v30,v31,$keyperm | |
729 | + lvx v30,$x10,$key | |
730 | + addi $key,$key,0x20 | |
731 | + stvx v24,$x00,$key_ # off-load round[1] | |
732 | + ?vperm v25,v31,v30,$keyperm | |
733 | + lvx v31,$x00,$key | |
734 | + stvx v25,$x10,$key_ # off-load round[2] | |
735 | + addi $key_,$key_,0x20 | |
736 | + bdnz Load_cbc_dec_key | |
737 | + | |
738 | + lvx v26,$x10,$key | |
739 | + ?vperm v24,v30,v31,$keyperm | |
740 | + lvx v27,$x20,$key | |
741 | + stvx v24,$x00,$key_ # off-load round[3] | |
742 | + ?vperm v25,v31,v26,$keyperm | |
743 | + lvx v28,$x30,$key | |
744 | + stvx v25,$x10,$key_ # off-load round[4] | |
745 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
746 | + ?vperm v26,v26,v27,$keyperm | |
747 | + lvx v29,$x40,$key | |
748 | + ?vperm v27,v27,v28,$keyperm | |
749 | + lvx v30,$x50,$key | |
750 | + ?vperm v28,v28,v29,$keyperm | |
751 | + lvx v31,$x60,$key | |
752 | + ?vperm v29,v29,v30,$keyperm | |
753 | + lvx $out0,$x70,$key # borrow $out0 | |
754 | + ?vperm v30,v30,v31,$keyperm | |
755 | + lvx v24,$x00,$key_ # pre-load round[1] | |
756 | + ?vperm v31,v31,$out0,$keyperm | |
757 | + lvx v25,$x10,$key_ # pre-load round[2] | |
758 | + | |
759 | + #lvx $inptail,0,$inp # "caller" already did this | |
760 | + #addi $inp,$inp,15 # 15 is not typo | |
761 | + subi $inp,$inp,15 # undo "caller" | |
762 | + | |
763 | + le?li $idx,8 | |
764 | + lvx_u $in0,$x00,$inp # load first 8 "words" | |
765 | + le?lvsl $inpperm,0,$idx | |
766 | + le?vspltisb $tmp,0x0f | |
767 | + lvx_u $in1,$x10,$inp | |
768 | + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u | |
769 | + lvx_u $in2,$x20,$inp | |
770 | + le?vperm $in0,$in0,$in0,$inpperm | |
771 | + lvx_u $in3,$x30,$inp | |
772 | + le?vperm $in1,$in1,$in1,$inpperm | |
773 | + lvx_u $in4,$x40,$inp | |
774 | + le?vperm $in2,$in2,$in2,$inpperm | |
775 | + vxor $out0,$in0,$rndkey0 | |
776 | + lvx_u $in5,$x50,$inp | |
777 | + le?vperm $in3,$in3,$in3,$inpperm | |
778 | + vxor $out1,$in1,$rndkey0 | |
779 | + lvx_u $in6,$x60,$inp | |
780 | + le?vperm $in4,$in4,$in4,$inpperm | |
781 | + vxor $out2,$in2,$rndkey0 | |
782 | + lvx_u $in7,$x70,$inp | |
783 | + addi $inp,$inp,0x80 | |
784 | + le?vperm $in5,$in5,$in5,$inpperm | |
785 | + vxor $out3,$in3,$rndkey0 | |
786 | + le?vperm $in6,$in6,$in6,$inpperm | |
787 | + vxor $out4,$in4,$rndkey0 | |
788 | + le?vperm $in7,$in7,$in7,$inpperm | |
789 | + vxor $out5,$in5,$rndkey0 | |
790 | + vxor $out6,$in6,$rndkey0 | |
791 | + vxor $out7,$in7,$rndkey0 | |
792 | + | |
793 | + mtctr $rounds | |
794 | + b Loop_cbc_dec8x | |
795 | +.align 5 | |
796 | +Loop_cbc_dec8x: | |
797 | + vncipher $out0,$out0,v24 | |
798 | + vncipher $out1,$out1,v24 | |
799 | + vncipher $out2,$out2,v24 | |
800 | + vncipher $out3,$out3,v24 | |
801 | + vncipher $out4,$out4,v24 | |
802 | + vncipher $out5,$out5,v24 | |
803 | + vncipher $out6,$out6,v24 | |
804 | + vncipher $out7,$out7,v24 | |
805 | + lvx v24,$x20,$key_ # round[3] | |
806 | + addi $key_,$key_,0x20 | |
807 | + | |
808 | + vncipher $out0,$out0,v25 | |
809 | + vncipher $out1,$out1,v25 | |
810 | + vncipher $out2,$out2,v25 | |
811 | + vncipher $out3,$out3,v25 | |
812 | + vncipher $out4,$out4,v25 | |
813 | + vncipher $out5,$out5,v25 | |
814 | + vncipher $out6,$out6,v25 | |
815 | + vncipher $out7,$out7,v25 | |
816 | + lvx v25,$x10,$key_ # round[4] | |
817 | + bdnz Loop_cbc_dec8x | |
818 | + | |
819 | + subic $len,$len,128 # $len-=128 | |
820 | + vncipher $out0,$out0,v24 | |
821 | + vncipher $out1,$out1,v24 | |
822 | + vncipher $out2,$out2,v24 | |
823 | + vncipher $out3,$out3,v24 | |
824 | + vncipher $out4,$out4,v24 | |
825 | + vncipher $out5,$out5,v24 | |
826 | + vncipher $out6,$out6,v24 | |
827 | + vncipher $out7,$out7,v24 | |
828 | + | |
829 | + subfe. r0,r0,r0 # borrow?-1:0 | |
830 | + vncipher $out0,$out0,v25 | |
831 | + vncipher $out1,$out1,v25 | |
832 | + vncipher $out2,$out2,v25 | |
833 | + vncipher $out3,$out3,v25 | |
834 | + vncipher $out4,$out4,v25 | |
835 | + vncipher $out5,$out5,v25 | |
836 | + vncipher $out6,$out6,v25 | |
837 | + vncipher $out7,$out7,v25 | |
838 | + | |
839 | + and r0,r0,$len | |
840 | + vncipher $out0,$out0,v26 | |
841 | + vncipher $out1,$out1,v26 | |
842 | + vncipher $out2,$out2,v26 | |
843 | + vncipher $out3,$out3,v26 | |
844 | + vncipher $out4,$out4,v26 | |
845 | + vncipher $out5,$out5,v26 | |
846 | + vncipher $out6,$out6,v26 | |
847 | + vncipher $out7,$out7,v26 | |
848 | + | |
849 | + add $inp,$inp,r0 # $inp is adjusted in such | |
850 | + # way that at exit from the | |
851 | + # loop inX-in7 are loaded | |
852 | + # with last "words" | |
853 | + vncipher $out0,$out0,v27 | |
854 | + vncipher $out1,$out1,v27 | |
855 | + vncipher $out2,$out2,v27 | |
856 | + vncipher $out3,$out3,v27 | |
857 | + vncipher $out4,$out4,v27 | |
858 | + vncipher $out5,$out5,v27 | |
859 | + vncipher $out6,$out6,v27 | |
860 | + vncipher $out7,$out7,v27 | |
861 | + | |
862 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
863 | + vncipher $out0,$out0,v28 | |
864 | + vncipher $out1,$out1,v28 | |
865 | + vncipher $out2,$out2,v28 | |
866 | + vncipher $out3,$out3,v28 | |
867 | + vncipher $out4,$out4,v28 | |
868 | + vncipher $out5,$out5,v28 | |
869 | + vncipher $out6,$out6,v28 | |
870 | + vncipher $out7,$out7,v28 | |
871 | + lvx v24,$x00,$key_ # re-pre-load round[1] | |
872 | + | |
873 | + vncipher $out0,$out0,v29 | |
874 | + vncipher $out1,$out1,v29 | |
875 | + vncipher $out2,$out2,v29 | |
876 | + vncipher $out3,$out3,v29 | |
877 | + vncipher $out4,$out4,v29 | |
878 | + vncipher $out5,$out5,v29 | |
879 | + vncipher $out6,$out6,v29 | |
880 | + vncipher $out7,$out7,v29 | |
881 | + lvx v25,$x10,$key_ # re-pre-load round[2] | |
882 | + | |
883 | + vncipher $out0,$out0,v30 | |
884 | + vxor $ivec,$ivec,v31 # xor with last round key | |
885 | + vncipher $out1,$out1,v30 | |
886 | + vxor $in0,$in0,v31 | |
887 | + vncipher $out2,$out2,v30 | |
888 | + vxor $in1,$in1,v31 | |
889 | + vncipher $out3,$out3,v30 | |
890 | + vxor $in2,$in2,v31 | |
891 | + vncipher $out4,$out4,v30 | |
892 | + vxor $in3,$in3,v31 | |
893 | + vncipher $out5,$out5,v30 | |
894 | + vxor $in4,$in4,v31 | |
895 | + vncipher $out6,$out6,v30 | |
896 | + vxor $in5,$in5,v31 | |
897 | + vncipher $out7,$out7,v30 | |
898 | + vxor $in6,$in6,v31 | |
899 | + | |
900 | + vncipherlast $out0,$out0,$ivec | |
901 | + vncipherlast $out1,$out1,$in0 | |
902 | + lvx_u $in0,$x00,$inp # load next input block | |
903 | + vncipherlast $out2,$out2,$in1 | |
904 | + lvx_u $in1,$x10,$inp | |
905 | + vncipherlast $out3,$out3,$in2 | |
906 | + le?vperm $in0,$in0,$in0,$inpperm | |
907 | + lvx_u $in2,$x20,$inp | |
908 | + vncipherlast $out4,$out4,$in3 | |
909 | + le?vperm $in1,$in1,$in1,$inpperm | |
910 | + lvx_u $in3,$x30,$inp | |
911 | + vncipherlast $out5,$out5,$in4 | |
912 | + le?vperm $in2,$in2,$in2,$inpperm | |
913 | + lvx_u $in4,$x40,$inp | |
914 | + vncipherlast $out6,$out6,$in5 | |
915 | + le?vperm $in3,$in3,$in3,$inpperm | |
916 | + lvx_u $in5,$x50,$inp | |
917 | + vncipherlast $out7,$out7,$in6 | |
918 | + le?vperm $in4,$in4,$in4,$inpperm | |
919 | + lvx_u $in6,$x60,$inp | |
920 | + vmr $ivec,$in7 | |
921 | + le?vperm $in5,$in5,$in5,$inpperm | |
922 | + lvx_u $in7,$x70,$inp | |
923 | + addi $inp,$inp,0x80 | |
924 | + | |
925 | + le?vperm $out0,$out0,$out0,$inpperm | |
926 | + le?vperm $out1,$out1,$out1,$inpperm | |
927 | + stvx_u $out0,$x00,$out | |
928 | + le?vperm $in6,$in6,$in6,$inpperm | |
929 | + vxor $out0,$in0,$rndkey0 | |
930 | + le?vperm $out2,$out2,$out2,$inpperm | |
931 | + stvx_u $out1,$x10,$out | |
932 | + le?vperm $in7,$in7,$in7,$inpperm | |
933 | + vxor $out1,$in1,$rndkey0 | |
934 | + le?vperm $out3,$out3,$out3,$inpperm | |
935 | + stvx_u $out2,$x20,$out | |
936 | + vxor $out2,$in2,$rndkey0 | |
937 | + le?vperm $out4,$out4,$out4,$inpperm | |
938 | + stvx_u $out3,$x30,$out | |
939 | + vxor $out3,$in3,$rndkey0 | |
940 | + le?vperm $out5,$out5,$out5,$inpperm | |
941 | + stvx_u $out4,$x40,$out | |
942 | + vxor $out4,$in4,$rndkey0 | |
943 | + le?vperm $out6,$out6,$out6,$inpperm | |
944 | + stvx_u $out5,$x50,$out | |
945 | + vxor $out5,$in5,$rndkey0 | |
946 | + le?vperm $out7,$out7,$out7,$inpperm | |
947 | + stvx_u $out6,$x60,$out | |
948 | + vxor $out6,$in6,$rndkey0 | |
949 | + stvx_u $out7,$x70,$out | |
950 | + addi $out,$out,0x80 | |
951 | + vxor $out7,$in7,$rndkey0 | |
952 | + | |
953 | + mtctr $rounds | |
954 | + beq Loop_cbc_dec8x # did $len-=128 borrow? | |
955 | + | |
956 | + addic. $len,$len,128 | |
957 | + beq Lcbc_dec8x_done | |
958 | + nop | |
959 | + nop | |
960 | + | |
961 | +Loop_cbc_dec8x_tail: # up to 7 "words" tail... | |
962 | + vncipher $out1,$out1,v24 | |
963 | + vncipher $out2,$out2,v24 | |
964 | + vncipher $out3,$out3,v24 | |
965 | + vncipher $out4,$out4,v24 | |
966 | + vncipher $out5,$out5,v24 | |
967 | + vncipher $out6,$out6,v24 | |
968 | + vncipher $out7,$out7,v24 | |
969 | + lvx v24,$x20,$key_ # round[3] | |
970 | + addi $key_,$key_,0x20 | |
971 | + | |
972 | + vncipher $out1,$out1,v25 | |
973 | + vncipher $out2,$out2,v25 | |
974 | + vncipher $out3,$out3,v25 | |
975 | + vncipher $out4,$out4,v25 | |
976 | + vncipher $out5,$out5,v25 | |
977 | + vncipher $out6,$out6,v25 | |
978 | + vncipher $out7,$out7,v25 | |
979 | + lvx v25,$x10,$key_ # round[4] | |
980 | + bdnz Loop_cbc_dec8x_tail | |
981 | + | |
982 | + vncipher $out1,$out1,v24 | |
983 | + vncipher $out2,$out2,v24 | |
984 | + vncipher $out3,$out3,v24 | |
985 | + vncipher $out4,$out4,v24 | |
986 | + vncipher $out5,$out5,v24 | |
987 | + vncipher $out6,$out6,v24 | |
988 | + vncipher $out7,$out7,v24 | |
989 | + | |
990 | + vncipher $out1,$out1,v25 | |
991 | + vncipher $out2,$out2,v25 | |
992 | + vncipher $out3,$out3,v25 | |
993 | + vncipher $out4,$out4,v25 | |
994 | + vncipher $out5,$out5,v25 | |
995 | + vncipher $out6,$out6,v25 | |
996 | + vncipher $out7,$out7,v25 | |
997 | + | |
998 | + vncipher $out1,$out1,v26 | |
999 | + vncipher $out2,$out2,v26 | |
1000 | + vncipher $out3,$out3,v26 | |
1001 | + vncipher $out4,$out4,v26 | |
1002 | + vncipher $out5,$out5,v26 | |
1003 | + vncipher $out6,$out6,v26 | |
1004 | + vncipher $out7,$out7,v26 | |
1005 | + | |
1006 | + vncipher $out1,$out1,v27 | |
1007 | + vncipher $out2,$out2,v27 | |
1008 | + vncipher $out3,$out3,v27 | |
1009 | + vncipher $out4,$out4,v27 | |
1010 | + vncipher $out5,$out5,v27 | |
1011 | + vncipher $out6,$out6,v27 | |
1012 | + vncipher $out7,$out7,v27 | |
1013 | + | |
1014 | + vncipher $out1,$out1,v28 | |
1015 | + vncipher $out2,$out2,v28 | |
1016 | + vncipher $out3,$out3,v28 | |
1017 | + vncipher $out4,$out4,v28 | |
1018 | + vncipher $out5,$out5,v28 | |
1019 | + vncipher $out6,$out6,v28 | |
1020 | + vncipher $out7,$out7,v28 | |
1021 | + | |
1022 | + vncipher $out1,$out1,v29 | |
1023 | + vncipher $out2,$out2,v29 | |
1024 | + vncipher $out3,$out3,v29 | |
1025 | + vncipher $out4,$out4,v29 | |
1026 | + vncipher $out5,$out5,v29 | |
1027 | + vncipher $out6,$out6,v29 | |
1028 | + vncipher $out7,$out7,v29 | |
1029 | + | |
1030 | + vncipher $out1,$out1,v30 | |
1031 | + vxor $ivec,$ivec,v31 # last round key | |
1032 | + vncipher $out2,$out2,v30 | |
1033 | + vxor $in1,$in1,v31 | |
1034 | + vncipher $out3,$out3,v30 | |
1035 | + vxor $in2,$in2,v31 | |
1036 | + vncipher $out4,$out4,v30 | |
1037 | + vxor $in3,$in3,v31 | |
1038 | + vncipher $out5,$out5,v30 | |
1039 | + vxor $in4,$in4,v31 | |
1040 | + vncipher $out6,$out6,v30 | |
1041 | + vxor $in5,$in5,v31 | |
1042 | + vncipher $out7,$out7,v30 | |
1043 | + vxor $in6,$in6,v31 | |
1044 | + | |
1045 | + cmplwi $len,32 # switch($len) | |
1046 | + blt Lcbc_dec8x_one | |
1047 | + nop | |
1048 | + beq Lcbc_dec8x_two | |
1049 | + cmplwi $len,64 | |
1050 | + blt Lcbc_dec8x_three | |
1051 | + nop | |
1052 | + beq Lcbc_dec8x_four | |
1053 | + cmplwi $len,96 | |
1054 | + blt Lcbc_dec8x_five | |
1055 | + nop | |
1056 | + beq Lcbc_dec8x_six | |
1057 | + | |
1058 | +Lcbc_dec8x_seven: | |
1059 | + vncipherlast $out1,$out1,$ivec | |
1060 | + vncipherlast $out2,$out2,$in1 | |
1061 | + vncipherlast $out3,$out3,$in2 | |
1062 | + vncipherlast $out4,$out4,$in3 | |
1063 | + vncipherlast $out5,$out5,$in4 | |
1064 | + vncipherlast $out6,$out6,$in5 | |
1065 | + vncipherlast $out7,$out7,$in6 | |
1066 | + vmr $ivec,$in7 | |
1067 | + | |
1068 | + le?vperm $out1,$out1,$out1,$inpperm | |
1069 | + le?vperm $out2,$out2,$out2,$inpperm | |
1070 | + stvx_u $out1,$x00,$out | |
1071 | + le?vperm $out3,$out3,$out3,$inpperm | |
1072 | + stvx_u $out2,$x10,$out | |
1073 | + le?vperm $out4,$out4,$out4,$inpperm | |
1074 | + stvx_u $out3,$x20,$out | |
1075 | + le?vperm $out5,$out5,$out5,$inpperm | |
1076 | + stvx_u $out4,$x30,$out | |
1077 | + le?vperm $out6,$out6,$out6,$inpperm | |
1078 | + stvx_u $out5,$x40,$out | |
1079 | + le?vperm $out7,$out7,$out7,$inpperm | |
1080 | + stvx_u $out6,$x50,$out | |
1081 | + stvx_u $out7,$x60,$out | |
1082 | + addi $out,$out,0x70 | |
1083 | + b Lcbc_dec8x_done | |
1084 | + | |
1085 | +.align 5 | |
1086 | +Lcbc_dec8x_six: | |
1087 | + vncipherlast $out2,$out2,$ivec | |
1088 | + vncipherlast $out3,$out3,$in2 | |
1089 | + vncipherlast $out4,$out4,$in3 | |
1090 | + vncipherlast $out5,$out5,$in4 | |
1091 | + vncipherlast $out6,$out6,$in5 | |
1092 | + vncipherlast $out7,$out7,$in6 | |
1093 | + vmr $ivec,$in7 | |
1094 | + | |
1095 | + le?vperm $out2,$out2,$out2,$inpperm | |
1096 | + le?vperm $out3,$out3,$out3,$inpperm | |
1097 | + stvx_u $out2,$x00,$out | |
1098 | + le?vperm $out4,$out4,$out4,$inpperm | |
1099 | + stvx_u $out3,$x10,$out | |
1100 | + le?vperm $out5,$out5,$out5,$inpperm | |
1101 | + stvx_u $out4,$x20,$out | |
1102 | + le?vperm $out6,$out6,$out6,$inpperm | |
1103 | + stvx_u $out5,$x30,$out | |
1104 | + le?vperm $out7,$out7,$out7,$inpperm | |
1105 | + stvx_u $out6,$x40,$out | |
1106 | + stvx_u $out7,$x50,$out | |
1107 | + addi $out,$out,0x60 | |
1108 | + b Lcbc_dec8x_done | |
1109 | + | |
1110 | +.align 5 | |
1111 | +Lcbc_dec8x_five: | |
1112 | + vncipherlast $out3,$out3,$ivec | |
1113 | + vncipherlast $out4,$out4,$in3 | |
1114 | + vncipherlast $out5,$out5,$in4 | |
1115 | + vncipherlast $out6,$out6,$in5 | |
1116 | + vncipherlast $out7,$out7,$in6 | |
1117 | + vmr $ivec,$in7 | |
1118 | + | |
1119 | + le?vperm $out3,$out3,$out3,$inpperm | |
1120 | + le?vperm $out4,$out4,$out4,$inpperm | |
1121 | + stvx_u $out3,$x00,$out | |
1122 | + le?vperm $out5,$out5,$out5,$inpperm | |
1123 | + stvx_u $out4,$x10,$out | |
1124 | + le?vperm $out6,$out6,$out6,$inpperm | |
1125 | + stvx_u $out5,$x20,$out | |
1126 | + le?vperm $out7,$out7,$out7,$inpperm | |
1127 | + stvx_u $out6,$x30,$out | |
1128 | + stvx_u $out7,$x40,$out | |
1129 | + addi $out,$out,0x50 | |
1130 | + b Lcbc_dec8x_done | |
1131 | + | |
1132 | +.align 5 | |
1133 | +Lcbc_dec8x_four: | |
1134 | + vncipherlast $out4,$out4,$ivec | |
1135 | + vncipherlast $out5,$out5,$in4 | |
1136 | + vncipherlast $out6,$out6,$in5 | |
1137 | + vncipherlast $out7,$out7,$in6 | |
1138 | + vmr $ivec,$in7 | |
1139 | + | |
1140 | + le?vperm $out4,$out4,$out4,$inpperm | |
1141 | + le?vperm $out5,$out5,$out5,$inpperm | |
1142 | + stvx_u $out4,$x00,$out | |
1143 | + le?vperm $out6,$out6,$out6,$inpperm | |
1144 | + stvx_u $out5,$x10,$out | |
1145 | + le?vperm $out7,$out7,$out7,$inpperm | |
1146 | + stvx_u $out6,$x20,$out | |
1147 | + stvx_u $out7,$x30,$out | |
1148 | + addi $out,$out,0x40 | |
1149 | + b Lcbc_dec8x_done | |
1150 | + | |
1151 | +.align 5 | |
1152 | +Lcbc_dec8x_three: | |
1153 | + vncipherlast $out5,$out5,$ivec | |
1154 | + vncipherlast $out6,$out6,$in5 | |
1155 | + vncipherlast $out7,$out7,$in6 | |
1156 | + vmr $ivec,$in7 | |
1157 | + | |
1158 | + le?vperm $out5,$out5,$out5,$inpperm | |
1159 | + le?vperm $out6,$out6,$out6,$inpperm | |
1160 | + stvx_u $out5,$x00,$out | |
1161 | + le?vperm $out7,$out7,$out7,$inpperm | |
1162 | + stvx_u $out6,$x10,$out | |
1163 | + stvx_u $out7,$x20,$out | |
1164 | + addi $out,$out,0x30 | |
1165 | + b Lcbc_dec8x_done | |
1166 | + | |
1167 | +.align 5 | |
1168 | +Lcbc_dec8x_two: | |
1169 | + vncipherlast $out6,$out6,$ivec | |
1170 | + vncipherlast $out7,$out7,$in6 | |
1171 | + vmr $ivec,$in7 | |
1172 | + | |
1173 | + le?vperm $out6,$out6,$out6,$inpperm | |
1174 | + le?vperm $out7,$out7,$out7,$inpperm | |
1175 | + stvx_u $out6,$x00,$out | |
1176 | + stvx_u $out7,$x10,$out | |
1177 | + addi $out,$out,0x20 | |
1178 | + b Lcbc_dec8x_done | |
1179 | + | |
1180 | +.align 5 | |
1181 | +Lcbc_dec8x_one: | |
1182 | + vncipherlast $out7,$out7,$ivec | |
1183 | + vmr $ivec,$in7 | |
1184 | + | |
1185 | + le?vperm $out7,$out7,$out7,$inpperm | |
1186 | + stvx_u $out7,0,$out | |
1187 | + addi $out,$out,0x10 | |
1188 | + | |
1189 | +Lcbc_dec8x_done: | |
1190 | + le?vperm $ivec,$ivec,$ivec,$inpperm | |
1191 | + stvx_u $ivec,0,$ivp # write [unaligned] iv | |
1192 | + | |
1193 | + li r10,`$FRAME+15` | |
1194 | + li r11,`$FRAME+31` | |
1195 | + stvx $inpperm,r10,$sp # wipe copies of round keys | |
1196 | + addi r10,r10,32 | |
1197 | + stvx $inpperm,r11,$sp | |
1198 | + addi r11,r11,32 | |
1199 | + stvx $inpperm,r10,$sp | |
1200 | + addi r10,r10,32 | |
1201 | + stvx $inpperm,r11,$sp | |
1202 | + addi r11,r11,32 | |
1203 | + stvx $inpperm,r10,$sp | |
1204 | + addi r10,r10,32 | |
1205 | + stvx $inpperm,r11,$sp | |
1206 | + addi r11,r11,32 | |
1207 | + stvx $inpperm,r10,$sp | |
1208 | + addi r10,r10,32 | |
1209 | + stvx $inpperm,r11,$sp | |
1210 | + addi r11,r11,32 | |
1211 | + | |
1212 | + mtspr 256,$vrsave | |
1213 | + lvx v20,r10,$sp # ABI says so | |
1214 | + addi r10,r10,32 | |
1215 | + lvx v21,r11,$sp | |
1216 | + addi r11,r11,32 | |
1217 | + lvx v22,r10,$sp | |
1218 | + addi r10,r10,32 | |
1219 | + lvx v23,r11,$sp | |
1220 | + addi r11,r11,32 | |
1221 | + lvx v24,r10,$sp | |
1222 | + addi r10,r10,32 | |
1223 | + lvx v25,r11,$sp | |
1224 | + addi r11,r11,32 | |
1225 | + lvx v26,r10,$sp | |
1226 | + addi r10,r10,32 | |
1227 | + lvx v27,r11,$sp | |
1228 | + addi r11,r11,32 | |
1229 | + lvx v28,r10,$sp | |
1230 | + addi r10,r10,32 | |
1231 | + lvx v29,r11,$sp | |
1232 | + addi r11,r11,32 | |
1233 | + lvx v30,r10,$sp | |
1234 | + lvx v31,r11,$sp | |
1235 | + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
1236 | + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
1237 | + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
1238 | + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
1239 | + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
1240 | + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
1241 | + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` | |
1242 | + blr | |
1243 | + .long 0 | |
1244 | + .byte 0,12,0x04,0,0x80,6,6,0 | |
1245 | + .long 0 | |
1246 | +.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt | |
1247 | +___ | |
1248 | +}} }}} | |
1249 | + | |
1250 | +######################################################################### | |
1251 | +{{{ # CTR procedure[s] # | |
1252 | +my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10)); | |
1253 | +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); | |
1254 | +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)= | |
1255 | + map("v$_",(4..11)); | |
1256 | +my $dat=$tmp; | |
1257 | + | |
1258 | +$code.=<<___; | |
1259 | +.globl .${prefix}_ctr32_encrypt_blocks | |
1260 | +.align 5 | |
1261 | +.${prefix}_ctr32_encrypt_blocks: | |
1262 | + ${UCMP}i $len,1 | |
1263 | + bltlr- | |
1264 | + | |
1265 | + lis r0,0xfff0 | |
1266 | + mfspr $vrsave,256 | |
1267 | + mtspr 256,r0 | |
1268 | + | |
1269 | + li $idx,15 | |
1270 | + vxor $rndkey0,$rndkey0,$rndkey0 | |
1271 | + le?vspltisb $tmp,0x0f | |
1272 | + | |
1273 | + lvx $ivec,0,$ivp # load [unaligned] iv | |
1274 | + lvsl $inpperm,0,$ivp | |
1275 | + lvx $inptail,$idx,$ivp | |
1276 | + vspltisb $one,1 | |
1277 | + le?vxor $inpperm,$inpperm,$tmp | |
1278 | + vperm $ivec,$ivec,$inptail,$inpperm | |
1279 | + vsldoi $one,$rndkey0,$one,1 | |
1280 | + | |
1281 | + neg r11,$inp | |
1282 | + ?lvsl $keyperm,0,$key # prepare for unaligned key | |
1283 | + lwz $rounds,240($key) | |
1284 | + | |
1285 | + lvsr $inpperm,0,r11 # prepare for unaligned load | |
1286 | + lvx $inptail,0,$inp | |
1287 | + addi $inp,$inp,15 # 15 is not typo | |
1288 | + le?vxor $inpperm,$inpperm,$tmp | |
1289 | + | |
1290 | + srwi $rounds,$rounds,1 | |
1291 | + li $idx,16 | |
1292 | + subi $rounds,$rounds,1 | |
1293 | + | |
1294 | + ${UCMP}i $len,8 | |
1295 | + bge _aesp8_ctr32_encrypt8x | |
1296 | + | |
1297 | + ?lvsr $outperm,0,$out # prepare for unaligned store | |
1298 | + vspltisb $outmask,-1 | |
1299 | + lvx $outhead,0,$out | |
1300 | + ?vperm $outmask,$rndkey0,$outmask,$outperm | |
1301 | + le?vxor $outperm,$outperm,$tmp | |
1302 | + | |
1303 | + lvx $rndkey0,0,$key | |
1304 | + mtctr $rounds | |
1305 | + lvx $rndkey1,$idx,$key | |
1306 | + addi $idx,$idx,16 | |
1307 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
1308 | + vxor $inout,$ivec,$rndkey0 | |
1309 | + lvx $rndkey0,$idx,$key | |
1310 | + addi $idx,$idx,16 | |
1311 | + b Loop_ctr32_enc | |
1312 | + | |
1313 | +.align 5 | |
1314 | +Loop_ctr32_enc: | |
1315 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
1316 | + vcipher $inout,$inout,$rndkey1 | |
1317 | + lvx $rndkey1,$idx,$key | |
1318 | + addi $idx,$idx,16 | |
1319 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
1320 | + vcipher $inout,$inout,$rndkey0 | |
1321 | + lvx $rndkey0,$idx,$key | |
1322 | + addi $idx,$idx,16 | |
1323 | + bdnz Loop_ctr32_enc | |
1324 | + | |
1325 | + vadduwm $ivec,$ivec,$one | |
1326 | + vmr $dat,$inptail | |
1327 | + lvx $inptail,0,$inp | |
1328 | + addi $inp,$inp,16 | |
1329 | + subic. $len,$len,1 # blocks-- | |
1330 | + | |
1331 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
1332 | + vcipher $inout,$inout,$rndkey1 | |
1333 | + lvx $rndkey1,$idx,$key | |
1334 | + vperm $dat,$dat,$inptail,$inpperm | |
1335 | + li $idx,16 | |
1336 | + ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm | |
1337 | + lvx $rndkey0,0,$key | |
1338 | + vxor $dat,$dat,$rndkey1 # last round key | |
1339 | + vcipherlast $inout,$inout,$dat | |
1340 | + | |
1341 | + lvx $rndkey1,$idx,$key | |
1342 | + addi $idx,$idx,16 | |
1343 | + vperm $inout,$inout,$inout,$outperm | |
1344 | + vsel $dat,$outhead,$inout,$outmask | |
1345 | + mtctr $rounds | |
1346 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
1347 | + vmr $outhead,$inout | |
1348 | + vxor $inout,$ivec,$rndkey0 | |
1349 | + lvx $rndkey0,$idx,$key | |
1350 | + addi $idx,$idx,16 | |
1351 | + stvx $dat,0,$out | |
1352 | + addi $out,$out,16 | |
1353 | + bne Loop_ctr32_enc | |
1354 | + | |
1355 | + addi $out,$out,-1 | |
1356 | + lvx $inout,0,$out # redundant in aligned case | |
1357 | + vsel $inout,$outhead,$inout,$outmask | |
1358 | + stvx $inout,0,$out | |
1359 | + | |
1360 | + mtspr 256,$vrsave | |
1361 | + blr | |
1362 | + .long 0 | |
1363 | + .byte 0,12,0x14,0,0,0,6,0 | |
1364 | + .long 0 | |
1365 | +___ | |
1366 | +######################################################################### | |
1367 | +{{ # Optimized CTR procedure # | |
1368 | +my $key_="r11"; | |
1369 | +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); | |
1370 | + $x00=0 if ($flavour =~ /osx/); | |
1371 | +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14)); | |
1372 | +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22)); | |
1373 | +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys | |
1374 | + # v26-v31 last 6 round keys | |
1375 | +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment | |
1376 | +my ($two,$three,$four)=($outhead,$outperm,$outmask); | |
1377 | + | |
1378 | +$code.=<<___; | |
1379 | +.align 5 | |
1380 | +_aesp8_ctr32_encrypt8x: | |
1381 | + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) | |
1382 | + li r10,`$FRAME+8*16+15` | |
1383 | + li r11,`$FRAME+8*16+31` | |
1384 | + stvx v20,r10,$sp # ABI says so | |
1385 | + addi r10,r10,32 | |
1386 | + stvx v21,r11,$sp | |
1387 | + addi r11,r11,32 | |
1388 | + stvx v22,r10,$sp | |
1389 | + addi r10,r10,32 | |
1390 | + stvx v23,r11,$sp | |
1391 | + addi r11,r11,32 | |
1392 | + stvx v24,r10,$sp | |
1393 | + addi r10,r10,32 | |
1394 | + stvx v25,r11,$sp | |
1395 | + addi r11,r11,32 | |
1396 | + stvx v26,r10,$sp | |
1397 | + addi r10,r10,32 | |
1398 | + stvx v27,r11,$sp | |
1399 | + addi r11,r11,32 | |
1400 | + stvx v28,r10,$sp | |
1401 | + addi r10,r10,32 | |
1402 | + stvx v29,r11,$sp | |
1403 | + addi r11,r11,32 | |
1404 | + stvx v30,r10,$sp | |
1405 | + stvx v31,r11,$sp | |
1406 | + li r0,-1 | |
1407 | + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave | |
1408 | + li $x10,0x10 | |
1409 | + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
1410 | + li $x20,0x20 | |
1411 | + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
1412 | + li $x30,0x30 | |
1413 | + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
1414 | + li $x40,0x40 | |
1415 | + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
1416 | + li $x50,0x50 | |
1417 | + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
1418 | + li $x60,0x60 | |
1419 | + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
1420 | + li $x70,0x70 | |
1421 | + mtspr 256,r0 | |
1422 | + | |
1423 | + subi $rounds,$rounds,3 # -4 in total | |
1424 | + | |
1425 | + lvx $rndkey0,$x00,$key # load key schedule | |
1426 | + lvx v30,$x10,$key | |
1427 | + addi $key,$key,0x20 | |
1428 | + lvx v31,$x00,$key | |
1429 | + ?vperm $rndkey0,$rndkey0,v30,$keyperm | |
1430 | + addi $key_,$sp,$FRAME+15 | |
1431 | + mtctr $rounds | |
1432 | + | |
1433 | +Load_ctr32_enc_key: | |
1434 | + ?vperm v24,v30,v31,$keyperm | |
1435 | + lvx v30,$x10,$key | |
1436 | + addi $key,$key,0x20 | |
1437 | + stvx v24,$x00,$key_ # off-load round[1] | |
1438 | + ?vperm v25,v31,v30,$keyperm | |
1439 | + lvx v31,$x00,$key | |
1440 | + stvx v25,$x10,$key_ # off-load round[2] | |
1441 | + addi $key_,$key_,0x20 | |
1442 | + bdnz Load_ctr32_enc_key | |
1443 | + | |
1444 | + lvx v26,$x10,$key | |
1445 | + ?vperm v24,v30,v31,$keyperm | |
1446 | + lvx v27,$x20,$key | |
1447 | + stvx v24,$x00,$key_ # off-load round[3] | |
1448 | + ?vperm v25,v31,v26,$keyperm | |
1449 | + lvx v28,$x30,$key | |
1450 | + stvx v25,$x10,$key_ # off-load round[4] | |
1451 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
1452 | + ?vperm v26,v26,v27,$keyperm | |
1453 | + lvx v29,$x40,$key | |
1454 | + ?vperm v27,v27,v28,$keyperm | |
1455 | + lvx v30,$x50,$key | |
1456 | + ?vperm v28,v28,v29,$keyperm | |
1457 | + lvx v31,$x60,$key | |
1458 | + ?vperm v29,v29,v30,$keyperm | |
1459 | + lvx $out0,$x70,$key # borrow $out0 | |
1460 | + ?vperm v30,v30,v31,$keyperm | |
1461 | + lvx v24,$x00,$key_ # pre-load round[1] | |
1462 | + ?vperm v31,v31,$out0,$keyperm | |
1463 | + lvx v25,$x10,$key_ # pre-load round[2] | |
1464 | + | |
1465 | + vadduwm $two,$one,$one | |
1466 | + subi $inp,$inp,15 # undo "caller" | |
1467 | + $SHL $len,$len,4 | |
1468 | + | |
1469 | + vadduwm $out1,$ivec,$one # counter values ... | |
1470 | + vadduwm $out2,$ivec,$two | |
1471 | + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] | |
1472 | + le?li $idx,8 | |
1473 | + vadduwm $out3,$out1,$two | |
1474 | + vxor $out1,$out1,$rndkey0 | |
1475 | + le?lvsl $inpperm,0,$idx | |
1476 | + vadduwm $out4,$out2,$two | |
1477 | + vxor $out2,$out2,$rndkey0 | |
1478 | + le?vspltisb $tmp,0x0f | |
1479 | + vadduwm $out5,$out3,$two | |
1480 | + vxor $out3,$out3,$rndkey0 | |
1481 | + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u | |
1482 | + vadduwm $out6,$out4,$two | |
1483 | + vxor $out4,$out4,$rndkey0 | |
1484 | + vadduwm $out7,$out5,$two | |
1485 | + vxor $out5,$out5,$rndkey0 | |
1486 | + vadduwm $ivec,$out6,$two # next counter value | |
1487 | + vxor $out6,$out6,$rndkey0 | |
1488 | + vxor $out7,$out7,$rndkey0 | |
1489 | + | |
1490 | + mtctr $rounds | |
1491 | + b Loop_ctr32_enc8x | |
1492 | +.align 5 | |
1493 | +Loop_ctr32_enc8x: | |
1494 | + vcipher $out0,$out0,v24 | |
1495 | + vcipher $out1,$out1,v24 | |
1496 | + vcipher $out2,$out2,v24 | |
1497 | + vcipher $out3,$out3,v24 | |
1498 | + vcipher $out4,$out4,v24 | |
1499 | + vcipher $out5,$out5,v24 | |
1500 | + vcipher $out6,$out6,v24 | |
1501 | + vcipher $out7,$out7,v24 | |
1502 | +Loop_ctr32_enc8x_middle: | |
1503 | + lvx v24,$x20,$key_ # round[3] | |
1504 | + addi $key_,$key_,0x20 | |
1505 | + | |
1506 | + vcipher $out0,$out0,v25 | |
1507 | + vcipher $out1,$out1,v25 | |
1508 | + vcipher $out2,$out2,v25 | |
1509 | + vcipher $out3,$out3,v25 | |
1510 | + vcipher $out4,$out4,v25 | |
1511 | + vcipher $out5,$out5,v25 | |
1512 | + vcipher $out6,$out6,v25 | |
1513 | + vcipher $out7,$out7,v25 | |
1514 | + lvx v25,$x10,$key_ # round[4] | |
1515 | + bdnz Loop_ctr32_enc8x | |
1516 | + | |
1517 | + subic r11,$len,256 # $len-256, borrow $key_ | |
1518 | + vcipher $out0,$out0,v24 | |
1519 | + vcipher $out1,$out1,v24 | |
1520 | + vcipher $out2,$out2,v24 | |
1521 | + vcipher $out3,$out3,v24 | |
1522 | + vcipher $out4,$out4,v24 | |
1523 | + vcipher $out5,$out5,v24 | |
1524 | + vcipher $out6,$out6,v24 | |
1525 | + vcipher $out7,$out7,v24 | |
1526 | + | |
1527 | + subfe r0,r0,r0 # borrow?-1:0 | |
1528 | + vcipher $out0,$out0,v25 | |
1529 | + vcipher $out1,$out1,v25 | |
1530 | + vcipher $out2,$out2,v25 | |
1531 | + vcipher $out3,$out3,v25 | |
1532 | + vcipher $out4,$out4,v25 | |
1533 | + vcipher $out5,$out5,v25 | |
1534 | + vcipher $out6,$out6,v25 | |
1535 | + vcipher $out7,$out7,v25 | |
1536 | + | |
1537 | + and r0,r0,r11 | |
1538 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
1539 | + vcipher $out0,$out0,v26 | |
1540 | + vcipher $out1,$out1,v26 | |
1541 | + vcipher $out2,$out2,v26 | |
1542 | + vcipher $out3,$out3,v26 | |
1543 | + vcipher $out4,$out4,v26 | |
1544 | + vcipher $out5,$out5,v26 | |
1545 | + vcipher $out6,$out6,v26 | |
1546 | + vcipher $out7,$out7,v26 | |
1547 | + lvx v24,$x00,$key_ # re-pre-load round[1] | |
1548 | + | |
1549 | + subic $len,$len,129 # $len-=129 | |
1550 | + vcipher $out0,$out0,v27 | |
1551 | + addi $len,$len,1 # $len-=128 really | |
1552 | + vcipher $out1,$out1,v27 | |
1553 | + vcipher $out2,$out2,v27 | |
1554 | + vcipher $out3,$out3,v27 | |
1555 | + vcipher $out4,$out4,v27 | |
1556 | + vcipher $out5,$out5,v27 | |
1557 | + vcipher $out6,$out6,v27 | |
1558 | + vcipher $out7,$out7,v27 | |
1559 | + lvx v25,$x10,$key_ # re-pre-load round[2] | |
1560 | + | |
1561 | + vcipher $out0,$out0,v28 | |
1562 | + lvx_u $in0,$x00,$inp # load input | |
1563 | + vcipher $out1,$out1,v28 | |
1564 | + lvx_u $in1,$x10,$inp | |
1565 | + vcipher $out2,$out2,v28 | |
1566 | + lvx_u $in2,$x20,$inp | |
1567 | + vcipher $out3,$out3,v28 | |
1568 | + lvx_u $in3,$x30,$inp | |
1569 | + vcipher $out4,$out4,v28 | |
1570 | + lvx_u $in4,$x40,$inp | |
1571 | + vcipher $out5,$out5,v28 | |
1572 | + lvx_u $in5,$x50,$inp | |
1573 | + vcipher $out6,$out6,v28 | |
1574 | + lvx_u $in6,$x60,$inp | |
1575 | + vcipher $out7,$out7,v28 | |
1576 | + lvx_u $in7,$x70,$inp | |
1577 | + addi $inp,$inp,0x80 | |
1578 | + | |
1579 | + vcipher $out0,$out0,v29 | |
1580 | + le?vperm $in0,$in0,$in0,$inpperm | |
1581 | + vcipher $out1,$out1,v29 | |
1582 | + le?vperm $in1,$in1,$in1,$inpperm | |
1583 | + vcipher $out2,$out2,v29 | |
1584 | + le?vperm $in2,$in2,$in2,$inpperm | |
1585 | + vcipher $out3,$out3,v29 | |
1586 | + le?vperm $in3,$in3,$in3,$inpperm | |
1587 | + vcipher $out4,$out4,v29 | |
1588 | + le?vperm $in4,$in4,$in4,$inpperm | |
1589 | + vcipher $out5,$out5,v29 | |
1590 | + le?vperm $in5,$in5,$in5,$inpperm | |
1591 | + vcipher $out6,$out6,v29 | |
1592 | + le?vperm $in6,$in6,$in6,$inpperm | |
1593 | + vcipher $out7,$out7,v29 | |
1594 | + le?vperm $in7,$in7,$in7,$inpperm | |
1595 | + | |
1596 | + add $inp,$inp,r0 # $inp is adjusted in such | |
1597 | + # way that at exit from the | |
1598 | + # loop inX-in7 are loaded | |
1599 | + # with last "words" | |
1600 | + subfe. r0,r0,r0 # borrow?-1:0 | |
1601 | + vcipher $out0,$out0,v30 | |
1602 | + vxor $in0,$in0,v31 # xor with last round key | |
1603 | + vcipher $out1,$out1,v30 | |
1604 | + vxor $in1,$in1,v31 | |
1605 | + vcipher $out2,$out2,v30 | |
1606 | + vxor $in2,$in2,v31 | |
1607 | + vcipher $out3,$out3,v30 | |
1608 | + vxor $in3,$in3,v31 | |
1609 | + vcipher $out4,$out4,v30 | |
1610 | + vxor $in4,$in4,v31 | |
1611 | + vcipher $out5,$out5,v30 | |
1612 | + vxor $in5,$in5,v31 | |
1613 | + vcipher $out6,$out6,v30 | |
1614 | + vxor $in6,$in6,v31 | |
1615 | + vcipher $out7,$out7,v30 | |
1616 | + vxor $in7,$in7,v31 | |
1617 | + | |
1618 | + bne Lctr32_enc8x_break # did $len-129 borrow? | |
1619 | + | |
1620 | + vcipherlast $in0,$out0,$in0 | |
1621 | + vcipherlast $in1,$out1,$in1 | |
1622 | + vadduwm $out1,$ivec,$one # counter values ... | |
1623 | + vcipherlast $in2,$out2,$in2 | |
1624 | + vadduwm $out2,$ivec,$two | |
1625 | + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] | |
1626 | + vcipherlast $in3,$out3,$in3 | |
1627 | + vadduwm $out3,$out1,$two | |
1628 | + vxor $out1,$out1,$rndkey0 | |
1629 | + vcipherlast $in4,$out4,$in4 | |
1630 | + vadduwm $out4,$out2,$two | |
1631 | + vxor $out2,$out2,$rndkey0 | |
1632 | + vcipherlast $in5,$out5,$in5 | |
1633 | + vadduwm $out5,$out3,$two | |
1634 | + vxor $out3,$out3,$rndkey0 | |
1635 | + vcipherlast $in6,$out6,$in6 | |
1636 | + vadduwm $out6,$out4,$two | |
1637 | + vxor $out4,$out4,$rndkey0 | |
1638 | + vcipherlast $in7,$out7,$in7 | |
1639 | + vadduwm $out7,$out5,$two | |
1640 | + vxor $out5,$out5,$rndkey0 | |
1641 | + le?vperm $in0,$in0,$in0,$inpperm | |
1642 | + vadduwm $ivec,$out6,$two # next counter value | |
1643 | + vxor $out6,$out6,$rndkey0 | |
1644 | + le?vperm $in1,$in1,$in1,$inpperm | |
1645 | + vxor $out7,$out7,$rndkey0 | |
1646 | + mtctr $rounds | |
1647 | + | |
1648 | + vcipher $out0,$out0,v24 | |
1649 | + stvx_u $in0,$x00,$out | |
1650 | + le?vperm $in2,$in2,$in2,$inpperm | |
1651 | + vcipher $out1,$out1,v24 | |
1652 | + stvx_u $in1,$x10,$out | |
1653 | + le?vperm $in3,$in3,$in3,$inpperm | |
1654 | + vcipher $out2,$out2,v24 | |
1655 | + stvx_u $in2,$x20,$out | |
1656 | + le?vperm $in4,$in4,$in4,$inpperm | |
1657 | + vcipher $out3,$out3,v24 | |
1658 | + stvx_u $in3,$x30,$out | |
1659 | + le?vperm $in5,$in5,$in5,$inpperm | |
1660 | + vcipher $out4,$out4,v24 | |
1661 | + stvx_u $in4,$x40,$out | |
1662 | + le?vperm $in6,$in6,$in6,$inpperm | |
1663 | + vcipher $out5,$out5,v24 | |
1664 | + stvx_u $in5,$x50,$out | |
1665 | + le?vperm $in7,$in7,$in7,$inpperm | |
1666 | + vcipher $out6,$out6,v24 | |
1667 | + stvx_u $in6,$x60,$out | |
1668 | + vcipher $out7,$out7,v24 | |
1669 | + stvx_u $in7,$x70,$out | |
1670 | + addi $out,$out,0x80 | |
1671 | + | |
1672 | + b Loop_ctr32_enc8x_middle | |
1673 | + | |
1674 | +.align 5 | |
1675 | +Lctr32_enc8x_break: | |
1676 | + cmpwi $len,-0x60 | |
1677 | + blt Lctr32_enc8x_one | |
1678 | + nop | |
1679 | + beq Lctr32_enc8x_two | |
1680 | + cmpwi $len,-0x40 | |
1681 | + blt Lctr32_enc8x_three | |
1682 | + nop | |
1683 | + beq Lctr32_enc8x_four | |
1684 | + cmpwi $len,-0x20 | |
1685 | + blt Lctr32_enc8x_five | |
1686 | + nop | |
1687 | + beq Lctr32_enc8x_six | |
1688 | + cmpwi $len,0x00 | |
1689 | + blt Lctr32_enc8x_seven | |
1690 | + | |
1691 | +Lctr32_enc8x_eight: | |
1692 | + vcipherlast $out0,$out0,$in0 | |
1693 | + vcipherlast $out1,$out1,$in1 | |
1694 | + vcipherlast $out2,$out2,$in2 | |
1695 | + vcipherlast $out3,$out3,$in3 | |
1696 | + vcipherlast $out4,$out4,$in4 | |
1697 | + vcipherlast $out5,$out5,$in5 | |
1698 | + vcipherlast $out6,$out6,$in6 | |
1699 | + vcipherlast $out7,$out7,$in7 | |
1700 | + | |
1701 | + le?vperm $out0,$out0,$out0,$inpperm | |
1702 | + le?vperm $out1,$out1,$out1,$inpperm | |
1703 | + stvx_u $out0,$x00,$out | |
1704 | + le?vperm $out2,$out2,$out2,$inpperm | |
1705 | + stvx_u $out1,$x10,$out | |
1706 | + le?vperm $out3,$out3,$out3,$inpperm | |
1707 | + stvx_u $out2,$x20,$out | |
1708 | + le?vperm $out4,$out4,$out4,$inpperm | |
1709 | + stvx_u $out3,$x30,$out | |
1710 | + le?vperm $out5,$out5,$out5,$inpperm | |
1711 | + stvx_u $out4,$x40,$out | |
1712 | + le?vperm $out6,$out6,$out6,$inpperm | |
1713 | + stvx_u $out5,$x50,$out | |
1714 | + le?vperm $out7,$out7,$out7,$inpperm | |
1715 | + stvx_u $out6,$x60,$out | |
1716 | + stvx_u $out7,$x70,$out | |
1717 | + addi $out,$out,0x80 | |
1718 | + b Lctr32_enc8x_done | |
1719 | + | |
1720 | +.align 5 | |
1721 | +Lctr32_enc8x_seven: | |
1722 | + vcipherlast $out0,$out0,$in1 | |
1723 | + vcipherlast $out1,$out1,$in2 | |
1724 | + vcipherlast $out2,$out2,$in3 | |
1725 | + vcipherlast $out3,$out3,$in4 | |
1726 | + vcipherlast $out4,$out4,$in5 | |
1727 | + vcipherlast $out5,$out5,$in6 | |
1728 | + vcipherlast $out6,$out6,$in7 | |
1729 | + | |
1730 | + le?vperm $out0,$out0,$out0,$inpperm | |
1731 | + le?vperm $out1,$out1,$out1,$inpperm | |
1732 | + stvx_u $out0,$x00,$out | |
1733 | + le?vperm $out2,$out2,$out2,$inpperm | |
1734 | + stvx_u $out1,$x10,$out | |
1735 | + le?vperm $out3,$out3,$out3,$inpperm | |
1736 | + stvx_u $out2,$x20,$out | |
1737 | + le?vperm $out4,$out4,$out4,$inpperm | |
1738 | + stvx_u $out3,$x30,$out | |
1739 | + le?vperm $out5,$out5,$out5,$inpperm | |
1740 | + stvx_u $out4,$x40,$out | |
1741 | + le?vperm $out6,$out6,$out6,$inpperm | |
1742 | + stvx_u $out5,$x50,$out | |
1743 | + stvx_u $out6,$x60,$out | |
1744 | + addi $out,$out,0x70 | |
1745 | + b Lctr32_enc8x_done | |
1746 | + | |
1747 | +.align 5 | |
1748 | +Lctr32_enc8x_six: | |
1749 | + vcipherlast $out0,$out0,$in2 | |
1750 | + vcipherlast $out1,$out1,$in3 | |
1751 | + vcipherlast $out2,$out2,$in4 | |
1752 | + vcipherlast $out3,$out3,$in5 | |
1753 | + vcipherlast $out4,$out4,$in6 | |
1754 | + vcipherlast $out5,$out5,$in7 | |
1755 | + | |
1756 | + le?vperm $out0,$out0,$out0,$inpperm | |
1757 | + le?vperm $out1,$out1,$out1,$inpperm | |
1758 | + stvx_u $out0,$x00,$out | |
1759 | + le?vperm $out2,$out2,$out2,$inpperm | |
1760 | + stvx_u $out1,$x10,$out | |
1761 | + le?vperm $out3,$out3,$out3,$inpperm | |
1762 | + stvx_u $out2,$x20,$out | |
1763 | + le?vperm $out4,$out4,$out4,$inpperm | |
1764 | + stvx_u $out3,$x30,$out | |
1765 | + le?vperm $out5,$out5,$out5,$inpperm | |
1766 | + stvx_u $out4,$x40,$out | |
1767 | + stvx_u $out5,$x50,$out | |
1768 | + addi $out,$out,0x60 | |
1769 | + b Lctr32_enc8x_done | |
1770 | + | |
1771 | +.align 5 | |
1772 | +Lctr32_enc8x_five: | |
1773 | + vcipherlast $out0,$out0,$in3 | |
1774 | + vcipherlast $out1,$out1,$in4 | |
1775 | + vcipherlast $out2,$out2,$in5 | |
1776 | + vcipherlast $out3,$out3,$in6 | |
1777 | + vcipherlast $out4,$out4,$in7 | |
1778 | + | |
1779 | + le?vperm $out0,$out0,$out0,$inpperm | |
1780 | + le?vperm $out1,$out1,$out1,$inpperm | |
1781 | + stvx_u $out0,$x00,$out | |
1782 | + le?vperm $out2,$out2,$out2,$inpperm | |
1783 | + stvx_u $out1,$x10,$out | |
1784 | + le?vperm $out3,$out3,$out3,$inpperm | |
1785 | + stvx_u $out2,$x20,$out | |
1786 | + le?vperm $out4,$out4,$out4,$inpperm | |
1787 | + stvx_u $out3,$x30,$out | |
1788 | + stvx_u $out4,$x40,$out | |
1789 | + addi $out,$out,0x50 | |
1790 | + b Lctr32_enc8x_done | |
1791 | + | |
1792 | +.align 5 | |
1793 | +Lctr32_enc8x_four: | |
1794 | + vcipherlast $out0,$out0,$in4 | |
1795 | + vcipherlast $out1,$out1,$in5 | |
1796 | + vcipherlast $out2,$out2,$in6 | |
1797 | + vcipherlast $out3,$out3,$in7 | |
1798 | + | |
1799 | + le?vperm $out0,$out0,$out0,$inpperm | |
1800 | + le?vperm $out1,$out1,$out1,$inpperm | |
1801 | + stvx_u $out0,$x00,$out | |
1802 | + le?vperm $out2,$out2,$out2,$inpperm | |
1803 | + stvx_u $out1,$x10,$out | |
1804 | + le?vperm $out3,$out3,$out3,$inpperm | |
1805 | + stvx_u $out2,$x20,$out | |
1806 | + stvx_u $out3,$x30,$out | |
1807 | + addi $out,$out,0x40 | |
1808 | + b Lctr32_enc8x_done | |
1809 | + | |
1810 | +.align 5 | |
1811 | +Lctr32_enc8x_three: | |
1812 | + vcipherlast $out0,$out0,$in5 | |
1813 | + vcipherlast $out1,$out1,$in6 | |
1814 | + vcipherlast $out2,$out2,$in7 | |
1815 | + | |
1816 | + le?vperm $out0,$out0,$out0,$inpperm | |
1817 | + le?vperm $out1,$out1,$out1,$inpperm | |
1818 | + stvx_u $out0,$x00,$out | |
1819 | + le?vperm $out2,$out2,$out2,$inpperm | |
1820 | + stvx_u $out1,$x10,$out | |
1821 | + stvx_u $out2,$x20,$out | |
1822 | + addi $out,$out,0x30 | |
1823 | + b Lcbc_dec8x_done | |
1824 | + | |
1825 | +.align 5 | |
1826 | +Lctr32_enc8x_two: | |
1827 | + vcipherlast $out0,$out0,$in6 | |
1828 | + vcipherlast $out1,$out1,$in7 | |
1829 | + | |
1830 | + le?vperm $out0,$out0,$out0,$inpperm | |
1831 | + le?vperm $out1,$out1,$out1,$inpperm | |
1832 | + stvx_u $out0,$x00,$out | |
1833 | + stvx_u $out1,$x10,$out | |
1834 | + addi $out,$out,0x20 | |
1835 | + b Lcbc_dec8x_done | |
1836 | + | |
1837 | +.align 5 | |
1838 | +Lctr32_enc8x_one: | |
1839 | + vcipherlast $out0,$out0,$in7 | |
1840 | + | |
1841 | + le?vperm $out0,$out0,$out0,$inpperm | |
1842 | + stvx_u $out0,0,$out | |
1843 | + addi $out,$out,0x10 | |
1844 | + | |
1845 | +Lctr32_enc8x_done: | |
1846 | + li r10,`$FRAME+15` | |
1847 | + li r11,`$FRAME+31` | |
1848 | + stvx $inpperm,r10,$sp # wipe copies of round keys | |
1849 | + addi r10,r10,32 | |
1850 | + stvx $inpperm,r11,$sp | |
1851 | + addi r11,r11,32 | |
1852 | + stvx $inpperm,r10,$sp | |
1853 | + addi r10,r10,32 | |
1854 | + stvx $inpperm,r11,$sp | |
1855 | + addi r11,r11,32 | |
1856 | + stvx $inpperm,r10,$sp | |
1857 | + addi r10,r10,32 | |
1858 | + stvx $inpperm,r11,$sp | |
1859 | + addi r11,r11,32 | |
1860 | + stvx $inpperm,r10,$sp | |
1861 | + addi r10,r10,32 | |
1862 | + stvx $inpperm,r11,$sp | |
1863 | + addi r11,r11,32 | |
1864 | + | |
1865 | + mtspr 256,$vrsave | |
1866 | + lvx v20,r10,$sp # ABI says so | |
1867 | + addi r10,r10,32 | |
1868 | + lvx v21,r11,$sp | |
1869 | + addi r11,r11,32 | |
1870 | + lvx v22,r10,$sp | |
1871 | + addi r10,r10,32 | |
1872 | + lvx v23,r11,$sp | |
1873 | + addi r11,r11,32 | |
1874 | + lvx v24,r10,$sp | |
1875 | + addi r10,r10,32 | |
1876 | + lvx v25,r11,$sp | |
1877 | + addi r11,r11,32 | |
1878 | + lvx v26,r10,$sp | |
1879 | + addi r10,r10,32 | |
1880 | + lvx v27,r11,$sp | |
1881 | + addi r11,r11,32 | |
1882 | + lvx v28,r10,$sp | |
1883 | + addi r10,r10,32 | |
1884 | + lvx v29,r11,$sp | |
1885 | + addi r11,r11,32 | |
1886 | + lvx v30,r10,$sp | |
1887 | + lvx v31,r11,$sp | |
1888 | + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
1889 | + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
1890 | + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
1891 | + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
1892 | + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
1893 | + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
1894 | + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` | |
1895 | + blr | |
1896 | + .long 0 | |
1897 | + .byte 0,12,0x04,0,0x80,6,6,0 | |
1898 | + .long 0 | |
1899 | +.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks | |
1900 | +___ | |
1901 | +}} }}} | |
1902 | + | |
1903 | +######################################################################### | |
1904 | +{{{ # XTS procedures # | |
1905 | +my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10)); | |
1906 | +my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2)); | |
1907 | +my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7)); | |
1908 | +my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12)); | |
1909 | +my $taillen = $key2; | |
1910 | + | |
1911 | + ($inp,$idx) = ($idx,$inp); # reassign | |
1912 | + | |
1913 | +$code.=<<___; | |
1914 | +.globl .${prefix}_xts_encrypt | |
1915 | +.align 5 | |
1916 | +.${prefix}_xts_encrypt: | |
1917 | + mr $inp,r3 # reassign | |
1918 | + li r3,-1 | |
1919 | + ${UCMP}i $len,16 | |
1920 | + bltlr- | |
1921 | + | |
1922 | + lis r0,0xfff0 | |
1923 | + mfspr r12,256 # save vrsave | |
1924 | + li r11,0 | |
1925 | + mtspr 256,r0 | |
1926 | + | |
1927 | + vspltisb $seven,0x07 # 0x070707..07 | |
1928 | + le?lvsl $leperm,r11,r11 | |
1929 | + le?vspltisb $tmp,0x0f | |
1930 | + le?vxor $leperm,$leperm,$seven | |
1931 | + | |
1932 | + li $idx,15 | |
1933 | + lvx $tweak,0,$ivp # load [unaligned] iv | |
1934 | + lvsl $inpperm,0,$ivp | |
1935 | + lvx $inptail,$idx,$ivp | |
1936 | + le?vxor $inpperm,$inpperm,$tmp | |
1937 | + vperm $tweak,$tweak,$inptail,$inpperm | |
1938 | + | |
1939 | + ?lvsl $keyperm,0,$key2 # prepare for unaligned key | |
1940 | + lwz $rounds,240($key2) | |
1941 | + srwi $rounds,$rounds,1 | |
1942 | + subi $rounds,$rounds,1 | |
1943 | + li $idx,16 | |
1944 | + | |
1945 | + neg r11,$inp | |
1946 | + lvsr $inpperm,0,r11 # prepare for unaligned load | |
1947 | + lvx $inout,0,$inp | |
1948 | + addi $inp,$inp,15 # 15 is not typo | |
1949 | + le?vxor $inpperm,$inpperm,$tmp | |
1950 | + | |
1951 | + lvx $rndkey0,0,$key2 | |
1952 | + lvx $rndkey1,$idx,$key2 | |
1953 | + addi $idx,$idx,16 | |
1954 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
1955 | + vxor $tweak,$tweak,$rndkey0 | |
1956 | + lvx $rndkey0,$idx,$key2 | |
1957 | + addi $idx,$idx,16 | |
1958 | + mtctr $rounds | |
1959 | + | |
1960 | +Ltweak_xts_enc: | |
1961 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
1962 | + vcipher $tweak,$tweak,$rndkey1 | |
1963 | + lvx $rndkey1,$idx,$key2 | |
1964 | + addi $idx,$idx,16 | |
1965 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
1966 | + vcipher $tweak,$tweak,$rndkey0 | |
1967 | + lvx $rndkey0,$idx,$key2 | |
1968 | + addi $idx,$idx,16 | |
1969 | + bdnz Ltweak_xts_enc | |
1970 | + | |
1971 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
1972 | + vcipher $tweak,$tweak,$rndkey1 | |
1973 | + lvx $rndkey1,$idx,$key2 | |
1974 | + li $idx,16 | |
1975 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
1976 | + vcipherlast $tweak,$tweak,$rndkey0 | |
1977 | + | |
1978 | + lvx $inptail,0,$inp | |
1979 | + addi $inp,$inp,16 | |
1980 | + | |
1981 | + ?lvsl $keyperm,0,$key1 # prepare for unaligned key | |
1982 | + lwz $rounds,240($key1) | |
1983 | + srwi $rounds,$rounds,1 | |
1984 | + subi $rounds,$rounds,1 | |
1985 | + li $idx,16 | |
1986 | + | |
1987 | + vslb $eighty7,$seven,$seven # 0x808080..80 | |
1988 | + vor $eighty7,$eighty7,$seven # 0x878787..87 | |
1989 | + vspltisb $tmp,1 # 0x010101..01 | |
1990 | + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 | |
1991 | + | |
1992 | + ${UCMP}i $len,96 | |
1993 | + bge _aesp8_xts_encrypt6x | |
1994 | + | |
1995 | + andi. $taillen,$len,15 | |
1996 | + subic r0,$len,32 | |
1997 | + subi $taillen,$taillen,16 | |
1998 | + subfe r0,r0,r0 | |
1999 | + and r0,r0,$taillen | |
2000 | + add $inp,$inp,r0 | |
2001 | + | |
2002 | + lvx $rndkey0,0,$key1 | |
2003 | + lvx $rndkey1,$idx,$key1 | |
2004 | + addi $idx,$idx,16 | |
2005 | + vperm $inout,$inout,$inptail,$inpperm | |
2006 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2007 | + vxor $inout,$inout,$tweak | |
2008 | + vxor $inout,$inout,$rndkey0 | |
2009 | + lvx $rndkey0,$idx,$key1 | |
2010 | + addi $idx,$idx,16 | |
2011 | + mtctr $rounds | |
2012 | + b Loop_xts_enc | |
2013 | + | |
2014 | +.align 5 | |
2015 | +Loop_xts_enc: | |
2016 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
2017 | + vcipher $inout,$inout,$rndkey1 | |
2018 | + lvx $rndkey1,$idx,$key1 | |
2019 | + addi $idx,$idx,16 | |
2020 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2021 | + vcipher $inout,$inout,$rndkey0 | |
2022 | + lvx $rndkey0,$idx,$key1 | |
2023 | + addi $idx,$idx,16 | |
2024 | + bdnz Loop_xts_enc | |
2025 | + | |
2026 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
2027 | + vcipher $inout,$inout,$rndkey1 | |
2028 | + lvx $rndkey1,$idx,$key1 | |
2029 | + li $idx,16 | |
2030 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2031 | + vxor $rndkey0,$rndkey0,$tweak | |
2032 | + vcipherlast $output,$inout,$rndkey0 | |
2033 | + | |
2034 | + le?vperm $tmp,$output,$output,$leperm | |
2035 | + be?nop | |
2036 | + le?stvx_u $tmp,0,$out | |
2037 | + be?stvx_u $output,0,$out | |
2038 | + addi $out,$out,16 | |
2039 | + | |
2040 | + subic. $len,$len,16 | |
2041 | + beq Lxts_enc_done | |
2042 | + | |
2043 | + vmr $inout,$inptail | |
2044 | + lvx $inptail,0,$inp | |
2045 | + addi $inp,$inp,16 | |
2046 | + lvx $rndkey0,0,$key1 | |
2047 | + lvx $rndkey1,$idx,$key1 | |
2048 | + addi $idx,$idx,16 | |
2049 | + | |
2050 | + subic r0,$len,32 | |
2051 | + subfe r0,r0,r0 | |
2052 | + and r0,r0,$taillen | |
2053 | + add $inp,$inp,r0 | |
2054 | + | |
2055 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2056 | + vaddubm $tweak,$tweak,$tweak | |
2057 | + vsldoi $tmp,$tmp,$tmp,15 | |
2058 | + vand $tmp,$tmp,$eighty7 | |
2059 | + vxor $tweak,$tweak,$tmp | |
2060 | + | |
2061 | + vperm $inout,$inout,$inptail,$inpperm | |
2062 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2063 | + vxor $inout,$inout,$tweak | |
2064 | + vxor $output,$output,$rndkey0 # just in case $len<16 | |
2065 | + vxor $inout,$inout,$rndkey0 | |
2066 | + lvx $rndkey0,$idx,$key1 | |
2067 | + addi $idx,$idx,16 | |
2068 | + | |
2069 | + mtctr $rounds | |
2070 | + ${UCMP}i $len,16 | |
2071 | + bge Loop_xts_enc | |
2072 | + | |
2073 | + vxor $output,$output,$tweak | |
2074 | + lvsr $inpperm,0,$len # $inpperm is no longer needed | |
2075 | + vxor $inptail,$inptail,$inptail # $inptail is no longer needed | |
2076 | + vspltisb $tmp,-1 | |
2077 | + vperm $inptail,$inptail,$tmp,$inpperm | |
2078 | + vsel $inout,$inout,$output,$inptail | |
2079 | + | |
2080 | + subi r11,$out,17 | |
2081 | + subi $out,$out,16 | |
2082 | + mtctr $len | |
2083 | + li $len,16 | |
2084 | +Loop_xts_enc_steal: | |
2085 | + lbzu r0,1(r11) | |
2086 | + stb r0,16(r11) | |
2087 | + bdnz Loop_xts_enc_steal | |
2088 | + | |
2089 | + mtctr $rounds | |
2090 | + b Loop_xts_enc # one more time... | |
2091 | + | |
2092 | +Lxts_enc_done: | |
2093 | + mtspr 256,r12 # restore vrsave | |
2094 | + li r3,0 | |
2095 | + blr | |
2096 | + .long 0 | |
2097 | + .byte 0,12,0x04,0,0x80,6,6,0 | |
2098 | + .long 0 | |
2099 | +.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt | |
2100 | + | |
2101 | +.globl .${prefix}_xts_decrypt | |
2102 | +.align 5 | |
2103 | +.${prefix}_xts_decrypt: | |
2104 | + mr $inp,r3 # reassign | |
2105 | + li r3,-1 | |
2106 | + ${UCMP}i $len,16 | |
2107 | + bltlr- | |
2108 | + | |
2109 | + lis r0,0xfff8 | |
2110 | + mfspr r12,256 # save vrsave | |
2111 | + li r11,0 | |
2112 | + mtspr 256,r0 | |
2113 | + | |
2114 | + andi. r0,$len,15 | |
2115 | + neg r0,r0 | |
2116 | + andi. r0,r0,16 | |
2117 | + sub $len,$len,r0 | |
2118 | + | |
2119 | + vspltisb $seven,0x07 # 0x070707..07 | |
2120 | + le?lvsl $leperm,r11,r11 | |
2121 | + le?vspltisb $tmp,0x0f | |
2122 | + le?vxor $leperm,$leperm,$seven | |
2123 | + | |
2124 | + li $idx,15 | |
2125 | + lvx $tweak,0,$ivp # load [unaligned] iv | |
2126 | + lvsl $inpperm,0,$ivp | |
2127 | + lvx $inptail,$idx,$ivp | |
2128 | + le?vxor $inpperm,$inpperm,$tmp | |
2129 | + vperm $tweak,$tweak,$inptail,$inpperm | |
2130 | + | |
2131 | + ?lvsl $keyperm,0,$key2 # prepare for unaligned key | |
2132 | + lwz $rounds,240($key2) | |
2133 | + srwi $rounds,$rounds,1 | |
2134 | + subi $rounds,$rounds,1 | |
2135 | + li $idx,16 | |
2136 | + | |
2137 | + neg r11,$inp | |
2138 | + lvsr $inpperm,0,r11 # prepare for unaligned load | |
2139 | + lvx $inout,0,$inp | |
2140 | + addi $inp,$inp,15 # 15 is not typo | |
2141 | + le?vxor $inpperm,$inpperm,$tmp | |
2142 | + | |
2143 | + lvx $rndkey0,0,$key2 | |
2144 | + lvx $rndkey1,$idx,$key2 | |
2145 | + addi $idx,$idx,16 | |
2146 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2147 | + vxor $tweak,$tweak,$rndkey0 | |
2148 | + lvx $rndkey0,$idx,$key2 | |
2149 | + addi $idx,$idx,16 | |
2150 | + mtctr $rounds | |
2151 | + | |
2152 | +Ltweak_xts_dec: | |
2153 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
2154 | + vcipher $tweak,$tweak,$rndkey1 | |
2155 | + lvx $rndkey1,$idx,$key2 | |
2156 | + addi $idx,$idx,16 | |
2157 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2158 | + vcipher $tweak,$tweak,$rndkey0 | |
2159 | + lvx $rndkey0,$idx,$key2 | |
2160 | + addi $idx,$idx,16 | |
2161 | + bdnz Ltweak_xts_dec | |
2162 | + | |
2163 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
2164 | + vcipher $tweak,$tweak,$rndkey1 | |
2165 | + lvx $rndkey1,$idx,$key2 | |
2166 | + li $idx,16 | |
2167 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2168 | + vcipherlast $tweak,$tweak,$rndkey0 | |
2169 | + | |
2170 | + lvx $inptail,0,$inp | |
2171 | + addi $inp,$inp,16 | |
2172 | + | |
2173 | + ?lvsl $keyperm,0,$key1 # prepare for unaligned key | |
2174 | + lwz $rounds,240($key1) | |
2175 | + srwi $rounds,$rounds,1 | |
2176 | + subi $rounds,$rounds,1 | |
2177 | + li $idx,16 | |
2178 | + | |
2179 | + vslb $eighty7,$seven,$seven # 0x808080..80 | |
2180 | + vor $eighty7,$eighty7,$seven # 0x878787..87 | |
2181 | + vspltisb $tmp,1 # 0x010101..01 | |
2182 | + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 | |
2183 | + | |
2184 | + ${UCMP}i $len,96 | |
2185 | + bge _aesp8_xts_decrypt6x | |
2186 | + | |
2187 | + lvx $rndkey0,0,$key1 | |
2188 | + lvx $rndkey1,$idx,$key1 | |
2189 | + addi $idx,$idx,16 | |
2190 | + vperm $inout,$inout,$inptail,$inpperm | |
2191 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2192 | + vxor $inout,$inout,$tweak | |
2193 | + vxor $inout,$inout,$rndkey0 | |
2194 | + lvx $rndkey0,$idx,$key1 | |
2195 | + addi $idx,$idx,16 | |
2196 | + mtctr $rounds | |
2197 | + | |
2198 | + ${UCMP}i $len,16 | |
2199 | + blt Ltail_xts_dec | |
2200 | + be?b Loop_xts_dec | |
2201 | + | |
2202 | +.align 5 | |
2203 | +Loop_xts_dec: | |
2204 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
2205 | + vncipher $inout,$inout,$rndkey1 | |
2206 | + lvx $rndkey1,$idx,$key1 | |
2207 | + addi $idx,$idx,16 | |
2208 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2209 | + vncipher $inout,$inout,$rndkey0 | |
2210 | + lvx $rndkey0,$idx,$key1 | |
2211 | + addi $idx,$idx,16 | |
2212 | + bdnz Loop_xts_dec | |
2213 | + | |
2214 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
2215 | + vncipher $inout,$inout,$rndkey1 | |
2216 | + lvx $rndkey1,$idx,$key1 | |
2217 | + li $idx,16 | |
2218 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2219 | + vxor $rndkey0,$rndkey0,$tweak | |
2220 | + vncipherlast $output,$inout,$rndkey0 | |
2221 | + | |
2222 | + le?vperm $tmp,$output,$output,$leperm | |
2223 | + be?nop | |
2224 | + le?stvx_u $tmp,0,$out | |
2225 | + be?stvx_u $output,0,$out | |
2226 | + addi $out,$out,16 | |
2227 | + | |
2228 | + subic. $len,$len,16 | |
2229 | + beq Lxts_dec_done | |
2230 | + | |
2231 | + vmr $inout,$inptail | |
2232 | + lvx $inptail,0,$inp | |
2233 | + addi $inp,$inp,16 | |
2234 | + lvx $rndkey0,0,$key1 | |
2235 | + lvx $rndkey1,$idx,$key1 | |
2236 | + addi $idx,$idx,16 | |
2237 | + | |
2238 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2239 | + vaddubm $tweak,$tweak,$tweak | |
2240 | + vsldoi $tmp,$tmp,$tmp,15 | |
2241 | + vand $tmp,$tmp,$eighty7 | |
2242 | + vxor $tweak,$tweak,$tmp | |
2243 | + | |
2244 | + vperm $inout,$inout,$inptail,$inpperm | |
2245 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2246 | + vxor $inout,$inout,$tweak | |
2247 | + vxor $inout,$inout,$rndkey0 | |
2248 | + lvx $rndkey0,$idx,$key1 | |
2249 | + addi $idx,$idx,16 | |
2250 | + | |
2251 | + mtctr $rounds | |
2252 | + ${UCMP}i $len,16 | |
2253 | + bge Loop_xts_dec | |
2254 | + | |
2255 | +Ltail_xts_dec: | |
2256 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2257 | + vaddubm $tweak1,$tweak,$tweak | |
2258 | + vsldoi $tmp,$tmp,$tmp,15 | |
2259 | + vand $tmp,$tmp,$eighty7 | |
2260 | + vxor $tweak1,$tweak1,$tmp | |
2261 | + | |
2262 | + subi $inp,$inp,16 | |
2263 | + add $inp,$inp,$len | |
2264 | + | |
2265 | + vxor $inout,$inout,$tweak # :-( | |
2266 | + vxor $inout,$inout,$tweak1 # :-) | |
2267 | + | |
2268 | +Loop_xts_dec_short: | |
2269 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
2270 | + vncipher $inout,$inout,$rndkey1 | |
2271 | + lvx $rndkey1,$idx,$key1 | |
2272 | + addi $idx,$idx,16 | |
2273 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2274 | + vncipher $inout,$inout,$rndkey0 | |
2275 | + lvx $rndkey0,$idx,$key1 | |
2276 | + addi $idx,$idx,16 | |
2277 | + bdnz Loop_xts_dec_short | |
2278 | + | |
2279 | + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm | |
2280 | + vncipher $inout,$inout,$rndkey1 | |
2281 | + lvx $rndkey1,$idx,$key1 | |
2282 | + li $idx,16 | |
2283 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2284 | + vxor $rndkey0,$rndkey0,$tweak1 | |
2285 | + vncipherlast $output,$inout,$rndkey0 | |
2286 | + | |
2287 | + le?vperm $tmp,$output,$output,$leperm | |
2288 | + be?nop | |
2289 | + le?stvx_u $tmp,0,$out | |
2290 | + be?stvx_u $output,0,$out | |
2291 | + | |
2292 | + vmr $inout,$inptail | |
2293 | + lvx $inptail,0,$inp | |
2294 | + #addi $inp,$inp,16 | |
2295 | + lvx $rndkey0,0,$key1 | |
2296 | + lvx $rndkey1,$idx,$key1 | |
2297 | + addi $idx,$idx,16 | |
2298 | + vperm $inout,$inout,$inptail,$inpperm | |
2299 | + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm | |
2300 | + | |
2301 | + lvsr $inpperm,0,$len # $inpperm is no longer needed | |
2302 | + vxor $inptail,$inptail,$inptail # $inptail is no longer needed | |
2303 | + vspltisb $tmp,-1 | |
2304 | + vperm $inptail,$inptail,$tmp,$inpperm | |
2305 | + vsel $inout,$inout,$output,$inptail | |
2306 | + | |
2307 | + vxor $rndkey0,$rndkey0,$tweak | |
2308 | + vxor $inout,$inout,$rndkey0 | |
2309 | + lvx $rndkey0,$idx,$key1 | |
2310 | + addi $idx,$idx,16 | |
2311 | + | |
2312 | + subi r11,$out,1 | |
2313 | + mtctr $len | |
2314 | + li $len,16 | |
2315 | +Loop_xts_dec_steal: | |
2316 | + lbzu r0,1(r11) | |
2317 | + stb r0,16(r11) | |
2318 | + bdnz Loop_xts_dec_steal | |
2319 | + | |
2320 | + mtctr $rounds | |
2321 | + b Loop_xts_dec # one more time... | |
2322 | + | |
2323 | +Lxts_dec_done: | |
2324 | + mtspr 256,r12 # restore vrsave | |
2325 | + li r3,0 | |
2326 | + blr | |
2327 | + .long 0 | |
2328 | + .byte 0,12,0x04,0,0x80,6,6,0 | |
2329 | + .long 0 | |
2330 | +.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt | |
2331 | +___ | |
2332 | +######################################################################### | |
2333 | +{{ # Optimized XTS procedures # | |
2334 | +my $key_="r11"; | |
2335 | +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); | |
2336 | + $x00=0 if ($flavour =~ /osx/); | |
2337 | +my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5)); | |
2338 | +my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16)); | |
2339 | +my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22)); | |
2340 | +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys | |
2341 | + # v26-v31 last 6 round keys | |
2342 | +my ($keyperm)=($out0); # aliases with "caller", redundant assignment | |
2343 | +my $taillen=$x70; | |
2344 | + | |
2345 | +$code.=<<___; | |
2346 | +.align 5 | |
2347 | +_aesp8_xts_encrypt6x: | |
2348 | + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) | |
2349 | + mflr r0 | |
2350 | + li r7,`$FRAME+8*16+15` | |
2351 | + li r8,`$FRAME+8*16+31` | |
2352 | + $PUSH r0,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) | |
2353 | + stvx v20,r7,$sp # ABI says so | |
2354 | + addi r7,r7,32 | |
2355 | + stvx v21,r8,$sp | |
2356 | + addi r8,r8,32 | |
2357 | + stvx v22,r7,$sp | |
2358 | + addi r7,r7,32 | |
2359 | + stvx v23,r8,$sp | |
2360 | + addi r8,r8,32 | |
2361 | + stvx v24,r7,$sp | |
2362 | + addi r7,r7,32 | |
2363 | + stvx v25,r8,$sp | |
2364 | + addi r8,r8,32 | |
2365 | + stvx v26,r7,$sp | |
2366 | + addi r7,r7,32 | |
2367 | + stvx v27,r8,$sp | |
2368 | + addi r8,r8,32 | |
2369 | + stvx v28,r7,$sp | |
2370 | + addi r7,r7,32 | |
2371 | + stvx v29,r8,$sp | |
2372 | + addi r8,r8,32 | |
2373 | + stvx v30,r7,$sp | |
2374 | + stvx v31,r8,$sp | |
2375 | + mr r7,r0 | |
2376 | + li r0,-1 | |
2377 | + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave | |
2378 | + li $x10,0x10 | |
2379 | + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
2380 | + li $x20,0x20 | |
2381 | + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
2382 | + li $x30,0x30 | |
2383 | + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
2384 | + li $x40,0x40 | |
2385 | + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
2386 | + li $x50,0x50 | |
2387 | + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
2388 | + li $x60,0x60 | |
2389 | + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
2390 | + li $x70,0x70 | |
2391 | + mtspr 256,r0 | |
2392 | + | |
2393 | + subi $rounds,$rounds,3 # -4 in total | |
2394 | + | |
2395 | + lvx $rndkey0,$x00,$key1 # load key schedule | |
2396 | + lvx v30,$x10,$key1 | |
2397 | + addi $key1,$key1,0x20 | |
2398 | + lvx v31,$x00,$key1 | |
2399 | + ?vperm $rndkey0,$rndkey0,v30,$keyperm | |
2400 | + addi $key_,$sp,$FRAME+15 | |
2401 | + mtctr $rounds | |
2402 | + | |
2403 | +Load_xts_enc_key: | |
2404 | + ?vperm v24,v30,v31,$keyperm | |
2405 | + lvx v30,$x10,$key1 | |
2406 | + addi $key1,$key1,0x20 | |
2407 | + stvx v24,$x00,$key_ # off-load round[1] | |
2408 | + ?vperm v25,v31,v30,$keyperm | |
2409 | + lvx v31,$x00,$key1 | |
2410 | + stvx v25,$x10,$key_ # off-load round[2] | |
2411 | + addi $key_,$key_,0x20 | |
2412 | + bdnz Load_xts_enc_key | |
2413 | + | |
2414 | + lvx v26,$x10,$key1 | |
2415 | + ?vperm v24,v30,v31,$keyperm | |
2416 | + lvx v27,$x20,$key1 | |
2417 | + stvx v24,$x00,$key_ # off-load round[3] | |
2418 | + ?vperm v25,v31,v26,$keyperm | |
2419 | + lvx v28,$x30,$key1 | |
2420 | + stvx v25,$x10,$key_ # off-load round[4] | |
2421 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
2422 | + ?vperm v26,v26,v27,$keyperm | |
2423 | + lvx v29,$x40,$key1 | |
2424 | + ?vperm v27,v27,v28,$keyperm | |
2425 | + lvx v30,$x50,$key1 | |
2426 | + ?vperm v28,v28,v29,$keyperm | |
2427 | + lvx v31,$x60,$key1 | |
2428 | + ?vperm v29,v29,v30,$keyperm | |
2429 | + lvx $twk5,$x70,$key1 # borrow $twk5 | |
2430 | + ?vperm v30,v30,v31,$keyperm | |
2431 | + lvx v24,$x00,$key_ # pre-load round[1] | |
2432 | + ?vperm v31,v31,$twk5,$keyperm | |
2433 | + lvx v25,$x10,$key_ # pre-load round[2] | |
2434 | + | |
2435 | + vperm $in0,$inout,$inptail,$inpperm | |
2436 | + subi $inp,$inp,31 # undo "caller" | |
2437 | + vxor $twk0,$tweak,$rndkey0 | |
2438 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2439 | + vaddubm $tweak,$tweak,$tweak | |
2440 | + vsldoi $tmp,$tmp,$tmp,15 | |
2441 | + vand $tmp,$tmp,$eighty7 | |
2442 | + vxor $out0,$in0,$twk0 | |
2443 | + vxor $tweak,$tweak,$tmp | |
2444 | + | |
2445 | + lvx_u $in1,$x10,$inp | |
2446 | + vxor $twk1,$tweak,$rndkey0 | |
2447 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2448 | + vaddubm $tweak,$tweak,$tweak | |
2449 | + vsldoi $tmp,$tmp,$tmp,15 | |
2450 | + le?vperm $in1,$in1,$in1,$leperm | |
2451 | + vand $tmp,$tmp,$eighty7 | |
2452 | + vxor $out1,$in1,$twk1 | |
2453 | + vxor $tweak,$tweak,$tmp | |
2454 | + | |
2455 | + lvx_u $in2,$x20,$inp | |
2456 | + andi. $taillen,$len,15 | |
2457 | + vxor $twk2,$tweak,$rndkey0 | |
2458 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2459 | + vaddubm $tweak,$tweak,$tweak | |
2460 | + vsldoi $tmp,$tmp,$tmp,15 | |
2461 | + le?vperm $in2,$in2,$in2,$leperm | |
2462 | + vand $tmp,$tmp,$eighty7 | |
2463 | + vxor $out2,$in2,$twk2 | |
2464 | + vxor $tweak,$tweak,$tmp | |
2465 | + | |
2466 | + lvx_u $in3,$x30,$inp | |
2467 | + sub $len,$len,$taillen | |
2468 | + vxor $twk3,$tweak,$rndkey0 | |
2469 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2470 | + vaddubm $tweak,$tweak,$tweak | |
2471 | + vsldoi $tmp,$tmp,$tmp,15 | |
2472 | + le?vperm $in3,$in3,$in3,$leperm | |
2473 | + vand $tmp,$tmp,$eighty7 | |
2474 | + vxor $out3,$in3,$twk3 | |
2475 | + vxor $tweak,$tweak,$tmp | |
2476 | + | |
2477 | + lvx_u $in4,$x40,$inp | |
2478 | + subi $len,$len,0x60 | |
2479 | + vxor $twk4,$tweak,$rndkey0 | |
2480 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2481 | + vaddubm $tweak,$tweak,$tweak | |
2482 | + vsldoi $tmp,$tmp,$tmp,15 | |
2483 | + le?vperm $in4,$in4,$in4,$leperm | |
2484 | + vand $tmp,$tmp,$eighty7 | |
2485 | + vxor $out4,$in4,$twk4 | |
2486 | + vxor $tweak,$tweak,$tmp | |
2487 | + | |
2488 | + lvx_u $in5,$x50,$inp | |
2489 | + addi $inp,$inp,0x60 | |
2490 | + vxor $twk5,$tweak,$rndkey0 | |
2491 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2492 | + vaddubm $tweak,$tweak,$tweak | |
2493 | + vsldoi $tmp,$tmp,$tmp,15 | |
2494 | + le?vperm $in5,$in5,$in5,$leperm | |
2495 | + vand $tmp,$tmp,$eighty7 | |
2496 | + vxor $out5,$in5,$twk5 | |
2497 | + vxor $tweak,$tweak,$tmp | |
2498 | + | |
2499 | + vxor v31,v31,$rndkey0 | |
2500 | + mtctr $rounds | |
2501 | + b Loop_xts_enc6x | |
2502 | + | |
2503 | +.align 5 | |
2504 | +Loop_xts_enc6x: | |
2505 | + vcipher $out0,$out0,v24 | |
2506 | + vcipher $out1,$out1,v24 | |
2507 | + vcipher $out2,$out2,v24 | |
2508 | + vcipher $out3,$out3,v24 | |
2509 | + vcipher $out4,$out4,v24 | |
2510 | + vcipher $out5,$out5,v24 | |
2511 | + lvx v24,$x20,$key_ # round[3] | |
2512 | + addi $key_,$key_,0x20 | |
2513 | + | |
2514 | + vcipher $out0,$out0,v25 | |
2515 | + vcipher $out1,$out1,v25 | |
2516 | + vcipher $out2,$out2,v25 | |
2517 | + vcipher $out3,$out3,v25 | |
2518 | + vcipher $out4,$out4,v25 | |
2519 | + vcipher $out5,$out5,v25 | |
2520 | + lvx v25,$x10,$key_ # round[4] | |
2521 | + bdnz Loop_xts_enc6x | |
2522 | + | |
2523 | + subic $len,$len,96 # $len-=96 | |
2524 | + vxor $in0,$twk0,v31 # xor with last round key | |
2525 | + vcipher $out0,$out0,v24 | |
2526 | + vcipher $out1,$out1,v24 | |
2527 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2528 | + vxor $twk0,$tweak,$rndkey0 | |
2529 | + vaddubm $tweak,$tweak,$tweak | |
2530 | + vcipher $out2,$out2,v24 | |
2531 | + vcipher $out3,$out3,v24 | |
2532 | + vsldoi $tmp,$tmp,$tmp,15 | |
2533 | + vcipher $out4,$out4,v24 | |
2534 | + vcipher $out5,$out5,v24 | |
2535 | + | |
2536 | + subfe. r0,r0,r0 # borrow?-1:0 | |
2537 | + vand $tmp,$tmp,$eighty7 | |
2538 | + vcipher $out0,$out0,v25 | |
2539 | + vcipher $out1,$out1,v25 | |
2540 | + vxor $tweak,$tweak,$tmp | |
2541 | + vcipher $out2,$out2,v25 | |
2542 | + vcipher $out3,$out3,v25 | |
2543 | + vxor $in1,$twk1,v31 | |
2544 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2545 | + vxor $twk1,$tweak,$rndkey0 | |
2546 | + vcipher $out4,$out4,v25 | |
2547 | + vcipher $out5,$out5,v25 | |
2548 | + | |
2549 | + and r0,r0,$len | |
2550 | + vaddubm $tweak,$tweak,$tweak | |
2551 | + vsldoi $tmp,$tmp,$tmp,15 | |
2552 | + vcipher $out0,$out0,v26 | |
2553 | + vcipher $out1,$out1,v26 | |
2554 | + vand $tmp,$tmp,$eighty7 | |
2555 | + vcipher $out2,$out2,v26 | |
2556 | + vcipher $out3,$out3,v26 | |
2557 | + vxor $tweak,$tweak,$tmp | |
2558 | + vcipher $out4,$out4,v26 | |
2559 | + vcipher $out5,$out5,v26 | |
2560 | + | |
2561 | + add $inp,$inp,r0 # $inp is adjusted in such | |
2562 | + # way that at exit from the | |
2563 | + # loop inX-in5 are loaded | |
2564 | + # with last "words" | |
2565 | + vxor $in2,$twk2,v31 | |
2566 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2567 | + vxor $twk2,$tweak,$rndkey0 | |
2568 | + vaddubm $tweak,$tweak,$tweak | |
2569 | + vcipher $out0,$out0,v27 | |
2570 | + vcipher $out1,$out1,v27 | |
2571 | + vsldoi $tmp,$tmp,$tmp,15 | |
2572 | + vcipher $out2,$out2,v27 | |
2573 | + vcipher $out3,$out3,v27 | |
2574 | + vand $tmp,$tmp,$eighty7 | |
2575 | + vcipher $out4,$out4,v27 | |
2576 | + vcipher $out5,$out5,v27 | |
2577 | + | |
2578 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
2579 | + vxor $tweak,$tweak,$tmp | |
2580 | + vcipher $out0,$out0,v28 | |
2581 | + vcipher $out1,$out1,v28 | |
2582 | + vxor $in3,$twk3,v31 | |
2583 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2584 | + vxor $twk3,$tweak,$rndkey0 | |
2585 | + vcipher $out2,$out2,v28 | |
2586 | + vcipher $out3,$out3,v28 | |
2587 | + vaddubm $tweak,$tweak,$tweak | |
2588 | + vsldoi $tmp,$tmp,$tmp,15 | |
2589 | + vcipher $out4,$out4,v28 | |
2590 | + vcipher $out5,$out5,v28 | |
2591 | + lvx v24,$x00,$key_ # re-pre-load round[1] | |
2592 | + vand $tmp,$tmp,$eighty7 | |
2593 | + | |
2594 | + vcipher $out0,$out0,v29 | |
2595 | + vcipher $out1,$out1,v29 | |
2596 | + vxor $tweak,$tweak,$tmp | |
2597 | + vcipher $out2,$out2,v29 | |
2598 | + vcipher $out3,$out3,v29 | |
2599 | + vxor $in4,$twk4,v31 | |
2600 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2601 | + vxor $twk4,$tweak,$rndkey0 | |
2602 | + vcipher $out4,$out4,v29 | |
2603 | + vcipher $out5,$out5,v29 | |
2604 | + lvx v25,$x10,$key_ # re-pre-load round[2] | |
2605 | + vaddubm $tweak,$tweak,$tweak | |
2606 | + vsldoi $tmp,$tmp,$tmp,15 | |
2607 | + | |
2608 | + vcipher $out0,$out0,v30 | |
2609 | + vcipher $out1,$out1,v30 | |
2610 | + vand $tmp,$tmp,$eighty7 | |
2611 | + vcipher $out2,$out2,v30 | |
2612 | + vcipher $out3,$out3,v30 | |
2613 | + vxor $tweak,$tweak,$tmp | |
2614 | + vcipher $out4,$out4,v30 | |
2615 | + vcipher $out5,$out5,v30 | |
2616 | + vxor $in5,$twk5,v31 | |
2617 | + vsrab $tmp,$tweak,$seven # next tweak value | |
2618 | + vxor $twk5,$tweak,$rndkey0 | |
2619 | + | |
2620 | + vcipherlast $out0,$out0,$in0 | |
2621 | + lvx_u $in0,$x00,$inp # load next input block | |
2622 | + vaddubm $tweak,$tweak,$tweak | |
2623 | + vsldoi $tmp,$tmp,$tmp,15 | |
2624 | + vcipherlast $out1,$out1,$in1 | |
2625 | + lvx_u $in1,$x10,$inp | |
2626 | + vcipherlast $out2,$out2,$in2 | |
2627 | + le?vperm $in0,$in0,$in0,$leperm | |
2628 | + lvx_u $in2,$x20,$inp | |
2629 | + vand $tmp,$tmp,$eighty7 | |
2630 | + vcipherlast $out3,$out3,$in3 | |
2631 | + le?vperm $in1,$in1,$in1,$leperm | |
2632 | + lvx_u $in3,$x30,$inp | |
2633 | + vcipherlast $out4,$out4,$in4 | |
2634 | + le?vperm $in2,$in2,$in2,$leperm | |
2635 | + lvx_u $in4,$x40,$inp | |
2636 | + vxor $tweak,$tweak,$tmp | |
2637 | + vcipherlast $tmp,$out5,$in5 # last block might be needed | |
2638 | + # in stealing mode | |
2639 | + le?vperm $in3,$in3,$in3,$leperm | |
2640 | + lvx_u $in5,$x50,$inp | |
2641 | + addi $inp,$inp,0x60 | |
2642 | + le?vperm $in4,$in4,$in4,$leperm | |
2643 | + le?vperm $in5,$in5,$in5,$leperm | |
2644 | + | |
2645 | + le?vperm $out0,$out0,$out0,$leperm | |
2646 | + le?vperm $out1,$out1,$out1,$leperm | |
2647 | + stvx_u $out0,$x00,$out # store output | |
2648 | + vxor $out0,$in0,$twk0 | |
2649 | + le?vperm $out2,$out2,$out2,$leperm | |
2650 | + stvx_u $out1,$x10,$out | |
2651 | + vxor $out1,$in1,$twk1 | |
2652 | + le?vperm $out3,$out3,$out3,$leperm | |
2653 | + stvx_u $out2,$x20,$out | |
2654 | + vxor $out2,$in2,$twk2 | |
2655 | + le?vperm $out4,$out4,$out4,$leperm | |
2656 | + stvx_u $out3,$x30,$out | |
2657 | + vxor $out3,$in3,$twk3 | |
2658 | + le?vperm $out5,$tmp,$tmp,$leperm | |
2659 | + stvx_u $out4,$x40,$out | |
2660 | + vxor $out4,$in4,$twk4 | |
2661 | + le?stvx_u $out5,$x50,$out | |
2662 | + be?stvx_u $tmp, $x50,$out | |
2663 | + vxor $out5,$in5,$twk5 | |
2664 | + addi $out,$out,0x60 | |
2665 | + | |
2666 | + mtctr $rounds | |
2667 | + beq Loop_xts_enc6x # did $len-=96 borrow? | |
2668 | + | |
2669 | + addic. $len,$len,0x60 | |
2670 | + beq Lxts_enc6x_zero | |
2671 | + cmpwi $len,0x20 | |
2672 | + blt Lxts_enc6x_one | |
2673 | + nop | |
2674 | + beq Lxts_enc6x_two | |
2675 | + cmpwi $len,0x40 | |
2676 | + blt Lxts_enc6x_three | |
2677 | + nop | |
2678 | + beq Lxts_enc6x_four | |
2679 | + | |
2680 | +Lxts_enc6x_five: | |
2681 | + vxor $out0,$in1,$twk0 | |
2682 | + vxor $out1,$in2,$twk1 | |
2683 | + vxor $out2,$in3,$twk2 | |
2684 | + vxor $out3,$in4,$twk3 | |
2685 | + vxor $out4,$in5,$twk4 | |
2686 | + | |
2687 | + bl _aesp8_xts_enc5x | |
2688 | + | |
2689 | + le?vperm $out0,$out0,$out0,$leperm | |
2690 | + vmr $twk0,$twk5 # unused tweak | |
2691 | + le?vperm $out1,$out1,$out1,$leperm | |
2692 | + stvx_u $out0,$x00,$out # store output | |
2693 | + le?vperm $out2,$out2,$out2,$leperm | |
2694 | + stvx_u $out1,$x10,$out | |
2695 | + le?vperm $out3,$out3,$out3,$leperm | |
2696 | + stvx_u $out2,$x20,$out | |
2697 | + vxor $tmp,$out4,$twk5 # last block prep for stealing | |
2698 | + le?vperm $out4,$out4,$out4,$leperm | |
2699 | + stvx_u $out3,$x30,$out | |
2700 | + stvx_u $out4,$x40,$out | |
2701 | + addi $out,$out,0x50 | |
2702 | + bne Lxts_enc6x_steal | |
2703 | + b Lxts_enc6x_done | |
2704 | + | |
2705 | +.align 4 | |
2706 | +Lxts_enc6x_four: | |
2707 | + vxor $out0,$in2,$twk0 | |
2708 | + vxor $out1,$in3,$twk1 | |
2709 | + vxor $out2,$in4,$twk2 | |
2710 | + vxor $out3,$in5,$twk3 | |
2711 | + vxor $out4,$out4,$out4 | |
2712 | + | |
2713 | + bl _aesp8_xts_enc5x | |
2714 | + | |
2715 | + le?vperm $out0,$out0,$out0,$leperm | |
2716 | + vmr $twk0,$twk4 # unused tweak | |
2717 | + le?vperm $out1,$out1,$out1,$leperm | |
2718 | + stvx_u $out0,$x00,$out # store output | |
2719 | + le?vperm $out2,$out2,$out2,$leperm | |
2720 | + stvx_u $out1,$x10,$out | |
2721 | + vxor $tmp,$out3,$twk4 # last block prep for stealing | |
2722 | + le?vperm $out3,$out3,$out3,$leperm | |
2723 | + stvx_u $out2,$x20,$out | |
2724 | + stvx_u $out3,$x30,$out | |
2725 | + addi $out,$out,0x40 | |
2726 | + bne Lxts_enc6x_steal | |
2727 | + b Lxts_enc6x_done | |
2728 | + | |
2729 | +.align 4 | |
2730 | +Lxts_enc6x_three: | |
2731 | + vxor $out0,$in3,$twk0 | |
2732 | + vxor $out1,$in4,$twk1 | |
2733 | + vxor $out2,$in5,$twk2 | |
2734 | + vxor $out3,$out3,$out3 | |
2735 | + vxor $out4,$out4,$out4 | |
2736 | + | |
2737 | + bl _aesp8_xts_enc5x | |
2738 | + | |
2739 | + le?vperm $out0,$out0,$out0,$leperm | |
2740 | + vmr $twk0,$twk3 # unused tweak | |
2741 | + le?vperm $out1,$out1,$out1,$leperm | |
2742 | + stvx_u $out0,$x00,$out # store output | |
2743 | + vxor $tmp,$out2,$twk3 # last block prep for stealing | |
2744 | + le?vperm $out2,$out2,$out2,$leperm | |
2745 | + stvx_u $out1,$x10,$out | |
2746 | + stvx_u $out2,$x20,$out | |
2747 | + addi $out,$out,0x30 | |
2748 | + bne Lxts_enc6x_steal | |
2749 | + b Lxts_enc6x_done | |
2750 | + | |
2751 | +.align 4 | |
2752 | +Lxts_enc6x_two: | |
2753 | + vxor $out0,$in4,$twk0 | |
2754 | + vxor $out1,$in5,$twk1 | |
2755 | + vxor $out2,$out2,$out2 | |
2756 | + vxor $out3,$out3,$out3 | |
2757 | + vxor $out4,$out4,$out4 | |
2758 | + | |
2759 | + bl _aesp8_xts_enc5x | |
2760 | + | |
2761 | + le?vperm $out0,$out0,$out0,$leperm | |
2762 | + vmr $twk0,$twk2 # unused tweak | |
2763 | + vxor $tmp,$out1,$twk2 # last block prep for stealing | |
2764 | + le?vperm $out1,$out1,$out1,$leperm | |
2765 | + stvx_u $out0,$x00,$out # store output | |
2766 | + stvx_u $out1,$x10,$out | |
2767 | + addi $out,$out,0x20 | |
2768 | + bne Lxts_enc6x_steal | |
2769 | + b Lxts_enc6x_done | |
2770 | + | |
2771 | +.align 4 | |
2772 | +Lxts_enc6x_one: | |
2773 | + vxor $out0,$in5,$twk0 | |
2774 | + nop | |
2775 | +Loop_xts_enc1x: | |
2776 | + vcipher $out0,$out0,v24 | |
2777 | + lvx v24,$x20,$key_ # round[3] | |
2778 | + addi $key_,$key_,0x20 | |
2779 | + | |
2780 | + vcipher $out0,$out0,v25 | |
2781 | + lvx v25,$x10,$key_ # round[4] | |
2782 | + bdnz Loop_xts_enc1x | |
2783 | + | |
2784 | + add $inp,$inp,$taillen | |
2785 | + cmpwi $taillen,0 | |
2786 | + vcipher $out0,$out0,v24 | |
2787 | + | |
2788 | + subi $inp,$inp,16 | |
2789 | + vcipher $out0,$out0,v25 | |
2790 | + | |
2791 | + lvsr $inpperm,0,$taillen | |
2792 | + vcipher $out0,$out0,v26 | |
2793 | + | |
2794 | + lvx_u $in0,0,$inp | |
2795 | + vcipher $out0,$out0,v27 | |
2796 | + | |
2797 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
2798 | + vcipher $out0,$out0,v28 | |
2799 | + lvx v24,$x00,$key_ # re-pre-load round[1] | |
2800 | + | |
2801 | + vcipher $out0,$out0,v29 | |
2802 | + lvx v25,$x10,$key_ # re-pre-load round[2] | |
2803 | + vxor $twk0,$twk0,v31 | |
2804 | + | |
2805 | + le?vperm $in0,$in0,$in0,$leperm | |
2806 | + vcipher $out0,$out0,v30 | |
2807 | + | |
2808 | + vperm $in0,$in0,$in0,$inpperm | |
2809 | + vcipherlast $out0,$out0,$twk0 | |
2810 | + | |
2811 | + vmr $twk0,$twk1 # unused tweak | |
2812 | + vxor $tmp,$out0,$twk1 # last block prep for stealing | |
2813 | + le?vperm $out0,$out0,$out0,$leperm | |
2814 | + stvx_u $out0,$x00,$out # store output | |
2815 | + addi $out,$out,0x10 | |
2816 | + bne Lxts_enc6x_steal | |
2817 | + b Lxts_enc6x_done | |
2818 | + | |
2819 | +.align 4 | |
2820 | +Lxts_enc6x_zero: | |
2821 | + cmpwi $taillen,0 | |
2822 | + beq Lxts_enc6x_done | |
2823 | + | |
2824 | + add $inp,$inp,$taillen | |
2825 | + subi $inp,$inp,16 | |
2826 | + lvx_u $in0,0,$inp | |
2827 | + lvsr $inpperm,0,$taillen # $in5 is no more | |
2828 | + le?vperm $in0,$in0,$in0,$leperm | |
2829 | + vperm $in0,$in0,$in0,$inpperm | |
2830 | + vxor $tmp,$tmp,$twk0 | |
2831 | +Lxts_enc6x_steal: | |
2832 | + vxor $in0,$in0,$twk0 | |
2833 | + vxor $out0,$out0,$out0 | |
2834 | + vspltisb $out1,-1 | |
2835 | + vperm $out0,$out0,$out1,$inpperm | |
2836 | + vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember? | |
2837 | + | |
2838 | + subi r3,$out,17 | |
2839 | + subi $out,$out,16 | |
2840 | + mtctr $taillen | |
2841 | +Loop_xts_enc6x_steal: | |
2842 | + lbzu r0,1(r3) | |
2843 | + stb r0,16(r3) | |
2844 | + bdnz Loop_xts_enc6x_steal | |
2845 | + | |
2846 | + li $taillen,0 | |
2847 | + mtctr $rounds | |
2848 | + b Loop_xts_enc1x # one more time... | |
2849 | + | |
2850 | +.align 4 | |
2851 | +Lxts_enc6x_done: | |
2852 | + mtlr r7 | |
2853 | + li r10,`$FRAME+15` | |
2854 | + li r11,`$FRAME+31` | |
2855 | + stvx $seven,r10,$sp # wipe copies of round keys | |
2856 | + addi r10,r10,32 | |
2857 | + stvx $seven,r11,$sp | |
2858 | + addi r11,r11,32 | |
2859 | + stvx $seven,r10,$sp | |
2860 | + addi r10,r10,32 | |
2861 | + stvx $seven,r11,$sp | |
2862 | + addi r11,r11,32 | |
2863 | + stvx $seven,r10,$sp | |
2864 | + addi r10,r10,32 | |
2865 | + stvx $seven,r11,$sp | |
2866 | + addi r11,r11,32 | |
2867 | + stvx $seven,r10,$sp | |
2868 | + addi r10,r10,32 | |
2869 | + stvx $seven,r11,$sp | |
2870 | + addi r11,r11,32 | |
2871 | + | |
2872 | + mtspr 256,$vrsave | |
2873 | + lvx v20,r10,$sp # ABI says so | |
2874 | + addi r10,r10,32 | |
2875 | + lvx v21,r11,$sp | |
2876 | + addi r11,r11,32 | |
2877 | + lvx v22,r10,$sp | |
2878 | + addi r10,r10,32 | |
2879 | + lvx v23,r11,$sp | |
2880 | + addi r11,r11,32 | |
2881 | + lvx v24,r10,$sp | |
2882 | + addi r10,r10,32 | |
2883 | + lvx v25,r11,$sp | |
2884 | + addi r11,r11,32 | |
2885 | + lvx v26,r10,$sp | |
2886 | + addi r10,r10,32 | |
2887 | + lvx v27,r11,$sp | |
2888 | + addi r11,r11,32 | |
2889 | + lvx v28,r10,$sp | |
2890 | + addi r10,r10,32 | |
2891 | + lvx v29,r11,$sp | |
2892 | + addi r11,r11,32 | |
2893 | + lvx v30,r10,$sp | |
2894 | + lvx v31,r11,$sp | |
2895 | + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
2896 | + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
2897 | + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
2898 | + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
2899 | + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
2900 | + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
2901 | + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` | |
2902 | + blr | |
2903 | + .long 0 | |
2904 | + .byte 0,12,0x04,1,0x80,6,6,0 | |
2905 | + .long 0 | |
2906 | + | |
2907 | +.align 5 | |
2908 | +_aesp8_xts_enc5x: | |
2909 | + vcipher $out0,$out0,v24 | |
2910 | + vcipher $out1,$out1,v24 | |
2911 | + vcipher $out2,$out2,v24 | |
2912 | + vcipher $out3,$out3,v24 | |
2913 | + vcipher $out4,$out4,v24 | |
2914 | + lvx v24,$x20,$key_ # round[3] | |
2915 | + addi $key_,$key_,0x20 | |
2916 | + | |
2917 | + vcipher $out0,$out0,v25 | |
2918 | + vcipher $out1,$out1,v25 | |
2919 | + vcipher $out2,$out2,v25 | |
2920 | + vcipher $out3,$out3,v25 | |
2921 | + vcipher $out4,$out4,v25 | |
2922 | + lvx v25,$x10,$key_ # round[4] | |
2923 | + bdnz _aesp8_xts_enc5x | |
2924 | + | |
2925 | + add $inp,$inp,$taillen | |
2926 | + cmpwi $taillen,0 | |
2927 | + vcipher $out0,$out0,v24 | |
2928 | + vcipher $out1,$out1,v24 | |
2929 | + vcipher $out2,$out2,v24 | |
2930 | + vcipher $out3,$out3,v24 | |
2931 | + vcipher $out4,$out4,v24 | |
2932 | + | |
2933 | + subi $inp,$inp,16 | |
2934 | + vcipher $out0,$out0,v25 | |
2935 | + vcipher $out1,$out1,v25 | |
2936 | + vcipher $out2,$out2,v25 | |
2937 | + vcipher $out3,$out3,v25 | |
2938 | + vcipher $out4,$out4,v25 | |
2939 | + vxor $twk0,$twk0,v31 | |
2940 | + | |
2941 | + vcipher $out0,$out0,v26 | |
2942 | + lvsr $inpperm,r0,$taillen # $in5 is no more | |
2943 | + vcipher $out1,$out1,v26 | |
2944 | + vcipher $out2,$out2,v26 | |
2945 | + vcipher $out3,$out3,v26 | |
2946 | + vcipher $out4,$out4,v26 | |
2947 | + vxor $in1,$twk1,v31 | |
2948 | + | |
2949 | + vcipher $out0,$out0,v27 | |
2950 | + lvx_u $in0,0,$inp | |
2951 | + vcipher $out1,$out1,v27 | |
2952 | + vcipher $out2,$out2,v27 | |
2953 | + vcipher $out3,$out3,v27 | |
2954 | + vcipher $out4,$out4,v27 | |
2955 | + vxor $in2,$twk2,v31 | |
2956 | + | |
2957 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
2958 | + vcipher $out0,$out0,v28 | |
2959 | + vcipher $out1,$out1,v28 | |
2960 | + vcipher $out2,$out2,v28 | |
2961 | + vcipher $out3,$out3,v28 | |
2962 | + vcipher $out4,$out4,v28 | |
2963 | + lvx v24,$x00,$key_ # re-pre-load round[1] | |
2964 | + vxor $in3,$twk3,v31 | |
2965 | + | |
2966 | + vcipher $out0,$out0,v29 | |
2967 | + le?vperm $in0,$in0,$in0,$leperm | |
2968 | + vcipher $out1,$out1,v29 | |
2969 | + vcipher $out2,$out2,v29 | |
2970 | + vcipher $out3,$out3,v29 | |
2971 | + vcipher $out4,$out4,v29 | |
2972 | + lvx v25,$x10,$key_ # re-pre-load round[2] | |
2973 | + vxor $in4,$twk4,v31 | |
2974 | + | |
2975 | + vcipher $out0,$out0,v30 | |
2976 | + vperm $in0,$in0,$in0,$inpperm | |
2977 | + vcipher $out1,$out1,v30 | |
2978 | + vcipher $out2,$out2,v30 | |
2979 | + vcipher $out3,$out3,v30 | |
2980 | + vcipher $out4,$out4,v30 | |
2981 | + | |
2982 | + vcipherlast $out0,$out0,$twk0 | |
2983 | + vcipherlast $out1,$out1,$in1 | |
2984 | + vcipherlast $out2,$out2,$in2 | |
2985 | + vcipherlast $out3,$out3,$in3 | |
2986 | + vcipherlast $out4,$out4,$in4 | |
2987 | + blr | |
2988 | + .long 0 | |
2989 | + .byte 0,12,0x14,0,0,0,0,0 | |
2990 | + | |
2991 | +.align 5 | |
2992 | +_aesp8_xts_decrypt6x: | |
2993 | + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) | |
2994 | + mflr r0 | |
2995 | + li r7,`$FRAME+8*16+15` | |
2996 | + li r8,`$FRAME+8*16+31` | |
2997 | + $PUSH r0,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) | |
2998 | + stvx v20,r7,$sp # ABI says so | |
2999 | + addi r7,r7,32 | |
3000 | + stvx v21,r8,$sp | |
3001 | + addi r8,r8,32 | |
3002 | + stvx v22,r7,$sp | |
3003 | + addi r7,r7,32 | |
3004 | + stvx v23,r8,$sp | |
3005 | + addi r8,r8,32 | |
3006 | + stvx v24,r7,$sp | |
3007 | + addi r7,r7,32 | |
3008 | + stvx v25,r8,$sp | |
3009 | + addi r8,r8,32 | |
3010 | + stvx v26,r7,$sp | |
3011 | + addi r7,r7,32 | |
3012 | + stvx v27,r8,$sp | |
3013 | + addi r8,r8,32 | |
3014 | + stvx v28,r7,$sp | |
3015 | + addi r7,r7,32 | |
3016 | + stvx v29,r8,$sp | |
3017 | + addi r8,r8,32 | |
3018 | + stvx v30,r7,$sp | |
3019 | + stvx v31,r8,$sp | |
3020 | + mr r7,r0 | |
3021 | + li r0,-1 | |
3022 | + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave | |
3023 | + li $x10,0x10 | |
3024 | + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
3025 | + li $x20,0x20 | |
3026 | + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
3027 | + li $x30,0x30 | |
3028 | + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
3029 | + li $x40,0x40 | |
3030 | + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
3031 | + li $x50,0x50 | |
3032 | + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
3033 | + li $x60,0x60 | |
3034 | + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
3035 | + li $x70,0x70 | |
3036 | + mtspr 256,r0 | |
3037 | + | |
3038 | + subi $rounds,$rounds,3 # -4 in total | |
3039 | + | |
3040 | + lvx $rndkey0,$x00,$key1 # load key schedule | |
3041 | + lvx v30,$x10,$key1 | |
3042 | + addi $key1,$key1,0x20 | |
3043 | + lvx v31,$x00,$key1 | |
3044 | + ?vperm $rndkey0,$rndkey0,v30,$keyperm | |
3045 | + addi $key_,$sp,$FRAME+15 | |
3046 | + mtctr $rounds | |
3047 | + | |
3048 | +Load_xts_dec_key: | |
3049 | + ?vperm v24,v30,v31,$keyperm | |
3050 | + lvx v30,$x10,$key1 | |
3051 | + addi $key1,$key1,0x20 | |
3052 | + stvx v24,$x00,$key_ # off-load round[1] | |
3053 | + ?vperm v25,v31,v30,$keyperm | |
3054 | + lvx v31,$x00,$key1 | |
3055 | + stvx v25,$x10,$key_ # off-load round[2] | |
3056 | + addi $key_,$key_,0x20 | |
3057 | + bdnz Load_xts_dec_key | |
3058 | + | |
3059 | + lvx v26,$x10,$key1 | |
3060 | + ?vperm v24,v30,v31,$keyperm | |
3061 | + lvx v27,$x20,$key1 | |
3062 | + stvx v24,$x00,$key_ # off-load round[3] | |
3063 | + ?vperm v25,v31,v26,$keyperm | |
3064 | + lvx v28,$x30,$key1 | |
3065 | + stvx v25,$x10,$key_ # off-load round[4] | |
3066 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
3067 | + ?vperm v26,v26,v27,$keyperm | |
3068 | + lvx v29,$x40,$key1 | |
3069 | + ?vperm v27,v27,v28,$keyperm | |
3070 | + lvx v30,$x50,$key1 | |
3071 | + ?vperm v28,v28,v29,$keyperm | |
3072 | + lvx v31,$x60,$key1 | |
3073 | + ?vperm v29,v29,v30,$keyperm | |
3074 | + lvx $twk5,$x70,$key1 # borrow $twk5 | |
3075 | + ?vperm v30,v30,v31,$keyperm | |
3076 | + lvx v24,$x00,$key_ # pre-load round[1] | |
3077 | + ?vperm v31,v31,$twk5,$keyperm | |
3078 | + lvx v25,$x10,$key_ # pre-load round[2] | |
3079 | + | |
3080 | + vperm $in0,$inout,$inptail,$inpperm | |
3081 | + subi $inp,$inp,31 # undo "caller" | |
3082 | + vxor $twk0,$tweak,$rndkey0 | |
3083 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3084 | + vaddubm $tweak,$tweak,$tweak | |
3085 | + vsldoi $tmp,$tmp,$tmp,15 | |
3086 | + vand $tmp,$tmp,$eighty7 | |
3087 | + vxor $out0,$in0,$twk0 | |
3088 | + vxor $tweak,$tweak,$tmp | |
3089 | + | |
3090 | + lvx_u $in1,$x10,$inp | |
3091 | + vxor $twk1,$tweak,$rndkey0 | |
3092 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3093 | + vaddubm $tweak,$tweak,$tweak | |
3094 | + vsldoi $tmp,$tmp,$tmp,15 | |
3095 | + le?vperm $in1,$in1,$in1,$leperm | |
3096 | + vand $tmp,$tmp,$eighty7 | |
3097 | + vxor $out1,$in1,$twk1 | |
3098 | + vxor $tweak,$tweak,$tmp | |
3099 | + | |
3100 | + lvx_u $in2,$x20,$inp | |
3101 | + andi. $taillen,$len,15 | |
3102 | + vxor $twk2,$tweak,$rndkey0 | |
3103 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3104 | + vaddubm $tweak,$tweak,$tweak | |
3105 | + vsldoi $tmp,$tmp,$tmp,15 | |
3106 | + le?vperm $in2,$in2,$in2,$leperm | |
3107 | + vand $tmp,$tmp,$eighty7 | |
3108 | + vxor $out2,$in2,$twk2 | |
3109 | + vxor $tweak,$tweak,$tmp | |
3110 | + | |
3111 | + lvx_u $in3,$x30,$inp | |
3112 | + sub $len,$len,$taillen | |
3113 | + vxor $twk3,$tweak,$rndkey0 | |
3114 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3115 | + vaddubm $tweak,$tweak,$tweak | |
3116 | + vsldoi $tmp,$tmp,$tmp,15 | |
3117 | + le?vperm $in3,$in3,$in3,$leperm | |
3118 | + vand $tmp,$tmp,$eighty7 | |
3119 | + vxor $out3,$in3,$twk3 | |
3120 | + vxor $tweak,$tweak,$tmp | |
3121 | + | |
3122 | + lvx_u $in4,$x40,$inp | |
3123 | + subi $len,$len,0x60 | |
3124 | + vxor $twk4,$tweak,$rndkey0 | |
3125 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3126 | + vaddubm $tweak,$tweak,$tweak | |
3127 | + vsldoi $tmp,$tmp,$tmp,15 | |
3128 | + le?vperm $in4,$in4,$in4,$leperm | |
3129 | + vand $tmp,$tmp,$eighty7 | |
3130 | + vxor $out4,$in4,$twk4 | |
3131 | + vxor $tweak,$tweak,$tmp | |
3132 | + | |
3133 | + lvx_u $in5,$x50,$inp | |
3134 | + addi $inp,$inp,0x60 | |
3135 | + vxor $twk5,$tweak,$rndkey0 | |
3136 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3137 | + vaddubm $tweak,$tweak,$tweak | |
3138 | + vsldoi $tmp,$tmp,$tmp,15 | |
3139 | + le?vperm $in5,$in5,$in5,$leperm | |
3140 | + vand $tmp,$tmp,$eighty7 | |
3141 | + vxor $out5,$in5,$twk5 | |
3142 | + vxor $tweak,$tweak,$tmp | |
3143 | + | |
3144 | + vxor v31,v31,$rndkey0 | |
3145 | + mtctr $rounds | |
3146 | + b Loop_xts_dec6x | |
3147 | + | |
3148 | +.align 5 | |
3149 | +Loop_xts_dec6x: | |
3150 | + vncipher $out0,$out0,v24 | |
3151 | + vncipher $out1,$out1,v24 | |
3152 | + vncipher $out2,$out2,v24 | |
3153 | + vncipher $out3,$out3,v24 | |
3154 | + vncipher $out4,$out4,v24 | |
3155 | + vncipher $out5,$out5,v24 | |
3156 | + lvx v24,$x20,$key_ # round[3] | |
3157 | + addi $key_,$key_,0x20 | |
3158 | + | |
3159 | + vncipher $out0,$out0,v25 | |
3160 | + vncipher $out1,$out1,v25 | |
3161 | + vncipher $out2,$out2,v25 | |
3162 | + vncipher $out3,$out3,v25 | |
3163 | + vncipher $out4,$out4,v25 | |
3164 | + vncipher $out5,$out5,v25 | |
3165 | + lvx v25,$x10,$key_ # round[4] | |
3166 | + bdnz Loop_xts_dec6x | |
3167 | + | |
3168 | + subic $len,$len,96 # $len-=96 | |
3169 | + vxor $in0,$twk0,v31 # xor with last round key | |
3170 | + vncipher $out0,$out0,v24 | |
3171 | + vncipher $out1,$out1,v24 | |
3172 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3173 | + vxor $twk0,$tweak,$rndkey0 | |
3174 | + vaddubm $tweak,$tweak,$tweak | |
3175 | + vncipher $out2,$out2,v24 | |
3176 | + vncipher $out3,$out3,v24 | |
3177 | + vsldoi $tmp,$tmp,$tmp,15 | |
3178 | + vncipher $out4,$out4,v24 | |
3179 | + vncipher $out5,$out5,v24 | |
3180 | + | |
3181 | + subfe. r0,r0,r0 # borrow?-1:0 | |
3182 | + vand $tmp,$tmp,$eighty7 | |
3183 | + vncipher $out0,$out0,v25 | |
3184 | + vncipher $out1,$out1,v25 | |
3185 | + vxor $tweak,$tweak,$tmp | |
3186 | + vncipher $out2,$out2,v25 | |
3187 | + vncipher $out3,$out3,v25 | |
3188 | + vxor $in1,$twk1,v31 | |
3189 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3190 | + vxor $twk1,$tweak,$rndkey0 | |
3191 | + vncipher $out4,$out4,v25 | |
3192 | + vncipher $out5,$out5,v25 | |
3193 | + | |
3194 | + and r0,r0,$len | |
3195 | + vaddubm $tweak,$tweak,$tweak | |
3196 | + vsldoi $tmp,$tmp,$tmp,15 | |
3197 | + vncipher $out0,$out0,v26 | |
3198 | + vncipher $out1,$out1,v26 | |
3199 | + vand $tmp,$tmp,$eighty7 | |
3200 | + vncipher $out2,$out2,v26 | |
3201 | + vncipher $out3,$out3,v26 | |
3202 | + vxor $tweak,$tweak,$tmp | |
3203 | + vncipher $out4,$out4,v26 | |
3204 | + vncipher $out5,$out5,v26 | |
3205 | + | |
3206 | + add $inp,$inp,r0 # $inp is adjusted in such | |
3207 | + # way that at exit from the | |
3208 | + # loop inX-in5 are loaded | |
3209 | + # with last "words" | |
3210 | + vxor $in2,$twk2,v31 | |
3211 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3212 | + vxor $twk2,$tweak,$rndkey0 | |
3213 | + vaddubm $tweak,$tweak,$tweak | |
3214 | + vncipher $out0,$out0,v27 | |
3215 | + vncipher $out1,$out1,v27 | |
3216 | + vsldoi $tmp,$tmp,$tmp,15 | |
3217 | + vncipher $out2,$out2,v27 | |
3218 | + vncipher $out3,$out3,v27 | |
3219 | + vand $tmp,$tmp,$eighty7 | |
3220 | + vncipher $out4,$out4,v27 | |
3221 | + vncipher $out5,$out5,v27 | |
3222 | + | |
3223 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
3224 | + vxor $tweak,$tweak,$tmp | |
3225 | + vncipher $out0,$out0,v28 | |
3226 | + vncipher $out1,$out1,v28 | |
3227 | + vxor $in3,$twk3,v31 | |
3228 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3229 | + vxor $twk3,$tweak,$rndkey0 | |
3230 | + vncipher $out2,$out2,v28 | |
3231 | + vncipher $out3,$out3,v28 | |
3232 | + vaddubm $tweak,$tweak,$tweak | |
3233 | + vsldoi $tmp,$tmp,$tmp,15 | |
3234 | + vncipher $out4,$out4,v28 | |
3235 | + vncipher $out5,$out5,v28 | |
3236 | + lvx v24,$x00,$key_ # re-pre-load round[1] | |
3237 | + vand $tmp,$tmp,$eighty7 | |
3238 | + | |
3239 | + vncipher $out0,$out0,v29 | |
3240 | + vncipher $out1,$out1,v29 | |
3241 | + vxor $tweak,$tweak,$tmp | |
3242 | + vncipher $out2,$out2,v29 | |
3243 | + vncipher $out3,$out3,v29 | |
3244 | + vxor $in4,$twk4,v31 | |
3245 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3246 | + vxor $twk4,$tweak,$rndkey0 | |
3247 | + vncipher $out4,$out4,v29 | |
3248 | + vncipher $out5,$out5,v29 | |
3249 | + lvx v25,$x10,$key_ # re-pre-load round[2] | |
3250 | + vaddubm $tweak,$tweak,$tweak | |
3251 | + vsldoi $tmp,$tmp,$tmp,15 | |
3252 | + | |
3253 | + vncipher $out0,$out0,v30 | |
3254 | + vncipher $out1,$out1,v30 | |
3255 | + vand $tmp,$tmp,$eighty7 | |
3256 | + vncipher $out2,$out2,v30 | |
3257 | + vncipher $out3,$out3,v30 | |
3258 | + vxor $tweak,$tweak,$tmp | |
3259 | + vncipher $out4,$out4,v30 | |
3260 | + vncipher $out5,$out5,v30 | |
3261 | + vxor $in5,$twk5,v31 | |
3262 | + vsrab $tmp,$tweak,$seven # next tweak value | |
3263 | + vxor $twk5,$tweak,$rndkey0 | |
3264 | + | |
3265 | + vncipherlast $out0,$out0,$in0 | |
3266 | + lvx_u $in0,$x00,$inp # load next input block | |
3267 | + vaddubm $tweak,$tweak,$tweak | |
3268 | + vsldoi $tmp,$tmp,$tmp,15 | |
3269 | + vncipherlast $out1,$out1,$in1 | |
3270 | + lvx_u $in1,$x10,$inp | |
3271 | + vncipherlast $out2,$out2,$in2 | |
3272 | + le?vperm $in0,$in0,$in0,$leperm | |
3273 | + lvx_u $in2,$x20,$inp | |
3274 | + vand $tmp,$tmp,$eighty7 | |
3275 | + vncipherlast $out3,$out3,$in3 | |
3276 | + le?vperm $in1,$in1,$in1,$leperm | |
3277 | + lvx_u $in3,$x30,$inp | |
3278 | + vncipherlast $out4,$out4,$in4 | |
3279 | + le?vperm $in2,$in2,$in2,$leperm | |
3280 | + lvx_u $in4,$x40,$inp | |
3281 | + vxor $tweak,$tweak,$tmp | |
3282 | + vncipherlast $out5,$out5,$in5 | |
3283 | + le?vperm $in3,$in3,$in3,$leperm | |
3284 | + lvx_u $in5,$x50,$inp | |
3285 | + addi $inp,$inp,0x60 | |
3286 | + le?vperm $in4,$in4,$in4,$leperm | |
3287 | + le?vperm $in5,$in5,$in5,$leperm | |
3288 | + | |
3289 | + le?vperm $out0,$out0,$out0,$leperm | |
3290 | + le?vperm $out1,$out1,$out1,$leperm | |
3291 | + stvx_u $out0,$x00,$out # store output | |
3292 | + vxor $out0,$in0,$twk0 | |
3293 | + le?vperm $out2,$out2,$out2,$leperm | |
3294 | + stvx_u $out1,$x10,$out | |
3295 | + vxor $out1,$in1,$twk1 | |
3296 | + le?vperm $out3,$out3,$out3,$leperm | |
3297 | + stvx_u $out2,$x20,$out | |
3298 | + vxor $out2,$in2,$twk2 | |
3299 | + le?vperm $out4,$out4,$out4,$leperm | |
3300 | + stvx_u $out3,$x30,$out | |
3301 | + vxor $out3,$in3,$twk3 | |
3302 | + le?vperm $out5,$out5,$out5,$leperm | |
3303 | + stvx_u $out4,$x40,$out | |
3304 | + vxor $out4,$in4,$twk4 | |
3305 | + stvx_u $out5,$x50,$out | |
3306 | + vxor $out5,$in5,$twk5 | |
3307 | + addi $out,$out,0x60 | |
3308 | + | |
3309 | + mtctr $rounds | |
3310 | + beq Loop_xts_dec6x # did $len-=96 borrow? | |
3311 | + | |
3312 | + addic. $len,$len,0x60 | |
3313 | + beq Lxts_dec6x_zero | |
3314 | + cmpwi $len,0x20 | |
3315 | + blt Lxts_dec6x_one | |
3316 | + nop | |
3317 | + beq Lxts_dec6x_two | |
3318 | + cmpwi $len,0x40 | |
3319 | + blt Lxts_dec6x_three | |
3320 | + nop | |
3321 | + beq Lxts_dec6x_four | |
3322 | + | |
3323 | +Lxts_dec6x_five: | |
3324 | + vxor $out0,$in1,$twk0 | |
3325 | + vxor $out1,$in2,$twk1 | |
3326 | + vxor $out2,$in3,$twk2 | |
3327 | + vxor $out3,$in4,$twk3 | |
3328 | + vxor $out4,$in5,$twk4 | |
3329 | + | |
3330 | + bl _aesp8_xts_dec5x | |
3331 | + | |
3332 | + le?vperm $out0,$out0,$out0,$leperm | |
3333 | + vmr $twk0,$twk5 # unused tweak | |
3334 | + vxor $twk1,$tweak,$rndkey0 | |
3335 | + le?vperm $out1,$out1,$out1,$leperm | |
3336 | + stvx_u $out0,$x00,$out # store output | |
3337 | + vxor $out0,$in0,$twk1 | |
3338 | + le?vperm $out2,$out2,$out2,$leperm | |
3339 | + stvx_u $out1,$x10,$out | |
3340 | + le?vperm $out3,$out3,$out3,$leperm | |
3341 | + stvx_u $out2,$x20,$out | |
3342 | + le?vperm $out4,$out4,$out4,$leperm | |
3343 | + stvx_u $out3,$x30,$out | |
3344 | + stvx_u $out4,$x40,$out | |
3345 | + addi $out,$out,0x50 | |
3346 | + bne Lxts_dec6x_steal | |
3347 | + b Lxts_dec6x_done | |
3348 | + | |
3349 | +.align 4 | |
3350 | +Lxts_dec6x_four: | |
3351 | + vxor $out0,$in2,$twk0 | |
3352 | + vxor $out1,$in3,$twk1 | |
3353 | + vxor $out2,$in4,$twk2 | |
3354 | + vxor $out3,$in5,$twk3 | |
3355 | + vxor $out4,$out4,$out4 | |
3356 | + | |
3357 | + bl _aesp8_xts_dec5x | |
3358 | + | |
3359 | + le?vperm $out0,$out0,$out0,$leperm | |
3360 | + vmr $twk0,$twk4 # unused tweak | |
3361 | + vmr $twk1,$twk5 | |
3362 | + le?vperm $out1,$out1,$out1,$leperm | |
3363 | + stvx_u $out0,$x00,$out # store output | |
3364 | + vxor $out0,$in0,$twk5 | |
3365 | + le?vperm $out2,$out2,$out2,$leperm | |
3366 | + stvx_u $out1,$x10,$out | |
3367 | + le?vperm $out3,$out3,$out3,$leperm | |
3368 | + stvx_u $out2,$x20,$out | |
3369 | + stvx_u $out3,$x30,$out | |
3370 | + addi $out,$out,0x40 | |
3371 | + bne Lxts_dec6x_steal | |
3372 | + b Lxts_dec6x_done | |
3373 | + | |
3374 | +.align 4 | |
3375 | +Lxts_dec6x_three: | |
3376 | + vxor $out0,$in3,$twk0 | |
3377 | + vxor $out1,$in4,$twk1 | |
3378 | + vxor $out2,$in5,$twk2 | |
3379 | + vxor $out3,$out3,$out3 | |
3380 | + vxor $out4,$out4,$out4 | |
3381 | + | |
3382 | + bl _aesp8_xts_dec5x | |
3383 | + | |
3384 | + le?vperm $out0,$out0,$out0,$leperm | |
3385 | + vmr $twk0,$twk3 # unused tweak | |
3386 | + vmr $twk1,$twk4 | |
3387 | + le?vperm $out1,$out1,$out1,$leperm | |
3388 | + stvx_u $out0,$x00,$out # store output | |
3389 | + vxor $out0,$in0,$twk4 | |
3390 | + le?vperm $out2,$out2,$out2,$leperm | |
3391 | + stvx_u $out1,$x10,$out | |
3392 | + stvx_u $out2,$x20,$out | |
3393 | + addi $out,$out,0x30 | |
3394 | + bne Lxts_dec6x_steal | |
3395 | + b Lxts_dec6x_done | |
3396 | + | |
3397 | +.align 4 | |
3398 | +Lxts_dec6x_two: | |
3399 | + vxor $out0,$in4,$twk0 | |
3400 | + vxor $out1,$in5,$twk1 | |
3401 | + vxor $out2,$out2,$out2 | |
3402 | + vxor $out3,$out3,$out3 | |
3403 | + vxor $out4,$out4,$out4 | |
3404 | + | |
3405 | + bl _aesp8_xts_dec5x | |
3406 | + | |
3407 | + le?vperm $out0,$out0,$out0,$leperm | |
3408 | + vmr $twk0,$twk2 # unused tweak | |
3409 | + vmr $twk1,$twk3 | |
3410 | + le?vperm $out1,$out1,$out1,$leperm | |
3411 | + stvx_u $out0,$x00,$out # store output | |
3412 | + vxor $out0,$in0,$twk3 | |
3413 | + stvx_u $out1,$x10,$out | |
3414 | + addi $out,$out,0x20 | |
3415 | + bne Lxts_dec6x_steal | |
3416 | + b Lxts_dec6x_done | |
3417 | + | |
3418 | +.align 4 | |
3419 | +Lxts_dec6x_one: | |
3420 | + vxor $out0,$in5,$twk0 | |
3421 | + nop | |
3422 | +Loop_xts_dec1x: | |
3423 | + vncipher $out0,$out0,v24 | |
3424 | + lvx v24,$x20,$key_ # round[3] | |
3425 | + addi $key_,$key_,0x20 | |
3426 | + | |
3427 | + vncipher $out0,$out0,v25 | |
3428 | + lvx v25,$x10,$key_ # round[4] | |
3429 | + bdnz Loop_xts_dec1x | |
3430 | + | |
3431 | + subi r0,$taillen,1 | |
3432 | + vncipher $out0,$out0,v24 | |
3433 | + | |
3434 | + andi. r0,r0,16 | |
3435 | + cmpwi $taillen,0 | |
3436 | + vncipher $out0,$out0,v25 | |
3437 | + | |
3438 | + sub $inp,$inp,r0 | |
3439 | + vncipher $out0,$out0,v26 | |
3440 | + | |
3441 | + lvx_u $in0,0,$inp | |
3442 | + vncipher $out0,$out0,v27 | |
3443 | + | |
3444 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
3445 | + vncipher $out0,$out0,v28 | |
3446 | + lvx v24,$x00,$key_ # re-pre-load round[1] | |
3447 | + | |
3448 | + vncipher $out0,$out0,v29 | |
3449 | + lvx v25,$x10,$key_ # re-pre-load round[2] | |
3450 | + vxor $twk0,$twk0,v31 | |
3451 | + | |
3452 | + le?vperm $in0,$in0,$in0,$leperm | |
3453 | + vncipher $out0,$out0,v30 | |
3454 | + | |
3455 | + mtctr $rounds | |
3456 | + vncipherlast $out0,$out0,$twk0 | |
3457 | + | |
3458 | + vmr $twk0,$twk1 # unused tweak | |
3459 | + vmr $twk1,$twk2 | |
3460 | + le?vperm $out0,$out0,$out0,$leperm | |
3461 | + stvx_u $out0,$x00,$out # store output | |
3462 | + addi $out,$out,0x10 | |
3463 | + vxor $out0,$in0,$twk2 | |
3464 | + bne Lxts_dec6x_steal | |
3465 | + b Lxts_dec6x_done | |
3466 | + | |
3467 | +.align 4 | |
3468 | +Lxts_dec6x_zero: | |
3469 | + cmpwi $taillen,0 | |
3470 | + beq Lxts_dec6x_done | |
3471 | + | |
3472 | + lvx_u $in0,0,$inp | |
3473 | + le?vperm $in0,$in0,$in0,$leperm | |
3474 | + vxor $out0,$in0,$twk1 | |
3475 | +Lxts_dec6x_steal: | |
3476 | + vncipher $out0,$out0,v24 | |
3477 | + lvx v24,$x20,$key_ # round[3] | |
3478 | + addi $key_,$key_,0x20 | |
3479 | + | |
3480 | + vncipher $out0,$out0,v25 | |
3481 | + lvx v25,$x10,$key_ # round[4] | |
3482 | + bdnz Lxts_dec6x_steal | |
3483 | + | |
3484 | + add $inp,$inp,$taillen | |
3485 | + vncipher $out0,$out0,v24 | |
3486 | + | |
3487 | + cmpwi $taillen,0 | |
3488 | + vncipher $out0,$out0,v25 | |
3489 | + | |
3490 | + lvx_u $in0,0,$inp | |
3491 | + vncipher $out0,$out0,v26 | |
3492 | + | |
3493 | + lvsr $inpperm,0,$taillen # $in5 is no more | |
3494 | + vncipher $out0,$out0,v27 | |
3495 | + | |
3496 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
3497 | + vncipher $out0,$out0,v28 | |
3498 | + lvx v24,$x00,$key_ # re-pre-load round[1] | |
3499 | + | |
3500 | + vncipher $out0,$out0,v29 | |
3501 | + lvx v25,$x10,$key_ # re-pre-load round[2] | |
3502 | + vxor $twk1,$twk1,v31 | |
3503 | + | |
3504 | + le?vperm $in0,$in0,$in0,$leperm | |
3505 | + vncipher $out0,$out0,v30 | |
3506 | + | |
3507 | + vperm $in0,$in0,$in0,$inpperm | |
3508 | + vncipherlast $tmp,$out0,$twk1 | |
3509 | + | |
3510 | + le?vperm $out0,$tmp,$tmp,$leperm | |
3511 | + le?stvx_u $out0,0,$out | |
3512 | + be?stvx_u $tmp,0,$out | |
3513 | + | |
3514 | + vxor $out0,$out0,$out0 | |
3515 | + vspltisb $out1,-1 | |
3516 | + vperm $out0,$out0,$out1,$inpperm | |
3517 | + vsel $out0,$in0,$tmp,$out0 | |
3518 | + vxor $out0,$out0,$twk0 | |
3519 | + | |
3520 | + subi r3,$out,1 | |
3521 | + mtctr $taillen | |
3522 | +Loop_xts_dec6x_steal: | |
3523 | + lbzu r0,1(r3) | |
3524 | + stb r0,16(r3) | |
3525 | + bdnz Loop_xts_dec6x_steal | |
3526 | + | |
3527 | + li $taillen,0 | |
3528 | + mtctr $rounds | |
3529 | + b Loop_xts_dec1x # one more time... | |
3530 | + | |
3531 | +.align 4 | |
3532 | +Lxts_dec6x_done: | |
3533 | + mtlr r7 | |
3534 | + li r10,`$FRAME+15` | |
3535 | + li r11,`$FRAME+31` | |
3536 | + stvx $seven,r10,$sp # wipe copies of round keys | |
3537 | + addi r10,r10,32 | |
3538 | + stvx $seven,r11,$sp | |
3539 | + addi r11,r11,32 | |
3540 | + stvx $seven,r10,$sp | |
3541 | + addi r10,r10,32 | |
3542 | + stvx $seven,r11,$sp | |
3543 | + addi r11,r11,32 | |
3544 | + stvx $seven,r10,$sp | |
3545 | + addi r10,r10,32 | |
3546 | + stvx $seven,r11,$sp | |
3547 | + addi r11,r11,32 | |
3548 | + stvx $seven,r10,$sp | |
3549 | + addi r10,r10,32 | |
3550 | + stvx $seven,r11,$sp | |
3551 | + addi r11,r11,32 | |
3552 | + | |
3553 | + mtspr 256,$vrsave | |
3554 | + lvx v20,r10,$sp # ABI says so | |
3555 | + addi r10,r10,32 | |
3556 | + lvx v21,r11,$sp | |
3557 | + addi r11,r11,32 | |
3558 | + lvx v22,r10,$sp | |
3559 | + addi r10,r10,32 | |
3560 | + lvx v23,r11,$sp | |
3561 | + addi r11,r11,32 | |
3562 | + lvx v24,r10,$sp | |
3563 | + addi r10,r10,32 | |
3564 | + lvx v25,r11,$sp | |
3565 | + addi r11,r11,32 | |
3566 | + lvx v26,r10,$sp | |
3567 | + addi r10,r10,32 | |
3568 | + lvx v27,r11,$sp | |
3569 | + addi r11,r11,32 | |
3570 | + lvx v28,r10,$sp | |
3571 | + addi r10,r10,32 | |
3572 | + lvx v29,r11,$sp | |
3573 | + addi r11,r11,32 | |
3574 | + lvx v30,r10,$sp | |
3575 | + lvx v31,r11,$sp | |
3576 | + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
3577 | + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
3578 | + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
3579 | + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
3580 | + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
3581 | + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
3582 | + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` | |
3583 | + blr | |
3584 | + .long 0 | |
3585 | + .byte 0,12,0x04,1,0x80,6,6,0 | |
3586 | + .long 0 | |
3587 | + | |
3588 | +.align 5 | |
3589 | +_aesp8_xts_dec5x: | |
3590 | + vncipher $out0,$out0,v24 | |
3591 | + vncipher $out1,$out1,v24 | |
3592 | + vncipher $out2,$out2,v24 | |
3593 | + vncipher $out3,$out3,v24 | |
3594 | + vncipher $out4,$out4,v24 | |
3595 | + lvx v24,$x20,$key_ # round[3] | |
3596 | + addi $key_,$key_,0x20 | |
3597 | + | |
3598 | + vncipher $out0,$out0,v25 | |
3599 | + vncipher $out1,$out1,v25 | |
3600 | + vncipher $out2,$out2,v25 | |
3601 | + vncipher $out3,$out3,v25 | |
3602 | + vncipher $out4,$out4,v25 | |
3603 | + lvx v25,$x10,$key_ # round[4] | |
3604 | + bdnz _aesp8_xts_dec5x | |
3605 | + | |
3606 | + subi r0,$taillen,1 | |
3607 | + vncipher $out0,$out0,v24 | |
3608 | + vncipher $out1,$out1,v24 | |
3609 | + vncipher $out2,$out2,v24 | |
3610 | + vncipher $out3,$out3,v24 | |
3611 | + vncipher $out4,$out4,v24 | |
3612 | + | |
3613 | + andi. r0,r0,16 | |
3614 | + cmpwi $taillen,0 | |
3615 | + vncipher $out0,$out0,v25 | |
3616 | + vncipher $out1,$out1,v25 | |
3617 | + vncipher $out2,$out2,v25 | |
3618 | + vncipher $out3,$out3,v25 | |
3619 | + vncipher $out4,$out4,v25 | |
3620 | + vxor $twk0,$twk0,v31 | |
3621 | + | |
3622 | + sub $inp,$inp,r0 | |
3623 | + vncipher $out0,$out0,v26 | |
3624 | + vncipher $out1,$out1,v26 | |
3625 | + vncipher $out2,$out2,v26 | |
3626 | + vncipher $out3,$out3,v26 | |
3627 | + vncipher $out4,$out4,v26 | |
3628 | + vxor $in1,$twk1,v31 | |
3629 | + | |
3630 | + vncipher $out0,$out0,v27 | |
3631 | + lvx_u $in0,0,$inp | |
3632 | + vncipher $out1,$out1,v27 | |
3633 | + vncipher $out2,$out2,v27 | |
3634 | + vncipher $out3,$out3,v27 | |
3635 | + vncipher $out4,$out4,v27 | |
3636 | + vxor $in2,$twk2,v31 | |
3637 | + | |
3638 | + addi $key_,$sp,$FRAME+15 # rewind $key_ | |
3639 | + vncipher $out0,$out0,v28 | |
3640 | + vncipher $out1,$out1,v28 | |
3641 | + vncipher $out2,$out2,v28 | |
3642 | + vncipher $out3,$out3,v28 | |
3643 | + vncipher $out4,$out4,v28 | |
3644 | + lvx v24,$x00,$key_ # re-pre-load round[1] | |
3645 | + vxor $in3,$twk3,v31 | |
3646 | + | |
3647 | + vncipher $out0,$out0,v29 | |
3648 | + le?vperm $in0,$in0,$in0,$leperm | |
3649 | + vncipher $out1,$out1,v29 | |
3650 | + vncipher $out2,$out2,v29 | |
3651 | + vncipher $out3,$out3,v29 | |
3652 | + vncipher $out4,$out4,v29 | |
3653 | + lvx v25,$x10,$key_ # re-pre-load round[2] | |
3654 | + vxor $in4,$twk4,v31 | |
3655 | + | |
3656 | + vncipher $out0,$out0,v30 | |
3657 | + vncipher $out1,$out1,v30 | |
3658 | + vncipher $out2,$out2,v30 | |
3659 | + vncipher $out3,$out3,v30 | |
3660 | + vncipher $out4,$out4,v30 | |
3661 | + | |
3662 | + vncipherlast $out0,$out0,$twk0 | |
3663 | + vncipherlast $out1,$out1,$in1 | |
3664 | + vncipherlast $out2,$out2,$in2 | |
3665 | + vncipherlast $out3,$out3,$in3 | |
3666 | + vncipherlast $out4,$out4,$in4 | |
3667 | + mtctr $rounds | |
3668 | + blr | |
3669 | + .long 0 | |
3670 | + .byte 0,12,0x14,0,0,0,0,0 | |
3671 | +___ | |
3672 | +}} }}} | |
3673 | + | |
3674 | +my $consts=1; | |
3675 | +foreach(split("\n",$code)) { | |
3676 | + s/\`([^\`]*)\`/eval($1)/geo; | |
3677 | + | |
3678 | + # constants table endian-specific conversion | |
3679 | + if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) { | |
3680 | + my $conv=$3; | |
3681 | + my @bytes=(); | |
3682 | + | |
3683 | + # convert to endian-agnostic format | |
3684 | + if ($1 eq "long") { | |
3685 | + foreach (split(/,\s*/,$2)) { | |
3686 | + my $l = /^0/?oct:int; | |
3687 | + push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; | |
3688 | + } | |
3689 | + } else { | |
3690 | + @bytes = map(/^0/?oct:int,split(/,\s*/,$2)); | |
3691 | + } | |
3692 | + | |
3693 | + # little-endian conversion | |
3694 | + if ($flavour =~ /le$/o) { | |
3695 | + SWITCH: for($conv) { | |
3696 | + /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; | |
3697 | + /\?rev/ && do { @bytes=reverse(@bytes); last; }; | |
3698 | + } | |
3699 | + } | |
3700 | + | |
3701 | + #emit | |
3702 | + print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; | |
3703 | + next; | |
3704 | + } | |
3705 | + $consts=0 if (m/Lconsts:/o); # end of table | |
3706 | + | |
3707 | + # instructions prefixed with '?' are endian-specific and need | |
3708 | + # to be adjusted accordingly... | |
3709 | + if ($flavour =~ /le$/o) { # little-endian | |
3710 | + s/le\?//o or | |
3711 | + s/be\?/#be#/o or | |
3712 | + s/\?lvsr/lvsl/o or | |
3713 | + s/\?lvsl/lvsr/o or | |
3714 | + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or | |
3715 | + s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or | |
3716 | + s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; | |
3717 | + } else { # big-endian | |
3718 | + s/le\?/#le#/o or | |
3719 | + s/be\?//o or | |
3720 | + s/\?([a-z]+)/$1/o; | |
3721 | + } | |
3722 | + | |
3723 | + print $_,"\n"; | |
3724 | +} | |
3725 | + | |
3726 | +close STDOUT; |
@@ -191,7 +191,7 @@ L1st: | ||
191 | 191 | |
192 | 192 | addi $j,$j,$BNSZ ; j++ |
193 | 193 | addi $tp,$tp,$BNSZ ; tp++ |
194 | - bdnz- L1st | |
194 | + bdnz L1st | |
195 | 195 | ;L1st |
196 | 196 | addc $lo0,$alo,$hi0 |
197 | 197 | addze $hi0,$ahi |
@@ -253,7 +253,7 @@ Linner: | ||
253 | 253 | addze $hi1,$hi1 |
254 | 254 | $ST $lo1,0($tp) ; tp[j-1] |
255 | 255 | addi $tp,$tp,$BNSZ ; tp++ |
256 | - bdnz- Linner | |
256 | + bdnz Linner | |
257 | 257 | ;Linner |
258 | 258 | $LD $tj,$BNSZ($tp) ; tp[j] |
259 | 259 | addc $lo0,$alo,$hi0 |
@@ -276,7 +276,7 @@ Linner: | ||
276 | 276 | slwi $tj,$num,`log($BNSZ)/log(2)` |
277 | 277 | $UCMP $i,$tj |
278 | 278 | addi $i,$i,$BNSZ |
279 | - ble- Louter | |
279 | + ble Louter | |
280 | 280 | |
281 | 281 | addi $num,$num,2 ; restore $num |
282 | 282 | subfc $j,$j,$j ; j=0 and "clear" XER[CA] |
@@ -289,7 +289,7 @@ Lsub: $LDX $tj,$tp,$j | ||
289 | 289 | subfe $aj,$nj,$tj ; tp[j]-np[j] |
290 | 290 | $STX $aj,$rp,$j |
291 | 291 | addi $j,$j,$BNSZ |
292 | - bdnz- Lsub | |
292 | + bdnz Lsub | |
293 | 293 | |
294 | 294 | li $j,0 |
295 | 295 | mtctr $num |
@@ -304,7 +304,7 @@ Lcopy: ; copy or in-place refresh | ||
304 | 304 | $STX $tj,$rp,$j |
305 | 305 | $STX $j,$tp,$j ; zap at once |
306 | 306 | addi $j,$j,$BNSZ |
307 | - bdnz- Lcopy | |
307 | + bdnz Lcopy | |
308 | 308 | |
309 | 309 | $POP $tj,0($sp) |
310 | 310 | li r3,1 |
@@ -1552,7 +1552,7 @@ Lppcasm_sub_mainloop: | ||
1552 | 1552 | # if carry = 1 this is r7-r8. Else it |
1553 | 1553 | # is r7-r8 -1 as we need. |
1554 | 1554 | $STU r6,$BNSZ(r3) |
1555 | - bdnz- Lppcasm_sub_mainloop | |
1555 | + bdnz Lppcasm_sub_mainloop | |
1556 | 1556 | Lppcasm_sub_adios: |
1557 | 1557 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 |
1558 | 1558 | andi. r3,r3,1 # keep only last bit. |
@@ -1598,7 +1598,7 @@ Lppcasm_add_mainloop: | ||
1598 | 1598 | $LDU r8,$BNSZ(r5) |
1599 | 1599 | adde r8,r7,r8 |
1600 | 1600 | $STU r8,$BNSZ(r3) |
1601 | - bdnz- Lppcasm_add_mainloop | |
1601 | + bdnz Lppcasm_add_mainloop | |
1602 | 1602 | Lppcasm_add_adios: |
1603 | 1603 | addze r3,r0 #return carry bit. |
1604 | 1604 | blr |
@@ -1755,7 +1755,7 @@ Lppcasm_sqr_mainloop: | ||
1755 | 1755 | $UMULH r8,r6,r6 |
1756 | 1756 | $STU r7,$BNSZ(r3) |
1757 | 1757 | $STU r8,$BNSZ(r3) |
1758 | - bdnz- Lppcasm_sqr_mainloop | |
1758 | + bdnz Lppcasm_sqr_mainloop | |
1759 | 1759 | Lppcasm_sqr_adios: |
1760 | 1760 | blr |
1761 | 1761 | .long 0 |
@@ -1819,7 +1819,7 @@ Lppcasm_mw_LOOP: | ||
1819 | 1819 | |
1820 | 1820 | addi r3,r3,`4*$BNSZ` |
1821 | 1821 | addi r4,r4,`4*$BNSZ` |
1822 | - bdnz- Lppcasm_mw_LOOP | |
1822 | + bdnz Lppcasm_mw_LOOP | |
1823 | 1823 | |
1824 | 1824 | Lppcasm_mw_REM: |
1825 | 1825 | andi. r5,r5,0x3 |
@@ -561,7 +561,7 @@ $code.=<<___; | ||
561 | 561 | stfd $T3b,`$FRAME+56`($sp) |
562 | 562 | std $t0,8($tp) ; tp[j-1] |
563 | 563 | stdu $t4,16($tp) ; tp[j] |
564 | - bdnz- L1st | |
564 | + bdnz L1st | |
565 | 565 | |
566 | 566 | fctid $dota,$dota |
567 | 567 | fctid $dotb,$dotb |
@@ -856,7 +856,7 @@ $code.=<<___; | ||
856 | 856 | addze $carry,$carry |
857 | 857 | std $t3,-16($tp) ; tp[j-1] |
858 | 858 | std $t5,-8($tp) ; tp[j] |
859 | - bdnz- Linner | |
859 | + bdnz Linner | |
860 | 860 | |
861 | 861 | fctid $dota,$dota |
862 | 862 | fctid $dotb,$dotb |
@@ -954,7 +954,7 @@ Lsub: ldx $t0,$tp,$i | ||
954 | 954 | stdx $t0,$rp,$i |
955 | 955 | stdx $t2,$t6,$i |
956 | 956 | addi $i,$i,16 |
957 | - bdnz- Lsub | |
957 | + bdnz Lsub | |
958 | 958 | |
959 | 959 | li $i,0 |
960 | 960 | subfe $ovf,$i,$ovf ; handle upmost overflow bit |
@@ -981,7 +981,7 @@ Lcopy: ; copy or in-place refresh | ||
981 | 981 | stdx $i,$tp,$i ; zap tp at once |
982 | 982 | stdx $i,$t4,$i |
983 | 983 | addi $i,$i,16 |
984 | - bdnz- Lcopy | |
984 | + bdnz Lcopy | |
985 | 985 | ___ |
986 | 986 | $code.=<<___ if ($SIZE_T==4); |
987 | 987 | subf $np,$num,$np ; rewind np |
@@ -1014,7 +1014,7 @@ Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order | ||
1014 | 1014 | stw $t5,8($rp) |
1015 | 1015 | stw $t6,12($rp) |
1016 | 1016 | stwu $t7,16($rp) |
1017 | - bdnz- Lsub | |
1017 | + bdnz Lsub | |
1018 | 1018 | |
1019 | 1019 | li $i,0 |
1020 | 1020 | subfe $ovf,$i,$ovf ; handle upmost overflow bit |
@@ -1046,7 +1046,7 @@ Lcopy: ; copy or in-place refresh | ||
1046 | 1046 | stwu $t3,16($rp) |
1047 | 1047 | std $i,8($tp) ; zap tp at once |
1048 | 1048 | stdu $i,16($tp) |
1049 | - bdnz- Lcopy | |
1049 | + bdnz Lcopy | |
1050 | 1050 | ___ |
1051 | 1051 | |
1052 | 1052 | $code.=<<___; |
@@ -140,6 +140,19 @@ void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, | ||
140 | 140 | const unsigned char ivec[AES_BLOCK_SIZE]); |
141 | 141 | #endif |
142 | 142 | |
143 | +#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) | |
144 | +extern int OPENSSL_ppccap_P; | |
145 | +# define HWAES_CAPABLE (OPENSSL_ppccap_P & (1<<2)) | |
146 | +# define HWAES_set_encrypt_key aes_p8_set_encrypt_key | |
147 | +# define HWAES_set_decrypt_key aes_p8_set_decrypt_key | |
148 | +# define HWAES_encrypt aes_p8_encrypt | |
149 | +# define HWAES_decrypt aes_p8_decrypt | |
150 | +# define HWAES_cbc_encrypt aes_p8_cbc_encrypt | |
151 | +# define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks | |
152 | +# define HWAES_xts_encrypt aes_p8_xts_encrypt | |
153 | +# define HWAES_xts_decrypt aes_p8_xts_decrypt | |
154 | +#endif | |
155 | + | |
143 | 156 | #if defined(AES_ASM) && !defined(I386_ONLY) && ( \ |
144 | 157 | ((defined(__i386) || defined(__i386__) || \ |
145 | 158 | defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \ |
@@ -498,6 +511,13 @@ void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out, | ||
498 | 511 | unsigned char *ivec, const int enc); |
499 | 512 | void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, |
500 | 513 | size_t len, const AES_KEY *key, const unsigned char ivec[16]); |
514 | +void HWAES_xts_encrypt(const unsigned char *inp, unsigned char *out, | |
515 | + size_t len, const AES_KEY *key1, | |
516 | + const AES_KEY *key2, const unsigned char iv[16]); | |
517 | +void HWAES_xts_decrypt(const unsigned char *inp, unsigned char *out, | |
518 | + size_t len, const AES_KEY *key1, | |
519 | + const AES_KEY *key2, const unsigned char iv[16]); | |
520 | + | |
501 | 521 | #endif |
502 | 522 | |
503 | 523 | #define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \ |
@@ -1131,11 +1151,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, | ||
1131 | 1151 | { |
1132 | 1152 | HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); |
1133 | 1153 | xctx->xts.block1 = (block128_f)HWAES_encrypt; |
1154 | +#ifdef HWAES_xts_encrypt | |
1155 | + xctx->stream = HWAES_xts_encrypt; | |
1156 | +#endif | |
1134 | 1157 | } |
1135 | 1158 | else |
1136 | 1159 | { |
1137 | 1160 | HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); |
1138 | 1161 | xctx->xts.block1 = (block128_f)HWAES_decrypt; |
1162 | +#ifdef HWAES_xts_decrypt | |
1163 | + xctx->stream = HWAES_xts_decrypt; | |
1164 | +#endif | |
1139 | 1165 | } |
1140 | 1166 | |
1141 | 1167 | HWAES_set_encrypt_key(key + ctx->key_len/2, |
@@ -58,6 +58,8 @@ ghash-parisc.s: asm/ghash-parisc.pl | ||
58 | 58 | $(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@ |
59 | 59 | ghashv8-armx.S: asm/ghashv8-armx.pl |
60 | 60 | $(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@ |
61 | +ghashp8-ppc.s: asm/ghashp8-ppc.pl | |
62 | + $(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@ | |
61 | 63 | |
62 | 64 | # GNU make "catch all" |
63 | 65 | ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ |
@@ -0,0 +1,663 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | +# | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | +# | |
10 | +# GHASH for for PowerISA v2.07. | |
11 | +# | |
12 | +# July 2014 | |
13 | +# | |
14 | +# Accurate performance measurements are problematic, because it's | |
15 | +# always virtualized setup with possibly throttled processor. | |
16 | +# Relative comparison is therefore more informative. This initial | |
17 | +# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x | |
18 | +# faster than "4-bit" integer-only compiler-generated 64-bit code. | |
19 | +# "Initial version" means that there is room for futher improvement. | |
20 | + | |
21 | +# May 2016 | |
22 | +# | |
23 | +# 2x aggregated reduction improves performance by 50% (resulting | |
24 | +# performance on POWER8 is 1 cycle per processed byte), and 4x | |
25 | +# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb). | |
26 | + | |
27 | +$flavour=shift; | |
28 | +$output =shift; | |
29 | + | |
30 | +if ($flavour =~ /64/) { | |
31 | + $SIZE_T=8; | |
32 | + $LRSAVE=2*$SIZE_T; | |
33 | + $STU="stdu"; | |
34 | + $POP="ld"; | |
35 | + $PUSH="std"; | |
36 | + $UCMP="cmpld"; | |
37 | + $SHRI="srdi"; | |
38 | +} elsif ($flavour =~ /32/) { | |
39 | + $SIZE_T=4; | |
40 | + $LRSAVE=$SIZE_T; | |
41 | + $STU="stwu"; | |
42 | + $POP="lwz"; | |
43 | + $PUSH="stw"; | |
44 | + $UCMP="cmplw"; | |
45 | + $SHRI="srwi"; | |
46 | +} else { die "nonsense $flavour"; } | |
47 | + | |
48 | +$sp="r1"; | |
49 | +$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload | |
50 | + | |
51 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
52 | +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
53 | +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
54 | +die "can't locate ppc-xlate.pl"; | |
55 | + | |
56 | +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; | |
57 | + | |
58 | +my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block | |
59 | + | |
60 | +my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); | |
61 | +my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); | |
62 | +my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); | |
63 | +my $vrsave="r12"; | |
64 | + | |
65 | +$code=<<___; | |
66 | +.machine "any" | |
67 | + | |
68 | +.text | |
69 | + | |
70 | +.globl .gcm_init_p8 | |
71 | +.align 5 | |
72 | +.gcm_init_p8: | |
73 | + li r0,-4096 | |
74 | + li r8,0x10 | |
75 | + mfspr $vrsave,256 | |
76 | + li r9,0x20 | |
77 | + mtspr 256,r0 | |
78 | + li r10,0x30 | |
79 | + lvx_u $H,0,r4 # load H | |
80 | + | |
81 | + vspltisb $xC2,-16 # 0xf0 | |
82 | + vspltisb $t0,1 # one | |
83 | + vaddubm $xC2,$xC2,$xC2 # 0xe0 | |
84 | + vxor $zero,$zero,$zero | |
85 | + vor $xC2,$xC2,$t0 # 0xe1 | |
86 | + vsldoi $xC2,$xC2,$zero,15 # 0xe1... | |
87 | + vsldoi $t1,$zero,$t0,1 # ...1 | |
88 | + vaddubm $xC2,$xC2,$xC2 # 0xc2... | |
89 | + vspltisb $t2,7 | |
90 | + vor $xC2,$xC2,$t1 # 0xc2....01 | |
91 | + vspltb $t1,$H,0 # most significant byte | |
92 | + vsl $H,$H,$t0 # H<<=1 | |
93 | + vsrab $t1,$t1,$t2 # broadcast carry bit | |
94 | + vand $t1,$t1,$xC2 | |
95 | + vxor $IN,$H,$t1 # twisted H | |
96 | + | |
97 | + vsldoi $H,$IN,$IN,8 # twist even more ... | |
98 | + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 | |
99 | + vsldoi $Hl,$zero,$H,8 # ... and split | |
100 | + vsldoi $Hh,$H,$zero,8 | |
101 | + | |
102 | + stvx_u $xC2,0,r3 # save pre-computed table | |
103 | + stvx_u $Hl,r8,r3 | |
104 | + li r8,0x40 | |
105 | + stvx_u $H, r9,r3 | |
106 | + li r9,0x50 | |
107 | + stvx_u $Hh,r10,r3 | |
108 | + li r10,0x60 | |
109 | + | |
110 | + vpmsumd $Xl,$IN,$Hl # H.lo·H.lo | |
111 | + vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi | |
112 | + vpmsumd $Xh,$IN,$Hh # H.hi·H.hi | |
113 | + | |
114 | + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase | |
115 | + | |
116 | + vsldoi $t0,$Xm,$zero,8 | |
117 | + vsldoi $t1,$zero,$Xm,8 | |
118 | + vxor $Xl,$Xl,$t0 | |
119 | + vxor $Xh,$Xh,$t1 | |
120 | + | |
121 | + vsldoi $Xl,$Xl,$Xl,8 | |
122 | + vxor $Xl,$Xl,$t2 | |
123 | + | |
124 | + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase | |
125 | + vpmsumd $Xl,$Xl,$xC2 | |
126 | + vxor $t1,$t1,$Xh | |
127 | + vxor $IN1,$Xl,$t1 | |
128 | + | |
129 | + vsldoi $H2,$IN1,$IN1,8 | |
130 | + vsldoi $H2l,$zero,$H2,8 | |
131 | + vsldoi $H2h,$H2,$zero,8 | |
132 | + | |
133 | + stvx_u $H2l,r8,r3 # save H^2 | |
134 | + li r8,0x70 | |
135 | + stvx_u $H2,r9,r3 | |
136 | + li r9,0x80 | |
137 | + stvx_u $H2h,r10,r3 | |
138 | + li r10,0x90 | |
139 | +___ | |
140 | +{ | |
141 | +my ($t4,$t5,$t6) = ($Hl,$H,$Hh); | |
142 | +$code.=<<___; | |
143 | + vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo | |
144 | + vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo | |
145 | + vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi | |
146 | + vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi | |
147 | + vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi | |
148 | + vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi | |
149 | + | |
150 | + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase | |
151 | + vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase | |
152 | + | |
153 | + vsldoi $t0,$Xm,$zero,8 | |
154 | + vsldoi $t1,$zero,$Xm,8 | |
155 | + vsldoi $t4,$Xm1,$zero,8 | |
156 | + vsldoi $t5,$zero,$Xm1,8 | |
157 | + vxor $Xl,$Xl,$t0 | |
158 | + vxor $Xh,$Xh,$t1 | |
159 | + vxor $Xl1,$Xl1,$t4 | |
160 | + vxor $Xh1,$Xh1,$t5 | |
161 | + | |
162 | + vsldoi $Xl,$Xl,$Xl,8 | |
163 | + vsldoi $Xl1,$Xl1,$Xl1,8 | |
164 | + vxor $Xl,$Xl,$t2 | |
165 | + vxor $Xl1,$Xl1,$t6 | |
166 | + | |
167 | + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase | |
168 | + vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase | |
169 | + vpmsumd $Xl,$Xl,$xC2 | |
170 | + vpmsumd $Xl1,$Xl1,$xC2 | |
171 | + vxor $t1,$t1,$Xh | |
172 | + vxor $t5,$t5,$Xh1 | |
173 | + vxor $Xl,$Xl,$t1 | |
174 | + vxor $Xl1,$Xl1,$t5 | |
175 | + | |
176 | + vsldoi $H,$Xl,$Xl,8 | |
177 | + vsldoi $H2,$Xl1,$Xl1,8 | |
178 | + vsldoi $Hl,$zero,$H,8 | |
179 | + vsldoi $Hh,$H,$zero,8 | |
180 | + vsldoi $H2l,$zero,$H2,8 | |
181 | + vsldoi $H2h,$H2,$zero,8 | |
182 | + | |
183 | + stvx_u $Hl,r8,r3 # save H^3 | |
184 | + li r8,0xa0 | |
185 | + stvx_u $H,r9,r3 | |
186 | + li r9,0xb0 | |
187 | + stvx_u $Hh,r10,r3 | |
188 | + li r10,0xc0 | |
189 | + stvx_u $H2l,r8,r3 # save H^4 | |
190 | + stvx_u $H2,r9,r3 | |
191 | + stvx_u $H2h,r10,r3 | |
192 | + | |
193 | + mtspr 256,$vrsave | |
194 | + blr | |
195 | + .long 0 | |
196 | + .byte 0,12,0x14,0,0,0,2,0 | |
197 | + .long 0 | |
198 | +.size .gcm_init_p8,.-.gcm_init_p8 | |
199 | +___ | |
200 | +} | |
201 | +$code.=<<___; | |
202 | +.globl .gcm_gmult_p8 | |
203 | +.align 5 | |
204 | +.gcm_gmult_p8: | |
205 | + lis r0,0xfff8 | |
206 | + li r8,0x10 | |
207 | + mfspr $vrsave,256 | |
208 | + li r9,0x20 | |
209 | + mtspr 256,r0 | |
210 | + li r10,0x30 | |
211 | + lvx_u $IN,0,$Xip # load Xi | |
212 | + | |
213 | + lvx_u $Hl,r8,$Htbl # load pre-computed table | |
214 | + le?lvsl $lemask,r0,r0 | |
215 | + lvx_u $H, r9,$Htbl | |
216 | + le?vspltisb $t0,0x07 | |
217 | + lvx_u $Hh,r10,$Htbl | |
218 | + le?vxor $lemask,$lemask,$t0 | |
219 | + lvx_u $xC2,0,$Htbl | |
220 | + le?vperm $IN,$IN,$IN,$lemask | |
221 | + vxor $zero,$zero,$zero | |
222 | + | |
223 | + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo | |
224 | + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi | |
225 | + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi | |
226 | + | |
227 | + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase | |
228 | + | |
229 | + vsldoi $t0,$Xm,$zero,8 | |
230 | + vsldoi $t1,$zero,$Xm,8 | |
231 | + vxor $Xl,$Xl,$t0 | |
232 | + vxor $Xh,$Xh,$t1 | |
233 | + | |
234 | + vsldoi $Xl,$Xl,$Xl,8 | |
235 | + vxor $Xl,$Xl,$t2 | |
236 | + | |
237 | + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase | |
238 | + vpmsumd $Xl,$Xl,$xC2 | |
239 | + vxor $t1,$t1,$Xh | |
240 | + vxor $Xl,$Xl,$t1 | |
241 | + | |
242 | + le?vperm $Xl,$Xl,$Xl,$lemask | |
243 | + stvx_u $Xl,0,$Xip # write out Xi | |
244 | + | |
245 | + mtspr 256,$vrsave | |
246 | + blr | |
247 | + .long 0 | |
248 | + .byte 0,12,0x14,0,0,0,2,0 | |
249 | + .long 0 | |
250 | +.size .gcm_gmult_p8,.-.gcm_gmult_p8 | |
251 | + | |
252 | +.globl .gcm_ghash_p8 | |
253 | +.align 5 | |
254 | +.gcm_ghash_p8: | |
255 | + li r0,-4096 | |
256 | + li r8,0x10 | |
257 | + mfspr $vrsave,256 | |
258 | + li r9,0x20 | |
259 | + mtspr 256,r0 | |
260 | + li r10,0x30 | |
261 | + lvx_u $Xl,0,$Xip # load Xi | |
262 | + | |
263 | + lvx_u $Hl,r8,$Htbl # load pre-computed table | |
264 | + li r8,0x40 | |
265 | + le?lvsl $lemask,r0,r0 | |
266 | + lvx_u $H, r9,$Htbl | |
267 | + li r9,0x50 | |
268 | + le?vspltisb $t0,0x07 | |
269 | + lvx_u $Hh,r10,$Htbl | |
270 | + li r10,0x60 | |
271 | + le?vxor $lemask,$lemask,$t0 | |
272 | + lvx_u $xC2,0,$Htbl | |
273 | + le?vperm $Xl,$Xl,$Xl,$lemask | |
274 | + vxor $zero,$zero,$zero | |
275 | + | |
276 | + ${UCMP}i $len,64 | |
277 | + bge Lgcm_ghash_p8_4x | |
278 | + | |
279 | + lvx_u $IN,0,$inp | |
280 | + addi $inp,$inp,16 | |
281 | + subic. $len,$len,16 | |
282 | + le?vperm $IN,$IN,$IN,$lemask | |
283 | + vxor $IN,$IN,$Xl | |
284 | + beq Lshort | |
285 | + | |
286 | + lvx_u $H2l,r8,$Htbl # load H^2 | |
287 | + li r8,16 | |
288 | + lvx_u $H2, r9,$Htbl | |
289 | + add r9,$inp,$len # end of input | |
290 | + lvx_u $H2h,r10,$Htbl | |
291 | + be?b Loop_2x | |
292 | + | |
293 | +.align 5 | |
294 | +Loop_2x: | |
295 | + lvx_u $IN1,0,$inp | |
296 | + le?vperm $IN1,$IN1,$IN1,$lemask | |
297 | + | |
298 | + subic $len,$len,32 | |
299 | + vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo | |
300 | + vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo | |
301 | + subfe r0,r0,r0 # borrow?-1:0 | |
302 | + vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi | |
303 | + vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi | |
304 | + and r0,r0,$len | |
305 | + vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi | |
306 | + vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi | |
307 | + add $inp,$inp,r0 | |
308 | + | |
309 | + vxor $Xl,$Xl,$Xl1 | |
310 | + vxor $Xm,$Xm,$Xm1 | |
311 | + | |
312 | + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase | |
313 | + | |
314 | + vsldoi $t0,$Xm,$zero,8 | |
315 | + vsldoi $t1,$zero,$Xm,8 | |
316 | + vxor $Xh,$Xh,$Xh1 | |
317 | + vxor $Xl,$Xl,$t0 | |
318 | + vxor $Xh,$Xh,$t1 | |
319 | + | |
320 | + vsldoi $Xl,$Xl,$Xl,8 | |
321 | + vxor $Xl,$Xl,$t2 | |
322 | + lvx_u $IN,r8,$inp | |
323 | + addi $inp,$inp,32 | |
324 | + | |
325 | + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase | |
326 | + vpmsumd $Xl,$Xl,$xC2 | |
327 | + le?vperm $IN,$IN,$IN,$lemask | |
328 | + vxor $t1,$t1,$Xh | |
329 | + vxor $IN,$IN,$t1 | |
330 | + vxor $IN,$IN,$Xl | |
331 | + $UCMP r9,$inp | |
332 | + bgt Loop_2x # done yet? | |
333 | + | |
334 | + cmplwi $len,0 | |
335 | + bne Leven | |
336 | + | |
337 | +Lshort: | |
338 | + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo | |
339 | + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi | |
340 | + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi | |
341 | + | |
342 | + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase | |
343 | + | |
344 | + vsldoi $t0,$Xm,$zero,8 | |
345 | + vsldoi $t1,$zero,$Xm,8 | |
346 | + vxor $Xl,$Xl,$t0 | |
347 | + vxor $Xh,$Xh,$t1 | |
348 | + | |
349 | + vsldoi $Xl,$Xl,$Xl,8 | |
350 | + vxor $Xl,$Xl,$t2 | |
351 | + | |
352 | + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase | |
353 | + vpmsumd $Xl,$Xl,$xC2 | |
354 | + vxor $t1,$t1,$Xh | |
355 | + | |
356 | +Leven: | |
357 | + vxor $Xl,$Xl,$t1 | |
358 | + le?vperm $Xl,$Xl,$Xl,$lemask | |
359 | + stvx_u $Xl,0,$Xip # write out Xi | |
360 | + | |
361 | + mtspr 256,$vrsave | |
362 | + blr | |
363 | + .long 0 | |
364 | + .byte 0,12,0x14,0,0,0,4,0 | |
365 | + .long 0 | |
366 | +___ | |
367 | +{ | |
368 | +my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h, | |
369 | + $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31)); | |
370 | +my $IN0=$IN; | |
371 | +my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h); | |
372 | + | |
373 | +$code.=<<___; | |
374 | +.align 5 | |
375 | +.gcm_ghash_p8_4x: | |
376 | +Lgcm_ghash_p8_4x: | |
377 | + $STU $sp,-$FRAME($sp) | |
378 | + li r10,`15+6*$SIZE_T` | |
379 | + li r11,`31+6*$SIZE_T` | |
380 | + stvx v20,r10,$sp | |
381 | + addi r10,r10,32 | |
382 | + stvx v21,r11,$sp | |
383 | + addi r11,r11,32 | |
384 | + stvx v22,r10,$sp | |
385 | + addi r10,r10,32 | |
386 | + stvx v23,r11,$sp | |
387 | + addi r11,r11,32 | |
388 | + stvx v24,r10,$sp | |
389 | + addi r10,r10,32 | |
390 | + stvx v25,r11,$sp | |
391 | + addi r11,r11,32 | |
392 | + stvx v26,r10,$sp | |
393 | + addi r10,r10,32 | |
394 | + stvx v27,r11,$sp | |
395 | + addi r11,r11,32 | |
396 | + stvx v28,r10,$sp | |
397 | + addi r10,r10,32 | |
398 | + stvx v29,r11,$sp | |
399 | + addi r11,r11,32 | |
400 | + stvx v30,r10,$sp | |
401 | + li r10,0x60 | |
402 | + stvx v31,r11,$sp | |
403 | + li r0,-1 | |
404 | + stw $vrsave,`$FRAME-4`($sp) # save vrsave | |
405 | + mtspr 256,r0 # preserve all AltiVec registers | |
406 | + | |
407 | + lvsl $t0,0,r8 # 0x0001..0e0f | |
408 | + #lvx_u $H2l,r8,$Htbl # load H^2 | |
409 | + li r8,0x70 | |
410 | + lvx_u $H2, r9,$Htbl | |
411 | + li r9,0x80 | |
412 | + vspltisb $t1,8 # 0x0808..0808 | |
413 | + #lvx_u $H2h,r10,$Htbl | |
414 | + li r10,0x90 | |
415 | + lvx_u $H3l,r8,$Htbl # load H^3 | |
416 | + li r8,0xa0 | |
417 | + lvx_u $H3, r9,$Htbl | |
418 | + li r9,0xb0 | |
419 | + lvx_u $H3h,r10,$Htbl | |
420 | + li r10,0xc0 | |
421 | + lvx_u $H4l,r8,$Htbl # load H^4 | |
422 | + li r8,0x10 | |
423 | + lvx_u $H4, r9,$Htbl | |
424 | + li r9,0x20 | |
425 | + lvx_u $H4h,r10,$Htbl | |
426 | + li r10,0x30 | |
427 | + | |
428 | + vsldoi $t2,$zero,$t1,8 # 0x0000..0808 | |
429 | + vaddubm $hiperm,$t0,$t2 # 0x0001..1617 | |
430 | + vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f | |
431 | + | |
432 | + $SHRI $len,$len,4 # this allows to use sign bit | |
433 | + # as carry | |
434 | + lvx_u $IN0,0,$inp # load input | |
435 | + lvx_u $IN1,r8,$inp | |
436 | + subic. $len,$len,8 | |
437 | + lvx_u $IN2,r9,$inp | |
438 | + lvx_u $IN3,r10,$inp | |
439 | + addi $inp,$inp,0x40 | |
440 | + le?vperm $IN0,$IN0,$IN0,$lemask | |
441 | + le?vperm $IN1,$IN1,$IN1,$lemask | |
442 | + le?vperm $IN2,$IN2,$IN2,$lemask | |
443 | + le?vperm $IN3,$IN3,$IN3,$lemask | |
444 | + | |
445 | + vxor $Xh,$IN0,$Xl | |
446 | + | |
447 | + vpmsumd $Xl1,$IN1,$H3l | |
448 | + vpmsumd $Xm1,$IN1,$H3 | |
449 | + vpmsumd $Xh1,$IN1,$H3h | |
450 | + | |
451 | + vperm $H21l,$H2,$H,$hiperm | |
452 | + vperm $t0,$IN2,$IN3,$loperm | |
453 | + vperm $H21h,$H2,$H,$loperm | |
454 | + vperm $t1,$IN2,$IN3,$hiperm | |
455 | + vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo | |
456 | + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo | |
457 | + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi | |
458 | + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi | |
459 | + | |
460 | + vxor $Xm2,$Xm2,$Xm1 | |
461 | + vxor $Xl3,$Xl3,$Xl1 | |
462 | + vxor $Xm3,$Xm3,$Xm2 | |
463 | + vxor $Xh3,$Xh3,$Xh1 | |
464 | + | |
465 | + blt Ltail_4x | |
466 | + | |
467 | +Loop_4x: | |
468 | + lvx_u $IN0,0,$inp | |
469 | + lvx_u $IN1,r8,$inp | |
470 | + subic. $len,$len,4 | |
471 | + lvx_u $IN2,r9,$inp | |
472 | + lvx_u $IN3,r10,$inp | |
473 | + addi $inp,$inp,0x40 | |
474 | + le?vperm $IN1,$IN1,$IN1,$lemask | |
475 | + le?vperm $IN2,$IN2,$IN2,$lemask | |
476 | + le?vperm $IN3,$IN3,$IN3,$lemask | |
477 | + le?vperm $IN0,$IN0,$IN0,$lemask | |
478 | + | |
479 | + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo | |
480 | + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi | |
481 | + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi | |
482 | + vpmsumd $Xl1,$IN1,$H3l | |
483 | + vpmsumd $Xm1,$IN1,$H3 | |
484 | + vpmsumd $Xh1,$IN1,$H3h | |
485 | + | |
486 | + vxor $Xl,$Xl,$Xl3 | |
487 | + vxor $Xm,$Xm,$Xm3 | |
488 | + vxor $Xh,$Xh,$Xh3 | |
489 | + vperm $t0,$IN2,$IN3,$loperm | |
490 | + vperm $t1,$IN2,$IN3,$hiperm | |
491 | + | |
492 | + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase | |
493 | + vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo | |
494 | + vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi | |
495 | + | |
496 | + vsldoi $t0,$Xm,$zero,8 | |
497 | + vsldoi $t1,$zero,$Xm,8 | |
498 | + vxor $Xl,$Xl,$t0 | |
499 | + vxor $Xh,$Xh,$t1 | |
500 | + | |
501 | + vsldoi $Xl,$Xl,$Xl,8 | |
502 | + vxor $Xl,$Xl,$t2 | |
503 | + | |
504 | + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase | |
505 | + vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi | |
506 | + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi | |
507 | + vpmsumd $Xl,$Xl,$xC2 | |
508 | + | |
509 | + vxor $Xl3,$Xl3,$Xl1 | |
510 | + vxor $Xh3,$Xh3,$Xh1 | |
511 | + vxor $Xh,$Xh,$IN0 | |
512 | + vxor $Xm2,$Xm2,$Xm1 | |
513 | + vxor $Xh,$Xh,$t1 | |
514 | + vxor $Xm3,$Xm3,$Xm2 | |
515 | + vxor $Xh,$Xh,$Xl | |
516 | + bge Loop_4x | |
517 | + | |
518 | +Ltail_4x: | |
519 | + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo | |
520 | + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi | |
521 | + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi | |
522 | + | |
523 | + vxor $Xl,$Xl,$Xl3 | |
524 | + vxor $Xm,$Xm,$Xm3 | |
525 | + | |
526 | + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase | |
527 | + | |
528 | + vsldoi $t0,$Xm,$zero,8 | |
529 | + vsldoi $t1,$zero,$Xm,8 | |
530 | + vxor $Xh,$Xh,$Xh3 | |
531 | + vxor $Xl,$Xl,$t0 | |
532 | + vxor $Xh,$Xh,$t1 | |
533 | + | |
534 | + vsldoi $Xl,$Xl,$Xl,8 | |
535 | + vxor $Xl,$Xl,$t2 | |
536 | + | |
537 | + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase | |
538 | + vpmsumd $Xl,$Xl,$xC2 | |
539 | + vxor $t1,$t1,$Xh | |
540 | + vxor $Xl,$Xl,$t1 | |
541 | + | |
542 | + addic. $len,$len,4 | |
543 | + beq Ldone_4x | |
544 | + | |
545 | + lvx_u $IN0,0,$inp | |
546 | + ${UCMP}i $len,2 | |
547 | + li $len,-4 | |
548 | + blt Lone | |
549 | + lvx_u $IN1,r8,$inp | |
550 | + beq Ltwo | |
551 | + | |
552 | +Lthree: | |
553 | + lvx_u $IN2,r9,$inp | |
554 | + le?vperm $IN0,$IN0,$IN0,$lemask | |
555 | + le?vperm $IN1,$IN1,$IN1,$lemask | |
556 | + le?vperm $IN2,$IN2,$IN2,$lemask | |
557 | + | |
558 | + vxor $Xh,$IN0,$Xl | |
559 | + vmr $H4l,$H3l | |
560 | + vmr $H4, $H3 | |
561 | + vmr $H4h,$H3h | |
562 | + | |
563 | + vperm $t0,$IN1,$IN2,$loperm | |
564 | + vperm $t1,$IN1,$IN2,$hiperm | |
565 | + vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo | |
566 | + vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi | |
567 | + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo | |
568 | + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi | |
569 | + | |
570 | + vxor $Xm3,$Xm3,$Xm2 | |
571 | + b Ltail_4x | |
572 | + | |
573 | +.align 4 | |
574 | +Ltwo: | |
575 | + le?vperm $IN0,$IN0,$IN0,$lemask | |
576 | + le?vperm $IN1,$IN1,$IN1,$lemask | |
577 | + | |
578 | + vxor $Xh,$IN0,$Xl | |
579 | + vperm $t0,$zero,$IN1,$loperm | |
580 | + vperm $t1,$zero,$IN1,$hiperm | |
581 | + | |
582 | + vsldoi $H4l,$zero,$H2,8 | |
583 | + vmr $H4, $H2 | |
584 | + vsldoi $H4h,$H2,$zero,8 | |
585 | + | |
586 | + vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo | |
587 | + vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi | |
588 | + vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi | |
589 | + | |
590 | + b Ltail_4x | |
591 | + | |
592 | +.align 4 | |
593 | +Lone: | |
594 | + le?vperm $IN0,$IN0,$IN0,$lemask | |
595 | + | |
596 | + vsldoi $H4l,$zero,$H,8 | |
597 | + vmr $H4, $H | |
598 | + vsldoi $H4h,$H,$zero,8 | |
599 | + | |
600 | + vxor $Xh,$IN0,$Xl | |
601 | + vxor $Xl3,$Xl3,$Xl3 | |
602 | + vxor $Xm3,$Xm3,$Xm3 | |
603 | + vxor $Xh3,$Xh3,$Xh3 | |
604 | + | |
605 | + b Ltail_4x | |
606 | + | |
607 | +Ldone_4x: | |
608 | + le?vperm $Xl,$Xl,$Xl,$lemask | |
609 | + stvx_u $Xl,0,$Xip # write out Xi | |
610 | + | |
611 | + li r10,`15+6*$SIZE_T` | |
612 | + li r11,`31+6*$SIZE_T` | |
613 | + mtspr 256,$vrsave | |
614 | + lvx v20,r10,$sp | |
615 | + addi r10,r10,32 | |
616 | + lvx v21,r11,$sp | |
617 | + addi r11,r11,32 | |
618 | + lvx v22,r10,$sp | |
619 | + addi r10,r10,32 | |
620 | + lvx v23,r11,$sp | |
621 | + addi r11,r11,32 | |
622 | + lvx v24,r10,$sp | |
623 | + addi r10,r10,32 | |
624 | + lvx v25,r11,$sp | |
625 | + addi r11,r11,32 | |
626 | + lvx v26,r10,$sp | |
627 | + addi r10,r10,32 | |
628 | + lvx v27,r11,$sp | |
629 | + addi r11,r11,32 | |
630 | + lvx v28,r10,$sp | |
631 | + addi r10,r10,32 | |
632 | + lvx v29,r11,$sp | |
633 | + addi r11,r11,32 | |
634 | + lvx v30,r10,$sp | |
635 | + lvx v31,r11,$sp | |
636 | + addi $sp,$sp,$FRAME | |
637 | + blr | |
638 | + .long 0 | |
639 | + .byte 0,12,0x04,0,0x80,0,4,0 | |
640 | + .long 0 | |
641 | +___ | |
642 | +} | |
643 | +$code.=<<___; | |
644 | +.size .gcm_ghash_p8,.-.gcm_ghash_p8 | |
645 | + | |
646 | +.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" | |
647 | +.align 2 | |
648 | +___ | |
649 | + | |
650 | +foreach (split("\n",$code)) { | |
651 | + s/\`([^\`]*)\`/eval $1/geo; | |
652 | + | |
653 | + if ($flavour =~ /le$/o) { # little-endian | |
654 | + s/le\?//o or | |
655 | + s/be\?/#be#/o; | |
656 | + } else { | |
657 | + s/le\?/#le#/o or | |
658 | + s/be\?//o; | |
659 | + } | |
660 | + print $_,"\n"; | |
661 | +} | |
662 | + | |
663 | +close STDOUT; # enforce flush |
@@ -683,6 +683,14 @@ void gcm_init_v8(u128 Htable[16],const u64 Xi[2]); | ||
683 | 683 | void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); |
684 | 684 | void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); |
685 | 685 | # endif |
686 | +# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) | |
687 | +# define GHASH_ASM_PPC | |
688 | +# define GCM_FUNCREF_4BIT | |
689 | +extern int OPENSSL_ppccap_P; | |
690 | +void gcm_init_p8(u128 Htable[16], const u64 Xi[2]); | |
691 | +void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]); | |
692 | +void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp, | |
693 | + size_t len); | |
686 | 694 | # elif defined(_TMS320C6400_PLUS) |
687 | 695 | # define GHASH_ASM_C64Xplus |
688 | 696 | # endif |
@@ -767,6 +775,16 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) | ||
767 | 775 | ctx->gmult = gcm_gmult_4bit; |
768 | 776 | ctx->ghash = gcm_ghash_4bit; |
769 | 777 | } |
778 | +# elif defined(GHASH_ASM_PPC) | |
779 | + if (OPENSSL_ppccap_P & (1<<2)) { | |
780 | + gcm_init_p8(ctx->Htable, ctx->H.u); | |
781 | + ctx->gmult = gcm_gmult_p8; | |
782 | + ctx->ghash = gcm_ghash_p8; | |
783 | + } else { | |
784 | + gcm_init_4bit(ctx->Htable, ctx->H.u); | |
785 | + ctx->gmult = gcm_gmult_4bit; | |
786 | + ctx->ghash = gcm_ghash_4bit; | |
787 | + } | |
770 | 788 | # elif defined(GHASH_ASM_C64Xplus) |
771 | 789 | /* C64x+ assembler doesn't use tables, skip gcm_init_4bit. |
772 | 790 | * This is likely to trigger "function never referenced" |
@@ -27,7 +27,8 @@ my $globl = sub { | ||
27 | 27 | /osx/ && do { $name = "_$name"; |
28 | 28 | last; |
29 | 29 | }; |
30 | - /linux.*32/ && do { $ret .= ".globl $name\n"; | |
30 | + /linux.*(32|64le)/ | |
31 | + && do { $ret .= ".globl $name\n"; | |
31 | 32 | $ret .= ".type $name,\@function"; |
32 | 33 | last; |
33 | 34 | }; |
@@ -37,7 +38,6 @@ my $globl = sub { | ||
37 | 38 | $ret .= ".align 3\n"; |
38 | 39 | $ret .= "$name:\n"; |
39 | 40 | $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; |
40 | - $ret .= ".size $name,24\n"; | |
41 | 41 | $ret .= ".previous\n"; |
42 | 42 | |
43 | 43 | $name = ".$name"; |
@@ -50,7 +50,9 @@ my $globl = sub { | ||
50 | 50 | $ret; |
51 | 51 | }; |
52 | 52 | my $text = sub { |
53 | - ($flavour =~ /aix/) ? ".csect" : ".text"; | |
53 | + my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text"; | |
54 | + $ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/); | |
55 | + $ret; | |
54 | 56 | }; |
55 | 57 | my $machine = sub { |
56 | 58 | my $junk = shift; |
@@ -62,9 +64,12 @@ my $machine = sub { | ||
62 | 64 | ".machine $arch"; |
63 | 65 | }; |
64 | 66 | my $size = sub { |
65 | - if ($flavour =~ /linux.*32/) | |
67 | + if ($flavour =~ /linux/) | |
66 | 68 | { shift; |
67 | - ".size " . join(",",@_); | |
69 | + my $name = shift; $name =~ s|^[\.\_]||; | |
70 | + my $ret = ".size $name,.-".($flavour=~/64$/?".":"").$name; | |
71 | + $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64$/); | |
72 | + $ret; | |
68 | 73 | } |
69 | 74 | else |
70 | 75 | { ""; } |
@@ -77,6 +82,25 @@ my $asciz = sub { | ||
77 | 82 | else |
78 | 83 | { ""; } |
79 | 84 | }; |
85 | +my $quad = sub { | |
86 | + shift; | |
87 | + my @ret; | |
88 | + my ($hi,$lo); | |
89 | + for (@_) { | |
90 | + if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io) | |
91 | + { $hi=$1?"0x$1":"0"; $lo="0x$2"; } | |
92 | + elsif (/^([0-9]+)$/o) | |
93 | + { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl | |
94 | + else | |
95 | + { $hi=undef; $lo=$_; } | |
96 | + | |
97 | + if (defined($hi)) | |
98 | + { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); } | |
99 | + else | |
100 | + { push(@ret,".quad $lo"); } | |
101 | + } | |
102 | + join("\n",@ret); | |
103 | +}; | |
80 | 104 | |
81 | 105 | ################################################################ |
82 | 106 | # simplified mnemonics not handled by at least one assembler |
@@ -122,6 +146,66 @@ my $extrdi = sub { | ||
122 | 146 | $b = ($b+$n)&63; $n = 64-$n; |
123 | 147 | " rldicl $ra,$rs,$b,$n"; |
124 | 148 | }; |
149 | +my $vmr = sub { | |
150 | + my ($f,$vx,$vy) = @_; | |
151 | + " vor $vx,$vy,$vy"; | |
152 | +}; | |
153 | + | |
154 | +# Some ABIs specify vrsave, special-purpose register #256, as reserved | |
155 | +# for system use. | |
156 | +my $no_vrsave = ($flavour =~ /aix|linux64le/); | |
157 | +my $mtspr = sub { | |
158 | + my ($f,$idx,$ra) = @_; | |
159 | + if ($idx == 256 && $no_vrsave) { | |
160 | + " or $ra,$ra,$ra"; | |
161 | + } else { | |
162 | + " mtspr $idx,$ra"; | |
163 | + } | |
164 | +}; | |
165 | +my $mfspr = sub { | |
166 | + my ($f,$rd,$idx) = @_; | |
167 | + if ($idx == 256 && $no_vrsave) { | |
168 | + " li $rd,-1"; | |
169 | + } else { | |
170 | + " mfspr $rd,$idx"; | |
171 | + } | |
172 | +}; | |
173 | + | |
174 | +# PowerISA 2.06 stuff | |
175 | +sub vsxmem_op { | |
176 | + my ($f, $vrt, $ra, $rb, $op) = @_; | |
177 | + " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1); | |
178 | +} | |
179 | +# made-up unaligned memory reference AltiVec/VMX instructions | |
180 | +my $lvx_u = sub { vsxmem_op(@_, 844); }; # lxvd2x | |
181 | +my $stvx_u = sub { vsxmem_op(@_, 972); }; # stxvd2x | |
182 | +my $lvdx_u = sub { vsxmem_op(@_, 588); }; # lxsdx | |
183 | +my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx | |
184 | +my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x | |
185 | +my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x | |
186 | + | |
187 | +# PowerISA 2.07 stuff | |
188 | +sub vcrypto_op { | |
189 | + my ($f, $vrt, $vra, $vrb, $op) = @_; | |
190 | + " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op; | |
191 | +} | |
192 | +my $vcipher = sub { vcrypto_op(@_, 1288); }; | |
193 | +my $vcipherlast = sub { vcrypto_op(@_, 1289); }; | |
194 | +my $vncipher = sub { vcrypto_op(@_, 1352); }; | |
195 | +my $vncipherlast= sub { vcrypto_op(@_, 1353); }; | |
196 | +my $vsbox = sub { vcrypto_op(@_, 0, 1480); }; | |
197 | +my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); }; | |
198 | +my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); }; | |
199 | +my $vpmsumb = sub { vcrypto_op(@_, 1032); }; | |
200 | +my $vpmsumd = sub { vcrypto_op(@_, 1224); }; | |
201 | +my $vpmsubh = sub { vcrypto_op(@_, 1096); }; | |
202 | +my $vpmsumw = sub { vcrypto_op(@_, 1160); }; | |
203 | +my $vaddudm = sub { vcrypto_op(@_, 192); }; | |
204 | + | |
205 | +my $mtsle = sub { | |
206 | + my ($f, $arg) = @_; | |
207 | + " .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2); | |
208 | +}; | |
125 | 209 | |
126 | 210 | while($line=<>) { |
127 | 211 |
@@ -138,7 +222,10 @@ while($line=<>) { | ||
138 | 222 | { |
139 | 223 | $line =~ s|(^[\.\w]+)\:\s*||; |
140 | 224 | my $label = $1; |
141 | - printf "%s:",($GLOBALS{$label} or $label) if ($label); | |
225 | + if ($label) { | |
226 | + printf "%s:",($GLOBALS{$label} or $label); | |
227 | + printf "\n.localentry\t$GLOBALS{$label},0" if ($GLOBALS{$label} && $flavour =~ /linux.*64le/); | |
228 | + } | |
142 | 229 | } |
143 | 230 | |
144 | 231 | { |
@@ -147,7 +234,7 @@ while($line=<>) { | ||
147 | 234 | my $mnemonic = $2; |
148 | 235 | my $f = $3; |
149 | 236 | my $opcode = eval("\$$mnemonic"); |
150 | - $line =~ s|\bc?[rf]([0-9]+)\b|$1|g if ($c ne "." and $flavour !~ /osx/); | |
237 | + $line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/); | |
151 | 238 | if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); } |
152 | 239 | elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; } |
153 | 240 | } |
@@ -3,13 +3,24 @@ | ||
3 | 3 | #include <string.h> |
4 | 4 | #include <setjmp.h> |
5 | 5 | #include <signal.h> |
6 | +#include <unistd.h> | |
7 | +#if defined(__linux) || defined(_AIX) | |
8 | +# include <sys/utsname.h> | |
9 | +#endif | |
10 | +#if defined(_AIX53) /* defined even on post-5.3 */ | |
11 | +# include <sys/systemcfg.h> | |
12 | +# if !defined(__power_set) | |
13 | +# define __power_set(a) (_system_configuration.implementation & (a)) | |
14 | +# endif | |
15 | +#endif | |
6 | 16 | #include <crypto.h> |
7 | 17 | #include <openssl/bn.h> |
8 | 18 | |
9 | 19 | #define PPC_FPU64 (1<<0) |
10 | 20 | #define PPC_ALTIVEC (1<<1) |
21 | +#define PPC_CRYPTO207 (1<<2) | |
11 | 22 | |
12 | -static int OPENSSL_ppccap_P = 0; | |
23 | +int OPENSSL_ppccap_P = 0; | |
13 | 24 | |
14 | 25 | static sigset_t all_masked; |
15 | 26 |
@@ -49,10 +60,28 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U | ||
49 | 60 | } |
50 | 61 | #endif |
51 | 62 | |
63 | +void sha256_block_p8(void *ctx, const void *inp, size_t len); | |
64 | +void sha256_block_ppc(void *ctx, const void *inp, size_t len); | |
65 | +void sha256_block_data_order(void *ctx, const void *inp, size_t len) | |
66 | +{ | |
67 | + OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha256_block_p8(ctx, inp, len) : | |
68 | + sha256_block_ppc(ctx, inp, len); | |
69 | +} | |
70 | + | |
71 | +void sha512_block_p8(void *ctx, const void *inp, size_t len); | |
72 | +void sha512_block_ppc(void *ctx, const void *inp, size_t len); | |
73 | +void sha512_block_data_order(void *ctx, const void *inp, size_t len) | |
74 | +{ | |
75 | + OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha512_block_p8(ctx, inp, len) : | |
76 | + sha512_block_ppc(ctx, inp, len); | |
77 | +} | |
78 | + | |
52 | 79 | static sigjmp_buf ill_jmp; |
53 | 80 | static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } |
54 | 81 | |
55 | 82 | void OPENSSL_ppc64_probe(void); |
83 | +void OPENSSL_altivec_probe(void); | |
84 | +void OPENSSL_crypto207_probe(void); | |
56 | 85 | |
57 | 86 | void OPENSSL_cpuid_setup(void) |
58 | 87 | { |
@@ -82,6 +111,45 @@ void OPENSSL_cpuid_setup(void) | ||
82 | 111 | |
83 | 112 | OPENSSL_ppccap_P = 0; |
84 | 113 | |
114 | +#if defined(_AIX) | |
115 | + if (sizeof(size_t) == 4) { | |
116 | + struct utsname uts; | |
117 | +# if defined(_SC_AIX_KERNEL_BITMODE) | |
118 | + if (sysconf(_SC_AIX_KERNEL_BITMODE) != 64) | |
119 | + return; | |
120 | +# endif | |
121 | + if (uname(&uts) != 0 || atoi(uts.version) < 6) | |
122 | + return; | |
123 | + } | |
124 | + | |
125 | +# if defined(__power_set) | |
126 | + /* | |
127 | + * Value used in __power_set is a single-bit 1<<n one denoting | |
128 | + * specific processor class. Incidentally 0xffffffff<<n can be | |
129 | + * used to denote specific processor and its successors. | |
130 | + */ | |
131 | + if (sizeof(size_t) == 4) { | |
132 | + /* In 32-bit case PPC_FPU64 is always fastest [if option] */ | |
133 | + if (__power_set(0xffffffffU<<13)) /* POWER5 and later */ | |
134 | + OPENSSL_ppccap_P |= PPC_FPU64; | |
135 | + } else { | |
136 | + /* In 64-bit case PPC_FPU64 is fastest only on POWER6 */ | |
137 | +# if 0 /* to keep compatibility with previous validations */ | |
138 | + if (__power_set(0x1U<<14)) /* POWER6 */ | |
139 | + OPENSSL_ppccap_P |= PPC_FPU64; | |
140 | +# endif | |
141 | + } | |
142 | + | |
143 | + if (__power_set(0xffffffffU<<14)) /* POWER6 and later */ | |
144 | + OPENSSL_ppccap_P |= PPC_ALTIVEC; | |
145 | + | |
146 | + if (__power_set(0xffffffffU<<16)) /* POWER8 and later */ | |
147 | + OPENSSL_ppccap_P |= PPC_CRYPTO207; | |
148 | + | |
149 | + return; | |
150 | +# endif | |
151 | +#endif | |
152 | + | |
85 | 153 | memset(&ill_act,0,sizeof(ill_act)); |
86 | 154 | ill_act.sa_handler = ill_handler; |
87 | 155 | ill_act.sa_mask = all_masked; |
@@ -108,6 +176,11 @@ void OPENSSL_cpuid_setup(void) | ||
108 | 176 | { |
109 | 177 | OPENSSL_altivec_probe(); |
110 | 178 | OPENSSL_ppccap_P |= PPC_ALTIVEC; |
179 | + if (sigsetjmp(ill_jmp, 1) == 0) | |
180 | + { | |
181 | + OPENSSL_crypto207_probe(); | |
182 | + OPENSSL_ppccap_P |= PPC_CRYPTO207; | |
183 | + } | |
111 | 184 | } |
112 | 185 | |
113 | 186 | sigaction (SIGILL,&ill_oact,NULL); |
@@ -40,6 +40,16 @@ $code=<<___; | ||
40 | 40 | .long 0 |
41 | 41 | .byte 0,12,0x14,0,0,0,0,0 |
42 | 42 | |
43 | +.globl .OPENSSL_crypto207_probe | |
44 | +.align 4 | |
45 | +.OPENSSL_crypto207_probe: | |
46 | + .long 0x7C000E99 # lvx_u v0,0,r1 | |
47 | + .long 0x10000508 # vcipher v0,v0,v0 | |
48 | + blr | |
49 | + .long 0 | |
50 | + .byte 0,12,0x14,0,0,0,0,0 | |
51 | +.size .OPENSSL_crypto207_probe,.-.OPENSSL_crypto207_probe | |
52 | + | |
43 | 53 | .globl .OPENSSL_wipe_cpu |
44 | 54 | .align 4 |
45 | 55 | .OPENSSL_wipe_cpu: |
@@ -73,6 +73,8 @@ sha512-sparcv9.s:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAG | ||
73 | 73 | sha1-ppc.s: asm/sha1-ppc.pl; $(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@ |
74 | 74 | sha256-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@ |
75 | 75 | sha512-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@ |
76 | +sha256p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@ | |
77 | +sha512p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@ | |
76 | 78 | |
77 | 79 | sha1-parisc.s: asm/sha1-parisc.pl; $(PERL) asm/sha1-parisc.pl $(PERLASM_SCHEME) $@ |
78 | 80 | sha256-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@ |
@@ -210,7 +210,7 @@ Lunaligned: | ||
210 | 210 | srwi. $t1,$t1,6 ; t1/=64 |
211 | 211 | beq Lcross_page |
212 | 212 | $UCMP $num,$t1 |
213 | - ble- Laligned ; didn't cross the page boundary | |
213 | + ble Laligned ; didn't cross the page boundary | |
214 | 214 | mtctr $t1 |
215 | 215 | subfc $num,$t1,$num |
216 | 216 | bl Lsha1_block_private |
@@ -238,7 +238,7 @@ Lmemcpy: | ||
238 | 238 | bl Lsha1_block_private |
239 | 239 | $POP $inp,`$FRAME-$SIZE_T*18`($sp) |
240 | 240 | addic. $num,$num,-1 |
241 | - bne- Lunaligned | |
241 | + bne Lunaligned | |
242 | 242 | |
243 | 243 | Ldone: |
244 | 244 | $POP r0,`$FRAME+$LRSAVE`($sp) |
@@ -312,7 +312,7 @@ $code.=<<___; | ||
312 | 312 | stw r20,16($ctx) |
313 | 313 | mr $E,r20 |
314 | 314 | addi $inp,$inp,`16*4` |
315 | - bdnz- Lsha1_block_private | |
315 | + bdnz Lsha1_block_private | |
316 | 316 | blr |
317 | 317 | .long 0 |
318 | 318 | .byte 0,12,0x14,0,0,0,0,0 |
@@ -64,7 +64,7 @@ die "can't locate ppc-xlate.pl"; | ||
64 | 64 | open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; |
65 | 65 | |
66 | 66 | if ($output =~ /512/) { |
67 | - $func="sha512_block_data_order"; | |
67 | + $func="sha512_block_ppc"; | |
68 | 68 | $SZ=8; |
69 | 69 | @Sigma0=(28,34,39); |
70 | 70 | @Sigma1=(14,18,41); |
@@ -76,7 +76,7 @@ if ($output =~ /512/) { | ||
76 | 76 | $ROR="rotrdi"; |
77 | 77 | $SHR="srdi"; |
78 | 78 | } else { |
79 | - $func="sha256_block_data_order"; | |
79 | + $func="sha256_block_ppc"; | |
80 | 80 | $SZ=4; |
81 | 81 | @Sigma0=( 2,13,22); |
82 | 82 | @Sigma1=( 6,11,25); |
@@ -243,7 +243,7 @@ Lunaligned: | ||
243 | 243 | andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary |
244 | 244 | beq Lcross_page |
245 | 245 | $UCMP $num,$t1 |
246 | - ble- Laligned ; didn't cross the page boundary | |
246 | + ble Laligned ; didn't cross the page boundary | |
247 | 247 | subfc $num,$t1,$num |
248 | 248 | add $t1,$inp,$t1 |
249 | 249 | $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num |
@@ -279,7 +279,7 @@ Lmemcpy: | ||
279 | 279 | $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp |
280 | 280 | $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num |
281 | 281 | addic. $num,$num,`-16*$SZ` ; num-- |
282 | - bne- Lunaligned | |
282 | + bne Lunaligned | |
283 | 283 | |
284 | 284 | Ldone: |
285 | 285 | $POP r0,`$FRAME+$LRSAVE`($sp) |
@@ -339,7 +339,7 @@ for(;$i<32;$i++) { | ||
339 | 339 | unshift(@V,pop(@V)); |
340 | 340 | } |
341 | 341 | $code.=<<___; |
342 | - bdnz- Lrounds | |
342 | + bdnz Lrounds | |
343 | 343 | |
344 | 344 | $POP $ctx,`$FRAME-$SIZE_T*22`($sp) |
345 | 345 | $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer |
@@ -0,0 +1,431 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# SHA256/512 for PowerISA v2.07. | |
11 | +# | |
12 | +# Accurate performance measurements are problematic, because it's | |
13 | +# always virtualized setup with possibly throttled processor. | |
14 | +# Relative comparison is therefore more informative. This module is | |
15 | +# ~60% faster than integer-only sha512-ppc.pl. To anchor to something | |
16 | +# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than | |
17 | +# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than | |
18 | +# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting | |
19 | +# result is degree of computational resources' utilization. POWER8 is | |
20 | +# "massively multi-threaded chip" and difference between single- and | |
21 | +# maximum multi-process benchmark results tells that utlization is | |
22 | +# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and | |
23 | +# for sha1-ppc.pl - 73%. 100% means that multi-process result equals | |
24 | +# to single-process one, given that all threads end up on the same | |
25 | +# physical core. | |
26 | +# | |
27 | +####################################################################### | |
28 | +# | |
29 | +# SHA256/pre-2.07(*) SHA512/pre-2.07(*) SHA1(*) | |
30 | +# POWER8 9.3 /14.8 5.8 /9.5 7.1 | |
31 | +# | |
32 | +# (*) presented for reference/comparison purposes; | |
33 | + | |
34 | +$flavour=shift; | |
35 | +$output =shift; | |
36 | + | |
37 | +if ($flavour =~ /64/) { | |
38 | + $SIZE_T=8; | |
39 | + $LRSAVE=2*$SIZE_T; | |
40 | + $STU="stdu"; | |
41 | + $POP="ld"; | |
42 | + $PUSH="std"; | |
43 | +} elsif ($flavour =~ /32/) { | |
44 | + $SIZE_T=4; | |
45 | + $LRSAVE=$SIZE_T; | |
46 | + $STU="stwu"; | |
47 | + $POP="lwz"; | |
48 | + $PUSH="stw"; | |
49 | +} else { die "nonsense $flavour"; } | |
50 | + | |
51 | +$LENDIAN=($flavour=~/le/); | |
52 | + | |
53 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
54 | +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
55 | +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
56 | +die "can't locate ppc-xlate.pl"; | |
57 | + | |
58 | +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; | |
59 | + | |
60 | +if ($output =~ /512/) { | |
61 | + $bits=512; | |
62 | + $SZ=8; | |
63 | + $sz="d"; | |
64 | + $rounds=80; | |
65 | +} else { | |
66 | + $bits=256; | |
67 | + $SZ=4; | |
68 | + $sz="w"; | |
69 | + $rounds=64; | |
70 | +} | |
71 | + | |
72 | +$func="sha${bits}_block_p8"; | |
73 | +$FRAME=8*$SIZE_T; | |
74 | + | |
75 | +$sp ="r1"; | |
76 | +$toc="r2"; | |
77 | +$ctx="r3"; | |
78 | +$inp="r4"; | |
79 | +$num="r5"; | |
80 | +$Tbl="r6"; | |
81 | +$idx="r7"; | |
82 | +$lrsave="r8"; | |
83 | +$offload="r11"; | |
84 | +$vrsave="r12"; | |
85 | +($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31)); | |
86 | + $x00=0 if ($flavour =~ /osx/); | |
87 | + | |
88 | +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7)); | |
89 | +@X=map("v$_",(8..23)); | |
90 | +($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31)); | |
91 | + | |
92 | +sub ROUND { | |
93 | +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; | |
94 | +my $j=($i+1)%16; | |
95 | + | |
96 | +$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1)); | |
97 | + lvx_u @X[$i+1],0,$inp ; load X[i] in advance | |
98 | + addi $inp,$inp,16 | |
99 | +___ | |
100 | +$code.=<<___ if ($i<16 && ($i%(16/$SZ))); | |
101 | + vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ | |
102 | +___ | |
103 | +$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0); | |
104 | + vperm @X[$i],@X[$i],@X[$i],$lemask | |
105 | +___ | |
106 | +$code.=<<___; | |
107 | + `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)` | |
108 | + vsel $Func,$g,$f,$e ; Ch(e,f,g) | |
109 | + vshasigma${sz} $S1,$e,1,15 ; Sigma1(e) | |
110 | + vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i] | |
111 | + vshasigma${sz} $S0,$a,1,0 ; Sigma0(a) | |
112 | + `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)` | |
113 | + vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g) | |
114 | + vxor $Func,$a,$b | |
115 | + `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)` | |
116 | + vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e) | |
117 | + vsel $Func,$b,$c,$Func ; Maj(a,b,c) | |
118 | + vaddu${sz}m $g,$g,$Ki ; future h+=K[i] | |
119 | + vaddu${sz}m $d,$d,$h ; d+=h | |
120 | + vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c) | |
121 | + `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)` | |
122 | + lvx $Ki,$idx,$Tbl ; load next K[i] | |
123 | + addi $idx,$idx,16 | |
124 | + vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c) | |
125 | + `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)` | |
126 | +___ | |
127 | +} | |
128 | + | |
129 | +$code=<<___; | |
130 | +.machine "any" | |
131 | +.text | |
132 | + | |
133 | +.globl $func | |
134 | +.align 6 | |
135 | +$func: | |
136 | + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) | |
137 | + mflr $lrsave | |
138 | + li r10,`$FRAME+8*16+15` | |
139 | + li r11,`$FRAME+8*16+31` | |
140 | + stvx v20,r10,$sp # ABI says so | |
141 | + addi r10,r10,32 | |
142 | + mfspr $vrsave,256 | |
143 | + stvx v21,r11,$sp | |
144 | + addi r11,r11,32 | |
145 | + stvx v22,r10,$sp | |
146 | + addi r10,r10,32 | |
147 | + stvx v23,r11,$sp | |
148 | + addi r11,r11,32 | |
149 | + stvx v24,r10,$sp | |
150 | + addi r10,r10,32 | |
151 | + stvx v25,r11,$sp | |
152 | + addi r11,r11,32 | |
153 | + stvx v26,r10,$sp | |
154 | + addi r10,r10,32 | |
155 | + stvx v27,r11,$sp | |
156 | + addi r11,r11,32 | |
157 | + stvx v28,r10,$sp | |
158 | + addi r10,r10,32 | |
159 | + stvx v29,r11,$sp | |
160 | + addi r11,r11,32 | |
161 | + stvx v30,r10,$sp | |
162 | + stvx v31,r11,$sp | |
163 | + li r11,-1 | |
164 | + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave | |
165 | + li $x10,0x10 | |
166 | + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
167 | + li $x20,0x20 | |
168 | + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
169 | + li $x30,0x30 | |
170 | + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
171 | + li $x40,0x40 | |
172 | + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
173 | + li $x50,0x50 | |
174 | + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
175 | + li $x60,0x60 | |
176 | + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
177 | + li $x70,0x70 | |
178 | + $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) | |
179 | + mtspr 256,r11 | |
180 | + | |
181 | + bl LPICmeup | |
182 | + addi $offload,$sp,$FRAME+15 | |
183 | +___ | |
184 | +$code.=<<___ if ($LENDIAN); | |
185 | + li $idx,8 | |
186 | + lvsl $lemask,0,$idx | |
187 | + vspltisb $Ki,0x0f | |
188 | + vxor $lemask,$lemask,$Ki | |
189 | +___ | |
190 | +$code.=<<___ if ($SZ==4); | |
191 | + lvx_4w $A,$x00,$ctx | |
192 | + lvx_4w $E,$x10,$ctx | |
193 | + vsldoi $B,$A,$A,4 # unpack | |
194 | + vsldoi $C,$A,$A,8 | |
195 | + vsldoi $D,$A,$A,12 | |
196 | + vsldoi $F,$E,$E,4 | |
197 | + vsldoi $G,$E,$E,8 | |
198 | + vsldoi $H,$E,$E,12 | |
199 | +___ | |
200 | +$code.=<<___ if ($SZ==8); | |
201 | + lvx_u $A,$x00,$ctx | |
202 | + lvx_u $C,$x10,$ctx | |
203 | + lvx_u $E,$x20,$ctx | |
204 | + vsldoi $B,$A,$A,8 # unpack | |
205 | + lvx_u $G,$x30,$ctx | |
206 | + vsldoi $D,$C,$C,8 | |
207 | + vsldoi $F,$E,$E,8 | |
208 | + vsldoi $H,$G,$G,8 | |
209 | +___ | |
210 | +$code.=<<___; | |
211 | + li r0,`($rounds-16)/16` # inner loop counter | |
212 | + b Loop | |
213 | +.align 5 | |
214 | +Loop: | |
215 | + lvx $Ki,$x00,$Tbl | |
216 | + li $idx,16 | |
217 | + lvx_u @X[0],0,$inp | |
218 | + addi $inp,$inp,16 | |
219 | + stvx $A,$x00,$offload # offload $A-$H | |
220 | + stvx $B,$x10,$offload | |
221 | + stvx $C,$x20,$offload | |
222 | + stvx $D,$x30,$offload | |
223 | + stvx $E,$x40,$offload | |
224 | + stvx $F,$x50,$offload | |
225 | + stvx $G,$x60,$offload | |
226 | + stvx $H,$x70,$offload | |
227 | + vaddu${sz}m $H,$H,$Ki # h+K[i] | |
228 | + lvx $Ki,$idx,$Tbl | |
229 | + addi $idx,$idx,16 | |
230 | +___ | |
231 | +for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } | |
232 | +$code.=<<___; | |
233 | + mtctr r0 | |
234 | + b L16_xx | |
235 | +.align 5 | |
236 | +L16_xx: | |
237 | +___ | |
238 | +for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } | |
239 | +$code.=<<___; | |
240 | + bdnz L16_xx | |
241 | + | |
242 | + lvx @X[2],$x00,$offload | |
243 | + subic. $num,$num,1 | |
244 | + lvx @X[3],$x10,$offload | |
245 | + vaddu${sz}m $A,$A,@X[2] | |
246 | + lvx @X[4],$x20,$offload | |
247 | + vaddu${sz}m $B,$B,@X[3] | |
248 | + lvx @X[5],$x30,$offload | |
249 | + vaddu${sz}m $C,$C,@X[4] | |
250 | + lvx @X[6],$x40,$offload | |
251 | + vaddu${sz}m $D,$D,@X[5] | |
252 | + lvx @X[7],$x50,$offload | |
253 | + vaddu${sz}m $E,$E,@X[6] | |
254 | + lvx @X[8],$x60,$offload | |
255 | + vaddu${sz}m $F,$F,@X[7] | |
256 | + lvx @X[9],$x70,$offload | |
257 | + vaddu${sz}m $G,$G,@X[8] | |
258 | + vaddu${sz}m $H,$H,@X[9] | |
259 | + bne Loop | |
260 | +___ | |
261 | +$code.=<<___ if ($SZ==4); | |
262 | + lvx @X[0],$idx,$Tbl | |
263 | + addi $idx,$idx,16 | |
264 | + vperm $A,$A,$B,$Ki # pack the answer | |
265 | + lvx @X[1],$idx,$Tbl | |
266 | + vperm $E,$E,$F,$Ki | |
267 | + vperm $A,$A,$C,@X[0] | |
268 | + vperm $E,$E,$G,@X[0] | |
269 | + vperm $A,$A,$D,@X[1] | |
270 | + vperm $E,$E,$H,@X[1] | |
271 | + stvx_4w $A,$x00,$ctx | |
272 | + stvx_4w $E,$x10,$ctx | |
273 | +___ | |
274 | +$code.=<<___ if ($SZ==8); | |
275 | + vperm $A,$A,$B,$Ki # pack the answer | |
276 | + vperm $C,$C,$D,$Ki | |
277 | + vperm $E,$E,$F,$Ki | |
278 | + vperm $G,$G,$H,$Ki | |
279 | + stvx_u $A,$x00,$ctx | |
280 | + stvx_u $C,$x10,$ctx | |
281 | + stvx_u $E,$x20,$ctx | |
282 | + stvx_u $G,$x30,$ctx | |
283 | +___ | |
284 | +$code.=<<___; | |
285 | + li r10,`$FRAME+8*16+15` | |
286 | + mtlr $lrsave | |
287 | + li r11,`$FRAME+8*16+31` | |
288 | + mtspr 256,$vrsave | |
289 | + lvx v20,r10,$sp # ABI says so | |
290 | + addi r10,r10,32 | |
291 | + lvx v21,r11,$sp | |
292 | + addi r11,r11,32 | |
293 | + lvx v22,r10,$sp | |
294 | + addi r10,r10,32 | |
295 | + lvx v23,r11,$sp | |
296 | + addi r11,r11,32 | |
297 | + lvx v24,r10,$sp | |
298 | + addi r10,r10,32 | |
299 | + lvx v25,r11,$sp | |
300 | + addi r11,r11,32 | |
301 | + lvx v26,r10,$sp | |
302 | + addi r10,r10,32 | |
303 | + lvx v27,r11,$sp | |
304 | + addi r11,r11,32 | |
305 | + lvx v28,r10,$sp | |
306 | + addi r10,r10,32 | |
307 | + lvx v29,r11,$sp | |
308 | + addi r11,r11,32 | |
309 | + lvx v30,r10,$sp | |
310 | + lvx v31,r11,$sp | |
311 | + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) | |
312 | + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) | |
313 | + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) | |
314 | + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) | |
315 | + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) | |
316 | + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) | |
317 | + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` | |
318 | + blr | |
319 | + .long 0 | |
320 | + .byte 0,12,4,1,0x80,6,3,0 | |
321 | + .long 0 | |
322 | +.size $func,.-$func | |
323 | +___ | |
324 | + | |
325 | +# Ugly hack here, because PPC assembler syntax seem to vary too | |
326 | +# much from platforms to platform... | |
327 | +$code.=<<___; | |
328 | +.align 6 | |
329 | +LPICmeup: | |
330 | + mflr r0 | |
331 | + bcl 20,31,\$+4 | |
332 | + mflr $Tbl ; vvvvvv "distance" between . and 1st data entry | |
333 | + addi $Tbl,$Tbl,`64-8` | |
334 | + mtlr r0 | |
335 | + blr | |
336 | + .long 0 | |
337 | + .byte 0,12,0x14,0,0,0,0,0 | |
338 | + .space `64-9*4` | |
339 | +___ | |
340 | + | |
341 | +if ($SZ==8) { | |
342 | + local *table = sub { | |
343 | + foreach(@_) { $code.=".quad $_,$_\n"; } | |
344 | + }; | |
345 | + table( | |
346 | + "0x428a2f98d728ae22","0x7137449123ef65cd", | |
347 | + "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc", | |
348 | + "0x3956c25bf348b538","0x59f111f1b605d019", | |
349 | + "0x923f82a4af194f9b","0xab1c5ed5da6d8118", | |
350 | + "0xd807aa98a3030242","0x12835b0145706fbe", | |
351 | + "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2", | |
352 | + "0x72be5d74f27b896f","0x80deb1fe3b1696b1", | |
353 | + "0x9bdc06a725c71235","0xc19bf174cf692694", | |
354 | + "0xe49b69c19ef14ad2","0xefbe4786384f25e3", | |
355 | + "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65", | |
356 | + "0x2de92c6f592b0275","0x4a7484aa6ea6e483", | |
357 | + "0x5cb0a9dcbd41fbd4","0x76f988da831153b5", | |
358 | + "0x983e5152ee66dfab","0xa831c66d2db43210", | |
359 | + "0xb00327c898fb213f","0xbf597fc7beef0ee4", | |
360 | + "0xc6e00bf33da88fc2","0xd5a79147930aa725", | |
361 | + "0x06ca6351e003826f","0x142929670a0e6e70", | |
362 | + "0x27b70a8546d22ffc","0x2e1b21385c26c926", | |
363 | + "0x4d2c6dfc5ac42aed","0x53380d139d95b3df", | |
364 | + "0x650a73548baf63de","0x766a0abb3c77b2a8", | |
365 | + "0x81c2c92e47edaee6","0x92722c851482353b", | |
366 | + "0xa2bfe8a14cf10364","0xa81a664bbc423001", | |
367 | + "0xc24b8b70d0f89791","0xc76c51a30654be30", | |
368 | + "0xd192e819d6ef5218","0xd69906245565a910", | |
369 | + "0xf40e35855771202a","0x106aa07032bbd1b8", | |
370 | + "0x19a4c116b8d2d0c8","0x1e376c085141ab53", | |
371 | + "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8", | |
372 | + "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb", | |
373 | + "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3", | |
374 | + "0x748f82ee5defb2fc","0x78a5636f43172f60", | |
375 | + "0x84c87814a1f0ab72","0x8cc702081a6439ec", | |
376 | + "0x90befffa23631e28","0xa4506cebde82bde9", | |
377 | + "0xbef9a3f7b2c67915","0xc67178f2e372532b", | |
378 | + "0xca273eceea26619c","0xd186b8c721c0c207", | |
379 | + "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178", | |
380 | + "0x06f067aa72176fba","0x0a637dc5a2c898a6", | |
381 | + "0x113f9804bef90dae","0x1b710b35131c471b", | |
382 | + "0x28db77f523047d84","0x32caab7b40c72493", | |
383 | + "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c", | |
384 | + "0x4cc5d4becb3e42b6","0x597f299cfc657e2a", | |
385 | + "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0"); | |
386 | +$code.=<<___ if (!$LENDIAN); | |
387 | +.quad 0x0001020304050607,0x1011121314151617 | |
388 | +___ | |
389 | +$code.=<<___ if ($LENDIAN); # quad-swapped | |
390 | +.quad 0x1011121314151617,0x0001020304050607 | |
391 | +___ | |
392 | +} else { | |
393 | + local *table = sub { | |
394 | + foreach(@_) { $code.=".long $_,$_,$_,$_\n"; } | |
395 | + }; | |
396 | + table( | |
397 | + "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5", | |
398 | + "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5", | |
399 | + "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3", | |
400 | + "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174", | |
401 | + "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc", | |
402 | + "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da", | |
403 | + "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7", | |
404 | + "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967", | |
405 | + "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13", | |
406 | + "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85", | |
407 | + "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3", | |
408 | + "0xd192e819","0xd6990624","0xf40e3585","0x106aa070", | |
409 | + "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5", | |
410 | + "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3", | |
411 | + "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208", | |
412 | + "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0"); | |
413 | +$code.=<<___ if (!$LENDIAN); | |
414 | +.long 0x00010203,0x10111213,0x10111213,0x10111213 | |
415 | +.long 0x00010203,0x04050607,0x10111213,0x10111213 | |
416 | +.long 0x00010203,0x04050607,0x08090a0b,0x10111213 | |
417 | +___ | |
418 | +$code.=<<___ if ($LENDIAN); # word-swapped | |
419 | +.long 0x10111213,0x10111213,0x10111213,0x00010203 | |
420 | +.long 0x10111213,0x10111213,0x04050607,0x00010203 | |
421 | +.long 0x10111213,0x08090a0b,0x04050607,0x00010203 | |
422 | +___ | |
423 | +} | |
424 | +$code.=<<___; | |
425 | +.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" | |
426 | +.align 2 | |
427 | +___ | |
428 | + | |
429 | +$code =~ s/\`([^\`]*)\`/eval $1/gem; | |
430 | +print $code; | |
431 | +close STDOUT; |
@@ -140,6 +140,9 @@ void FINGERPRINT_premain(void) | ||
140 | 140 | } |
141 | 141 | #endif |
142 | 142 | } while(0); |
143 | +#if defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC) | |
144 | + fips_openssl_cpuid_setup(); | |
145 | +#endif | |
143 | 146 | } |
144 | 147 | |
145 | 148 | #else |
@@ -1 +1 @@ | ||
1 | -HMAC-SHA1(fips_premain.c)= 65b20c3cec235cec85af848e1cd2dfdfa101804a | |
1 | +HMAC-SHA1(fips_premain.c)= 2bfb57ef540bdba29220a45d65e1b4080de9adc1 |
@@ -712,6 +712,23 @@ | ||
712 | 712 | #define _bn_GF2m_mul_2x2 _fips_bn_GF2m_mul_2x2 |
713 | 713 | #define _OPENSSL_cleanse _FIPS_openssl_cleanse |
714 | 714 | #endif |
715 | +#define aes_p8_encrypt fips_aes_p8_encrypt | |
716 | +#define aes_p8_decrypt fips_aes_p8_decrypt | |
717 | +#define aes_p8_set_encrypt_key fips_aes_p8_set_encrypt_key | |
718 | +#define aes_p8_set_decrypt_key fips_aes_p8_set_decrypt_key | |
719 | +#define aes_p8_cbc_encrypt fips_aes_p8_cbc_encrypt | |
720 | +#define aes_p8_ctr32_encrypt_blocks fips_aes_p8_ctr32_encrypt_blocks | |
721 | +#define aes_p8_xts_encrypt fips_aes_p8_xts_encrypt | |
722 | +#define aes_p8_xts_decrypt fips_aes_p8_xts_decrypt | |
723 | +#define gcm_init_p8 fips_gcm_init_p8 | |
724 | +#define gcm_gmult_p8 fips_gcm_gmult_p8 | |
725 | +#define gcm_ghash_p8 fips_gcm_ghash_p8 | |
726 | +#define sha256_block_p8 fips_sha256_block_p8 | |
727 | +#define sha512_block_p8 fips_sha512_block_p8 | |
728 | +#define sha256_block_ppc fips_sha256_block_ppc | |
729 | +#define sha512_block_ppc fips_sha512_block_ppc | |
730 | +#define OPENSSL_ppccap_P fips_openssl_ppccap_p | |
731 | +#define OPENSSL_crypto207_probe fips_openssl_crypto207_probe | |
715 | 732 | |
716 | 733 | #if defined(_MSC_VER) |
717 | 734 | # pragma const_seg("fipsro$b") |