• R/O
  • HTTP
  • SSH
  • HTTPS

コミット

タグ
未設定

よく使われているワード(クリックで追加)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

TLS/SSL and crypto library


コミットメタ情報

リビジョン894c04aa05ba1e64735d7beb9c2a1da93d288e31 (tree)
日時2016-06-22 06:44:54
作者Andy Polyakov <appro@open...>
コミッターAndy Polyakov

ログメッセージ

PowerPC assembly pack: add POWER8 support.

Reviewed-by: Dr. Stephen Henson <steve@openssl.org>
(cherry picked from commit 4577871ca393275ac0436b2b08f1a75661ced314)

変更サマリ

差分

--- a/Configure
+++ b/Configure
@@ -139,8 +139,8 @@ my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc
139139 my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o:::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:";
140140 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
141141 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
142-my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::";
143-my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::";
142+my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:";
143+my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:";
144144 my $no_asm=":::::::::::::::void";
145145
146146 # As for $BSDthreads. Idea is to maintain "collective" set of flags,
--- a/crypto/aes/Makefile
+++ b/crypto/aes/Makefile
@@ -71,6 +71,8 @@ aes-sparcv9.s: asm/aes-sparcv9.pl
7171
7272 aes-ppc.s: asm/aes-ppc.pl
7373 $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@
74+aesp8-ppc.s: asm/aesp8-ppc.pl
75+ $(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@
7476
7577 aes-parisc.s: asm/aes-parisc.pl
7678 $(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@
--- a/crypto/aes/asm/aes-ppc.pl
+++ b/crypto/aes/asm/aes-ppc.pl
@@ -548,7 +548,7 @@ Lenc_loop:
548548 xor $s2,$t2,$acc14
549549 xor $s3,$t3,$acc15
550550 addi $key,$key,16
551- bdnz- Lenc_loop
551+ bdnz Lenc_loop
552552
553553 addi $Tbl2,$Tbl0,2048
554554 nop
@@ -982,7 +982,7 @@ Ldec_loop:
982982 xor $s2,$t2,$acc14
983983 xor $s3,$t3,$acc15
984984 addi $key,$key,16
985- bdnz- Ldec_loop
985+ bdnz Ldec_loop
986986
987987 addi $Tbl2,$Tbl0,2048
988988 nop
--- /dev/null
+++ b/crypto/aes/asm/aesp8-ppc.pl
@@ -0,0 +1,3726 @@
1+#!/usr/bin/env perl
2+#
3+# ====================================================================
4+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+#
10+# This module implements support for AES instructions as per PowerISA
11+# specification version 2.07, first implemented by POWER8 processor.
12+# The module is endian-agnostic in sense that it supports both big-
13+# and little-endian cases. Data alignment in parallelizable modes is
14+# handled with VSX loads and stores, which implies MSR.VSX flag being
15+# set. It should also be noted that ISA specification doesn't prohibit
16+# alignment exceptions for these instructions on page boundaries.
17+# Initially alignment was handled in pure AltiVec/VMX way [when data
18+# is aligned programmatically, which in turn guarantees exception-
19+# free execution], but it turned to hamper performance when vcipher
20+# instructions are interleaved. It's reckoned that eventual
21+# misalignment penalties at page boundaries are in average lower
22+# than additional overhead in pure AltiVec approach.
23+#
24+# May 2016
25+#
26+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
27+# systems were measured.
28+#
29+######################################################################
30+# Current large-block performance in cycles per byte processed with
31+# 128-bit key (less is better).
32+#
33+# CBC en-/decrypt CTR XTS
34+# POWER8[le] 3.96/0.72 0.74 1.1
35+# POWER8[be] 3.75/0.65 0.66 1.0
36+
37+$flavour = shift;
38+
39+if ($flavour =~ /64/) {
40+ $SIZE_T =8;
41+ $LRSAVE =2*$SIZE_T;
42+ $STU ="stdu";
43+ $POP ="ld";
44+ $PUSH ="std";
45+ $UCMP ="cmpld";
46+ $SHL ="sldi";
47+} elsif ($flavour =~ /32/) {
48+ $SIZE_T =4;
49+ $LRSAVE =$SIZE_T;
50+ $STU ="stwu";
51+ $POP ="lwz";
52+ $PUSH ="stw";
53+ $UCMP ="cmplw";
54+ $SHL ="slwi";
55+} else { die "nonsense $flavour"; }
56+
57+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
58+
59+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62+die "can't locate ppc-xlate.pl";
63+
64+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
65+
66+$FRAME=8*$SIZE_T;
67+$prefix="aes_p8";
68+
69+$sp="r1";
70+$vrsave="r12";
71+
72+#########################################################################
73+{{{ # Key setup procedures #
74+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
75+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
76+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
77+
78+$code.=<<___;
79+.machine "any"
80+
81+.text
82+
83+.align 7
84+rcon:
85+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
86+.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
87+.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
88+.long 0,0,0,0 ?asis
89+Lconsts:
90+ mflr r0
91+ bcl 20,31,\$+4
92+ mflr $ptr #vvvvv "distance between . and rcon
93+ addi $ptr,$ptr,-0x48
94+ mtlr r0
95+ blr
96+ .long 0
97+ .byte 0,12,0x14,0,0,0,0,0
98+.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
99+
100+.globl .${prefix}_set_encrypt_key
101+.align 5
102+.${prefix}_set_encrypt_key:
103+Lset_encrypt_key:
104+ mflr r11
105+ $PUSH r11,$LRSAVE($sp)
106+
107+ li $ptr,-1
108+ ${UCMP}i $inp,0
109+ beq- Lenc_key_abort # if ($inp==0) return -1;
110+ ${UCMP}i $out,0
111+ beq- Lenc_key_abort # if ($out==0) return -1;
112+ li $ptr,-2
113+ cmpwi $bits,128
114+ blt- Lenc_key_abort
115+ cmpwi $bits,256
116+ bgt- Lenc_key_abort
117+ andi. r0,$bits,0x3f
118+ bne- Lenc_key_abort
119+
120+ lis r0,0xfff0
121+ mfspr $vrsave,256
122+ mtspr 256,r0
123+
124+ bl Lconsts
125+ mtlr r11
126+
127+ neg r9,$inp
128+ lvx $in0,0,$inp
129+ addi $inp,$inp,15 # 15 is not typo
130+ lvsr $key,0,r9 # borrow $key
131+ li r8,0x20
132+ cmpwi $bits,192
133+ lvx $in1,0,$inp
134+ le?vspltisb $mask,0x0f # borrow $mask
135+ lvx $rcon,0,$ptr
136+ le?vxor $key,$key,$mask # adjust for byte swap
137+ lvx $mask,r8,$ptr
138+ addi $ptr,$ptr,0x10
139+ vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
140+ li $cnt,8
141+ vxor $zero,$zero,$zero
142+ mtctr $cnt
143+
144+ ?lvsr $outperm,0,$out
145+ vspltisb $outmask,-1
146+ lvx $outhead,0,$out
147+ ?vperm $outmask,$zero,$outmask,$outperm
148+
149+ blt Loop128
150+ addi $inp,$inp,8
151+ beq L192
152+ addi $inp,$inp,8
153+ b L256
154+
155+.align 4
156+Loop128:
157+ vperm $key,$in0,$in0,$mask # rotate-n-splat
158+ vsldoi $tmp,$zero,$in0,12 # >>32
159+ vperm $outtail,$in0,$in0,$outperm # rotate
160+ vsel $stage,$outhead,$outtail,$outmask
161+ vmr $outhead,$outtail
162+ vcipherlast $key,$key,$rcon
163+ stvx $stage,0,$out
164+ addi $out,$out,16
165+
166+ vxor $in0,$in0,$tmp
167+ vsldoi $tmp,$zero,$tmp,12 # >>32
168+ vxor $in0,$in0,$tmp
169+ vsldoi $tmp,$zero,$tmp,12 # >>32
170+ vxor $in0,$in0,$tmp
171+ vadduwm $rcon,$rcon,$rcon
172+ vxor $in0,$in0,$key
173+ bdnz Loop128
174+
175+ lvx $rcon,0,$ptr # last two round keys
176+
177+ vperm $key,$in0,$in0,$mask # rotate-n-splat
178+ vsldoi $tmp,$zero,$in0,12 # >>32
179+ vperm $outtail,$in0,$in0,$outperm # rotate
180+ vsel $stage,$outhead,$outtail,$outmask
181+ vmr $outhead,$outtail
182+ vcipherlast $key,$key,$rcon
183+ stvx $stage,0,$out
184+ addi $out,$out,16
185+
186+ vxor $in0,$in0,$tmp
187+ vsldoi $tmp,$zero,$tmp,12 # >>32
188+ vxor $in0,$in0,$tmp
189+ vsldoi $tmp,$zero,$tmp,12 # >>32
190+ vxor $in0,$in0,$tmp
191+ vadduwm $rcon,$rcon,$rcon
192+ vxor $in0,$in0,$key
193+
194+ vperm $key,$in0,$in0,$mask # rotate-n-splat
195+ vsldoi $tmp,$zero,$in0,12 # >>32
196+ vperm $outtail,$in0,$in0,$outperm # rotate
197+ vsel $stage,$outhead,$outtail,$outmask
198+ vmr $outhead,$outtail
199+ vcipherlast $key,$key,$rcon
200+ stvx $stage,0,$out
201+ addi $out,$out,16
202+
203+ vxor $in0,$in0,$tmp
204+ vsldoi $tmp,$zero,$tmp,12 # >>32
205+ vxor $in0,$in0,$tmp
206+ vsldoi $tmp,$zero,$tmp,12 # >>32
207+ vxor $in0,$in0,$tmp
208+ vxor $in0,$in0,$key
209+ vperm $outtail,$in0,$in0,$outperm # rotate
210+ vsel $stage,$outhead,$outtail,$outmask
211+ vmr $outhead,$outtail
212+ stvx $stage,0,$out
213+
214+ addi $inp,$out,15 # 15 is not typo
215+ addi $out,$out,0x50
216+
217+ li $rounds,10
218+ b Ldone
219+
220+.align 4
221+L192:
222+ lvx $tmp,0,$inp
223+ li $cnt,4
224+ vperm $outtail,$in0,$in0,$outperm # rotate
225+ vsel $stage,$outhead,$outtail,$outmask
226+ vmr $outhead,$outtail
227+ stvx $stage,0,$out
228+ addi $out,$out,16
229+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
230+ vspltisb $key,8 # borrow $key
231+ mtctr $cnt
232+ vsububm $mask,$mask,$key # adjust the mask
233+
234+Loop192:
235+ vperm $key,$in1,$in1,$mask # roate-n-splat
236+ vsldoi $tmp,$zero,$in0,12 # >>32
237+ vcipherlast $key,$key,$rcon
238+
239+ vxor $in0,$in0,$tmp
240+ vsldoi $tmp,$zero,$tmp,12 # >>32
241+ vxor $in0,$in0,$tmp
242+ vsldoi $tmp,$zero,$tmp,12 # >>32
243+ vxor $in0,$in0,$tmp
244+
245+ vsldoi $stage,$zero,$in1,8
246+ vspltw $tmp,$in0,3
247+ vxor $tmp,$tmp,$in1
248+ vsldoi $in1,$zero,$in1,12 # >>32
249+ vadduwm $rcon,$rcon,$rcon
250+ vxor $in1,$in1,$tmp
251+ vxor $in0,$in0,$key
252+ vxor $in1,$in1,$key
253+ vsldoi $stage,$stage,$in0,8
254+
255+ vperm $key,$in1,$in1,$mask # rotate-n-splat
256+ vsldoi $tmp,$zero,$in0,12 # >>32
257+ vperm $outtail,$stage,$stage,$outperm # rotate
258+ vsel $stage,$outhead,$outtail,$outmask
259+ vmr $outhead,$outtail
260+ vcipherlast $key,$key,$rcon
261+ stvx $stage,0,$out
262+ addi $out,$out,16
263+
264+ vsldoi $stage,$in0,$in1,8
265+ vxor $in0,$in0,$tmp
266+ vsldoi $tmp,$zero,$tmp,12 # >>32
267+ vperm $outtail,$stage,$stage,$outperm # rotate
268+ vsel $stage,$outhead,$outtail,$outmask
269+ vmr $outhead,$outtail
270+ vxor $in0,$in0,$tmp
271+ vsldoi $tmp,$zero,$tmp,12 # >>32
272+ vxor $in0,$in0,$tmp
273+ stvx $stage,0,$out
274+ addi $out,$out,16
275+
276+ vspltw $tmp,$in0,3
277+ vxor $tmp,$tmp,$in1
278+ vsldoi $in1,$zero,$in1,12 # >>32
279+ vadduwm $rcon,$rcon,$rcon
280+ vxor $in1,$in1,$tmp
281+ vxor $in0,$in0,$key
282+ vxor $in1,$in1,$key
283+ vperm $outtail,$in0,$in0,$outperm # rotate
284+ vsel $stage,$outhead,$outtail,$outmask
285+ vmr $outhead,$outtail
286+ stvx $stage,0,$out
287+ addi $inp,$out,15 # 15 is not typo
288+ addi $out,$out,16
289+ bdnz Loop192
290+
291+ li $rounds,12
292+ addi $out,$out,0x20
293+ b Ldone
294+
295+.align 4
296+L256:
297+ lvx $tmp,0,$inp
298+ li $cnt,7
299+ li $rounds,14
300+ vperm $outtail,$in0,$in0,$outperm # rotate
301+ vsel $stage,$outhead,$outtail,$outmask
302+ vmr $outhead,$outtail
303+ stvx $stage,0,$out
304+ addi $out,$out,16
305+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
306+ mtctr $cnt
307+
308+Loop256:
309+ vperm $key,$in1,$in1,$mask # rotate-n-splat
310+ vsldoi $tmp,$zero,$in0,12 # >>32
311+ vperm $outtail,$in1,$in1,$outperm # rotate
312+ vsel $stage,$outhead,$outtail,$outmask
313+ vmr $outhead,$outtail
314+ vcipherlast $key,$key,$rcon
315+ stvx $stage,0,$out
316+ addi $out,$out,16
317+
318+ vxor $in0,$in0,$tmp
319+ vsldoi $tmp,$zero,$tmp,12 # >>32
320+ vxor $in0,$in0,$tmp
321+ vsldoi $tmp,$zero,$tmp,12 # >>32
322+ vxor $in0,$in0,$tmp
323+ vadduwm $rcon,$rcon,$rcon
324+ vxor $in0,$in0,$key
325+ vperm $outtail,$in0,$in0,$outperm # rotate
326+ vsel $stage,$outhead,$outtail,$outmask
327+ vmr $outhead,$outtail
328+ stvx $stage,0,$out
329+ addi $inp,$out,15 # 15 is not typo
330+ addi $out,$out,16
331+ bdz Ldone
332+
333+ vspltw $key,$in0,3 # just splat
334+ vsldoi $tmp,$zero,$in1,12 # >>32
335+ vsbox $key,$key
336+
337+ vxor $in1,$in1,$tmp
338+ vsldoi $tmp,$zero,$tmp,12 # >>32
339+ vxor $in1,$in1,$tmp
340+ vsldoi $tmp,$zero,$tmp,12 # >>32
341+ vxor $in1,$in1,$tmp
342+
343+ vxor $in1,$in1,$key
344+ b Loop256
345+
346+.align 4
347+Ldone:
348+ lvx $in1,0,$inp # redundant in aligned case
349+ vsel $in1,$outhead,$in1,$outmask
350+ stvx $in1,0,$inp
351+ li $ptr,0
352+ mtspr 256,$vrsave
353+ stw $rounds,0($out)
354+
355+Lenc_key_abort:
356+ mr r3,$ptr
357+ blr
358+ .long 0
359+ .byte 0,12,0x14,1,0,0,3,0
360+ .long 0
361+.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
362+
363+.globl .${prefix}_set_decrypt_key
364+.align 5
365+.${prefix}_set_decrypt_key:
366+ $STU $sp,-$FRAME($sp)
367+ mflr r10
368+ $PUSH r10,$FRAME+$LRSAVE($sp)
369+ bl Lset_encrypt_key
370+ mtlr r10
371+
372+ cmpwi r3,0
373+ bne- Ldec_key_abort
374+
375+ slwi $cnt,$rounds,4
376+ subi $inp,$out,240 # first round key
377+ srwi $rounds,$rounds,1
378+ add $out,$inp,$cnt # last round key
379+ mtctr $rounds
380+
381+Ldeckey:
382+ lwz r0, 0($inp)
383+ lwz r6, 4($inp)
384+ lwz r7, 8($inp)
385+ lwz r8, 12($inp)
386+ addi $inp,$inp,16
387+ lwz r9, 0($out)
388+ lwz r10,4($out)
389+ lwz r11,8($out)
390+ lwz r12,12($out)
391+ stw r0, 0($out)
392+ stw r6, 4($out)
393+ stw r7, 8($out)
394+ stw r8, 12($out)
395+ subi $out,$out,16
396+ stw r9, -16($inp)
397+ stw r10,-12($inp)
398+ stw r11,-8($inp)
399+ stw r12,-4($inp)
400+ bdnz Ldeckey
401+
402+ xor r3,r3,r3 # return value
403+Ldec_key_abort:
404+ addi $sp,$sp,$FRAME
405+ blr
406+ .long 0
407+ .byte 0,12,4,1,0x80,0,3,0
408+ .long 0
409+.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
410+___
411+}}}
412+#########################################################################
413+{{{ # Single block en- and decrypt procedures #
414+sub gen_block () {
415+my $dir = shift;
416+my $n = $dir eq "de" ? "n" : "";
417+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
418+
419+$code.=<<___;
420+.globl .${prefix}_${dir}crypt
421+.align 5
422+.${prefix}_${dir}crypt:
423+ lwz $rounds,240($key)
424+ lis r0,0xfc00
425+ mfspr $vrsave,256
426+ li $idx,15 # 15 is not typo
427+ mtspr 256,r0
428+
429+ lvx v0,0,$inp
430+ neg r11,$out
431+ lvx v1,$idx,$inp
432+ lvsl v2,0,$inp # inpperm
433+ le?vspltisb v4,0x0f
434+ ?lvsl v3,0,r11 # outperm
435+ le?vxor v2,v2,v4
436+ li $idx,16
437+ vperm v0,v0,v1,v2 # align [and byte swap in LE]
438+ lvx v1,0,$key
439+ ?lvsl v5,0,$key # keyperm
440+ srwi $rounds,$rounds,1
441+ lvx v2,$idx,$key
442+ addi $idx,$idx,16
443+ subi $rounds,$rounds,1
444+ ?vperm v1,v1,v2,v5 # align round key
445+
446+ vxor v0,v0,v1
447+ lvx v1,$idx,$key
448+ addi $idx,$idx,16
449+ mtctr $rounds
450+
451+Loop_${dir}c:
452+ ?vperm v2,v2,v1,v5
453+ v${n}cipher v0,v0,v2
454+ lvx v2,$idx,$key
455+ addi $idx,$idx,16
456+ ?vperm v1,v1,v2,v5
457+ v${n}cipher v0,v0,v1
458+ lvx v1,$idx,$key
459+ addi $idx,$idx,16
460+ bdnz Loop_${dir}c
461+
462+ ?vperm v2,v2,v1,v5
463+ v${n}cipher v0,v0,v2
464+ lvx v2,$idx,$key
465+ ?vperm v1,v1,v2,v5
466+ v${n}cipherlast v0,v0,v1
467+
468+ vspltisb v2,-1
469+ vxor v1,v1,v1
470+ li $idx,15 # 15 is not typo
471+ ?vperm v2,v1,v2,v3 # outmask
472+ le?vxor v3,v3,v4
473+ lvx v1,0,$out # outhead
474+ vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
475+ vsel v1,v1,v0,v2
476+ lvx v4,$idx,$out
477+ stvx v1,0,$out
478+ vsel v0,v0,v4,v2
479+ stvx v0,$idx,$out
480+
481+ mtspr 256,$vrsave
482+ blr
483+ .long 0
484+ .byte 0,12,0x14,0,0,0,3,0
485+ .long 0
486+.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
487+___
488+}
489+&gen_block("en");
490+&gen_block("de");
491+}}}
492+#########################################################################
493+{{{ # CBC en- and decrypt procedures #
494+my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
495+my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
496+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
497+ map("v$_",(4..10));
498+$code.=<<___;
499+.globl .${prefix}_cbc_encrypt
500+.align 5
501+.${prefix}_cbc_encrypt:
502+ ${UCMP}i $len,16
503+ bltlr-
504+
505+ cmpwi $enc,0 # test direction
506+ lis r0,0xffe0
507+ mfspr $vrsave,256
508+ mtspr 256,r0
509+
510+ li $idx,15
511+ vxor $rndkey0,$rndkey0,$rndkey0
512+ le?vspltisb $tmp,0x0f
513+
514+ lvx $ivec,0,$ivp # load [unaligned] iv
515+ lvsl $inpperm,0,$ivp
516+ lvx $inptail,$idx,$ivp
517+ le?vxor $inpperm,$inpperm,$tmp
518+ vperm $ivec,$ivec,$inptail,$inpperm
519+
520+ neg r11,$inp
521+ ?lvsl $keyperm,0,$key # prepare for unaligned key
522+ lwz $rounds,240($key)
523+
524+ lvsr $inpperm,0,r11 # prepare for unaligned load
525+ lvx $inptail,0,$inp
526+ addi $inp,$inp,15 # 15 is not typo
527+ le?vxor $inpperm,$inpperm,$tmp
528+
529+ ?lvsr $outperm,0,$out # prepare for unaligned store
530+ vspltisb $outmask,-1
531+ lvx $outhead,0,$out
532+ ?vperm $outmask,$rndkey0,$outmask,$outperm
533+ le?vxor $outperm,$outperm,$tmp
534+
535+ srwi $rounds,$rounds,1
536+ li $idx,16
537+ subi $rounds,$rounds,1
538+ beq Lcbc_dec
539+
540+Lcbc_enc:
541+ vmr $inout,$inptail
542+ lvx $inptail,0,$inp
543+ addi $inp,$inp,16
544+ mtctr $rounds
545+ subi $len,$len,16 # len-=16
546+
547+ lvx $rndkey0,0,$key
548+ vperm $inout,$inout,$inptail,$inpperm
549+ lvx $rndkey1,$idx,$key
550+ addi $idx,$idx,16
551+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
552+ vxor $inout,$inout,$rndkey0
553+ lvx $rndkey0,$idx,$key
554+ addi $idx,$idx,16
555+ vxor $inout,$inout,$ivec
556+
557+Loop_cbc_enc:
558+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
559+ vcipher $inout,$inout,$rndkey1
560+ lvx $rndkey1,$idx,$key
561+ addi $idx,$idx,16
562+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
563+ vcipher $inout,$inout,$rndkey0
564+ lvx $rndkey0,$idx,$key
565+ addi $idx,$idx,16
566+ bdnz Loop_cbc_enc
567+
568+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
569+ vcipher $inout,$inout,$rndkey1
570+ lvx $rndkey1,$idx,$key
571+ li $idx,16
572+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
573+ vcipherlast $ivec,$inout,$rndkey0
574+ ${UCMP}i $len,16
575+
576+ vperm $tmp,$ivec,$ivec,$outperm
577+ vsel $inout,$outhead,$tmp,$outmask
578+ vmr $outhead,$tmp
579+ stvx $inout,0,$out
580+ addi $out,$out,16
581+ bge Lcbc_enc
582+
583+ b Lcbc_done
584+
585+.align 4
586+Lcbc_dec:
587+ ${UCMP}i $len,128
588+ bge _aesp8_cbc_decrypt8x
589+ vmr $tmp,$inptail
590+ lvx $inptail,0,$inp
591+ addi $inp,$inp,16
592+ mtctr $rounds
593+ subi $len,$len,16 # len-=16
594+
595+ lvx $rndkey0,0,$key
596+ vperm $tmp,$tmp,$inptail,$inpperm
597+ lvx $rndkey1,$idx,$key
598+ addi $idx,$idx,16
599+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
600+ vxor $inout,$tmp,$rndkey0
601+ lvx $rndkey0,$idx,$key
602+ addi $idx,$idx,16
603+
604+Loop_cbc_dec:
605+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
606+ vncipher $inout,$inout,$rndkey1
607+ lvx $rndkey1,$idx,$key
608+ addi $idx,$idx,16
609+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
610+ vncipher $inout,$inout,$rndkey0
611+ lvx $rndkey0,$idx,$key
612+ addi $idx,$idx,16
613+ bdnz Loop_cbc_dec
614+
615+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
616+ vncipher $inout,$inout,$rndkey1
617+ lvx $rndkey1,$idx,$key
618+ li $idx,16
619+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
620+ vncipherlast $inout,$inout,$rndkey0
621+ ${UCMP}i $len,16
622+
623+ vxor $inout,$inout,$ivec
624+ vmr $ivec,$tmp
625+ vperm $tmp,$inout,$inout,$outperm
626+ vsel $inout,$outhead,$tmp,$outmask
627+ vmr $outhead,$tmp
628+ stvx $inout,0,$out
629+ addi $out,$out,16
630+ bge Lcbc_dec
631+
632+Lcbc_done:
633+ addi $out,$out,-1
634+ lvx $inout,0,$out # redundant in aligned case
635+ vsel $inout,$outhead,$inout,$outmask
636+ stvx $inout,0,$out
637+
638+ neg $enc,$ivp # write [unaligned] iv
639+ li $idx,15 # 15 is not typo
640+ vxor $rndkey0,$rndkey0,$rndkey0
641+ vspltisb $outmask,-1
642+ le?vspltisb $tmp,0x0f
643+ ?lvsl $outperm,0,$enc
644+ ?vperm $outmask,$rndkey0,$outmask,$outperm
645+ le?vxor $outperm,$outperm,$tmp
646+ lvx $outhead,0,$ivp
647+ vperm $ivec,$ivec,$ivec,$outperm
648+ vsel $inout,$outhead,$ivec,$outmask
649+ lvx $inptail,$idx,$ivp
650+ stvx $inout,0,$ivp
651+ vsel $inout,$ivec,$inptail,$outmask
652+ stvx $inout,$idx,$ivp
653+
654+ mtspr 256,$vrsave
655+ blr
656+ .long 0
657+ .byte 0,12,0x14,0,0,0,6,0
658+ .long 0
659+___
660+#########################################################################
661+{{ # Optimized CBC decrypt procedure #
662+my $key_="r11";
663+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
664+ $x00=0 if ($flavour =~ /osx/);
665+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
666+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
667+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
668+ # v26-v31 last 6 round keys
669+my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
670+
671+$code.=<<___;
672+.align 5
673+_aesp8_cbc_decrypt8x:
674+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
675+ li r10,`$FRAME+8*16+15`
676+ li r11,`$FRAME+8*16+31`
677+ stvx v20,r10,$sp # ABI says so
678+ addi r10,r10,32
679+ stvx v21,r11,$sp
680+ addi r11,r11,32
681+ stvx v22,r10,$sp
682+ addi r10,r10,32
683+ stvx v23,r11,$sp
684+ addi r11,r11,32
685+ stvx v24,r10,$sp
686+ addi r10,r10,32
687+ stvx v25,r11,$sp
688+ addi r11,r11,32
689+ stvx v26,r10,$sp
690+ addi r10,r10,32
691+ stvx v27,r11,$sp
692+ addi r11,r11,32
693+ stvx v28,r10,$sp
694+ addi r10,r10,32
695+ stvx v29,r11,$sp
696+ addi r11,r11,32
697+ stvx v30,r10,$sp
698+ stvx v31,r11,$sp
699+ li r0,-1
700+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
701+ li $x10,0x10
702+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
703+ li $x20,0x20
704+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
705+ li $x30,0x30
706+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
707+ li $x40,0x40
708+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
709+ li $x50,0x50
710+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
711+ li $x60,0x60
712+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
713+ li $x70,0x70
714+ mtspr 256,r0
715+
716+ subi $rounds,$rounds,3 # -4 in total
717+ subi $len,$len,128 # bias
718+
719+ lvx $rndkey0,$x00,$key # load key schedule
720+ lvx v30,$x10,$key
721+ addi $key,$key,0x20
722+ lvx v31,$x00,$key
723+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
724+ addi $key_,$sp,$FRAME+15
725+ mtctr $rounds
726+
727+Load_cbc_dec_key:
728+ ?vperm v24,v30,v31,$keyperm
729+ lvx v30,$x10,$key
730+ addi $key,$key,0x20
731+ stvx v24,$x00,$key_ # off-load round[1]
732+ ?vperm v25,v31,v30,$keyperm
733+ lvx v31,$x00,$key
734+ stvx v25,$x10,$key_ # off-load round[2]
735+ addi $key_,$key_,0x20
736+ bdnz Load_cbc_dec_key
737+
738+ lvx v26,$x10,$key
739+ ?vperm v24,v30,v31,$keyperm
740+ lvx v27,$x20,$key
741+ stvx v24,$x00,$key_ # off-load round[3]
742+ ?vperm v25,v31,v26,$keyperm
743+ lvx v28,$x30,$key
744+ stvx v25,$x10,$key_ # off-load round[4]
745+ addi $key_,$sp,$FRAME+15 # rewind $key_
746+ ?vperm v26,v26,v27,$keyperm
747+ lvx v29,$x40,$key
748+ ?vperm v27,v27,v28,$keyperm
749+ lvx v30,$x50,$key
750+ ?vperm v28,v28,v29,$keyperm
751+ lvx v31,$x60,$key
752+ ?vperm v29,v29,v30,$keyperm
753+ lvx $out0,$x70,$key # borrow $out0
754+ ?vperm v30,v30,v31,$keyperm
755+ lvx v24,$x00,$key_ # pre-load round[1]
756+ ?vperm v31,v31,$out0,$keyperm
757+ lvx v25,$x10,$key_ # pre-load round[2]
758+
759+ #lvx $inptail,0,$inp # "caller" already did this
760+ #addi $inp,$inp,15 # 15 is not typo
761+ subi $inp,$inp,15 # undo "caller"
762+
763+ le?li $idx,8
764+ lvx_u $in0,$x00,$inp # load first 8 "words"
765+ le?lvsl $inpperm,0,$idx
766+ le?vspltisb $tmp,0x0f
767+ lvx_u $in1,$x10,$inp
768+ le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
769+ lvx_u $in2,$x20,$inp
770+ le?vperm $in0,$in0,$in0,$inpperm
771+ lvx_u $in3,$x30,$inp
772+ le?vperm $in1,$in1,$in1,$inpperm
773+ lvx_u $in4,$x40,$inp
774+ le?vperm $in2,$in2,$in2,$inpperm
775+ vxor $out0,$in0,$rndkey0
776+ lvx_u $in5,$x50,$inp
777+ le?vperm $in3,$in3,$in3,$inpperm
778+ vxor $out1,$in1,$rndkey0
779+ lvx_u $in6,$x60,$inp
780+ le?vperm $in4,$in4,$in4,$inpperm
781+ vxor $out2,$in2,$rndkey0
782+ lvx_u $in7,$x70,$inp
783+ addi $inp,$inp,0x80
784+ le?vperm $in5,$in5,$in5,$inpperm
785+ vxor $out3,$in3,$rndkey0
786+ le?vperm $in6,$in6,$in6,$inpperm
787+ vxor $out4,$in4,$rndkey0
788+ le?vperm $in7,$in7,$in7,$inpperm
789+ vxor $out5,$in5,$rndkey0
790+ vxor $out6,$in6,$rndkey0
791+ vxor $out7,$in7,$rndkey0
792+
793+ mtctr $rounds
794+ b Loop_cbc_dec8x
795+.align 5
796+Loop_cbc_dec8x:
797+ vncipher $out0,$out0,v24
798+ vncipher $out1,$out1,v24
799+ vncipher $out2,$out2,v24
800+ vncipher $out3,$out3,v24
801+ vncipher $out4,$out4,v24
802+ vncipher $out5,$out5,v24
803+ vncipher $out6,$out6,v24
804+ vncipher $out7,$out7,v24
805+ lvx v24,$x20,$key_ # round[3]
806+ addi $key_,$key_,0x20
807+
808+ vncipher $out0,$out0,v25
809+ vncipher $out1,$out1,v25
810+ vncipher $out2,$out2,v25
811+ vncipher $out3,$out3,v25
812+ vncipher $out4,$out4,v25
813+ vncipher $out5,$out5,v25
814+ vncipher $out6,$out6,v25
815+ vncipher $out7,$out7,v25
816+ lvx v25,$x10,$key_ # round[4]
817+ bdnz Loop_cbc_dec8x
818+
819+ subic $len,$len,128 # $len-=128
820+ vncipher $out0,$out0,v24
821+ vncipher $out1,$out1,v24
822+ vncipher $out2,$out2,v24
823+ vncipher $out3,$out3,v24
824+ vncipher $out4,$out4,v24
825+ vncipher $out5,$out5,v24
826+ vncipher $out6,$out6,v24
827+ vncipher $out7,$out7,v24
828+
829+ subfe. r0,r0,r0 # borrow?-1:0
830+ vncipher $out0,$out0,v25
831+ vncipher $out1,$out1,v25
832+ vncipher $out2,$out2,v25
833+ vncipher $out3,$out3,v25
834+ vncipher $out4,$out4,v25
835+ vncipher $out5,$out5,v25
836+ vncipher $out6,$out6,v25
837+ vncipher $out7,$out7,v25
838+
839+ and r0,r0,$len
840+ vncipher $out0,$out0,v26
841+ vncipher $out1,$out1,v26
842+ vncipher $out2,$out2,v26
843+ vncipher $out3,$out3,v26
844+ vncipher $out4,$out4,v26
845+ vncipher $out5,$out5,v26
846+ vncipher $out6,$out6,v26
847+ vncipher $out7,$out7,v26
848+
849+ add $inp,$inp,r0 # $inp is adjusted in such
850+ # way that at exit from the
851+ # loop inX-in7 are loaded
852+ # with last "words"
853+ vncipher $out0,$out0,v27
854+ vncipher $out1,$out1,v27
855+ vncipher $out2,$out2,v27
856+ vncipher $out3,$out3,v27
857+ vncipher $out4,$out4,v27
858+ vncipher $out5,$out5,v27
859+ vncipher $out6,$out6,v27
860+ vncipher $out7,$out7,v27
861+
862+ addi $key_,$sp,$FRAME+15 # rewind $key_
863+ vncipher $out0,$out0,v28
864+ vncipher $out1,$out1,v28
865+ vncipher $out2,$out2,v28
866+ vncipher $out3,$out3,v28
867+ vncipher $out4,$out4,v28
868+ vncipher $out5,$out5,v28
869+ vncipher $out6,$out6,v28
870+ vncipher $out7,$out7,v28
871+ lvx v24,$x00,$key_ # re-pre-load round[1]
872+
873+ vncipher $out0,$out0,v29
874+ vncipher $out1,$out1,v29
875+ vncipher $out2,$out2,v29
876+ vncipher $out3,$out3,v29
877+ vncipher $out4,$out4,v29
878+ vncipher $out5,$out5,v29
879+ vncipher $out6,$out6,v29
880+ vncipher $out7,$out7,v29
881+ lvx v25,$x10,$key_ # re-pre-load round[2]
882+
883+ vncipher $out0,$out0,v30
884+ vxor $ivec,$ivec,v31 # xor with last round key
885+ vncipher $out1,$out1,v30
886+ vxor $in0,$in0,v31
887+ vncipher $out2,$out2,v30
888+ vxor $in1,$in1,v31
889+ vncipher $out3,$out3,v30
890+ vxor $in2,$in2,v31
891+ vncipher $out4,$out4,v30
892+ vxor $in3,$in3,v31
893+ vncipher $out5,$out5,v30
894+ vxor $in4,$in4,v31
895+ vncipher $out6,$out6,v30
896+ vxor $in5,$in5,v31
897+ vncipher $out7,$out7,v30
898+ vxor $in6,$in6,v31
899+
900+ vncipherlast $out0,$out0,$ivec
901+ vncipherlast $out1,$out1,$in0
902+ lvx_u $in0,$x00,$inp # load next input block
903+ vncipherlast $out2,$out2,$in1
904+ lvx_u $in1,$x10,$inp
905+ vncipherlast $out3,$out3,$in2
906+ le?vperm $in0,$in0,$in0,$inpperm
907+ lvx_u $in2,$x20,$inp
908+ vncipherlast $out4,$out4,$in3
909+ le?vperm $in1,$in1,$in1,$inpperm
910+ lvx_u $in3,$x30,$inp
911+ vncipherlast $out5,$out5,$in4
912+ le?vperm $in2,$in2,$in2,$inpperm
913+ lvx_u $in4,$x40,$inp
914+ vncipherlast $out6,$out6,$in5
915+ le?vperm $in3,$in3,$in3,$inpperm
916+ lvx_u $in5,$x50,$inp
917+ vncipherlast $out7,$out7,$in6
918+ le?vperm $in4,$in4,$in4,$inpperm
919+ lvx_u $in6,$x60,$inp
920+ vmr $ivec,$in7
921+ le?vperm $in5,$in5,$in5,$inpperm
922+ lvx_u $in7,$x70,$inp
923+ addi $inp,$inp,0x80
924+
925+ le?vperm $out0,$out0,$out0,$inpperm
926+ le?vperm $out1,$out1,$out1,$inpperm
927+ stvx_u $out0,$x00,$out
928+ le?vperm $in6,$in6,$in6,$inpperm
929+ vxor $out0,$in0,$rndkey0
930+ le?vperm $out2,$out2,$out2,$inpperm
931+ stvx_u $out1,$x10,$out
932+ le?vperm $in7,$in7,$in7,$inpperm
933+ vxor $out1,$in1,$rndkey0
934+ le?vperm $out3,$out3,$out3,$inpperm
935+ stvx_u $out2,$x20,$out
936+ vxor $out2,$in2,$rndkey0
937+ le?vperm $out4,$out4,$out4,$inpperm
938+ stvx_u $out3,$x30,$out
939+ vxor $out3,$in3,$rndkey0
940+ le?vperm $out5,$out5,$out5,$inpperm
941+ stvx_u $out4,$x40,$out
942+ vxor $out4,$in4,$rndkey0
943+ le?vperm $out6,$out6,$out6,$inpperm
944+ stvx_u $out5,$x50,$out
945+ vxor $out5,$in5,$rndkey0
946+ le?vperm $out7,$out7,$out7,$inpperm
947+ stvx_u $out6,$x60,$out
948+ vxor $out6,$in6,$rndkey0
949+ stvx_u $out7,$x70,$out
950+ addi $out,$out,0x80
951+ vxor $out7,$in7,$rndkey0
952+
953+ mtctr $rounds
954+ beq Loop_cbc_dec8x # did $len-=128 borrow?
955+
956+ addic. $len,$len,128
957+ beq Lcbc_dec8x_done
958+ nop
959+ nop
960+
961+Loop_cbc_dec8x_tail: # up to 7 "words" tail...
962+ vncipher $out1,$out1,v24
963+ vncipher $out2,$out2,v24
964+ vncipher $out3,$out3,v24
965+ vncipher $out4,$out4,v24
966+ vncipher $out5,$out5,v24
967+ vncipher $out6,$out6,v24
968+ vncipher $out7,$out7,v24
969+ lvx v24,$x20,$key_ # round[3]
970+ addi $key_,$key_,0x20
971+
972+ vncipher $out1,$out1,v25
973+ vncipher $out2,$out2,v25
974+ vncipher $out3,$out3,v25
975+ vncipher $out4,$out4,v25
976+ vncipher $out5,$out5,v25
977+ vncipher $out6,$out6,v25
978+ vncipher $out7,$out7,v25
979+ lvx v25,$x10,$key_ # round[4]
980+ bdnz Loop_cbc_dec8x_tail
981+
982+ vncipher $out1,$out1,v24
983+ vncipher $out2,$out2,v24
984+ vncipher $out3,$out3,v24
985+ vncipher $out4,$out4,v24
986+ vncipher $out5,$out5,v24
987+ vncipher $out6,$out6,v24
988+ vncipher $out7,$out7,v24
989+
990+ vncipher $out1,$out1,v25
991+ vncipher $out2,$out2,v25
992+ vncipher $out3,$out3,v25
993+ vncipher $out4,$out4,v25
994+ vncipher $out5,$out5,v25
995+ vncipher $out6,$out6,v25
996+ vncipher $out7,$out7,v25
997+
998+ vncipher $out1,$out1,v26
999+ vncipher $out2,$out2,v26
1000+ vncipher $out3,$out3,v26
1001+ vncipher $out4,$out4,v26
1002+ vncipher $out5,$out5,v26
1003+ vncipher $out6,$out6,v26
1004+ vncipher $out7,$out7,v26
1005+
1006+ vncipher $out1,$out1,v27
1007+ vncipher $out2,$out2,v27
1008+ vncipher $out3,$out3,v27
1009+ vncipher $out4,$out4,v27
1010+ vncipher $out5,$out5,v27
1011+ vncipher $out6,$out6,v27
1012+ vncipher $out7,$out7,v27
1013+
1014+ vncipher $out1,$out1,v28
1015+ vncipher $out2,$out2,v28
1016+ vncipher $out3,$out3,v28
1017+ vncipher $out4,$out4,v28
1018+ vncipher $out5,$out5,v28
1019+ vncipher $out6,$out6,v28
1020+ vncipher $out7,$out7,v28
1021+
1022+ vncipher $out1,$out1,v29
1023+ vncipher $out2,$out2,v29
1024+ vncipher $out3,$out3,v29
1025+ vncipher $out4,$out4,v29
1026+ vncipher $out5,$out5,v29
1027+ vncipher $out6,$out6,v29
1028+ vncipher $out7,$out7,v29
1029+
1030+ vncipher $out1,$out1,v30
1031+ vxor $ivec,$ivec,v31 # last round key
1032+ vncipher $out2,$out2,v30
1033+ vxor $in1,$in1,v31
1034+ vncipher $out3,$out3,v30
1035+ vxor $in2,$in2,v31
1036+ vncipher $out4,$out4,v30
1037+ vxor $in3,$in3,v31
1038+ vncipher $out5,$out5,v30
1039+ vxor $in4,$in4,v31
1040+ vncipher $out6,$out6,v30
1041+ vxor $in5,$in5,v31
1042+ vncipher $out7,$out7,v30
1043+ vxor $in6,$in6,v31
1044+
1045+ cmplwi $len,32 # switch($len)
1046+ blt Lcbc_dec8x_one
1047+ nop
1048+ beq Lcbc_dec8x_two
1049+ cmplwi $len,64
1050+ blt Lcbc_dec8x_three
1051+ nop
1052+ beq Lcbc_dec8x_four
1053+ cmplwi $len,96
1054+ blt Lcbc_dec8x_five
1055+ nop
1056+ beq Lcbc_dec8x_six
1057+
1058+Lcbc_dec8x_seven:
1059+ vncipherlast $out1,$out1,$ivec
1060+ vncipherlast $out2,$out2,$in1
1061+ vncipherlast $out3,$out3,$in2
1062+ vncipherlast $out4,$out4,$in3
1063+ vncipherlast $out5,$out5,$in4
1064+ vncipherlast $out6,$out6,$in5
1065+ vncipherlast $out7,$out7,$in6
1066+ vmr $ivec,$in7
1067+
1068+ le?vperm $out1,$out1,$out1,$inpperm
1069+ le?vperm $out2,$out2,$out2,$inpperm
1070+ stvx_u $out1,$x00,$out
1071+ le?vperm $out3,$out3,$out3,$inpperm
1072+ stvx_u $out2,$x10,$out
1073+ le?vperm $out4,$out4,$out4,$inpperm
1074+ stvx_u $out3,$x20,$out
1075+ le?vperm $out5,$out5,$out5,$inpperm
1076+ stvx_u $out4,$x30,$out
1077+ le?vperm $out6,$out6,$out6,$inpperm
1078+ stvx_u $out5,$x40,$out
1079+ le?vperm $out7,$out7,$out7,$inpperm
1080+ stvx_u $out6,$x50,$out
1081+ stvx_u $out7,$x60,$out
1082+ addi $out,$out,0x70
1083+ b Lcbc_dec8x_done
1084+
1085+.align 5
1086+Lcbc_dec8x_six:
1087+ vncipherlast $out2,$out2,$ivec
1088+ vncipherlast $out3,$out3,$in2
1089+ vncipherlast $out4,$out4,$in3
1090+ vncipherlast $out5,$out5,$in4
1091+ vncipherlast $out6,$out6,$in5
1092+ vncipherlast $out7,$out7,$in6
1093+ vmr $ivec,$in7
1094+
1095+ le?vperm $out2,$out2,$out2,$inpperm
1096+ le?vperm $out3,$out3,$out3,$inpperm
1097+ stvx_u $out2,$x00,$out
1098+ le?vperm $out4,$out4,$out4,$inpperm
1099+ stvx_u $out3,$x10,$out
1100+ le?vperm $out5,$out5,$out5,$inpperm
1101+ stvx_u $out4,$x20,$out
1102+ le?vperm $out6,$out6,$out6,$inpperm
1103+ stvx_u $out5,$x30,$out
1104+ le?vperm $out7,$out7,$out7,$inpperm
1105+ stvx_u $out6,$x40,$out
1106+ stvx_u $out7,$x50,$out
1107+ addi $out,$out,0x60
1108+ b Lcbc_dec8x_done
1109+
1110+.align 5
1111+Lcbc_dec8x_five:
1112+ vncipherlast $out3,$out3,$ivec
1113+ vncipherlast $out4,$out4,$in3
1114+ vncipherlast $out5,$out5,$in4
1115+ vncipherlast $out6,$out6,$in5
1116+ vncipherlast $out7,$out7,$in6
1117+ vmr $ivec,$in7
1118+
1119+ le?vperm $out3,$out3,$out3,$inpperm
1120+ le?vperm $out4,$out4,$out4,$inpperm
1121+ stvx_u $out3,$x00,$out
1122+ le?vperm $out5,$out5,$out5,$inpperm
1123+ stvx_u $out4,$x10,$out
1124+ le?vperm $out6,$out6,$out6,$inpperm
1125+ stvx_u $out5,$x20,$out
1126+ le?vperm $out7,$out7,$out7,$inpperm
1127+ stvx_u $out6,$x30,$out
1128+ stvx_u $out7,$x40,$out
1129+ addi $out,$out,0x50
1130+ b Lcbc_dec8x_done
1131+
1132+.align 5
1133+Lcbc_dec8x_four:
1134+ vncipherlast $out4,$out4,$ivec
1135+ vncipherlast $out5,$out5,$in4
1136+ vncipherlast $out6,$out6,$in5
1137+ vncipherlast $out7,$out7,$in6
1138+ vmr $ivec,$in7
1139+
1140+ le?vperm $out4,$out4,$out4,$inpperm
1141+ le?vperm $out5,$out5,$out5,$inpperm
1142+ stvx_u $out4,$x00,$out
1143+ le?vperm $out6,$out6,$out6,$inpperm
1144+ stvx_u $out5,$x10,$out
1145+ le?vperm $out7,$out7,$out7,$inpperm
1146+ stvx_u $out6,$x20,$out
1147+ stvx_u $out7,$x30,$out
1148+ addi $out,$out,0x40
1149+ b Lcbc_dec8x_done
1150+
1151+.align 5
1152+Lcbc_dec8x_three:
1153+ vncipherlast $out5,$out5,$ivec
1154+ vncipherlast $out6,$out6,$in5
1155+ vncipherlast $out7,$out7,$in6
1156+ vmr $ivec,$in7
1157+
1158+ le?vperm $out5,$out5,$out5,$inpperm
1159+ le?vperm $out6,$out6,$out6,$inpperm
1160+ stvx_u $out5,$x00,$out
1161+ le?vperm $out7,$out7,$out7,$inpperm
1162+ stvx_u $out6,$x10,$out
1163+ stvx_u $out7,$x20,$out
1164+ addi $out,$out,0x30
1165+ b Lcbc_dec8x_done
1166+
1167+.align 5
1168+Lcbc_dec8x_two:
1169+ vncipherlast $out6,$out6,$ivec
1170+ vncipherlast $out7,$out7,$in6
1171+ vmr $ivec,$in7
1172+
1173+ le?vperm $out6,$out6,$out6,$inpperm
1174+ le?vperm $out7,$out7,$out7,$inpperm
1175+ stvx_u $out6,$x00,$out
1176+ stvx_u $out7,$x10,$out
1177+ addi $out,$out,0x20
1178+ b Lcbc_dec8x_done
1179+
1180+.align 5
1181+Lcbc_dec8x_one:
1182+ vncipherlast $out7,$out7,$ivec
1183+ vmr $ivec,$in7
1184+
1185+ le?vperm $out7,$out7,$out7,$inpperm
1186+ stvx_u $out7,0,$out
1187+ addi $out,$out,0x10
1188+
1189+Lcbc_dec8x_done:
1190+ le?vperm $ivec,$ivec,$ivec,$inpperm
1191+ stvx_u $ivec,0,$ivp # write [unaligned] iv
1192+
1193+ li r10,`$FRAME+15`
1194+ li r11,`$FRAME+31`
1195+ stvx $inpperm,r10,$sp # wipe copies of round keys
1196+ addi r10,r10,32
1197+ stvx $inpperm,r11,$sp
1198+ addi r11,r11,32
1199+ stvx $inpperm,r10,$sp
1200+ addi r10,r10,32
1201+ stvx $inpperm,r11,$sp
1202+ addi r11,r11,32
1203+ stvx $inpperm,r10,$sp
1204+ addi r10,r10,32
1205+ stvx $inpperm,r11,$sp
1206+ addi r11,r11,32
1207+ stvx $inpperm,r10,$sp
1208+ addi r10,r10,32
1209+ stvx $inpperm,r11,$sp
1210+ addi r11,r11,32
1211+
1212+ mtspr 256,$vrsave
1213+ lvx v20,r10,$sp # ABI says so
1214+ addi r10,r10,32
1215+ lvx v21,r11,$sp
1216+ addi r11,r11,32
1217+ lvx v22,r10,$sp
1218+ addi r10,r10,32
1219+ lvx v23,r11,$sp
1220+ addi r11,r11,32
1221+ lvx v24,r10,$sp
1222+ addi r10,r10,32
1223+ lvx v25,r11,$sp
1224+ addi r11,r11,32
1225+ lvx v26,r10,$sp
1226+ addi r10,r10,32
1227+ lvx v27,r11,$sp
1228+ addi r11,r11,32
1229+ lvx v28,r10,$sp
1230+ addi r10,r10,32
1231+ lvx v29,r11,$sp
1232+ addi r11,r11,32
1233+ lvx v30,r10,$sp
1234+ lvx v31,r11,$sp
1235+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1236+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1237+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1238+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1239+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1240+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1241+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1242+ blr
1243+ .long 0
1244+ .byte 0,12,0x04,0,0x80,6,6,0
1245+ .long 0
1246+.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1247+___
1248+}} }}}
1249+
1250+#########################################################################
1251+{{{ # CTR procedure[s] #
1252+my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1253+my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
1254+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1255+ map("v$_",(4..11));
1256+my $dat=$tmp;
1257+
1258+$code.=<<___;
1259+.globl .${prefix}_ctr32_encrypt_blocks
1260+.align 5
1261+.${prefix}_ctr32_encrypt_blocks:
1262+ ${UCMP}i $len,1
1263+ bltlr-
1264+
1265+ lis r0,0xfff0
1266+ mfspr $vrsave,256
1267+ mtspr 256,r0
1268+
1269+ li $idx,15
1270+ vxor $rndkey0,$rndkey0,$rndkey0
1271+ le?vspltisb $tmp,0x0f
1272+
1273+ lvx $ivec,0,$ivp # load [unaligned] iv
1274+ lvsl $inpperm,0,$ivp
1275+ lvx $inptail,$idx,$ivp
1276+ vspltisb $one,1
1277+ le?vxor $inpperm,$inpperm,$tmp
1278+ vperm $ivec,$ivec,$inptail,$inpperm
1279+ vsldoi $one,$rndkey0,$one,1
1280+
1281+ neg r11,$inp
1282+ ?lvsl $keyperm,0,$key # prepare for unaligned key
1283+ lwz $rounds,240($key)
1284+
1285+ lvsr $inpperm,0,r11 # prepare for unaligned load
1286+ lvx $inptail,0,$inp
1287+ addi $inp,$inp,15 # 15 is not typo
1288+ le?vxor $inpperm,$inpperm,$tmp
1289+
1290+ srwi $rounds,$rounds,1
1291+ li $idx,16
1292+ subi $rounds,$rounds,1
1293+
1294+ ${UCMP}i $len,8
1295+ bge _aesp8_ctr32_encrypt8x
1296+
1297+ ?lvsr $outperm,0,$out # prepare for unaligned store
1298+ vspltisb $outmask,-1
1299+ lvx $outhead,0,$out
1300+ ?vperm $outmask,$rndkey0,$outmask,$outperm
1301+ le?vxor $outperm,$outperm,$tmp
1302+
1303+ lvx $rndkey0,0,$key
1304+ mtctr $rounds
1305+ lvx $rndkey1,$idx,$key
1306+ addi $idx,$idx,16
1307+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
1308+ vxor $inout,$ivec,$rndkey0
1309+ lvx $rndkey0,$idx,$key
1310+ addi $idx,$idx,16
1311+ b Loop_ctr32_enc
1312+
1313+.align 5
1314+Loop_ctr32_enc:
1315+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
1316+ vcipher $inout,$inout,$rndkey1
1317+ lvx $rndkey1,$idx,$key
1318+ addi $idx,$idx,16
1319+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
1320+ vcipher $inout,$inout,$rndkey0
1321+ lvx $rndkey0,$idx,$key
1322+ addi $idx,$idx,16
1323+ bdnz Loop_ctr32_enc
1324+
1325+ vadduwm $ivec,$ivec,$one
1326+ vmr $dat,$inptail
1327+ lvx $inptail,0,$inp
1328+ addi $inp,$inp,16
1329+ subic. $len,$len,1 # blocks--
1330+
1331+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
1332+ vcipher $inout,$inout,$rndkey1
1333+ lvx $rndkey1,$idx,$key
1334+ vperm $dat,$dat,$inptail,$inpperm
1335+ li $idx,16
1336+ ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm
1337+ lvx $rndkey0,0,$key
1338+ vxor $dat,$dat,$rndkey1 # last round key
1339+ vcipherlast $inout,$inout,$dat
1340+
1341+ lvx $rndkey1,$idx,$key
1342+ addi $idx,$idx,16
1343+ vperm $inout,$inout,$inout,$outperm
1344+ vsel $dat,$outhead,$inout,$outmask
1345+ mtctr $rounds
1346+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
1347+ vmr $outhead,$inout
1348+ vxor $inout,$ivec,$rndkey0
1349+ lvx $rndkey0,$idx,$key
1350+ addi $idx,$idx,16
1351+ stvx $dat,0,$out
1352+ addi $out,$out,16
1353+ bne Loop_ctr32_enc
1354+
1355+ addi $out,$out,-1
1356+ lvx $inout,0,$out # redundant in aligned case
1357+ vsel $inout,$outhead,$inout,$outmask
1358+ stvx $inout,0,$out
1359+
1360+ mtspr 256,$vrsave
1361+ blr
1362+ .long 0
1363+ .byte 0,12,0x14,0,0,0,6,0
1364+ .long 0
1365+___
1366+#########################################################################
1367+{{ # Optimized CTR procedure #
1368+my $key_="r11";
1369+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1370+ $x00=0 if ($flavour =~ /osx/);
1371+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1372+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1373+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
1374+ # v26-v31 last 6 round keys
1375+my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
1376+my ($two,$three,$four)=($outhead,$outperm,$outmask);
1377+
1378+$code.=<<___;
1379+.align 5
1380+_aesp8_ctr32_encrypt8x:
1381+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1382+ li r10,`$FRAME+8*16+15`
1383+ li r11,`$FRAME+8*16+31`
1384+ stvx v20,r10,$sp # ABI says so
1385+ addi r10,r10,32
1386+ stvx v21,r11,$sp
1387+ addi r11,r11,32
1388+ stvx v22,r10,$sp
1389+ addi r10,r10,32
1390+ stvx v23,r11,$sp
1391+ addi r11,r11,32
1392+ stvx v24,r10,$sp
1393+ addi r10,r10,32
1394+ stvx v25,r11,$sp
1395+ addi r11,r11,32
1396+ stvx v26,r10,$sp
1397+ addi r10,r10,32
1398+ stvx v27,r11,$sp
1399+ addi r11,r11,32
1400+ stvx v28,r10,$sp
1401+ addi r10,r10,32
1402+ stvx v29,r11,$sp
1403+ addi r11,r11,32
1404+ stvx v30,r10,$sp
1405+ stvx v31,r11,$sp
1406+ li r0,-1
1407+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
1408+ li $x10,0x10
1409+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1410+ li $x20,0x20
1411+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1412+ li $x30,0x30
1413+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1414+ li $x40,0x40
1415+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1416+ li $x50,0x50
1417+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1418+ li $x60,0x60
1419+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1420+ li $x70,0x70
1421+ mtspr 256,r0
1422+
1423+ subi $rounds,$rounds,3 # -4 in total
1424+
1425+ lvx $rndkey0,$x00,$key # load key schedule
1426+ lvx v30,$x10,$key
1427+ addi $key,$key,0x20
1428+ lvx v31,$x00,$key
1429+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
1430+ addi $key_,$sp,$FRAME+15
1431+ mtctr $rounds
1432+
1433+Load_ctr32_enc_key:
1434+ ?vperm v24,v30,v31,$keyperm
1435+ lvx v30,$x10,$key
1436+ addi $key,$key,0x20
1437+ stvx v24,$x00,$key_ # off-load round[1]
1438+ ?vperm v25,v31,v30,$keyperm
1439+ lvx v31,$x00,$key
1440+ stvx v25,$x10,$key_ # off-load round[2]
1441+ addi $key_,$key_,0x20
1442+ bdnz Load_ctr32_enc_key
1443+
1444+ lvx v26,$x10,$key
1445+ ?vperm v24,v30,v31,$keyperm
1446+ lvx v27,$x20,$key
1447+ stvx v24,$x00,$key_ # off-load round[3]
1448+ ?vperm v25,v31,v26,$keyperm
1449+ lvx v28,$x30,$key
1450+ stvx v25,$x10,$key_ # off-load round[4]
1451+ addi $key_,$sp,$FRAME+15 # rewind $key_
1452+ ?vperm v26,v26,v27,$keyperm
1453+ lvx v29,$x40,$key
1454+ ?vperm v27,v27,v28,$keyperm
1455+ lvx v30,$x50,$key
1456+ ?vperm v28,v28,v29,$keyperm
1457+ lvx v31,$x60,$key
1458+ ?vperm v29,v29,v30,$keyperm
1459+ lvx $out0,$x70,$key # borrow $out0
1460+ ?vperm v30,v30,v31,$keyperm
1461+ lvx v24,$x00,$key_ # pre-load round[1]
1462+ ?vperm v31,v31,$out0,$keyperm
1463+ lvx v25,$x10,$key_ # pre-load round[2]
1464+
1465+ vadduwm $two,$one,$one
1466+ subi $inp,$inp,15 # undo "caller"
1467+ $SHL $len,$len,4
1468+
1469+ vadduwm $out1,$ivec,$one # counter values ...
1470+ vadduwm $out2,$ivec,$two
1471+ vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
1472+ le?li $idx,8
1473+ vadduwm $out3,$out1,$two
1474+ vxor $out1,$out1,$rndkey0
1475+ le?lvsl $inpperm,0,$idx
1476+ vadduwm $out4,$out2,$two
1477+ vxor $out2,$out2,$rndkey0
1478+ le?vspltisb $tmp,0x0f
1479+ vadduwm $out5,$out3,$two
1480+ vxor $out3,$out3,$rndkey0
1481+ le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
1482+ vadduwm $out6,$out4,$two
1483+ vxor $out4,$out4,$rndkey0
1484+ vadduwm $out7,$out5,$two
1485+ vxor $out5,$out5,$rndkey0
1486+ vadduwm $ivec,$out6,$two # next counter value
1487+ vxor $out6,$out6,$rndkey0
1488+ vxor $out7,$out7,$rndkey0
1489+
1490+ mtctr $rounds
1491+ b Loop_ctr32_enc8x
1492+.align 5
1493+Loop_ctr32_enc8x:
1494+ vcipher $out0,$out0,v24
1495+ vcipher $out1,$out1,v24
1496+ vcipher $out2,$out2,v24
1497+ vcipher $out3,$out3,v24
1498+ vcipher $out4,$out4,v24
1499+ vcipher $out5,$out5,v24
1500+ vcipher $out6,$out6,v24
1501+ vcipher $out7,$out7,v24
1502+Loop_ctr32_enc8x_middle:
1503+ lvx v24,$x20,$key_ # round[3]
1504+ addi $key_,$key_,0x20
1505+
1506+ vcipher $out0,$out0,v25
1507+ vcipher $out1,$out1,v25
1508+ vcipher $out2,$out2,v25
1509+ vcipher $out3,$out3,v25
1510+ vcipher $out4,$out4,v25
1511+ vcipher $out5,$out5,v25
1512+ vcipher $out6,$out6,v25
1513+ vcipher $out7,$out7,v25
1514+ lvx v25,$x10,$key_ # round[4]
1515+ bdnz Loop_ctr32_enc8x
1516+
1517+ subic r11,$len,256 # $len-256, borrow $key_
1518+ vcipher $out0,$out0,v24
1519+ vcipher $out1,$out1,v24
1520+ vcipher $out2,$out2,v24
1521+ vcipher $out3,$out3,v24
1522+ vcipher $out4,$out4,v24
1523+ vcipher $out5,$out5,v24
1524+ vcipher $out6,$out6,v24
1525+ vcipher $out7,$out7,v24
1526+
1527+ subfe r0,r0,r0 # borrow?-1:0
1528+ vcipher $out0,$out0,v25
1529+ vcipher $out1,$out1,v25
1530+ vcipher $out2,$out2,v25
1531+ vcipher $out3,$out3,v25
1532+ vcipher $out4,$out4,v25
1533+ vcipher $out5,$out5,v25
1534+ vcipher $out6,$out6,v25
1535+ vcipher $out7,$out7,v25
1536+
1537+ and r0,r0,r11
1538+ addi $key_,$sp,$FRAME+15 # rewind $key_
1539+ vcipher $out0,$out0,v26
1540+ vcipher $out1,$out1,v26
1541+ vcipher $out2,$out2,v26
1542+ vcipher $out3,$out3,v26
1543+ vcipher $out4,$out4,v26
1544+ vcipher $out5,$out5,v26
1545+ vcipher $out6,$out6,v26
1546+ vcipher $out7,$out7,v26
1547+ lvx v24,$x00,$key_ # re-pre-load round[1]
1548+
1549+ subic $len,$len,129 # $len-=129
1550+ vcipher $out0,$out0,v27
1551+ addi $len,$len,1 # $len-=128 really
1552+ vcipher $out1,$out1,v27
1553+ vcipher $out2,$out2,v27
1554+ vcipher $out3,$out3,v27
1555+ vcipher $out4,$out4,v27
1556+ vcipher $out5,$out5,v27
1557+ vcipher $out6,$out6,v27
1558+ vcipher $out7,$out7,v27
1559+ lvx v25,$x10,$key_ # re-pre-load round[2]
1560+
1561+ vcipher $out0,$out0,v28
1562+ lvx_u $in0,$x00,$inp # load input
1563+ vcipher $out1,$out1,v28
1564+ lvx_u $in1,$x10,$inp
1565+ vcipher $out2,$out2,v28
1566+ lvx_u $in2,$x20,$inp
1567+ vcipher $out3,$out3,v28
1568+ lvx_u $in3,$x30,$inp
1569+ vcipher $out4,$out4,v28
1570+ lvx_u $in4,$x40,$inp
1571+ vcipher $out5,$out5,v28
1572+ lvx_u $in5,$x50,$inp
1573+ vcipher $out6,$out6,v28
1574+ lvx_u $in6,$x60,$inp
1575+ vcipher $out7,$out7,v28
1576+ lvx_u $in7,$x70,$inp
1577+ addi $inp,$inp,0x80
1578+
1579+ vcipher $out0,$out0,v29
1580+ le?vperm $in0,$in0,$in0,$inpperm
1581+ vcipher $out1,$out1,v29
1582+ le?vperm $in1,$in1,$in1,$inpperm
1583+ vcipher $out2,$out2,v29
1584+ le?vperm $in2,$in2,$in2,$inpperm
1585+ vcipher $out3,$out3,v29
1586+ le?vperm $in3,$in3,$in3,$inpperm
1587+ vcipher $out4,$out4,v29
1588+ le?vperm $in4,$in4,$in4,$inpperm
1589+ vcipher $out5,$out5,v29
1590+ le?vperm $in5,$in5,$in5,$inpperm
1591+ vcipher $out6,$out6,v29
1592+ le?vperm $in6,$in6,$in6,$inpperm
1593+ vcipher $out7,$out7,v29
1594+ le?vperm $in7,$in7,$in7,$inpperm
1595+
1596+ add $inp,$inp,r0 # $inp is adjusted in such
1597+ # way that at exit from the
1598+ # loop inX-in7 are loaded
1599+ # with last "words"
1600+ subfe. r0,r0,r0 # borrow?-1:0
1601+ vcipher $out0,$out0,v30
1602+ vxor $in0,$in0,v31 # xor with last round key
1603+ vcipher $out1,$out1,v30
1604+ vxor $in1,$in1,v31
1605+ vcipher $out2,$out2,v30
1606+ vxor $in2,$in2,v31
1607+ vcipher $out3,$out3,v30
1608+ vxor $in3,$in3,v31
1609+ vcipher $out4,$out4,v30
1610+ vxor $in4,$in4,v31
1611+ vcipher $out5,$out5,v30
1612+ vxor $in5,$in5,v31
1613+ vcipher $out6,$out6,v30
1614+ vxor $in6,$in6,v31
1615+ vcipher $out7,$out7,v30
1616+ vxor $in7,$in7,v31
1617+
1618+ bne Lctr32_enc8x_break # did $len-129 borrow?
1619+
1620+ vcipherlast $in0,$out0,$in0
1621+ vcipherlast $in1,$out1,$in1
1622+ vadduwm $out1,$ivec,$one # counter values ...
1623+ vcipherlast $in2,$out2,$in2
1624+ vadduwm $out2,$ivec,$two
1625+ vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
1626+ vcipherlast $in3,$out3,$in3
1627+ vadduwm $out3,$out1,$two
1628+ vxor $out1,$out1,$rndkey0
1629+ vcipherlast $in4,$out4,$in4
1630+ vadduwm $out4,$out2,$two
1631+ vxor $out2,$out2,$rndkey0
1632+ vcipherlast $in5,$out5,$in5
1633+ vadduwm $out5,$out3,$two
1634+ vxor $out3,$out3,$rndkey0
1635+ vcipherlast $in6,$out6,$in6
1636+ vadduwm $out6,$out4,$two
1637+ vxor $out4,$out4,$rndkey0
1638+ vcipherlast $in7,$out7,$in7
1639+ vadduwm $out7,$out5,$two
1640+ vxor $out5,$out5,$rndkey0
1641+ le?vperm $in0,$in0,$in0,$inpperm
1642+ vadduwm $ivec,$out6,$two # next counter value
1643+ vxor $out6,$out6,$rndkey0
1644+ le?vperm $in1,$in1,$in1,$inpperm
1645+ vxor $out7,$out7,$rndkey0
1646+ mtctr $rounds
1647+
1648+ vcipher $out0,$out0,v24
1649+ stvx_u $in0,$x00,$out
1650+ le?vperm $in2,$in2,$in2,$inpperm
1651+ vcipher $out1,$out1,v24
1652+ stvx_u $in1,$x10,$out
1653+ le?vperm $in3,$in3,$in3,$inpperm
1654+ vcipher $out2,$out2,v24
1655+ stvx_u $in2,$x20,$out
1656+ le?vperm $in4,$in4,$in4,$inpperm
1657+ vcipher $out3,$out3,v24
1658+ stvx_u $in3,$x30,$out
1659+ le?vperm $in5,$in5,$in5,$inpperm
1660+ vcipher $out4,$out4,v24
1661+ stvx_u $in4,$x40,$out
1662+ le?vperm $in6,$in6,$in6,$inpperm
1663+ vcipher $out5,$out5,v24
1664+ stvx_u $in5,$x50,$out
1665+ le?vperm $in7,$in7,$in7,$inpperm
1666+ vcipher $out6,$out6,v24
1667+ stvx_u $in6,$x60,$out
1668+ vcipher $out7,$out7,v24
1669+ stvx_u $in7,$x70,$out
1670+ addi $out,$out,0x80
1671+
1672+ b Loop_ctr32_enc8x_middle
1673+
1674+.align 5
1675+Lctr32_enc8x_break:
1676+ cmpwi $len,-0x60
1677+ blt Lctr32_enc8x_one
1678+ nop
1679+ beq Lctr32_enc8x_two
1680+ cmpwi $len,-0x40
1681+ blt Lctr32_enc8x_three
1682+ nop
1683+ beq Lctr32_enc8x_four
1684+ cmpwi $len,-0x20
1685+ blt Lctr32_enc8x_five
1686+ nop
1687+ beq Lctr32_enc8x_six
1688+ cmpwi $len,0x00
1689+ blt Lctr32_enc8x_seven
1690+
1691+Lctr32_enc8x_eight:
1692+ vcipherlast $out0,$out0,$in0
1693+ vcipherlast $out1,$out1,$in1
1694+ vcipherlast $out2,$out2,$in2
1695+ vcipherlast $out3,$out3,$in3
1696+ vcipherlast $out4,$out4,$in4
1697+ vcipherlast $out5,$out5,$in5
1698+ vcipherlast $out6,$out6,$in6
1699+ vcipherlast $out7,$out7,$in7
1700+
1701+ le?vperm $out0,$out0,$out0,$inpperm
1702+ le?vperm $out1,$out1,$out1,$inpperm
1703+ stvx_u $out0,$x00,$out
1704+ le?vperm $out2,$out2,$out2,$inpperm
1705+ stvx_u $out1,$x10,$out
1706+ le?vperm $out3,$out3,$out3,$inpperm
1707+ stvx_u $out2,$x20,$out
1708+ le?vperm $out4,$out4,$out4,$inpperm
1709+ stvx_u $out3,$x30,$out
1710+ le?vperm $out5,$out5,$out5,$inpperm
1711+ stvx_u $out4,$x40,$out
1712+ le?vperm $out6,$out6,$out6,$inpperm
1713+ stvx_u $out5,$x50,$out
1714+ le?vperm $out7,$out7,$out7,$inpperm
1715+ stvx_u $out6,$x60,$out
1716+ stvx_u $out7,$x70,$out
1717+ addi $out,$out,0x80
1718+ b Lctr32_enc8x_done
1719+
1720+.align 5
1721+Lctr32_enc8x_seven:
1722+ vcipherlast $out0,$out0,$in1
1723+ vcipherlast $out1,$out1,$in2
1724+ vcipherlast $out2,$out2,$in3
1725+ vcipherlast $out3,$out3,$in4
1726+ vcipherlast $out4,$out4,$in5
1727+ vcipherlast $out5,$out5,$in6
1728+ vcipherlast $out6,$out6,$in7
1729+
1730+ le?vperm $out0,$out0,$out0,$inpperm
1731+ le?vperm $out1,$out1,$out1,$inpperm
1732+ stvx_u $out0,$x00,$out
1733+ le?vperm $out2,$out2,$out2,$inpperm
1734+ stvx_u $out1,$x10,$out
1735+ le?vperm $out3,$out3,$out3,$inpperm
1736+ stvx_u $out2,$x20,$out
1737+ le?vperm $out4,$out4,$out4,$inpperm
1738+ stvx_u $out3,$x30,$out
1739+ le?vperm $out5,$out5,$out5,$inpperm
1740+ stvx_u $out4,$x40,$out
1741+ le?vperm $out6,$out6,$out6,$inpperm
1742+ stvx_u $out5,$x50,$out
1743+ stvx_u $out6,$x60,$out
1744+ addi $out,$out,0x70
1745+ b Lctr32_enc8x_done
1746+
1747+.align 5
1748+Lctr32_enc8x_six:
1749+ vcipherlast $out0,$out0,$in2
1750+ vcipherlast $out1,$out1,$in3
1751+ vcipherlast $out2,$out2,$in4
1752+ vcipherlast $out3,$out3,$in5
1753+ vcipherlast $out4,$out4,$in6
1754+ vcipherlast $out5,$out5,$in7
1755+
1756+ le?vperm $out0,$out0,$out0,$inpperm
1757+ le?vperm $out1,$out1,$out1,$inpperm
1758+ stvx_u $out0,$x00,$out
1759+ le?vperm $out2,$out2,$out2,$inpperm
1760+ stvx_u $out1,$x10,$out
1761+ le?vperm $out3,$out3,$out3,$inpperm
1762+ stvx_u $out2,$x20,$out
1763+ le?vperm $out4,$out4,$out4,$inpperm
1764+ stvx_u $out3,$x30,$out
1765+ le?vperm $out5,$out5,$out5,$inpperm
1766+ stvx_u $out4,$x40,$out
1767+ stvx_u $out5,$x50,$out
1768+ addi $out,$out,0x60
1769+ b Lctr32_enc8x_done
1770+
1771+.align 5
1772+Lctr32_enc8x_five:
1773+ vcipherlast $out0,$out0,$in3
1774+ vcipherlast $out1,$out1,$in4
1775+ vcipherlast $out2,$out2,$in5
1776+ vcipherlast $out3,$out3,$in6
1777+ vcipherlast $out4,$out4,$in7
1778+
1779+ le?vperm $out0,$out0,$out0,$inpperm
1780+ le?vperm $out1,$out1,$out1,$inpperm
1781+ stvx_u $out0,$x00,$out
1782+ le?vperm $out2,$out2,$out2,$inpperm
1783+ stvx_u $out1,$x10,$out
1784+ le?vperm $out3,$out3,$out3,$inpperm
1785+ stvx_u $out2,$x20,$out
1786+ le?vperm $out4,$out4,$out4,$inpperm
1787+ stvx_u $out3,$x30,$out
1788+ stvx_u $out4,$x40,$out
1789+ addi $out,$out,0x50
1790+ b Lctr32_enc8x_done
1791+
1792+.align 5
1793+Lctr32_enc8x_four:
1794+ vcipherlast $out0,$out0,$in4
1795+ vcipherlast $out1,$out1,$in5
1796+ vcipherlast $out2,$out2,$in6
1797+ vcipherlast $out3,$out3,$in7
1798+
1799+ le?vperm $out0,$out0,$out0,$inpperm
1800+ le?vperm $out1,$out1,$out1,$inpperm
1801+ stvx_u $out0,$x00,$out
1802+ le?vperm $out2,$out2,$out2,$inpperm
1803+ stvx_u $out1,$x10,$out
1804+ le?vperm $out3,$out3,$out3,$inpperm
1805+ stvx_u $out2,$x20,$out
1806+ stvx_u $out3,$x30,$out
1807+ addi $out,$out,0x40
1808+ b Lctr32_enc8x_done
1809+
1810+.align 5
1811+Lctr32_enc8x_three:
1812+ vcipherlast $out0,$out0,$in5
1813+ vcipherlast $out1,$out1,$in6
1814+ vcipherlast $out2,$out2,$in7
1815+
1816+ le?vperm $out0,$out0,$out0,$inpperm
1817+ le?vperm $out1,$out1,$out1,$inpperm
1818+ stvx_u $out0,$x00,$out
1819+ le?vperm $out2,$out2,$out2,$inpperm
1820+ stvx_u $out1,$x10,$out
1821+ stvx_u $out2,$x20,$out
1822+ addi $out,$out,0x30
1823+ b Lcbc_dec8x_done
1824+
1825+.align 5
1826+Lctr32_enc8x_two:
1827+ vcipherlast $out0,$out0,$in6
1828+ vcipherlast $out1,$out1,$in7
1829+
1830+ le?vperm $out0,$out0,$out0,$inpperm
1831+ le?vperm $out1,$out1,$out1,$inpperm
1832+ stvx_u $out0,$x00,$out
1833+ stvx_u $out1,$x10,$out
1834+ addi $out,$out,0x20
1835+ b Lcbc_dec8x_done
1836+
1837+.align 5
1838+Lctr32_enc8x_one:
1839+ vcipherlast $out0,$out0,$in7
1840+
1841+ le?vperm $out0,$out0,$out0,$inpperm
1842+ stvx_u $out0,0,$out
1843+ addi $out,$out,0x10
1844+
1845+Lctr32_enc8x_done:
1846+ li r10,`$FRAME+15`
1847+ li r11,`$FRAME+31`
1848+ stvx $inpperm,r10,$sp # wipe copies of round keys
1849+ addi r10,r10,32
1850+ stvx $inpperm,r11,$sp
1851+ addi r11,r11,32
1852+ stvx $inpperm,r10,$sp
1853+ addi r10,r10,32
1854+ stvx $inpperm,r11,$sp
1855+ addi r11,r11,32
1856+ stvx $inpperm,r10,$sp
1857+ addi r10,r10,32
1858+ stvx $inpperm,r11,$sp
1859+ addi r11,r11,32
1860+ stvx $inpperm,r10,$sp
1861+ addi r10,r10,32
1862+ stvx $inpperm,r11,$sp
1863+ addi r11,r11,32
1864+
1865+ mtspr 256,$vrsave
1866+ lvx v20,r10,$sp # ABI says so
1867+ addi r10,r10,32
1868+ lvx v21,r11,$sp
1869+ addi r11,r11,32
1870+ lvx v22,r10,$sp
1871+ addi r10,r10,32
1872+ lvx v23,r11,$sp
1873+ addi r11,r11,32
1874+ lvx v24,r10,$sp
1875+ addi r10,r10,32
1876+ lvx v25,r11,$sp
1877+ addi r11,r11,32
1878+ lvx v26,r10,$sp
1879+ addi r10,r10,32
1880+ lvx v27,r11,$sp
1881+ addi r11,r11,32
1882+ lvx v28,r10,$sp
1883+ addi r10,r10,32
1884+ lvx v29,r11,$sp
1885+ addi r11,r11,32
1886+ lvx v30,r10,$sp
1887+ lvx v31,r11,$sp
1888+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1889+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1890+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1891+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1892+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1893+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1894+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1895+ blr
1896+ .long 0
1897+ .byte 0,12,0x04,0,0x80,6,6,0
1898+ .long 0
1899+.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1900+___
1901+}} }}}
1902+
1903+#########################################################################
1904+{{{ # XTS procedures #
1905+my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10));
1906+my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2));
1907+my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7));
1908+my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12));
1909+my $taillen = $key2;
1910+
1911+ ($inp,$idx) = ($idx,$inp); # reassign
1912+
1913+$code.=<<___;
1914+.globl .${prefix}_xts_encrypt
1915+.align 5
1916+.${prefix}_xts_encrypt:
1917+ mr $inp,r3 # reassign
1918+ li r3,-1
1919+ ${UCMP}i $len,16
1920+ bltlr-
1921+
1922+ lis r0,0xfff0
1923+ mfspr r12,256 # save vrsave
1924+ li r11,0
1925+ mtspr 256,r0
1926+
1927+ vspltisb $seven,0x07 # 0x070707..07
1928+ le?lvsl $leperm,r11,r11
1929+ le?vspltisb $tmp,0x0f
1930+ le?vxor $leperm,$leperm,$seven
1931+
1932+ li $idx,15
1933+ lvx $tweak,0,$ivp # load [unaligned] iv
1934+ lvsl $inpperm,0,$ivp
1935+ lvx $inptail,$idx,$ivp
1936+ le?vxor $inpperm,$inpperm,$tmp
1937+ vperm $tweak,$tweak,$inptail,$inpperm
1938+
1939+ ?lvsl $keyperm,0,$key2 # prepare for unaligned key
1940+ lwz $rounds,240($key2)
1941+ srwi $rounds,$rounds,1
1942+ subi $rounds,$rounds,1
1943+ li $idx,16
1944+
1945+ neg r11,$inp
1946+ lvsr $inpperm,0,r11 # prepare for unaligned load
1947+ lvx $inout,0,$inp
1948+ addi $inp,$inp,15 # 15 is not typo
1949+ le?vxor $inpperm,$inpperm,$tmp
1950+
1951+ lvx $rndkey0,0,$key2
1952+ lvx $rndkey1,$idx,$key2
1953+ addi $idx,$idx,16
1954+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
1955+ vxor $tweak,$tweak,$rndkey0
1956+ lvx $rndkey0,$idx,$key2
1957+ addi $idx,$idx,16
1958+ mtctr $rounds
1959+
1960+Ltweak_xts_enc:
1961+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
1962+ vcipher $tweak,$tweak,$rndkey1
1963+ lvx $rndkey1,$idx,$key2
1964+ addi $idx,$idx,16
1965+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
1966+ vcipher $tweak,$tweak,$rndkey0
1967+ lvx $rndkey0,$idx,$key2
1968+ addi $idx,$idx,16
1969+ bdnz Ltweak_xts_enc
1970+
1971+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
1972+ vcipher $tweak,$tweak,$rndkey1
1973+ lvx $rndkey1,$idx,$key2
1974+ li $idx,16
1975+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
1976+ vcipherlast $tweak,$tweak,$rndkey0
1977+
1978+ lvx $inptail,0,$inp
1979+ addi $inp,$inp,16
1980+
1981+ ?lvsl $keyperm,0,$key1 # prepare for unaligned key
1982+ lwz $rounds,240($key1)
1983+ srwi $rounds,$rounds,1
1984+ subi $rounds,$rounds,1
1985+ li $idx,16
1986+
1987+ vslb $eighty7,$seven,$seven # 0x808080..80
1988+ vor $eighty7,$eighty7,$seven # 0x878787..87
1989+ vspltisb $tmp,1 # 0x010101..01
1990+ vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
1991+
1992+ ${UCMP}i $len,96
1993+ bge _aesp8_xts_encrypt6x
1994+
1995+ andi. $taillen,$len,15
1996+ subic r0,$len,32
1997+ subi $taillen,$taillen,16
1998+ subfe r0,r0,r0
1999+ and r0,r0,$taillen
2000+ add $inp,$inp,r0
2001+
2002+ lvx $rndkey0,0,$key1
2003+ lvx $rndkey1,$idx,$key1
2004+ addi $idx,$idx,16
2005+ vperm $inout,$inout,$inptail,$inpperm
2006+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2007+ vxor $inout,$inout,$tweak
2008+ vxor $inout,$inout,$rndkey0
2009+ lvx $rndkey0,$idx,$key1
2010+ addi $idx,$idx,16
2011+ mtctr $rounds
2012+ b Loop_xts_enc
2013+
2014+.align 5
2015+Loop_xts_enc:
2016+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
2017+ vcipher $inout,$inout,$rndkey1
2018+ lvx $rndkey1,$idx,$key1
2019+ addi $idx,$idx,16
2020+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2021+ vcipher $inout,$inout,$rndkey0
2022+ lvx $rndkey0,$idx,$key1
2023+ addi $idx,$idx,16
2024+ bdnz Loop_xts_enc
2025+
2026+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
2027+ vcipher $inout,$inout,$rndkey1
2028+ lvx $rndkey1,$idx,$key1
2029+ li $idx,16
2030+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2031+ vxor $rndkey0,$rndkey0,$tweak
2032+ vcipherlast $output,$inout,$rndkey0
2033+
2034+ le?vperm $tmp,$output,$output,$leperm
2035+ be?nop
2036+ le?stvx_u $tmp,0,$out
2037+ be?stvx_u $output,0,$out
2038+ addi $out,$out,16
2039+
2040+ subic. $len,$len,16
2041+ beq Lxts_enc_done
2042+
2043+ vmr $inout,$inptail
2044+ lvx $inptail,0,$inp
2045+ addi $inp,$inp,16
2046+ lvx $rndkey0,0,$key1
2047+ lvx $rndkey1,$idx,$key1
2048+ addi $idx,$idx,16
2049+
2050+ subic r0,$len,32
2051+ subfe r0,r0,r0
2052+ and r0,r0,$taillen
2053+ add $inp,$inp,r0
2054+
2055+ vsrab $tmp,$tweak,$seven # next tweak value
2056+ vaddubm $tweak,$tweak,$tweak
2057+ vsldoi $tmp,$tmp,$tmp,15
2058+ vand $tmp,$tmp,$eighty7
2059+ vxor $tweak,$tweak,$tmp
2060+
2061+ vperm $inout,$inout,$inptail,$inpperm
2062+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2063+ vxor $inout,$inout,$tweak
2064+ vxor $output,$output,$rndkey0 # just in case $len<16
2065+ vxor $inout,$inout,$rndkey0
2066+ lvx $rndkey0,$idx,$key1
2067+ addi $idx,$idx,16
2068+
2069+ mtctr $rounds
2070+ ${UCMP}i $len,16
2071+ bge Loop_xts_enc
2072+
2073+ vxor $output,$output,$tweak
2074+ lvsr $inpperm,0,$len # $inpperm is no longer needed
2075+ vxor $inptail,$inptail,$inptail # $inptail is no longer needed
2076+ vspltisb $tmp,-1
2077+ vperm $inptail,$inptail,$tmp,$inpperm
2078+ vsel $inout,$inout,$output,$inptail
2079+
2080+ subi r11,$out,17
2081+ subi $out,$out,16
2082+ mtctr $len
2083+ li $len,16
2084+Loop_xts_enc_steal:
2085+ lbzu r0,1(r11)
2086+ stb r0,16(r11)
2087+ bdnz Loop_xts_enc_steal
2088+
2089+ mtctr $rounds
2090+ b Loop_xts_enc # one more time...
2091+
2092+Lxts_enc_done:
2093+ mtspr 256,r12 # restore vrsave
2094+ li r3,0
2095+ blr
2096+ .long 0
2097+ .byte 0,12,0x04,0,0x80,6,6,0
2098+ .long 0
2099+.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2100+
2101+.globl .${prefix}_xts_decrypt
2102+.align 5
2103+.${prefix}_xts_decrypt:
2104+ mr $inp,r3 # reassign
2105+ li r3,-1
2106+ ${UCMP}i $len,16
2107+ bltlr-
2108+
2109+ lis r0,0xfff8
2110+ mfspr r12,256 # save vrsave
2111+ li r11,0
2112+ mtspr 256,r0
2113+
2114+ andi. r0,$len,15
2115+ neg r0,r0
2116+ andi. r0,r0,16
2117+ sub $len,$len,r0
2118+
2119+ vspltisb $seven,0x07 # 0x070707..07
2120+ le?lvsl $leperm,r11,r11
2121+ le?vspltisb $tmp,0x0f
2122+ le?vxor $leperm,$leperm,$seven
2123+
2124+ li $idx,15
2125+ lvx $tweak,0,$ivp # load [unaligned] iv
2126+ lvsl $inpperm,0,$ivp
2127+ lvx $inptail,$idx,$ivp
2128+ le?vxor $inpperm,$inpperm,$tmp
2129+ vperm $tweak,$tweak,$inptail,$inpperm
2130+
2131+ ?lvsl $keyperm,0,$key2 # prepare for unaligned key
2132+ lwz $rounds,240($key2)
2133+ srwi $rounds,$rounds,1
2134+ subi $rounds,$rounds,1
2135+ li $idx,16
2136+
2137+ neg r11,$inp
2138+ lvsr $inpperm,0,r11 # prepare for unaligned load
2139+ lvx $inout,0,$inp
2140+ addi $inp,$inp,15 # 15 is not typo
2141+ le?vxor $inpperm,$inpperm,$tmp
2142+
2143+ lvx $rndkey0,0,$key2
2144+ lvx $rndkey1,$idx,$key2
2145+ addi $idx,$idx,16
2146+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2147+ vxor $tweak,$tweak,$rndkey0
2148+ lvx $rndkey0,$idx,$key2
2149+ addi $idx,$idx,16
2150+ mtctr $rounds
2151+
2152+Ltweak_xts_dec:
2153+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
2154+ vcipher $tweak,$tweak,$rndkey1
2155+ lvx $rndkey1,$idx,$key2
2156+ addi $idx,$idx,16
2157+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2158+ vcipher $tweak,$tweak,$rndkey0
2159+ lvx $rndkey0,$idx,$key2
2160+ addi $idx,$idx,16
2161+ bdnz Ltweak_xts_dec
2162+
2163+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
2164+ vcipher $tweak,$tweak,$rndkey1
2165+ lvx $rndkey1,$idx,$key2
2166+ li $idx,16
2167+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2168+ vcipherlast $tweak,$tweak,$rndkey0
2169+
2170+ lvx $inptail,0,$inp
2171+ addi $inp,$inp,16
2172+
2173+ ?lvsl $keyperm,0,$key1 # prepare for unaligned key
2174+ lwz $rounds,240($key1)
2175+ srwi $rounds,$rounds,1
2176+ subi $rounds,$rounds,1
2177+ li $idx,16
2178+
2179+ vslb $eighty7,$seven,$seven # 0x808080..80
2180+ vor $eighty7,$eighty7,$seven # 0x878787..87
2181+ vspltisb $tmp,1 # 0x010101..01
2182+ vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
2183+
2184+ ${UCMP}i $len,96
2185+ bge _aesp8_xts_decrypt6x
2186+
2187+ lvx $rndkey0,0,$key1
2188+ lvx $rndkey1,$idx,$key1
2189+ addi $idx,$idx,16
2190+ vperm $inout,$inout,$inptail,$inpperm
2191+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2192+ vxor $inout,$inout,$tweak
2193+ vxor $inout,$inout,$rndkey0
2194+ lvx $rndkey0,$idx,$key1
2195+ addi $idx,$idx,16
2196+ mtctr $rounds
2197+
2198+ ${UCMP}i $len,16
2199+ blt Ltail_xts_dec
2200+ be?b Loop_xts_dec
2201+
2202+.align 5
2203+Loop_xts_dec:
2204+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
2205+ vncipher $inout,$inout,$rndkey1
2206+ lvx $rndkey1,$idx,$key1
2207+ addi $idx,$idx,16
2208+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2209+ vncipher $inout,$inout,$rndkey0
2210+ lvx $rndkey0,$idx,$key1
2211+ addi $idx,$idx,16
2212+ bdnz Loop_xts_dec
2213+
2214+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
2215+ vncipher $inout,$inout,$rndkey1
2216+ lvx $rndkey1,$idx,$key1
2217+ li $idx,16
2218+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2219+ vxor $rndkey0,$rndkey0,$tweak
2220+ vncipherlast $output,$inout,$rndkey0
2221+
2222+ le?vperm $tmp,$output,$output,$leperm
2223+ be?nop
2224+ le?stvx_u $tmp,0,$out
2225+ be?stvx_u $output,0,$out
2226+ addi $out,$out,16
2227+
2228+ subic. $len,$len,16
2229+ beq Lxts_dec_done
2230+
2231+ vmr $inout,$inptail
2232+ lvx $inptail,0,$inp
2233+ addi $inp,$inp,16
2234+ lvx $rndkey0,0,$key1
2235+ lvx $rndkey1,$idx,$key1
2236+ addi $idx,$idx,16
2237+
2238+ vsrab $tmp,$tweak,$seven # next tweak value
2239+ vaddubm $tweak,$tweak,$tweak
2240+ vsldoi $tmp,$tmp,$tmp,15
2241+ vand $tmp,$tmp,$eighty7
2242+ vxor $tweak,$tweak,$tmp
2243+
2244+ vperm $inout,$inout,$inptail,$inpperm
2245+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2246+ vxor $inout,$inout,$tweak
2247+ vxor $inout,$inout,$rndkey0
2248+ lvx $rndkey0,$idx,$key1
2249+ addi $idx,$idx,16
2250+
2251+ mtctr $rounds
2252+ ${UCMP}i $len,16
2253+ bge Loop_xts_dec
2254+
2255+Ltail_xts_dec:
2256+ vsrab $tmp,$tweak,$seven # next tweak value
2257+ vaddubm $tweak1,$tweak,$tweak
2258+ vsldoi $tmp,$tmp,$tmp,15
2259+ vand $tmp,$tmp,$eighty7
2260+ vxor $tweak1,$tweak1,$tmp
2261+
2262+ subi $inp,$inp,16
2263+ add $inp,$inp,$len
2264+
2265+ vxor $inout,$inout,$tweak # :-(
2266+ vxor $inout,$inout,$tweak1 # :-)
2267+
2268+Loop_xts_dec_short:
2269+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
2270+ vncipher $inout,$inout,$rndkey1
2271+ lvx $rndkey1,$idx,$key1
2272+ addi $idx,$idx,16
2273+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2274+ vncipher $inout,$inout,$rndkey0
2275+ lvx $rndkey0,$idx,$key1
2276+ addi $idx,$idx,16
2277+ bdnz Loop_xts_dec_short
2278+
2279+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
2280+ vncipher $inout,$inout,$rndkey1
2281+ lvx $rndkey1,$idx,$key1
2282+ li $idx,16
2283+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2284+ vxor $rndkey0,$rndkey0,$tweak1
2285+ vncipherlast $output,$inout,$rndkey0
2286+
2287+ le?vperm $tmp,$output,$output,$leperm
2288+ be?nop
2289+ le?stvx_u $tmp,0,$out
2290+ be?stvx_u $output,0,$out
2291+
2292+ vmr $inout,$inptail
2293+ lvx $inptail,0,$inp
2294+ #addi $inp,$inp,16
2295+ lvx $rndkey0,0,$key1
2296+ lvx $rndkey1,$idx,$key1
2297+ addi $idx,$idx,16
2298+ vperm $inout,$inout,$inptail,$inpperm
2299+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
2300+
2301+ lvsr $inpperm,0,$len # $inpperm is no longer needed
2302+ vxor $inptail,$inptail,$inptail # $inptail is no longer needed
2303+ vspltisb $tmp,-1
2304+ vperm $inptail,$inptail,$tmp,$inpperm
2305+ vsel $inout,$inout,$output,$inptail
2306+
2307+ vxor $rndkey0,$rndkey0,$tweak
2308+ vxor $inout,$inout,$rndkey0
2309+ lvx $rndkey0,$idx,$key1
2310+ addi $idx,$idx,16
2311+
2312+ subi r11,$out,1
2313+ mtctr $len
2314+ li $len,16
2315+Loop_xts_dec_steal:
2316+ lbzu r0,1(r11)
2317+ stb r0,16(r11)
2318+ bdnz Loop_xts_dec_steal
2319+
2320+ mtctr $rounds
2321+ b Loop_xts_dec # one more time...
2322+
2323+Lxts_dec_done:
2324+ mtspr 256,r12 # restore vrsave
2325+ li r3,0
2326+ blr
2327+ .long 0
2328+ .byte 0,12,0x04,0,0x80,6,6,0
2329+ .long 0
2330+.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2331+___
2332+#########################################################################
2333+{{ # Optimized XTS procedures #
2334+my $key_="r11";
2335+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
2336+ $x00=0 if ($flavour =~ /osx/);
2337+my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5));
2338+my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2339+my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2340+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
2341+ # v26-v31 last 6 round keys
2342+my ($keyperm)=($out0); # aliases with "caller", redundant assignment
2343+my $taillen=$x70;
2344+
2345+$code.=<<___;
2346+.align 5
2347+_aesp8_xts_encrypt6x:
2348+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2349+ mflr r0
2350+ li r7,`$FRAME+8*16+15`
2351+ li r8,`$FRAME+8*16+31`
2352+ $PUSH r0,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2353+ stvx v20,r7,$sp # ABI says so
2354+ addi r7,r7,32
2355+ stvx v21,r8,$sp
2356+ addi r8,r8,32
2357+ stvx v22,r7,$sp
2358+ addi r7,r7,32
2359+ stvx v23,r8,$sp
2360+ addi r8,r8,32
2361+ stvx v24,r7,$sp
2362+ addi r7,r7,32
2363+ stvx v25,r8,$sp
2364+ addi r8,r8,32
2365+ stvx v26,r7,$sp
2366+ addi r7,r7,32
2367+ stvx v27,r8,$sp
2368+ addi r8,r8,32
2369+ stvx v28,r7,$sp
2370+ addi r7,r7,32
2371+ stvx v29,r8,$sp
2372+ addi r8,r8,32
2373+ stvx v30,r7,$sp
2374+ stvx v31,r8,$sp
2375+ mr r7,r0
2376+ li r0,-1
2377+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
2378+ li $x10,0x10
2379+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2380+ li $x20,0x20
2381+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2382+ li $x30,0x30
2383+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2384+ li $x40,0x40
2385+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2386+ li $x50,0x50
2387+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2388+ li $x60,0x60
2389+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2390+ li $x70,0x70
2391+ mtspr 256,r0
2392+
2393+ subi $rounds,$rounds,3 # -4 in total
2394+
2395+ lvx $rndkey0,$x00,$key1 # load key schedule
2396+ lvx v30,$x10,$key1
2397+ addi $key1,$key1,0x20
2398+ lvx v31,$x00,$key1
2399+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
2400+ addi $key_,$sp,$FRAME+15
2401+ mtctr $rounds
2402+
2403+Load_xts_enc_key:
2404+ ?vperm v24,v30,v31,$keyperm
2405+ lvx v30,$x10,$key1
2406+ addi $key1,$key1,0x20
2407+ stvx v24,$x00,$key_ # off-load round[1]
2408+ ?vperm v25,v31,v30,$keyperm
2409+ lvx v31,$x00,$key1
2410+ stvx v25,$x10,$key_ # off-load round[2]
2411+ addi $key_,$key_,0x20
2412+ bdnz Load_xts_enc_key
2413+
2414+ lvx v26,$x10,$key1
2415+ ?vperm v24,v30,v31,$keyperm
2416+ lvx v27,$x20,$key1
2417+ stvx v24,$x00,$key_ # off-load round[3]
2418+ ?vperm v25,v31,v26,$keyperm
2419+ lvx v28,$x30,$key1
2420+ stvx v25,$x10,$key_ # off-load round[4]
2421+ addi $key_,$sp,$FRAME+15 # rewind $key_
2422+ ?vperm v26,v26,v27,$keyperm
2423+ lvx v29,$x40,$key1
2424+ ?vperm v27,v27,v28,$keyperm
2425+ lvx v30,$x50,$key1
2426+ ?vperm v28,v28,v29,$keyperm
2427+ lvx v31,$x60,$key1
2428+ ?vperm v29,v29,v30,$keyperm
2429+ lvx $twk5,$x70,$key1 # borrow $twk5
2430+ ?vperm v30,v30,v31,$keyperm
2431+ lvx v24,$x00,$key_ # pre-load round[1]
2432+ ?vperm v31,v31,$twk5,$keyperm
2433+ lvx v25,$x10,$key_ # pre-load round[2]
2434+
2435+ vperm $in0,$inout,$inptail,$inpperm
2436+ subi $inp,$inp,31 # undo "caller"
2437+ vxor $twk0,$tweak,$rndkey0
2438+ vsrab $tmp,$tweak,$seven # next tweak value
2439+ vaddubm $tweak,$tweak,$tweak
2440+ vsldoi $tmp,$tmp,$tmp,15
2441+ vand $tmp,$tmp,$eighty7
2442+ vxor $out0,$in0,$twk0
2443+ vxor $tweak,$tweak,$tmp
2444+
2445+ lvx_u $in1,$x10,$inp
2446+ vxor $twk1,$tweak,$rndkey0
2447+ vsrab $tmp,$tweak,$seven # next tweak value
2448+ vaddubm $tweak,$tweak,$tweak
2449+ vsldoi $tmp,$tmp,$tmp,15
2450+ le?vperm $in1,$in1,$in1,$leperm
2451+ vand $tmp,$tmp,$eighty7
2452+ vxor $out1,$in1,$twk1
2453+ vxor $tweak,$tweak,$tmp
2454+
2455+ lvx_u $in2,$x20,$inp
2456+ andi. $taillen,$len,15
2457+ vxor $twk2,$tweak,$rndkey0
2458+ vsrab $tmp,$tweak,$seven # next tweak value
2459+ vaddubm $tweak,$tweak,$tweak
2460+ vsldoi $tmp,$tmp,$tmp,15
2461+ le?vperm $in2,$in2,$in2,$leperm
2462+ vand $tmp,$tmp,$eighty7
2463+ vxor $out2,$in2,$twk2
2464+ vxor $tweak,$tweak,$tmp
2465+
2466+ lvx_u $in3,$x30,$inp
2467+ sub $len,$len,$taillen
2468+ vxor $twk3,$tweak,$rndkey0
2469+ vsrab $tmp,$tweak,$seven # next tweak value
2470+ vaddubm $tweak,$tweak,$tweak
2471+ vsldoi $tmp,$tmp,$tmp,15
2472+ le?vperm $in3,$in3,$in3,$leperm
2473+ vand $tmp,$tmp,$eighty7
2474+ vxor $out3,$in3,$twk3
2475+ vxor $tweak,$tweak,$tmp
2476+
2477+ lvx_u $in4,$x40,$inp
2478+ subi $len,$len,0x60
2479+ vxor $twk4,$tweak,$rndkey0
2480+ vsrab $tmp,$tweak,$seven # next tweak value
2481+ vaddubm $tweak,$tweak,$tweak
2482+ vsldoi $tmp,$tmp,$tmp,15
2483+ le?vperm $in4,$in4,$in4,$leperm
2484+ vand $tmp,$tmp,$eighty7
2485+ vxor $out4,$in4,$twk4
2486+ vxor $tweak,$tweak,$tmp
2487+
2488+ lvx_u $in5,$x50,$inp
2489+ addi $inp,$inp,0x60
2490+ vxor $twk5,$tweak,$rndkey0
2491+ vsrab $tmp,$tweak,$seven # next tweak value
2492+ vaddubm $tweak,$tweak,$tweak
2493+ vsldoi $tmp,$tmp,$tmp,15
2494+ le?vperm $in5,$in5,$in5,$leperm
2495+ vand $tmp,$tmp,$eighty7
2496+ vxor $out5,$in5,$twk5
2497+ vxor $tweak,$tweak,$tmp
2498+
2499+ vxor v31,v31,$rndkey0
2500+ mtctr $rounds
2501+ b Loop_xts_enc6x
2502+
2503+.align 5
2504+Loop_xts_enc6x:
2505+ vcipher $out0,$out0,v24
2506+ vcipher $out1,$out1,v24
2507+ vcipher $out2,$out2,v24
2508+ vcipher $out3,$out3,v24
2509+ vcipher $out4,$out4,v24
2510+ vcipher $out5,$out5,v24
2511+ lvx v24,$x20,$key_ # round[3]
2512+ addi $key_,$key_,0x20
2513+
2514+ vcipher $out0,$out0,v25
2515+ vcipher $out1,$out1,v25
2516+ vcipher $out2,$out2,v25
2517+ vcipher $out3,$out3,v25
2518+ vcipher $out4,$out4,v25
2519+ vcipher $out5,$out5,v25
2520+ lvx v25,$x10,$key_ # round[4]
2521+ bdnz Loop_xts_enc6x
2522+
2523+ subic $len,$len,96 # $len-=96
2524+ vxor $in0,$twk0,v31 # xor with last round key
2525+ vcipher $out0,$out0,v24
2526+ vcipher $out1,$out1,v24
2527+ vsrab $tmp,$tweak,$seven # next tweak value
2528+ vxor $twk0,$tweak,$rndkey0
2529+ vaddubm $tweak,$tweak,$tweak
2530+ vcipher $out2,$out2,v24
2531+ vcipher $out3,$out3,v24
2532+ vsldoi $tmp,$tmp,$tmp,15
2533+ vcipher $out4,$out4,v24
2534+ vcipher $out5,$out5,v24
2535+
2536+ subfe. r0,r0,r0 # borrow?-1:0
2537+ vand $tmp,$tmp,$eighty7
2538+ vcipher $out0,$out0,v25
2539+ vcipher $out1,$out1,v25
2540+ vxor $tweak,$tweak,$tmp
2541+ vcipher $out2,$out2,v25
2542+ vcipher $out3,$out3,v25
2543+ vxor $in1,$twk1,v31
2544+ vsrab $tmp,$tweak,$seven # next tweak value
2545+ vxor $twk1,$tweak,$rndkey0
2546+ vcipher $out4,$out4,v25
2547+ vcipher $out5,$out5,v25
2548+
2549+ and r0,r0,$len
2550+ vaddubm $tweak,$tweak,$tweak
2551+ vsldoi $tmp,$tmp,$tmp,15
2552+ vcipher $out0,$out0,v26
2553+ vcipher $out1,$out1,v26
2554+ vand $tmp,$tmp,$eighty7
2555+ vcipher $out2,$out2,v26
2556+ vcipher $out3,$out3,v26
2557+ vxor $tweak,$tweak,$tmp
2558+ vcipher $out4,$out4,v26
2559+ vcipher $out5,$out5,v26
2560+
2561+ add $inp,$inp,r0 # $inp is adjusted in such
2562+ # way that at exit from the
2563+ # loop inX-in5 are loaded
2564+ # with last "words"
2565+ vxor $in2,$twk2,v31
2566+ vsrab $tmp,$tweak,$seven # next tweak value
2567+ vxor $twk2,$tweak,$rndkey0
2568+ vaddubm $tweak,$tweak,$tweak
2569+ vcipher $out0,$out0,v27
2570+ vcipher $out1,$out1,v27
2571+ vsldoi $tmp,$tmp,$tmp,15
2572+ vcipher $out2,$out2,v27
2573+ vcipher $out3,$out3,v27
2574+ vand $tmp,$tmp,$eighty7
2575+ vcipher $out4,$out4,v27
2576+ vcipher $out5,$out5,v27
2577+
2578+ addi $key_,$sp,$FRAME+15 # rewind $key_
2579+ vxor $tweak,$tweak,$tmp
2580+ vcipher $out0,$out0,v28
2581+ vcipher $out1,$out1,v28
2582+ vxor $in3,$twk3,v31
2583+ vsrab $tmp,$tweak,$seven # next tweak value
2584+ vxor $twk3,$tweak,$rndkey0
2585+ vcipher $out2,$out2,v28
2586+ vcipher $out3,$out3,v28
2587+ vaddubm $tweak,$tweak,$tweak
2588+ vsldoi $tmp,$tmp,$tmp,15
2589+ vcipher $out4,$out4,v28
2590+ vcipher $out5,$out5,v28
2591+ lvx v24,$x00,$key_ # re-pre-load round[1]
2592+ vand $tmp,$tmp,$eighty7
2593+
2594+ vcipher $out0,$out0,v29
2595+ vcipher $out1,$out1,v29
2596+ vxor $tweak,$tweak,$tmp
2597+ vcipher $out2,$out2,v29
2598+ vcipher $out3,$out3,v29
2599+ vxor $in4,$twk4,v31
2600+ vsrab $tmp,$tweak,$seven # next tweak value
2601+ vxor $twk4,$tweak,$rndkey0
2602+ vcipher $out4,$out4,v29
2603+ vcipher $out5,$out5,v29
2604+ lvx v25,$x10,$key_ # re-pre-load round[2]
2605+ vaddubm $tweak,$tweak,$tweak
2606+ vsldoi $tmp,$tmp,$tmp,15
2607+
2608+ vcipher $out0,$out0,v30
2609+ vcipher $out1,$out1,v30
2610+ vand $tmp,$tmp,$eighty7
2611+ vcipher $out2,$out2,v30
2612+ vcipher $out3,$out3,v30
2613+ vxor $tweak,$tweak,$tmp
2614+ vcipher $out4,$out4,v30
2615+ vcipher $out5,$out5,v30
2616+ vxor $in5,$twk5,v31
2617+ vsrab $tmp,$tweak,$seven # next tweak value
2618+ vxor $twk5,$tweak,$rndkey0
2619+
2620+ vcipherlast $out0,$out0,$in0
2621+ lvx_u $in0,$x00,$inp # load next input block
2622+ vaddubm $tweak,$tweak,$tweak
2623+ vsldoi $tmp,$tmp,$tmp,15
2624+ vcipherlast $out1,$out1,$in1
2625+ lvx_u $in1,$x10,$inp
2626+ vcipherlast $out2,$out2,$in2
2627+ le?vperm $in0,$in0,$in0,$leperm
2628+ lvx_u $in2,$x20,$inp
2629+ vand $tmp,$tmp,$eighty7
2630+ vcipherlast $out3,$out3,$in3
2631+ le?vperm $in1,$in1,$in1,$leperm
2632+ lvx_u $in3,$x30,$inp
2633+ vcipherlast $out4,$out4,$in4
2634+ le?vperm $in2,$in2,$in2,$leperm
2635+ lvx_u $in4,$x40,$inp
2636+ vxor $tweak,$tweak,$tmp
2637+ vcipherlast $tmp,$out5,$in5 # last block might be needed
2638+ # in stealing mode
2639+ le?vperm $in3,$in3,$in3,$leperm
2640+ lvx_u $in5,$x50,$inp
2641+ addi $inp,$inp,0x60
2642+ le?vperm $in4,$in4,$in4,$leperm
2643+ le?vperm $in5,$in5,$in5,$leperm
2644+
2645+ le?vperm $out0,$out0,$out0,$leperm
2646+ le?vperm $out1,$out1,$out1,$leperm
2647+ stvx_u $out0,$x00,$out # store output
2648+ vxor $out0,$in0,$twk0
2649+ le?vperm $out2,$out2,$out2,$leperm
2650+ stvx_u $out1,$x10,$out
2651+ vxor $out1,$in1,$twk1
2652+ le?vperm $out3,$out3,$out3,$leperm
2653+ stvx_u $out2,$x20,$out
2654+ vxor $out2,$in2,$twk2
2655+ le?vperm $out4,$out4,$out4,$leperm
2656+ stvx_u $out3,$x30,$out
2657+ vxor $out3,$in3,$twk3
2658+ le?vperm $out5,$tmp,$tmp,$leperm
2659+ stvx_u $out4,$x40,$out
2660+ vxor $out4,$in4,$twk4
2661+ le?stvx_u $out5,$x50,$out
2662+ be?stvx_u $tmp, $x50,$out
2663+ vxor $out5,$in5,$twk5
2664+ addi $out,$out,0x60
2665+
2666+ mtctr $rounds
2667+ beq Loop_xts_enc6x # did $len-=96 borrow?
2668+
2669+ addic. $len,$len,0x60
2670+ beq Lxts_enc6x_zero
2671+ cmpwi $len,0x20
2672+ blt Lxts_enc6x_one
2673+ nop
2674+ beq Lxts_enc6x_two
2675+ cmpwi $len,0x40
2676+ blt Lxts_enc6x_three
2677+ nop
2678+ beq Lxts_enc6x_four
2679+
2680+Lxts_enc6x_five:
2681+ vxor $out0,$in1,$twk0
2682+ vxor $out1,$in2,$twk1
2683+ vxor $out2,$in3,$twk2
2684+ vxor $out3,$in4,$twk3
2685+ vxor $out4,$in5,$twk4
2686+
2687+ bl _aesp8_xts_enc5x
2688+
2689+ le?vperm $out0,$out0,$out0,$leperm
2690+ vmr $twk0,$twk5 # unused tweak
2691+ le?vperm $out1,$out1,$out1,$leperm
2692+ stvx_u $out0,$x00,$out # store output
2693+ le?vperm $out2,$out2,$out2,$leperm
2694+ stvx_u $out1,$x10,$out
2695+ le?vperm $out3,$out3,$out3,$leperm
2696+ stvx_u $out2,$x20,$out
2697+ vxor $tmp,$out4,$twk5 # last block prep for stealing
2698+ le?vperm $out4,$out4,$out4,$leperm
2699+ stvx_u $out3,$x30,$out
2700+ stvx_u $out4,$x40,$out
2701+ addi $out,$out,0x50
2702+ bne Lxts_enc6x_steal
2703+ b Lxts_enc6x_done
2704+
2705+.align 4
2706+Lxts_enc6x_four:
2707+ vxor $out0,$in2,$twk0
2708+ vxor $out1,$in3,$twk1
2709+ vxor $out2,$in4,$twk2
2710+ vxor $out3,$in5,$twk3
2711+ vxor $out4,$out4,$out4
2712+
2713+ bl _aesp8_xts_enc5x
2714+
2715+ le?vperm $out0,$out0,$out0,$leperm
2716+ vmr $twk0,$twk4 # unused tweak
2717+ le?vperm $out1,$out1,$out1,$leperm
2718+ stvx_u $out0,$x00,$out # store output
2719+ le?vperm $out2,$out2,$out2,$leperm
2720+ stvx_u $out1,$x10,$out
2721+ vxor $tmp,$out3,$twk4 # last block prep for stealing
2722+ le?vperm $out3,$out3,$out3,$leperm
2723+ stvx_u $out2,$x20,$out
2724+ stvx_u $out3,$x30,$out
2725+ addi $out,$out,0x40
2726+ bne Lxts_enc6x_steal
2727+ b Lxts_enc6x_done
2728+
2729+.align 4
2730+Lxts_enc6x_three:
2731+ vxor $out0,$in3,$twk0
2732+ vxor $out1,$in4,$twk1
2733+ vxor $out2,$in5,$twk2
2734+ vxor $out3,$out3,$out3
2735+ vxor $out4,$out4,$out4
2736+
2737+ bl _aesp8_xts_enc5x
2738+
2739+ le?vperm $out0,$out0,$out0,$leperm
2740+ vmr $twk0,$twk3 # unused tweak
2741+ le?vperm $out1,$out1,$out1,$leperm
2742+ stvx_u $out0,$x00,$out # store output
2743+ vxor $tmp,$out2,$twk3 # last block prep for stealing
2744+ le?vperm $out2,$out2,$out2,$leperm
2745+ stvx_u $out1,$x10,$out
2746+ stvx_u $out2,$x20,$out
2747+ addi $out,$out,0x30
2748+ bne Lxts_enc6x_steal
2749+ b Lxts_enc6x_done
2750+
2751+.align 4
2752+Lxts_enc6x_two:
2753+ vxor $out0,$in4,$twk0
2754+ vxor $out1,$in5,$twk1
2755+ vxor $out2,$out2,$out2
2756+ vxor $out3,$out3,$out3
2757+ vxor $out4,$out4,$out4
2758+
2759+ bl _aesp8_xts_enc5x
2760+
2761+ le?vperm $out0,$out0,$out0,$leperm
2762+ vmr $twk0,$twk2 # unused tweak
2763+ vxor $tmp,$out1,$twk2 # last block prep for stealing
2764+ le?vperm $out1,$out1,$out1,$leperm
2765+ stvx_u $out0,$x00,$out # store output
2766+ stvx_u $out1,$x10,$out
2767+ addi $out,$out,0x20
2768+ bne Lxts_enc6x_steal
2769+ b Lxts_enc6x_done
2770+
2771+.align 4
2772+Lxts_enc6x_one:
2773+ vxor $out0,$in5,$twk0
2774+ nop
2775+Loop_xts_enc1x:
2776+ vcipher $out0,$out0,v24
2777+ lvx v24,$x20,$key_ # round[3]
2778+ addi $key_,$key_,0x20
2779+
2780+ vcipher $out0,$out0,v25
2781+ lvx v25,$x10,$key_ # round[4]
2782+ bdnz Loop_xts_enc1x
2783+
2784+ add $inp,$inp,$taillen
2785+ cmpwi $taillen,0
2786+ vcipher $out0,$out0,v24
2787+
2788+ subi $inp,$inp,16
2789+ vcipher $out0,$out0,v25
2790+
2791+ lvsr $inpperm,0,$taillen
2792+ vcipher $out0,$out0,v26
2793+
2794+ lvx_u $in0,0,$inp
2795+ vcipher $out0,$out0,v27
2796+
2797+ addi $key_,$sp,$FRAME+15 # rewind $key_
2798+ vcipher $out0,$out0,v28
2799+ lvx v24,$x00,$key_ # re-pre-load round[1]
2800+
2801+ vcipher $out0,$out0,v29
2802+ lvx v25,$x10,$key_ # re-pre-load round[2]
2803+ vxor $twk0,$twk0,v31
2804+
2805+ le?vperm $in0,$in0,$in0,$leperm
2806+ vcipher $out0,$out0,v30
2807+
2808+ vperm $in0,$in0,$in0,$inpperm
2809+ vcipherlast $out0,$out0,$twk0
2810+
2811+ vmr $twk0,$twk1 # unused tweak
2812+ vxor $tmp,$out0,$twk1 # last block prep for stealing
2813+ le?vperm $out0,$out0,$out0,$leperm
2814+ stvx_u $out0,$x00,$out # store output
2815+ addi $out,$out,0x10
2816+ bne Lxts_enc6x_steal
2817+ b Lxts_enc6x_done
2818+
2819+.align 4
2820+Lxts_enc6x_zero:
2821+ cmpwi $taillen,0
2822+ beq Lxts_enc6x_done
2823+
2824+ add $inp,$inp,$taillen
2825+ subi $inp,$inp,16
2826+ lvx_u $in0,0,$inp
2827+ lvsr $inpperm,0,$taillen # $in5 is no more
2828+ le?vperm $in0,$in0,$in0,$leperm
2829+ vperm $in0,$in0,$in0,$inpperm
2830+ vxor $tmp,$tmp,$twk0
2831+Lxts_enc6x_steal:
2832+ vxor $in0,$in0,$twk0
2833+ vxor $out0,$out0,$out0
2834+ vspltisb $out1,-1
2835+ vperm $out0,$out0,$out1,$inpperm
2836+ vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember?
2837+
2838+ subi r3,$out,17
2839+ subi $out,$out,16
2840+ mtctr $taillen
2841+Loop_xts_enc6x_steal:
2842+ lbzu r0,1(r3)
2843+ stb r0,16(r3)
2844+ bdnz Loop_xts_enc6x_steal
2845+
2846+ li $taillen,0
2847+ mtctr $rounds
2848+ b Loop_xts_enc1x # one more time...
2849+
2850+.align 4
2851+Lxts_enc6x_done:
2852+ mtlr r7
2853+ li r10,`$FRAME+15`
2854+ li r11,`$FRAME+31`
2855+ stvx $seven,r10,$sp # wipe copies of round keys
2856+ addi r10,r10,32
2857+ stvx $seven,r11,$sp
2858+ addi r11,r11,32
2859+ stvx $seven,r10,$sp
2860+ addi r10,r10,32
2861+ stvx $seven,r11,$sp
2862+ addi r11,r11,32
2863+ stvx $seven,r10,$sp
2864+ addi r10,r10,32
2865+ stvx $seven,r11,$sp
2866+ addi r11,r11,32
2867+ stvx $seven,r10,$sp
2868+ addi r10,r10,32
2869+ stvx $seven,r11,$sp
2870+ addi r11,r11,32
2871+
2872+ mtspr 256,$vrsave
2873+ lvx v20,r10,$sp # ABI says so
2874+ addi r10,r10,32
2875+ lvx v21,r11,$sp
2876+ addi r11,r11,32
2877+ lvx v22,r10,$sp
2878+ addi r10,r10,32
2879+ lvx v23,r11,$sp
2880+ addi r11,r11,32
2881+ lvx v24,r10,$sp
2882+ addi r10,r10,32
2883+ lvx v25,r11,$sp
2884+ addi r11,r11,32
2885+ lvx v26,r10,$sp
2886+ addi r10,r10,32
2887+ lvx v27,r11,$sp
2888+ addi r11,r11,32
2889+ lvx v28,r10,$sp
2890+ addi r10,r10,32
2891+ lvx v29,r11,$sp
2892+ addi r11,r11,32
2893+ lvx v30,r10,$sp
2894+ lvx v31,r11,$sp
2895+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2896+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2897+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2898+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2899+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2900+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2901+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
2902+ blr
2903+ .long 0
2904+ .byte 0,12,0x04,1,0x80,6,6,0
2905+ .long 0
2906+
2907+.align 5
2908+_aesp8_xts_enc5x:
2909+ vcipher $out0,$out0,v24
2910+ vcipher $out1,$out1,v24
2911+ vcipher $out2,$out2,v24
2912+ vcipher $out3,$out3,v24
2913+ vcipher $out4,$out4,v24
2914+ lvx v24,$x20,$key_ # round[3]
2915+ addi $key_,$key_,0x20
2916+
2917+ vcipher $out0,$out0,v25
2918+ vcipher $out1,$out1,v25
2919+ vcipher $out2,$out2,v25
2920+ vcipher $out3,$out3,v25
2921+ vcipher $out4,$out4,v25
2922+ lvx v25,$x10,$key_ # round[4]
2923+ bdnz _aesp8_xts_enc5x
2924+
2925+ add $inp,$inp,$taillen
2926+ cmpwi $taillen,0
2927+ vcipher $out0,$out0,v24
2928+ vcipher $out1,$out1,v24
2929+ vcipher $out2,$out2,v24
2930+ vcipher $out3,$out3,v24
2931+ vcipher $out4,$out4,v24
2932+
2933+ subi $inp,$inp,16
2934+ vcipher $out0,$out0,v25
2935+ vcipher $out1,$out1,v25
2936+ vcipher $out2,$out2,v25
2937+ vcipher $out3,$out3,v25
2938+ vcipher $out4,$out4,v25
2939+ vxor $twk0,$twk0,v31
2940+
2941+ vcipher $out0,$out0,v26
2942+ lvsr $inpperm,r0,$taillen # $in5 is no more
2943+ vcipher $out1,$out1,v26
2944+ vcipher $out2,$out2,v26
2945+ vcipher $out3,$out3,v26
2946+ vcipher $out4,$out4,v26
2947+ vxor $in1,$twk1,v31
2948+
2949+ vcipher $out0,$out0,v27
2950+ lvx_u $in0,0,$inp
2951+ vcipher $out1,$out1,v27
2952+ vcipher $out2,$out2,v27
2953+ vcipher $out3,$out3,v27
2954+ vcipher $out4,$out4,v27
2955+ vxor $in2,$twk2,v31
2956+
2957+ addi $key_,$sp,$FRAME+15 # rewind $key_
2958+ vcipher $out0,$out0,v28
2959+ vcipher $out1,$out1,v28
2960+ vcipher $out2,$out2,v28
2961+ vcipher $out3,$out3,v28
2962+ vcipher $out4,$out4,v28
2963+ lvx v24,$x00,$key_ # re-pre-load round[1]
2964+ vxor $in3,$twk3,v31
2965+
2966+ vcipher $out0,$out0,v29
2967+ le?vperm $in0,$in0,$in0,$leperm
2968+ vcipher $out1,$out1,v29
2969+ vcipher $out2,$out2,v29
2970+ vcipher $out3,$out3,v29
2971+ vcipher $out4,$out4,v29
2972+ lvx v25,$x10,$key_ # re-pre-load round[2]
2973+ vxor $in4,$twk4,v31
2974+
2975+ vcipher $out0,$out0,v30
2976+ vperm $in0,$in0,$in0,$inpperm
2977+ vcipher $out1,$out1,v30
2978+ vcipher $out2,$out2,v30
2979+ vcipher $out3,$out3,v30
2980+ vcipher $out4,$out4,v30
2981+
2982+ vcipherlast $out0,$out0,$twk0
2983+ vcipherlast $out1,$out1,$in1
2984+ vcipherlast $out2,$out2,$in2
2985+ vcipherlast $out3,$out3,$in3
2986+ vcipherlast $out4,$out4,$in4
2987+ blr
2988+ .long 0
2989+ .byte 0,12,0x14,0,0,0,0,0
2990+
2991+.align 5
2992+_aesp8_xts_decrypt6x:
2993+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2994+ mflr r0
2995+ li r7,`$FRAME+8*16+15`
2996+ li r8,`$FRAME+8*16+31`
2997+ $PUSH r0,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2998+ stvx v20,r7,$sp # ABI says so
2999+ addi r7,r7,32
3000+ stvx v21,r8,$sp
3001+ addi r8,r8,32
3002+ stvx v22,r7,$sp
3003+ addi r7,r7,32
3004+ stvx v23,r8,$sp
3005+ addi r8,r8,32
3006+ stvx v24,r7,$sp
3007+ addi r7,r7,32
3008+ stvx v25,r8,$sp
3009+ addi r8,r8,32
3010+ stvx v26,r7,$sp
3011+ addi r7,r7,32
3012+ stvx v27,r8,$sp
3013+ addi r8,r8,32
3014+ stvx v28,r7,$sp
3015+ addi r7,r7,32
3016+ stvx v29,r8,$sp
3017+ addi r8,r8,32
3018+ stvx v30,r7,$sp
3019+ stvx v31,r8,$sp
3020+ mr r7,r0
3021+ li r0,-1
3022+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
3023+ li $x10,0x10
3024+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3025+ li $x20,0x20
3026+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3027+ li $x30,0x30
3028+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3029+ li $x40,0x40
3030+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3031+ li $x50,0x50
3032+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3033+ li $x60,0x60
3034+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3035+ li $x70,0x70
3036+ mtspr 256,r0
3037+
3038+ subi $rounds,$rounds,3 # -4 in total
3039+
3040+ lvx $rndkey0,$x00,$key1 # load key schedule
3041+ lvx v30,$x10,$key1
3042+ addi $key1,$key1,0x20
3043+ lvx v31,$x00,$key1
3044+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
3045+ addi $key_,$sp,$FRAME+15
3046+ mtctr $rounds
3047+
3048+Load_xts_dec_key:
3049+ ?vperm v24,v30,v31,$keyperm
3050+ lvx v30,$x10,$key1
3051+ addi $key1,$key1,0x20
3052+ stvx v24,$x00,$key_ # off-load round[1]
3053+ ?vperm v25,v31,v30,$keyperm
3054+ lvx v31,$x00,$key1
3055+ stvx v25,$x10,$key_ # off-load round[2]
3056+ addi $key_,$key_,0x20
3057+ bdnz Load_xts_dec_key
3058+
3059+ lvx v26,$x10,$key1
3060+ ?vperm v24,v30,v31,$keyperm
3061+ lvx v27,$x20,$key1
3062+ stvx v24,$x00,$key_ # off-load round[3]
3063+ ?vperm v25,v31,v26,$keyperm
3064+ lvx v28,$x30,$key1
3065+ stvx v25,$x10,$key_ # off-load round[4]
3066+ addi $key_,$sp,$FRAME+15 # rewind $key_
3067+ ?vperm v26,v26,v27,$keyperm
3068+ lvx v29,$x40,$key1
3069+ ?vperm v27,v27,v28,$keyperm
3070+ lvx v30,$x50,$key1
3071+ ?vperm v28,v28,v29,$keyperm
3072+ lvx v31,$x60,$key1
3073+ ?vperm v29,v29,v30,$keyperm
3074+ lvx $twk5,$x70,$key1 # borrow $twk5
3075+ ?vperm v30,v30,v31,$keyperm
3076+ lvx v24,$x00,$key_ # pre-load round[1]
3077+ ?vperm v31,v31,$twk5,$keyperm
3078+ lvx v25,$x10,$key_ # pre-load round[2]
3079+
3080+ vperm $in0,$inout,$inptail,$inpperm
3081+ subi $inp,$inp,31 # undo "caller"
3082+ vxor $twk0,$tweak,$rndkey0
3083+ vsrab $tmp,$tweak,$seven # next tweak value
3084+ vaddubm $tweak,$tweak,$tweak
3085+ vsldoi $tmp,$tmp,$tmp,15
3086+ vand $tmp,$tmp,$eighty7
3087+ vxor $out0,$in0,$twk0
3088+ vxor $tweak,$tweak,$tmp
3089+
3090+ lvx_u $in1,$x10,$inp
3091+ vxor $twk1,$tweak,$rndkey0
3092+ vsrab $tmp,$tweak,$seven # next tweak value
3093+ vaddubm $tweak,$tweak,$tweak
3094+ vsldoi $tmp,$tmp,$tmp,15
3095+ le?vperm $in1,$in1,$in1,$leperm
3096+ vand $tmp,$tmp,$eighty7
3097+ vxor $out1,$in1,$twk1
3098+ vxor $tweak,$tweak,$tmp
3099+
3100+ lvx_u $in2,$x20,$inp
3101+ andi. $taillen,$len,15
3102+ vxor $twk2,$tweak,$rndkey0
3103+ vsrab $tmp,$tweak,$seven # next tweak value
3104+ vaddubm $tweak,$tweak,$tweak
3105+ vsldoi $tmp,$tmp,$tmp,15
3106+ le?vperm $in2,$in2,$in2,$leperm
3107+ vand $tmp,$tmp,$eighty7
3108+ vxor $out2,$in2,$twk2
3109+ vxor $tweak,$tweak,$tmp
3110+
3111+ lvx_u $in3,$x30,$inp
3112+ sub $len,$len,$taillen
3113+ vxor $twk3,$tweak,$rndkey0
3114+ vsrab $tmp,$tweak,$seven # next tweak value
3115+ vaddubm $tweak,$tweak,$tweak
3116+ vsldoi $tmp,$tmp,$tmp,15
3117+ le?vperm $in3,$in3,$in3,$leperm
3118+ vand $tmp,$tmp,$eighty7
3119+ vxor $out3,$in3,$twk3
3120+ vxor $tweak,$tweak,$tmp
3121+
3122+ lvx_u $in4,$x40,$inp
3123+ subi $len,$len,0x60
3124+ vxor $twk4,$tweak,$rndkey0
3125+ vsrab $tmp,$tweak,$seven # next tweak value
3126+ vaddubm $tweak,$tweak,$tweak
3127+ vsldoi $tmp,$tmp,$tmp,15
3128+ le?vperm $in4,$in4,$in4,$leperm
3129+ vand $tmp,$tmp,$eighty7
3130+ vxor $out4,$in4,$twk4
3131+ vxor $tweak,$tweak,$tmp
3132+
3133+ lvx_u $in5,$x50,$inp
3134+ addi $inp,$inp,0x60
3135+ vxor $twk5,$tweak,$rndkey0
3136+ vsrab $tmp,$tweak,$seven # next tweak value
3137+ vaddubm $tweak,$tweak,$tweak
3138+ vsldoi $tmp,$tmp,$tmp,15
3139+ le?vperm $in5,$in5,$in5,$leperm
3140+ vand $tmp,$tmp,$eighty7
3141+ vxor $out5,$in5,$twk5
3142+ vxor $tweak,$tweak,$tmp
3143+
3144+ vxor v31,v31,$rndkey0
3145+ mtctr $rounds
3146+ b Loop_xts_dec6x
3147+
3148+.align 5
3149+Loop_xts_dec6x:
3150+ vncipher $out0,$out0,v24
3151+ vncipher $out1,$out1,v24
3152+ vncipher $out2,$out2,v24
3153+ vncipher $out3,$out3,v24
3154+ vncipher $out4,$out4,v24
3155+ vncipher $out5,$out5,v24
3156+ lvx v24,$x20,$key_ # round[3]
3157+ addi $key_,$key_,0x20
3158+
3159+ vncipher $out0,$out0,v25
3160+ vncipher $out1,$out1,v25
3161+ vncipher $out2,$out2,v25
3162+ vncipher $out3,$out3,v25
3163+ vncipher $out4,$out4,v25
3164+ vncipher $out5,$out5,v25
3165+ lvx v25,$x10,$key_ # round[4]
3166+ bdnz Loop_xts_dec6x
3167+
3168+ subic $len,$len,96 # $len-=96
3169+ vxor $in0,$twk0,v31 # xor with last round key
3170+ vncipher $out0,$out0,v24
3171+ vncipher $out1,$out1,v24
3172+ vsrab $tmp,$tweak,$seven # next tweak value
3173+ vxor $twk0,$tweak,$rndkey0
3174+ vaddubm $tweak,$tweak,$tweak
3175+ vncipher $out2,$out2,v24
3176+ vncipher $out3,$out3,v24
3177+ vsldoi $tmp,$tmp,$tmp,15
3178+ vncipher $out4,$out4,v24
3179+ vncipher $out5,$out5,v24
3180+
3181+ subfe. r0,r0,r0 # borrow?-1:0
3182+ vand $tmp,$tmp,$eighty7
3183+ vncipher $out0,$out0,v25
3184+ vncipher $out1,$out1,v25
3185+ vxor $tweak,$tweak,$tmp
3186+ vncipher $out2,$out2,v25
3187+ vncipher $out3,$out3,v25
3188+ vxor $in1,$twk1,v31
3189+ vsrab $tmp,$tweak,$seven # next tweak value
3190+ vxor $twk1,$tweak,$rndkey0
3191+ vncipher $out4,$out4,v25
3192+ vncipher $out5,$out5,v25
3193+
3194+ and r0,r0,$len
3195+ vaddubm $tweak,$tweak,$tweak
3196+ vsldoi $tmp,$tmp,$tmp,15
3197+ vncipher $out0,$out0,v26
3198+ vncipher $out1,$out1,v26
3199+ vand $tmp,$tmp,$eighty7
3200+ vncipher $out2,$out2,v26
3201+ vncipher $out3,$out3,v26
3202+ vxor $tweak,$tweak,$tmp
3203+ vncipher $out4,$out4,v26
3204+ vncipher $out5,$out5,v26
3205+
3206+ add $inp,$inp,r0 # $inp is adjusted in such
3207+ # way that at exit from the
3208+ # loop inX-in5 are loaded
3209+ # with last "words"
3210+ vxor $in2,$twk2,v31
3211+ vsrab $tmp,$tweak,$seven # next tweak value
3212+ vxor $twk2,$tweak,$rndkey0
3213+ vaddubm $tweak,$tweak,$tweak
3214+ vncipher $out0,$out0,v27
3215+ vncipher $out1,$out1,v27
3216+ vsldoi $tmp,$tmp,$tmp,15
3217+ vncipher $out2,$out2,v27
3218+ vncipher $out3,$out3,v27
3219+ vand $tmp,$tmp,$eighty7
3220+ vncipher $out4,$out4,v27
3221+ vncipher $out5,$out5,v27
3222+
3223+ addi $key_,$sp,$FRAME+15 # rewind $key_
3224+ vxor $tweak,$tweak,$tmp
3225+ vncipher $out0,$out0,v28
3226+ vncipher $out1,$out1,v28
3227+ vxor $in3,$twk3,v31
3228+ vsrab $tmp,$tweak,$seven # next tweak value
3229+ vxor $twk3,$tweak,$rndkey0
3230+ vncipher $out2,$out2,v28
3231+ vncipher $out3,$out3,v28
3232+ vaddubm $tweak,$tweak,$tweak
3233+ vsldoi $tmp,$tmp,$tmp,15
3234+ vncipher $out4,$out4,v28
3235+ vncipher $out5,$out5,v28
3236+ lvx v24,$x00,$key_ # re-pre-load round[1]
3237+ vand $tmp,$tmp,$eighty7
3238+
3239+ vncipher $out0,$out0,v29
3240+ vncipher $out1,$out1,v29
3241+ vxor $tweak,$tweak,$tmp
3242+ vncipher $out2,$out2,v29
3243+ vncipher $out3,$out3,v29
3244+ vxor $in4,$twk4,v31
3245+ vsrab $tmp,$tweak,$seven # next tweak value
3246+ vxor $twk4,$tweak,$rndkey0
3247+ vncipher $out4,$out4,v29
3248+ vncipher $out5,$out5,v29
3249+ lvx v25,$x10,$key_ # re-pre-load round[2]
3250+ vaddubm $tweak,$tweak,$tweak
3251+ vsldoi $tmp,$tmp,$tmp,15
3252+
3253+ vncipher $out0,$out0,v30
3254+ vncipher $out1,$out1,v30
3255+ vand $tmp,$tmp,$eighty7
3256+ vncipher $out2,$out2,v30
3257+ vncipher $out3,$out3,v30
3258+ vxor $tweak,$tweak,$tmp
3259+ vncipher $out4,$out4,v30
3260+ vncipher $out5,$out5,v30
3261+ vxor $in5,$twk5,v31
3262+ vsrab $tmp,$tweak,$seven # next tweak value
3263+ vxor $twk5,$tweak,$rndkey0
3264+
3265+ vncipherlast $out0,$out0,$in0
3266+ lvx_u $in0,$x00,$inp # load next input block
3267+ vaddubm $tweak,$tweak,$tweak
3268+ vsldoi $tmp,$tmp,$tmp,15
3269+ vncipherlast $out1,$out1,$in1
3270+ lvx_u $in1,$x10,$inp
3271+ vncipherlast $out2,$out2,$in2
3272+ le?vperm $in0,$in0,$in0,$leperm
3273+ lvx_u $in2,$x20,$inp
3274+ vand $tmp,$tmp,$eighty7
3275+ vncipherlast $out3,$out3,$in3
3276+ le?vperm $in1,$in1,$in1,$leperm
3277+ lvx_u $in3,$x30,$inp
3278+ vncipherlast $out4,$out4,$in4
3279+ le?vperm $in2,$in2,$in2,$leperm
3280+ lvx_u $in4,$x40,$inp
3281+ vxor $tweak,$tweak,$tmp
3282+ vncipherlast $out5,$out5,$in5
3283+ le?vperm $in3,$in3,$in3,$leperm
3284+ lvx_u $in5,$x50,$inp
3285+ addi $inp,$inp,0x60
3286+ le?vperm $in4,$in4,$in4,$leperm
3287+ le?vperm $in5,$in5,$in5,$leperm
3288+
3289+ le?vperm $out0,$out0,$out0,$leperm
3290+ le?vperm $out1,$out1,$out1,$leperm
3291+ stvx_u $out0,$x00,$out # store output
3292+ vxor $out0,$in0,$twk0
3293+ le?vperm $out2,$out2,$out2,$leperm
3294+ stvx_u $out1,$x10,$out
3295+ vxor $out1,$in1,$twk1
3296+ le?vperm $out3,$out3,$out3,$leperm
3297+ stvx_u $out2,$x20,$out
3298+ vxor $out2,$in2,$twk2
3299+ le?vperm $out4,$out4,$out4,$leperm
3300+ stvx_u $out3,$x30,$out
3301+ vxor $out3,$in3,$twk3
3302+ le?vperm $out5,$out5,$out5,$leperm
3303+ stvx_u $out4,$x40,$out
3304+ vxor $out4,$in4,$twk4
3305+ stvx_u $out5,$x50,$out
3306+ vxor $out5,$in5,$twk5
3307+ addi $out,$out,0x60
3308+
3309+ mtctr $rounds
3310+ beq Loop_xts_dec6x # did $len-=96 borrow?
3311+
3312+ addic. $len,$len,0x60
3313+ beq Lxts_dec6x_zero
3314+ cmpwi $len,0x20
3315+ blt Lxts_dec6x_one
3316+ nop
3317+ beq Lxts_dec6x_two
3318+ cmpwi $len,0x40
3319+ blt Lxts_dec6x_three
3320+ nop
3321+ beq Lxts_dec6x_four
3322+
3323+Lxts_dec6x_five:
3324+ vxor $out0,$in1,$twk0
3325+ vxor $out1,$in2,$twk1
3326+ vxor $out2,$in3,$twk2
3327+ vxor $out3,$in4,$twk3
3328+ vxor $out4,$in5,$twk4
3329+
3330+ bl _aesp8_xts_dec5x
3331+
3332+ le?vperm $out0,$out0,$out0,$leperm
3333+ vmr $twk0,$twk5 # unused tweak
3334+ vxor $twk1,$tweak,$rndkey0
3335+ le?vperm $out1,$out1,$out1,$leperm
3336+ stvx_u $out0,$x00,$out # store output
3337+ vxor $out0,$in0,$twk1
3338+ le?vperm $out2,$out2,$out2,$leperm
3339+ stvx_u $out1,$x10,$out
3340+ le?vperm $out3,$out3,$out3,$leperm
3341+ stvx_u $out2,$x20,$out
3342+ le?vperm $out4,$out4,$out4,$leperm
3343+ stvx_u $out3,$x30,$out
3344+ stvx_u $out4,$x40,$out
3345+ addi $out,$out,0x50
3346+ bne Lxts_dec6x_steal
3347+ b Lxts_dec6x_done
3348+
3349+.align 4
3350+Lxts_dec6x_four:
3351+ vxor $out0,$in2,$twk0
3352+ vxor $out1,$in3,$twk1
3353+ vxor $out2,$in4,$twk2
3354+ vxor $out3,$in5,$twk3
3355+ vxor $out4,$out4,$out4
3356+
3357+ bl _aesp8_xts_dec5x
3358+
3359+ le?vperm $out0,$out0,$out0,$leperm
3360+ vmr $twk0,$twk4 # unused tweak
3361+ vmr $twk1,$twk5
3362+ le?vperm $out1,$out1,$out1,$leperm
3363+ stvx_u $out0,$x00,$out # store output
3364+ vxor $out0,$in0,$twk5
3365+ le?vperm $out2,$out2,$out2,$leperm
3366+ stvx_u $out1,$x10,$out
3367+ le?vperm $out3,$out3,$out3,$leperm
3368+ stvx_u $out2,$x20,$out
3369+ stvx_u $out3,$x30,$out
3370+ addi $out,$out,0x40
3371+ bne Lxts_dec6x_steal
3372+ b Lxts_dec6x_done
3373+
3374+.align 4
3375+Lxts_dec6x_three:
3376+ vxor $out0,$in3,$twk0
3377+ vxor $out1,$in4,$twk1
3378+ vxor $out2,$in5,$twk2
3379+ vxor $out3,$out3,$out3
3380+ vxor $out4,$out4,$out4
3381+
3382+ bl _aesp8_xts_dec5x
3383+
3384+ le?vperm $out0,$out0,$out0,$leperm
3385+ vmr $twk0,$twk3 # unused tweak
3386+ vmr $twk1,$twk4
3387+ le?vperm $out1,$out1,$out1,$leperm
3388+ stvx_u $out0,$x00,$out # store output
3389+ vxor $out0,$in0,$twk4
3390+ le?vperm $out2,$out2,$out2,$leperm
3391+ stvx_u $out1,$x10,$out
3392+ stvx_u $out2,$x20,$out
3393+ addi $out,$out,0x30
3394+ bne Lxts_dec6x_steal
3395+ b Lxts_dec6x_done
3396+
3397+.align 4
3398+Lxts_dec6x_two:
3399+ vxor $out0,$in4,$twk0
3400+ vxor $out1,$in5,$twk1
3401+ vxor $out2,$out2,$out2
3402+ vxor $out3,$out3,$out3
3403+ vxor $out4,$out4,$out4
3404+
3405+ bl _aesp8_xts_dec5x
3406+
3407+ le?vperm $out0,$out0,$out0,$leperm
3408+ vmr $twk0,$twk2 # unused tweak
3409+ vmr $twk1,$twk3
3410+ le?vperm $out1,$out1,$out1,$leperm
3411+ stvx_u $out0,$x00,$out # store output
3412+ vxor $out0,$in0,$twk3
3413+ stvx_u $out1,$x10,$out
3414+ addi $out,$out,0x20
3415+ bne Lxts_dec6x_steal
3416+ b Lxts_dec6x_done
3417+
3418+.align 4
3419+Lxts_dec6x_one:
3420+ vxor $out0,$in5,$twk0
3421+ nop
3422+Loop_xts_dec1x:
3423+ vncipher $out0,$out0,v24
3424+ lvx v24,$x20,$key_ # round[3]
3425+ addi $key_,$key_,0x20
3426+
3427+ vncipher $out0,$out0,v25
3428+ lvx v25,$x10,$key_ # round[4]
3429+ bdnz Loop_xts_dec1x
3430+
3431+ subi r0,$taillen,1
3432+ vncipher $out0,$out0,v24
3433+
3434+ andi. r0,r0,16
3435+ cmpwi $taillen,0
3436+ vncipher $out0,$out0,v25
3437+
3438+ sub $inp,$inp,r0
3439+ vncipher $out0,$out0,v26
3440+
3441+ lvx_u $in0,0,$inp
3442+ vncipher $out0,$out0,v27
3443+
3444+ addi $key_,$sp,$FRAME+15 # rewind $key_
3445+ vncipher $out0,$out0,v28
3446+ lvx v24,$x00,$key_ # re-pre-load round[1]
3447+
3448+ vncipher $out0,$out0,v29
3449+ lvx v25,$x10,$key_ # re-pre-load round[2]
3450+ vxor $twk0,$twk0,v31
3451+
3452+ le?vperm $in0,$in0,$in0,$leperm
3453+ vncipher $out0,$out0,v30
3454+
3455+ mtctr $rounds
3456+ vncipherlast $out0,$out0,$twk0
3457+
3458+ vmr $twk0,$twk1 # unused tweak
3459+ vmr $twk1,$twk2
3460+ le?vperm $out0,$out0,$out0,$leperm
3461+ stvx_u $out0,$x00,$out # store output
3462+ addi $out,$out,0x10
3463+ vxor $out0,$in0,$twk2
3464+ bne Lxts_dec6x_steal
3465+ b Lxts_dec6x_done
3466+
3467+.align 4
3468+Lxts_dec6x_zero:
3469+ cmpwi $taillen,0
3470+ beq Lxts_dec6x_done
3471+
3472+ lvx_u $in0,0,$inp
3473+ le?vperm $in0,$in0,$in0,$leperm
3474+ vxor $out0,$in0,$twk1
3475+Lxts_dec6x_steal:
3476+ vncipher $out0,$out0,v24
3477+ lvx v24,$x20,$key_ # round[3]
3478+ addi $key_,$key_,0x20
3479+
3480+ vncipher $out0,$out0,v25
3481+ lvx v25,$x10,$key_ # round[4]
3482+ bdnz Lxts_dec6x_steal
3483+
3484+ add $inp,$inp,$taillen
3485+ vncipher $out0,$out0,v24
3486+
3487+ cmpwi $taillen,0
3488+ vncipher $out0,$out0,v25
3489+
3490+ lvx_u $in0,0,$inp
3491+ vncipher $out0,$out0,v26
3492+
3493+ lvsr $inpperm,0,$taillen # $in5 is no more
3494+ vncipher $out0,$out0,v27
3495+
3496+ addi $key_,$sp,$FRAME+15 # rewind $key_
3497+ vncipher $out0,$out0,v28
3498+ lvx v24,$x00,$key_ # re-pre-load round[1]
3499+
3500+ vncipher $out0,$out0,v29
3501+ lvx v25,$x10,$key_ # re-pre-load round[2]
3502+ vxor $twk1,$twk1,v31
3503+
3504+ le?vperm $in0,$in0,$in0,$leperm
3505+ vncipher $out0,$out0,v30
3506+
3507+ vperm $in0,$in0,$in0,$inpperm
3508+ vncipherlast $tmp,$out0,$twk1
3509+
3510+ le?vperm $out0,$tmp,$tmp,$leperm
3511+ le?stvx_u $out0,0,$out
3512+ be?stvx_u $tmp,0,$out
3513+
3514+ vxor $out0,$out0,$out0
3515+ vspltisb $out1,-1
3516+ vperm $out0,$out0,$out1,$inpperm
3517+ vsel $out0,$in0,$tmp,$out0
3518+ vxor $out0,$out0,$twk0
3519+
3520+ subi r3,$out,1
3521+ mtctr $taillen
3522+Loop_xts_dec6x_steal:
3523+ lbzu r0,1(r3)
3524+ stb r0,16(r3)
3525+ bdnz Loop_xts_dec6x_steal
3526+
3527+ li $taillen,0
3528+ mtctr $rounds
3529+ b Loop_xts_dec1x # one more time...
3530+
3531+.align 4
3532+Lxts_dec6x_done:
3533+ mtlr r7
3534+ li r10,`$FRAME+15`
3535+ li r11,`$FRAME+31`
3536+ stvx $seven,r10,$sp # wipe copies of round keys
3537+ addi r10,r10,32
3538+ stvx $seven,r11,$sp
3539+ addi r11,r11,32
3540+ stvx $seven,r10,$sp
3541+ addi r10,r10,32
3542+ stvx $seven,r11,$sp
3543+ addi r11,r11,32
3544+ stvx $seven,r10,$sp
3545+ addi r10,r10,32
3546+ stvx $seven,r11,$sp
3547+ addi r11,r11,32
3548+ stvx $seven,r10,$sp
3549+ addi r10,r10,32
3550+ stvx $seven,r11,$sp
3551+ addi r11,r11,32
3552+
3553+ mtspr 256,$vrsave
3554+ lvx v20,r10,$sp # ABI says so
3555+ addi r10,r10,32
3556+ lvx v21,r11,$sp
3557+ addi r11,r11,32
3558+ lvx v22,r10,$sp
3559+ addi r10,r10,32
3560+ lvx v23,r11,$sp
3561+ addi r11,r11,32
3562+ lvx v24,r10,$sp
3563+ addi r10,r10,32
3564+ lvx v25,r11,$sp
3565+ addi r11,r11,32
3566+ lvx v26,r10,$sp
3567+ addi r10,r10,32
3568+ lvx v27,r11,$sp
3569+ addi r11,r11,32
3570+ lvx v28,r10,$sp
3571+ addi r10,r10,32
3572+ lvx v29,r11,$sp
3573+ addi r11,r11,32
3574+ lvx v30,r10,$sp
3575+ lvx v31,r11,$sp
3576+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3577+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3578+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3579+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3580+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3581+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3582+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3583+ blr
3584+ .long 0
3585+ .byte 0,12,0x04,1,0x80,6,6,0
3586+ .long 0
3587+
3588+.align 5
3589+_aesp8_xts_dec5x:
3590+ vncipher $out0,$out0,v24
3591+ vncipher $out1,$out1,v24
3592+ vncipher $out2,$out2,v24
3593+ vncipher $out3,$out3,v24
3594+ vncipher $out4,$out4,v24
3595+ lvx v24,$x20,$key_ # round[3]
3596+ addi $key_,$key_,0x20
3597+
3598+ vncipher $out0,$out0,v25
3599+ vncipher $out1,$out1,v25
3600+ vncipher $out2,$out2,v25
3601+ vncipher $out3,$out3,v25
3602+ vncipher $out4,$out4,v25
3603+ lvx v25,$x10,$key_ # round[4]
3604+ bdnz _aesp8_xts_dec5x
3605+
3606+ subi r0,$taillen,1
3607+ vncipher $out0,$out0,v24
3608+ vncipher $out1,$out1,v24
3609+ vncipher $out2,$out2,v24
3610+ vncipher $out3,$out3,v24
3611+ vncipher $out4,$out4,v24
3612+
3613+ andi. r0,r0,16
3614+ cmpwi $taillen,0
3615+ vncipher $out0,$out0,v25
3616+ vncipher $out1,$out1,v25
3617+ vncipher $out2,$out2,v25
3618+ vncipher $out3,$out3,v25
3619+ vncipher $out4,$out4,v25
3620+ vxor $twk0,$twk0,v31
3621+
3622+ sub $inp,$inp,r0
3623+ vncipher $out0,$out0,v26
3624+ vncipher $out1,$out1,v26
3625+ vncipher $out2,$out2,v26
3626+ vncipher $out3,$out3,v26
3627+ vncipher $out4,$out4,v26
3628+ vxor $in1,$twk1,v31
3629+
3630+ vncipher $out0,$out0,v27
3631+ lvx_u $in0,0,$inp
3632+ vncipher $out1,$out1,v27
3633+ vncipher $out2,$out2,v27
3634+ vncipher $out3,$out3,v27
3635+ vncipher $out4,$out4,v27
3636+ vxor $in2,$twk2,v31
3637+
3638+ addi $key_,$sp,$FRAME+15 # rewind $key_
3639+ vncipher $out0,$out0,v28
3640+ vncipher $out1,$out1,v28
3641+ vncipher $out2,$out2,v28
3642+ vncipher $out3,$out3,v28
3643+ vncipher $out4,$out4,v28
3644+ lvx v24,$x00,$key_ # re-pre-load round[1]
3645+ vxor $in3,$twk3,v31
3646+
3647+ vncipher $out0,$out0,v29
3648+ le?vperm $in0,$in0,$in0,$leperm
3649+ vncipher $out1,$out1,v29
3650+ vncipher $out2,$out2,v29
3651+ vncipher $out3,$out3,v29
3652+ vncipher $out4,$out4,v29
3653+ lvx v25,$x10,$key_ # re-pre-load round[2]
3654+ vxor $in4,$twk4,v31
3655+
3656+ vncipher $out0,$out0,v30
3657+ vncipher $out1,$out1,v30
3658+ vncipher $out2,$out2,v30
3659+ vncipher $out3,$out3,v30
3660+ vncipher $out4,$out4,v30
3661+
3662+ vncipherlast $out0,$out0,$twk0
3663+ vncipherlast $out1,$out1,$in1
3664+ vncipherlast $out2,$out2,$in2
3665+ vncipherlast $out3,$out3,$in3
3666+ vncipherlast $out4,$out4,$in4
3667+ mtctr $rounds
3668+ blr
3669+ .long 0
3670+ .byte 0,12,0x14,0,0,0,0,0
3671+___
3672+}} }}}
3673+
3674+my $consts=1;
3675+foreach(split("\n",$code)) {
3676+ s/\`([^\`]*)\`/eval($1)/geo;
3677+
3678+ # constants table endian-specific conversion
3679+ if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3680+ my $conv=$3;
3681+ my @bytes=();
3682+
3683+ # convert to endian-agnostic format
3684+ if ($1 eq "long") {
3685+ foreach (split(/,\s*/,$2)) {
3686+ my $l = /^0/?oct:int;
3687+ push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3688+ }
3689+ } else {
3690+ @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3691+ }
3692+
3693+ # little-endian conversion
3694+ if ($flavour =~ /le$/o) {
3695+ SWITCH: for($conv) {
3696+ /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
3697+ /\?rev/ && do { @bytes=reverse(@bytes); last; };
3698+ }
3699+ }
3700+
3701+ #emit
3702+ print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3703+ next;
3704+ }
3705+ $consts=0 if (m/Lconsts:/o); # end of table
3706+
3707+ # instructions prefixed with '?' are endian-specific and need
3708+ # to be adjusted accordingly...
3709+ if ($flavour =~ /le$/o) { # little-endian
3710+ s/le\?//o or
3711+ s/be\?/#be#/o or
3712+ s/\?lvsr/lvsl/o or
3713+ s/\?lvsl/lvsr/o or
3714+ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3715+ s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3716+ s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3717+ } else { # big-endian
3718+ s/le\?/#le#/o or
3719+ s/be\?//o or
3720+ s/\?([a-z]+)/$1/o;
3721+ }
3722+
3723+ print $_,"\n";
3724+}
3725+
3726+close STDOUT;
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -191,7 +191,7 @@ L1st:
191191
192192 addi $j,$j,$BNSZ ; j++
193193 addi $tp,$tp,$BNSZ ; tp++
194- bdnz- L1st
194+ bdnz L1st
195195 ;L1st
196196 addc $lo0,$alo,$hi0
197197 addze $hi0,$ahi
@@ -253,7 +253,7 @@ Linner:
253253 addze $hi1,$hi1
254254 $ST $lo1,0($tp) ; tp[j-1]
255255 addi $tp,$tp,$BNSZ ; tp++
256- bdnz- Linner
256+ bdnz Linner
257257 ;Linner
258258 $LD $tj,$BNSZ($tp) ; tp[j]
259259 addc $lo0,$alo,$hi0
@@ -276,7 +276,7 @@ Linner:
276276 slwi $tj,$num,`log($BNSZ)/log(2)`
277277 $UCMP $i,$tj
278278 addi $i,$i,$BNSZ
279- ble- Louter
279+ ble Louter
280280
281281 addi $num,$num,2 ; restore $num
282282 subfc $j,$j,$j ; j=0 and "clear" XER[CA]
@@ -289,7 +289,7 @@ Lsub: $LDX $tj,$tp,$j
289289 subfe $aj,$nj,$tj ; tp[j]-np[j]
290290 $STX $aj,$rp,$j
291291 addi $j,$j,$BNSZ
292- bdnz- Lsub
292+ bdnz Lsub
293293
294294 li $j,0
295295 mtctr $num
@@ -304,7 +304,7 @@ Lcopy: ; copy or in-place refresh
304304 $STX $tj,$rp,$j
305305 $STX $j,$tp,$j ; zap at once
306306 addi $j,$j,$BNSZ
307- bdnz- Lcopy
307+ bdnz Lcopy
308308
309309 $POP $tj,0($sp)
310310 li r3,1
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl
@@ -1552,7 +1552,7 @@ Lppcasm_sub_mainloop:
15521552 # if carry = 1 this is r7-r8. Else it
15531553 # is r7-r8 -1 as we need.
15541554 $STU r6,$BNSZ(r3)
1555- bdnz- Lppcasm_sub_mainloop
1555+ bdnz Lppcasm_sub_mainloop
15561556 Lppcasm_sub_adios:
15571557 subfze r3,r0 # if carry bit is set then r3 = 0 else -1
15581558 andi. r3,r3,1 # keep only last bit.
@@ -1598,7 +1598,7 @@ Lppcasm_add_mainloop:
15981598 $LDU r8,$BNSZ(r5)
15991599 adde r8,r7,r8
16001600 $STU r8,$BNSZ(r3)
1601- bdnz- Lppcasm_add_mainloop
1601+ bdnz Lppcasm_add_mainloop
16021602 Lppcasm_add_adios:
16031603 addze r3,r0 #return carry bit.
16041604 blr
@@ -1755,7 +1755,7 @@ Lppcasm_sqr_mainloop:
17551755 $UMULH r8,r6,r6
17561756 $STU r7,$BNSZ(r3)
17571757 $STU r8,$BNSZ(r3)
1758- bdnz- Lppcasm_sqr_mainloop
1758+ bdnz Lppcasm_sqr_mainloop
17591759 Lppcasm_sqr_adios:
17601760 blr
17611761 .long 0
@@ -1819,7 +1819,7 @@ Lppcasm_mw_LOOP:
18191819
18201820 addi r3,r3,`4*$BNSZ`
18211821 addi r4,r4,`4*$BNSZ`
1822- bdnz- Lppcasm_mw_LOOP
1822+ bdnz Lppcasm_mw_LOOP
18231823
18241824 Lppcasm_mw_REM:
18251825 andi. r5,r5,0x3
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -561,7 +561,7 @@ $code.=<<___;
561561 stfd $T3b,`$FRAME+56`($sp)
562562 std $t0,8($tp) ; tp[j-1]
563563 stdu $t4,16($tp) ; tp[j]
564- bdnz- L1st
564+ bdnz L1st
565565
566566 fctid $dota,$dota
567567 fctid $dotb,$dotb
@@ -856,7 +856,7 @@ $code.=<<___;
856856 addze $carry,$carry
857857 std $t3,-16($tp) ; tp[j-1]
858858 std $t5,-8($tp) ; tp[j]
859- bdnz- Linner
859+ bdnz Linner
860860
861861 fctid $dota,$dota
862862 fctid $dotb,$dotb
@@ -954,7 +954,7 @@ Lsub: ldx $t0,$tp,$i
954954 stdx $t0,$rp,$i
955955 stdx $t2,$t6,$i
956956 addi $i,$i,16
957- bdnz- Lsub
957+ bdnz Lsub
958958
959959 li $i,0
960960 subfe $ovf,$i,$ovf ; handle upmost overflow bit
@@ -981,7 +981,7 @@ Lcopy: ; copy or in-place refresh
981981 stdx $i,$tp,$i ; zap tp at once
982982 stdx $i,$t4,$i
983983 addi $i,$i,16
984- bdnz- Lcopy
984+ bdnz Lcopy
985985 ___
986986 $code.=<<___ if ($SIZE_T==4);
987987 subf $np,$num,$np ; rewind np
@@ -1014,7 +1014,7 @@ Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
10141014 stw $t5,8($rp)
10151015 stw $t6,12($rp)
10161016 stwu $t7,16($rp)
1017- bdnz- Lsub
1017+ bdnz Lsub
10181018
10191019 li $i,0
10201020 subfe $ovf,$i,$ovf ; handle upmost overflow bit
@@ -1046,7 +1046,7 @@ Lcopy: ; copy or in-place refresh
10461046 stwu $t3,16($rp)
10471047 std $i,8($tp) ; zap tp at once
10481048 stdu $i,16($tp)
1049- bdnz- Lcopy
1049+ bdnz Lcopy
10501050 ___
10511051
10521052 $code.=<<___;
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -140,6 +140,19 @@ void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
140140 const unsigned char ivec[AES_BLOCK_SIZE]);
141141 #endif
142142
143+#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
144+extern int OPENSSL_ppccap_P;
145+# define HWAES_CAPABLE (OPENSSL_ppccap_P & (1<<2))
146+# define HWAES_set_encrypt_key aes_p8_set_encrypt_key
147+# define HWAES_set_decrypt_key aes_p8_set_decrypt_key
148+# define HWAES_encrypt aes_p8_encrypt
149+# define HWAES_decrypt aes_p8_decrypt
150+# define HWAES_cbc_encrypt aes_p8_cbc_encrypt
151+# define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
152+# define HWAES_xts_encrypt aes_p8_xts_encrypt
153+# define HWAES_xts_decrypt aes_p8_xts_decrypt
154+#endif
155+
143156 #if defined(AES_ASM) && !defined(I386_ONLY) && ( \
144157 ((defined(__i386) || defined(__i386__) || \
145158 defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
@@ -498,6 +511,13 @@ void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
498511 unsigned char *ivec, const int enc);
499512 void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
500513 size_t len, const AES_KEY *key, const unsigned char ivec[16]);
514+void HWAES_xts_encrypt(const unsigned char *inp, unsigned char *out,
515+ size_t len, const AES_KEY *key1,
516+ const AES_KEY *key2, const unsigned char iv[16]);
517+void HWAES_xts_decrypt(const unsigned char *inp, unsigned char *out,
518+ size_t len, const AES_KEY *key1,
519+ const AES_KEY *key2, const unsigned char iv[16]);
520+
501521 #endif
502522
503523 #define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
@@ -1131,11 +1151,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
11311151 {
11321152 HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
11331153 xctx->xts.block1 = (block128_f)HWAES_encrypt;
1154+#ifdef HWAES_xts_encrypt
1155+ xctx->stream = HWAES_xts_encrypt;
1156+#endif
11341157 }
11351158 else
11361159 {
11371160 HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
11381161 xctx->xts.block1 = (block128_f)HWAES_decrypt;
1162+#ifdef HWAES_xts_decrypt
1163+ xctx->stream = HWAES_xts_decrypt;
1164+#endif
11391165 }
11401166
11411167 HWAES_set_encrypt_key(key + ctx->key_len/2,
--- a/crypto/modes/Makefile
+++ b/crypto/modes/Makefile
@@ -58,6 +58,8 @@ ghash-parisc.s: asm/ghash-parisc.pl
5858 $(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
5959 ghashv8-armx.S: asm/ghashv8-armx.pl
6060 $(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
61+ghashp8-ppc.s: asm/ghashp8-ppc.pl
62+ $(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@
6163
6264 # GNU make "catch all"
6365 ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
--- /dev/null
+++ b/crypto/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,663 @@
1+#!/usr/bin/env perl
2+#
3+# ====================================================================
4+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+#
10+# GHASH for for PowerISA v2.07.
11+#
12+# July 2014
13+#
14+# Accurate performance measurements are problematic, because it's
15+# always virtualized setup with possibly throttled processor.
16+# Relative comparison is therefore more informative. This initial
17+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
18+# faster than "4-bit" integer-only compiler-generated 64-bit code.
19+# "Initial version" means that there is room for futher improvement.
20+
21+# May 2016
22+#
23+# 2x aggregated reduction improves performance by 50% (resulting
24+# performance on POWER8 is 1 cycle per processed byte), and 4x
25+# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
26+
27+$flavour=shift;
28+$output =shift;
29+
30+if ($flavour =~ /64/) {
31+ $SIZE_T=8;
32+ $LRSAVE=2*$SIZE_T;
33+ $STU="stdu";
34+ $POP="ld";
35+ $PUSH="std";
36+ $UCMP="cmpld";
37+ $SHRI="srdi";
38+} elsif ($flavour =~ /32/) {
39+ $SIZE_T=4;
40+ $LRSAVE=$SIZE_T;
41+ $STU="stwu";
42+ $POP="lwz";
43+ $PUSH="stw";
44+ $UCMP="cmplw";
45+ $SHRI="srwi";
46+} else { die "nonsense $flavour"; }
47+
48+$sp="r1";
49+$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
50+
51+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
53+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
54+die "can't locate ppc-xlate.pl";
55+
56+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
57+
58+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
59+
60+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
61+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
62+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
63+my $vrsave="r12";
64+
65+$code=<<___;
66+.machine "any"
67+
68+.text
69+
70+.globl .gcm_init_p8
71+.align 5
72+.gcm_init_p8:
73+ li r0,-4096
74+ li r8,0x10
75+ mfspr $vrsave,256
76+ li r9,0x20
77+ mtspr 256,r0
78+ li r10,0x30
79+ lvx_u $H,0,r4 # load H
80+
81+ vspltisb $xC2,-16 # 0xf0
82+ vspltisb $t0,1 # one
83+ vaddubm $xC2,$xC2,$xC2 # 0xe0
84+ vxor $zero,$zero,$zero
85+ vor $xC2,$xC2,$t0 # 0xe1
86+ vsldoi $xC2,$xC2,$zero,15 # 0xe1...
87+ vsldoi $t1,$zero,$t0,1 # ...1
88+ vaddubm $xC2,$xC2,$xC2 # 0xc2...
89+ vspltisb $t2,7
90+ vor $xC2,$xC2,$t1 # 0xc2....01
91+ vspltb $t1,$H,0 # most significant byte
92+ vsl $H,$H,$t0 # H<<=1
93+ vsrab $t1,$t1,$t2 # broadcast carry bit
94+ vand $t1,$t1,$xC2
95+ vxor $IN,$H,$t1 # twisted H
96+
97+ vsldoi $H,$IN,$IN,8 # twist even more ...
98+ vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
99+ vsldoi $Hl,$zero,$H,8 # ... and split
100+ vsldoi $Hh,$H,$zero,8
101+
102+ stvx_u $xC2,0,r3 # save pre-computed table
103+ stvx_u $Hl,r8,r3
104+ li r8,0x40
105+ stvx_u $H, r9,r3
106+ li r9,0x50
107+ stvx_u $Hh,r10,r3
108+ li r10,0x60
109+
110+ vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
111+ vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
112+ vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
113+
114+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
115+
116+ vsldoi $t0,$Xm,$zero,8
117+ vsldoi $t1,$zero,$Xm,8
118+ vxor $Xl,$Xl,$t0
119+ vxor $Xh,$Xh,$t1
120+
121+ vsldoi $Xl,$Xl,$Xl,8
122+ vxor $Xl,$Xl,$t2
123+
124+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
125+ vpmsumd $Xl,$Xl,$xC2
126+ vxor $t1,$t1,$Xh
127+ vxor $IN1,$Xl,$t1
128+
129+ vsldoi $H2,$IN1,$IN1,8
130+ vsldoi $H2l,$zero,$H2,8
131+ vsldoi $H2h,$H2,$zero,8
132+
133+ stvx_u $H2l,r8,r3 # save H^2
134+ li r8,0x70
135+ stvx_u $H2,r9,r3
136+ li r9,0x80
137+ stvx_u $H2h,r10,r3
138+ li r10,0x90
139+___
140+{
141+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
142+$code.=<<___;
143+ vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
144+ vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
145+ vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
146+ vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
147+ vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
148+ vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
149+
150+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
151+ vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
152+
153+ vsldoi $t0,$Xm,$zero,8
154+ vsldoi $t1,$zero,$Xm,8
155+ vsldoi $t4,$Xm1,$zero,8
156+ vsldoi $t5,$zero,$Xm1,8
157+ vxor $Xl,$Xl,$t0
158+ vxor $Xh,$Xh,$t1
159+ vxor $Xl1,$Xl1,$t4
160+ vxor $Xh1,$Xh1,$t5
161+
162+ vsldoi $Xl,$Xl,$Xl,8
163+ vsldoi $Xl1,$Xl1,$Xl1,8
164+ vxor $Xl,$Xl,$t2
165+ vxor $Xl1,$Xl1,$t6
166+
167+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
168+ vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
169+ vpmsumd $Xl,$Xl,$xC2
170+ vpmsumd $Xl1,$Xl1,$xC2
171+ vxor $t1,$t1,$Xh
172+ vxor $t5,$t5,$Xh1
173+ vxor $Xl,$Xl,$t1
174+ vxor $Xl1,$Xl1,$t5
175+
176+ vsldoi $H,$Xl,$Xl,8
177+ vsldoi $H2,$Xl1,$Xl1,8
178+ vsldoi $Hl,$zero,$H,8
179+ vsldoi $Hh,$H,$zero,8
180+ vsldoi $H2l,$zero,$H2,8
181+ vsldoi $H2h,$H2,$zero,8
182+
183+ stvx_u $Hl,r8,r3 # save H^3
184+ li r8,0xa0
185+ stvx_u $H,r9,r3
186+ li r9,0xb0
187+ stvx_u $Hh,r10,r3
188+ li r10,0xc0
189+ stvx_u $H2l,r8,r3 # save H^4
190+ stvx_u $H2,r9,r3
191+ stvx_u $H2h,r10,r3
192+
193+ mtspr 256,$vrsave
194+ blr
195+ .long 0
196+ .byte 0,12,0x14,0,0,0,2,0
197+ .long 0
198+.size .gcm_init_p8,.-.gcm_init_p8
199+___
200+}
201+$code.=<<___;
202+.globl .gcm_gmult_p8
203+.align 5
204+.gcm_gmult_p8:
205+ lis r0,0xfff8
206+ li r8,0x10
207+ mfspr $vrsave,256
208+ li r9,0x20
209+ mtspr 256,r0
210+ li r10,0x30
211+ lvx_u $IN,0,$Xip # load Xi
212+
213+ lvx_u $Hl,r8,$Htbl # load pre-computed table
214+ le?lvsl $lemask,r0,r0
215+ lvx_u $H, r9,$Htbl
216+ le?vspltisb $t0,0x07
217+ lvx_u $Hh,r10,$Htbl
218+ le?vxor $lemask,$lemask,$t0
219+ lvx_u $xC2,0,$Htbl
220+ le?vperm $IN,$IN,$IN,$lemask
221+ vxor $zero,$zero,$zero
222+
223+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
224+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
225+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
226+
227+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
228+
229+ vsldoi $t0,$Xm,$zero,8
230+ vsldoi $t1,$zero,$Xm,8
231+ vxor $Xl,$Xl,$t0
232+ vxor $Xh,$Xh,$t1
233+
234+ vsldoi $Xl,$Xl,$Xl,8
235+ vxor $Xl,$Xl,$t2
236+
237+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
238+ vpmsumd $Xl,$Xl,$xC2
239+ vxor $t1,$t1,$Xh
240+ vxor $Xl,$Xl,$t1
241+
242+ le?vperm $Xl,$Xl,$Xl,$lemask
243+ stvx_u $Xl,0,$Xip # write out Xi
244+
245+ mtspr 256,$vrsave
246+ blr
247+ .long 0
248+ .byte 0,12,0x14,0,0,0,2,0
249+ .long 0
250+.size .gcm_gmult_p8,.-.gcm_gmult_p8
251+
252+.globl .gcm_ghash_p8
253+.align 5
254+.gcm_ghash_p8:
255+ li r0,-4096
256+ li r8,0x10
257+ mfspr $vrsave,256
258+ li r9,0x20
259+ mtspr 256,r0
260+ li r10,0x30
261+ lvx_u $Xl,0,$Xip # load Xi
262+
263+ lvx_u $Hl,r8,$Htbl # load pre-computed table
264+ li r8,0x40
265+ le?lvsl $lemask,r0,r0
266+ lvx_u $H, r9,$Htbl
267+ li r9,0x50
268+ le?vspltisb $t0,0x07
269+ lvx_u $Hh,r10,$Htbl
270+ li r10,0x60
271+ le?vxor $lemask,$lemask,$t0
272+ lvx_u $xC2,0,$Htbl
273+ le?vperm $Xl,$Xl,$Xl,$lemask
274+ vxor $zero,$zero,$zero
275+
276+ ${UCMP}i $len,64
277+ bge Lgcm_ghash_p8_4x
278+
279+ lvx_u $IN,0,$inp
280+ addi $inp,$inp,16
281+ subic. $len,$len,16
282+ le?vperm $IN,$IN,$IN,$lemask
283+ vxor $IN,$IN,$Xl
284+ beq Lshort
285+
286+ lvx_u $H2l,r8,$Htbl # load H^2
287+ li r8,16
288+ lvx_u $H2, r9,$Htbl
289+ add r9,$inp,$len # end of input
290+ lvx_u $H2h,r10,$Htbl
291+ be?b Loop_2x
292+
293+.align 5
294+Loop_2x:
295+ lvx_u $IN1,0,$inp
296+ le?vperm $IN1,$IN1,$IN1,$lemask
297+
298+ subic $len,$len,32
299+ vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
300+ vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
301+ subfe r0,r0,r0 # borrow?-1:0
302+ vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
303+ vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
304+ and r0,r0,$len
305+ vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
306+ vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
307+ add $inp,$inp,r0
308+
309+ vxor $Xl,$Xl,$Xl1
310+ vxor $Xm,$Xm,$Xm1
311+
312+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
313+
314+ vsldoi $t0,$Xm,$zero,8
315+ vsldoi $t1,$zero,$Xm,8
316+ vxor $Xh,$Xh,$Xh1
317+ vxor $Xl,$Xl,$t0
318+ vxor $Xh,$Xh,$t1
319+
320+ vsldoi $Xl,$Xl,$Xl,8
321+ vxor $Xl,$Xl,$t2
322+ lvx_u $IN,r8,$inp
323+ addi $inp,$inp,32
324+
325+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
326+ vpmsumd $Xl,$Xl,$xC2
327+ le?vperm $IN,$IN,$IN,$lemask
328+ vxor $t1,$t1,$Xh
329+ vxor $IN,$IN,$t1
330+ vxor $IN,$IN,$Xl
331+ $UCMP r9,$inp
332+ bgt Loop_2x # done yet?
333+
334+ cmplwi $len,0
335+ bne Leven
336+
337+Lshort:
338+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
339+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
340+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
341+
342+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
343+
344+ vsldoi $t0,$Xm,$zero,8
345+ vsldoi $t1,$zero,$Xm,8
346+ vxor $Xl,$Xl,$t0
347+ vxor $Xh,$Xh,$t1
348+
349+ vsldoi $Xl,$Xl,$Xl,8
350+ vxor $Xl,$Xl,$t2
351+
352+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
353+ vpmsumd $Xl,$Xl,$xC2
354+ vxor $t1,$t1,$Xh
355+
356+Leven:
357+ vxor $Xl,$Xl,$t1
358+ le?vperm $Xl,$Xl,$Xl,$lemask
359+ stvx_u $Xl,0,$Xip # write out Xi
360+
361+ mtspr 256,$vrsave
362+ blr
363+ .long 0
364+ .byte 0,12,0x14,0,0,0,4,0
365+ .long 0
366+___
367+{
368+my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
369+ $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
370+my $IN0=$IN;
371+my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
372+
373+$code.=<<___;
374+.align 5
375+.gcm_ghash_p8_4x:
376+Lgcm_ghash_p8_4x:
377+ $STU $sp,-$FRAME($sp)
378+ li r10,`15+6*$SIZE_T`
379+ li r11,`31+6*$SIZE_T`
380+ stvx v20,r10,$sp
381+ addi r10,r10,32
382+ stvx v21,r11,$sp
383+ addi r11,r11,32
384+ stvx v22,r10,$sp
385+ addi r10,r10,32
386+ stvx v23,r11,$sp
387+ addi r11,r11,32
388+ stvx v24,r10,$sp
389+ addi r10,r10,32
390+ stvx v25,r11,$sp
391+ addi r11,r11,32
392+ stvx v26,r10,$sp
393+ addi r10,r10,32
394+ stvx v27,r11,$sp
395+ addi r11,r11,32
396+ stvx v28,r10,$sp
397+ addi r10,r10,32
398+ stvx v29,r11,$sp
399+ addi r11,r11,32
400+ stvx v30,r10,$sp
401+ li r10,0x60
402+ stvx v31,r11,$sp
403+ li r0,-1
404+ stw $vrsave,`$FRAME-4`($sp) # save vrsave
405+ mtspr 256,r0 # preserve all AltiVec registers
406+
407+ lvsl $t0,0,r8 # 0x0001..0e0f
408+ #lvx_u $H2l,r8,$Htbl # load H^2
409+ li r8,0x70
410+ lvx_u $H2, r9,$Htbl
411+ li r9,0x80
412+ vspltisb $t1,8 # 0x0808..0808
413+ #lvx_u $H2h,r10,$Htbl
414+ li r10,0x90
415+ lvx_u $H3l,r8,$Htbl # load H^3
416+ li r8,0xa0
417+ lvx_u $H3, r9,$Htbl
418+ li r9,0xb0
419+ lvx_u $H3h,r10,$Htbl
420+ li r10,0xc0
421+ lvx_u $H4l,r8,$Htbl # load H^4
422+ li r8,0x10
423+ lvx_u $H4, r9,$Htbl
424+ li r9,0x20
425+ lvx_u $H4h,r10,$Htbl
426+ li r10,0x30
427+
428+ vsldoi $t2,$zero,$t1,8 # 0x0000..0808
429+ vaddubm $hiperm,$t0,$t2 # 0x0001..1617
430+ vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
431+
432+ $SHRI $len,$len,4 # this allows to use sign bit
433+ # as carry
434+ lvx_u $IN0,0,$inp # load input
435+ lvx_u $IN1,r8,$inp
436+ subic. $len,$len,8
437+ lvx_u $IN2,r9,$inp
438+ lvx_u $IN3,r10,$inp
439+ addi $inp,$inp,0x40
440+ le?vperm $IN0,$IN0,$IN0,$lemask
441+ le?vperm $IN1,$IN1,$IN1,$lemask
442+ le?vperm $IN2,$IN2,$IN2,$lemask
443+ le?vperm $IN3,$IN3,$IN3,$lemask
444+
445+ vxor $Xh,$IN0,$Xl
446+
447+ vpmsumd $Xl1,$IN1,$H3l
448+ vpmsumd $Xm1,$IN1,$H3
449+ vpmsumd $Xh1,$IN1,$H3h
450+
451+ vperm $H21l,$H2,$H,$hiperm
452+ vperm $t0,$IN2,$IN3,$loperm
453+ vperm $H21h,$H2,$H,$loperm
454+ vperm $t1,$IN2,$IN3,$hiperm
455+ vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
456+ vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
457+ vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
458+ vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
459+
460+ vxor $Xm2,$Xm2,$Xm1
461+ vxor $Xl3,$Xl3,$Xl1
462+ vxor $Xm3,$Xm3,$Xm2
463+ vxor $Xh3,$Xh3,$Xh1
464+
465+ blt Ltail_4x
466+
467+Loop_4x:
468+ lvx_u $IN0,0,$inp
469+ lvx_u $IN1,r8,$inp
470+ subic. $len,$len,4
471+ lvx_u $IN2,r9,$inp
472+ lvx_u $IN3,r10,$inp
473+ addi $inp,$inp,0x40
474+ le?vperm $IN1,$IN1,$IN1,$lemask
475+ le?vperm $IN2,$IN2,$IN2,$lemask
476+ le?vperm $IN3,$IN3,$IN3,$lemask
477+ le?vperm $IN0,$IN0,$IN0,$lemask
478+
479+ vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
480+ vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
481+ vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
482+ vpmsumd $Xl1,$IN1,$H3l
483+ vpmsumd $Xm1,$IN1,$H3
484+ vpmsumd $Xh1,$IN1,$H3h
485+
486+ vxor $Xl,$Xl,$Xl3
487+ vxor $Xm,$Xm,$Xm3
488+ vxor $Xh,$Xh,$Xh3
489+ vperm $t0,$IN2,$IN3,$loperm
490+ vperm $t1,$IN2,$IN3,$hiperm
491+
492+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
493+ vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
494+ vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
495+
496+ vsldoi $t0,$Xm,$zero,8
497+ vsldoi $t1,$zero,$Xm,8
498+ vxor $Xl,$Xl,$t0
499+ vxor $Xh,$Xh,$t1
500+
501+ vsldoi $Xl,$Xl,$Xl,8
502+ vxor $Xl,$Xl,$t2
503+
504+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
505+ vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
506+ vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
507+ vpmsumd $Xl,$Xl,$xC2
508+
509+ vxor $Xl3,$Xl3,$Xl1
510+ vxor $Xh3,$Xh3,$Xh1
511+ vxor $Xh,$Xh,$IN0
512+ vxor $Xm2,$Xm2,$Xm1
513+ vxor $Xh,$Xh,$t1
514+ vxor $Xm3,$Xm3,$Xm2
515+ vxor $Xh,$Xh,$Xl
516+ bge Loop_4x
517+
518+Ltail_4x:
519+ vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
520+ vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
521+ vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
522+
523+ vxor $Xl,$Xl,$Xl3
524+ vxor $Xm,$Xm,$Xm3
525+
526+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
527+
528+ vsldoi $t0,$Xm,$zero,8
529+ vsldoi $t1,$zero,$Xm,8
530+ vxor $Xh,$Xh,$Xh3
531+ vxor $Xl,$Xl,$t0
532+ vxor $Xh,$Xh,$t1
533+
534+ vsldoi $Xl,$Xl,$Xl,8
535+ vxor $Xl,$Xl,$t2
536+
537+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
538+ vpmsumd $Xl,$Xl,$xC2
539+ vxor $t1,$t1,$Xh
540+ vxor $Xl,$Xl,$t1
541+
542+ addic. $len,$len,4
543+ beq Ldone_4x
544+
545+ lvx_u $IN0,0,$inp
546+ ${UCMP}i $len,2
547+ li $len,-4
548+ blt Lone
549+ lvx_u $IN1,r8,$inp
550+ beq Ltwo
551+
552+Lthree:
553+ lvx_u $IN2,r9,$inp
554+ le?vperm $IN0,$IN0,$IN0,$lemask
555+ le?vperm $IN1,$IN1,$IN1,$lemask
556+ le?vperm $IN2,$IN2,$IN2,$lemask
557+
558+ vxor $Xh,$IN0,$Xl
559+ vmr $H4l,$H3l
560+ vmr $H4, $H3
561+ vmr $H4h,$H3h
562+
563+ vperm $t0,$IN1,$IN2,$loperm
564+ vperm $t1,$IN1,$IN2,$hiperm
565+ vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
566+ vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
567+ vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
568+ vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
569+
570+ vxor $Xm3,$Xm3,$Xm2
571+ b Ltail_4x
572+
573+.align 4
574+Ltwo:
575+ le?vperm $IN0,$IN0,$IN0,$lemask
576+ le?vperm $IN1,$IN1,$IN1,$lemask
577+
578+ vxor $Xh,$IN0,$Xl
579+ vperm $t0,$zero,$IN1,$loperm
580+ vperm $t1,$zero,$IN1,$hiperm
581+
582+ vsldoi $H4l,$zero,$H2,8
583+ vmr $H4, $H2
584+ vsldoi $H4h,$H2,$zero,8
585+
586+ vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
587+ vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
588+ vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
589+
590+ b Ltail_4x
591+
592+.align 4
593+Lone:
594+ le?vperm $IN0,$IN0,$IN0,$lemask
595+
596+ vsldoi $H4l,$zero,$H,8
597+ vmr $H4, $H
598+ vsldoi $H4h,$H,$zero,8
599+
600+ vxor $Xh,$IN0,$Xl
601+ vxor $Xl3,$Xl3,$Xl3
602+ vxor $Xm3,$Xm3,$Xm3
603+ vxor $Xh3,$Xh3,$Xh3
604+
605+ b Ltail_4x
606+
607+Ldone_4x:
608+ le?vperm $Xl,$Xl,$Xl,$lemask
609+ stvx_u $Xl,0,$Xip # write out Xi
610+
611+ li r10,`15+6*$SIZE_T`
612+ li r11,`31+6*$SIZE_T`
613+ mtspr 256,$vrsave
614+ lvx v20,r10,$sp
615+ addi r10,r10,32
616+ lvx v21,r11,$sp
617+ addi r11,r11,32
618+ lvx v22,r10,$sp
619+ addi r10,r10,32
620+ lvx v23,r11,$sp
621+ addi r11,r11,32
622+ lvx v24,r10,$sp
623+ addi r10,r10,32
624+ lvx v25,r11,$sp
625+ addi r11,r11,32
626+ lvx v26,r10,$sp
627+ addi r10,r10,32
628+ lvx v27,r11,$sp
629+ addi r11,r11,32
630+ lvx v28,r10,$sp
631+ addi r10,r10,32
632+ lvx v29,r11,$sp
633+ addi r11,r11,32
634+ lvx v30,r10,$sp
635+ lvx v31,r11,$sp
636+ addi $sp,$sp,$FRAME
637+ blr
638+ .long 0
639+ .byte 0,12,0x04,0,0x80,0,4,0
640+ .long 0
641+___
642+}
643+$code.=<<___;
644+.size .gcm_ghash_p8,.-.gcm_ghash_p8
645+
646+.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
647+.align 2
648+___
649+
650+foreach (split("\n",$code)) {
651+ s/\`([^\`]*)\`/eval $1/geo;
652+
653+ if ($flavour =~ /le$/o) { # little-endian
654+ s/le\?//o or
655+ s/be\?/#be#/o;
656+ } else {
657+ s/le\?/#le#/o or
658+ s/be\?//o;
659+ }
660+ print $_,"\n";
661+}
662+
663+close STDOUT; # enforce flush
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -683,6 +683,14 @@ void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
683683 void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
684684 void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
685685 # endif
686+# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
687+# define GHASH_ASM_PPC
688+# define GCM_FUNCREF_4BIT
689+extern int OPENSSL_ppccap_P;
690+void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
691+void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
692+void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
693+ size_t len);
686694 # elif defined(_TMS320C6400_PLUS)
687695 # define GHASH_ASM_C64Xplus
688696 # endif
@@ -767,6 +775,16 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
767775 ctx->gmult = gcm_gmult_4bit;
768776 ctx->ghash = gcm_ghash_4bit;
769777 }
778+# elif defined(GHASH_ASM_PPC)
779+ if (OPENSSL_ppccap_P & (1<<2)) {
780+ gcm_init_p8(ctx->Htable, ctx->H.u);
781+ ctx->gmult = gcm_gmult_p8;
782+ ctx->ghash = gcm_ghash_p8;
783+ } else {
784+ gcm_init_4bit(ctx->Htable, ctx->H.u);
785+ ctx->gmult = gcm_gmult_4bit;
786+ ctx->ghash = gcm_ghash_4bit;
787+ }
770788 # elif defined(GHASH_ASM_C64Xplus)
771789 /* C64x+ assembler doesn't use tables, skip gcm_init_4bit.
772790 * This is likely to trigger "function never referenced"
--- a/crypto/perlasm/ppc-xlate.pl
+++ b/crypto/perlasm/ppc-xlate.pl
@@ -27,7 +27,8 @@ my $globl = sub {
2727 /osx/ && do { $name = "_$name";
2828 last;
2929 };
30- /linux.*32/ && do { $ret .= ".globl $name\n";
30+ /linux.*(32|64le)/
31+ && do { $ret .= ".globl $name\n";
3132 $ret .= ".type $name,\@function";
3233 last;
3334 };
@@ -37,7 +38,6 @@ my $globl = sub {
3738 $ret .= ".align 3\n";
3839 $ret .= "$name:\n";
3940 $ret .= ".quad .$name,.TOC.\@tocbase,0\n";
40- $ret .= ".size $name,24\n";
4141 $ret .= ".previous\n";
4242
4343 $name = ".$name";
@@ -50,7 +50,9 @@ my $globl = sub {
5050 $ret;
5151 };
5252 my $text = sub {
53- ($flavour =~ /aix/) ? ".csect" : ".text";
53+ my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text";
54+ $ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/);
55+ $ret;
5456 };
5557 my $machine = sub {
5658 my $junk = shift;
@@ -62,9 +64,12 @@ my $machine = sub {
6264 ".machine $arch";
6365 };
6466 my $size = sub {
65- if ($flavour =~ /linux.*32/)
67+ if ($flavour =~ /linux/)
6668 { shift;
67- ".size " . join(",",@_);
69+ my $name = shift; $name =~ s|^[\.\_]||;
70+ my $ret = ".size $name,.-".($flavour=~/64$/?".":"").$name;
71+ $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64$/);
72+ $ret;
6873 }
6974 else
7075 { ""; }
@@ -77,6 +82,25 @@ my $asciz = sub {
7782 else
7883 { ""; }
7984 };
85+my $quad = sub {
86+ shift;
87+ my @ret;
88+ my ($hi,$lo);
89+ for (@_) {
90+ if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io)
91+ { $hi=$1?"0x$1":"0"; $lo="0x$2"; }
92+ elsif (/^([0-9]+)$/o)
93+ { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl
94+ else
95+ { $hi=undef; $lo=$_; }
96+
97+ if (defined($hi))
98+ { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); }
99+ else
100+ { push(@ret,".quad $lo"); }
101+ }
102+ join("\n",@ret);
103+};
80104
81105 ################################################################
82106 # simplified mnemonics not handled by at least one assembler
@@ -122,6 +146,66 @@ my $extrdi = sub {
122146 $b = ($b+$n)&63; $n = 64-$n;
123147 " rldicl $ra,$rs,$b,$n";
124148 };
149+my $vmr = sub {
150+ my ($f,$vx,$vy) = @_;
151+ " vor $vx,$vy,$vy";
152+};
153+
154+# Some ABIs specify vrsave, special-purpose register #256, as reserved
155+# for system use.
156+my $no_vrsave = ($flavour =~ /aix|linux64le/);
157+my $mtspr = sub {
158+ my ($f,$idx,$ra) = @_;
159+ if ($idx == 256 && $no_vrsave) {
160+ " or $ra,$ra,$ra";
161+ } else {
162+ " mtspr $idx,$ra";
163+ }
164+};
165+my $mfspr = sub {
166+ my ($f,$rd,$idx) = @_;
167+ if ($idx == 256 && $no_vrsave) {
168+ " li $rd,-1";
169+ } else {
170+ " mfspr $rd,$idx";
171+ }
172+};
173+
174+# PowerISA 2.06 stuff
175+sub vsxmem_op {
176+ my ($f, $vrt, $ra, $rb, $op) = @_;
177+ " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1);
178+}
179+# made-up unaligned memory reference AltiVec/VMX instructions
180+my $lvx_u = sub { vsxmem_op(@_, 844); }; # lxvd2x
181+my $stvx_u = sub { vsxmem_op(@_, 972); }; # stxvd2x
182+my $lvdx_u = sub { vsxmem_op(@_, 588); }; # lxsdx
183+my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx
184+my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x
185+my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x
186+
187+# PowerISA 2.07 stuff
188+sub vcrypto_op {
189+ my ($f, $vrt, $vra, $vrb, $op) = @_;
190+ " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
191+}
192+my $vcipher = sub { vcrypto_op(@_, 1288); };
193+my $vcipherlast = sub { vcrypto_op(@_, 1289); };
194+my $vncipher = sub { vcrypto_op(@_, 1352); };
195+my $vncipherlast= sub { vcrypto_op(@_, 1353); };
196+my $vsbox = sub { vcrypto_op(@_, 0, 1480); };
197+my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); };
198+my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); };
199+my $vpmsumb = sub { vcrypto_op(@_, 1032); };
200+my $vpmsumd = sub { vcrypto_op(@_, 1224); };
201+my $vpmsubh = sub { vcrypto_op(@_, 1096); };
202+my $vpmsumw = sub { vcrypto_op(@_, 1160); };
203+my $vaddudm = sub { vcrypto_op(@_, 192); };
204+
205+my $mtsle = sub {
206+ my ($f, $arg) = @_;
207+ " .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2);
208+};
125209
126210 while($line=<>) {
127211
@@ -138,7 +222,10 @@ while($line=<>) {
138222 {
139223 $line =~ s|(^[\.\w]+)\:\s*||;
140224 my $label = $1;
141- printf "%s:",($GLOBALS{$label} or $label) if ($label);
225+ if ($label) {
226+ printf "%s:",($GLOBALS{$label} or $label);
227+ printf "\n.localentry\t$GLOBALS{$label},0" if ($GLOBALS{$label} && $flavour =~ /linux.*64le/);
228+ }
142229 }
143230
144231 {
@@ -147,7 +234,7 @@ while($line=<>) {
147234 my $mnemonic = $2;
148235 my $f = $3;
149236 my $opcode = eval("\$$mnemonic");
150- $line =~ s|\bc?[rf]([0-9]+)\b|$1|g if ($c ne "." and $flavour !~ /osx/);
237+ $line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/);
151238 if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
152239 elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; }
153240 }
--- a/crypto/ppccap.c
+++ b/crypto/ppccap.c
@@ -3,13 +3,24 @@
33 #include <string.h>
44 #include <setjmp.h>
55 #include <signal.h>
6+#include <unistd.h>
7+#if defined(__linux) || defined(_AIX)
8+# include <sys/utsname.h>
9+#endif
10+#if defined(_AIX53) /* defined even on post-5.3 */
11+# include <sys/systemcfg.h>
12+# if !defined(__power_set)
13+# define __power_set(a) (_system_configuration.implementation & (a))
14+# endif
15+#endif
616 #include <crypto.h>
717 #include <openssl/bn.h>
818
919 #define PPC_FPU64 (1<<0)
1020 #define PPC_ALTIVEC (1<<1)
21+#define PPC_CRYPTO207 (1<<2)
1122
12-static int OPENSSL_ppccap_P = 0;
23+int OPENSSL_ppccap_P = 0;
1324
1425 static sigset_t all_masked;
1526
@@ -49,10 +60,28 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
4960 }
5061 #endif
5162
63+void sha256_block_p8(void *ctx, const void *inp, size_t len);
64+void sha256_block_ppc(void *ctx, const void *inp, size_t len);
65+void sha256_block_data_order(void *ctx, const void *inp, size_t len)
66+{
67+ OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha256_block_p8(ctx, inp, len) :
68+ sha256_block_ppc(ctx, inp, len);
69+}
70+
71+void sha512_block_p8(void *ctx, const void *inp, size_t len);
72+void sha512_block_ppc(void *ctx, const void *inp, size_t len);
73+void sha512_block_data_order(void *ctx, const void *inp, size_t len)
74+{
75+ OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha512_block_p8(ctx, inp, len) :
76+ sha512_block_ppc(ctx, inp, len);
77+}
78+
5279 static sigjmp_buf ill_jmp;
5380 static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
5481
5582 void OPENSSL_ppc64_probe(void);
83+void OPENSSL_altivec_probe(void);
84+void OPENSSL_crypto207_probe(void);
5685
5786 void OPENSSL_cpuid_setup(void)
5887 {
@@ -82,6 +111,45 @@ void OPENSSL_cpuid_setup(void)
82111
83112 OPENSSL_ppccap_P = 0;
84113
114+#if defined(_AIX)
115+ if (sizeof(size_t) == 4) {
116+ struct utsname uts;
117+# if defined(_SC_AIX_KERNEL_BITMODE)
118+ if (sysconf(_SC_AIX_KERNEL_BITMODE) != 64)
119+ return;
120+# endif
121+ if (uname(&uts) != 0 || atoi(uts.version) < 6)
122+ return;
123+ }
124+
125+# if defined(__power_set)
126+ /*
127+ * Value used in __power_set is a single-bit 1<<n one denoting
128+ * specific processor class. Incidentally 0xffffffff<<n can be
129+ * used to denote specific processor and its successors.
130+ */
131+ if (sizeof(size_t) == 4) {
132+ /* In 32-bit case PPC_FPU64 is always fastest [if option] */
133+ if (__power_set(0xffffffffU<<13)) /* POWER5 and later */
134+ OPENSSL_ppccap_P |= PPC_FPU64;
135+ } else {
136+ /* In 64-bit case PPC_FPU64 is fastest only on POWER6 */
137+# if 0 /* to keep compatibility with previous validations */
138+ if (__power_set(0x1U<<14)) /* POWER6 */
139+ OPENSSL_ppccap_P |= PPC_FPU64;
140+# endif
141+ }
142+
143+ if (__power_set(0xffffffffU<<14)) /* POWER6 and later */
144+ OPENSSL_ppccap_P |= PPC_ALTIVEC;
145+
146+ if (__power_set(0xffffffffU<<16)) /* POWER8 and later */
147+ OPENSSL_ppccap_P |= PPC_CRYPTO207;
148+
149+ return;
150+# endif
151+#endif
152+
85153 memset(&ill_act,0,sizeof(ill_act));
86154 ill_act.sa_handler = ill_handler;
87155 ill_act.sa_mask = all_masked;
@@ -108,6 +176,11 @@ void OPENSSL_cpuid_setup(void)
108176 {
109177 OPENSSL_altivec_probe();
110178 OPENSSL_ppccap_P |= PPC_ALTIVEC;
179+ if (sigsetjmp(ill_jmp, 1) == 0)
180+ {
181+ OPENSSL_crypto207_probe();
182+ OPENSSL_ppccap_P |= PPC_CRYPTO207;
183+ }
111184 }
112185
113186 sigaction (SIGILL,&ill_oact,NULL);
--- a/crypto/ppccpuid.pl
+++ b/crypto/ppccpuid.pl
@@ -40,6 +40,16 @@ $code=<<___;
4040 .long 0
4141 .byte 0,12,0x14,0,0,0,0,0
4242
43+.globl .OPENSSL_crypto207_probe
44+.align 4
45+.OPENSSL_crypto207_probe:
46+ .long 0x7C000E99 # lvx_u v0,0,r1
47+ .long 0x10000508 # vcipher v0,v0,v0
48+ blr
49+ .long 0
50+ .byte 0,12,0x14,0,0,0,0,0
51+.size .OPENSSL_crypto207_probe,.-.OPENSSL_crypto207_probe
52+
4353 .globl .OPENSSL_wipe_cpu
4454 .align 4
4555 .OPENSSL_wipe_cpu:
--- a/crypto/sha/Makefile
+++ b/crypto/sha/Makefile
@@ -73,6 +73,8 @@ sha512-sparcv9.s:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAG
7373 sha1-ppc.s: asm/sha1-ppc.pl; $(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@
7474 sha256-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
7575 sha512-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
76+sha256p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@
77+sha512p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@
7678
7779 sha1-parisc.s: asm/sha1-parisc.pl; $(PERL) asm/sha1-parisc.pl $(PERLASM_SCHEME) $@
7880 sha256-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@
--- a/crypto/sha/asm/sha1-ppc.pl
+++ b/crypto/sha/asm/sha1-ppc.pl
@@ -210,7 +210,7 @@ Lunaligned:
210210 srwi. $t1,$t1,6 ; t1/=64
211211 beq Lcross_page
212212 $UCMP $num,$t1
213- ble- Laligned ; didn't cross the page boundary
213+ ble Laligned ; didn't cross the page boundary
214214 mtctr $t1
215215 subfc $num,$t1,$num
216216 bl Lsha1_block_private
@@ -238,7 +238,7 @@ Lmemcpy:
238238 bl Lsha1_block_private
239239 $POP $inp,`$FRAME-$SIZE_T*18`($sp)
240240 addic. $num,$num,-1
241- bne- Lunaligned
241+ bne Lunaligned
242242
243243 Ldone:
244244 $POP r0,`$FRAME+$LRSAVE`($sp)
@@ -312,7 +312,7 @@ $code.=<<___;
312312 stw r20,16($ctx)
313313 mr $E,r20
314314 addi $inp,$inp,`16*4`
315- bdnz- Lsha1_block_private
315+ bdnz Lsha1_block_private
316316 blr
317317 .long 0
318318 .byte 0,12,0x14,0,0,0,0,0
--- a/crypto/sha/asm/sha512-ppc.pl
+++ b/crypto/sha/asm/sha512-ppc.pl
@@ -64,7 +64,7 @@ die "can't locate ppc-xlate.pl";
6464 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
6565
6666 if ($output =~ /512/) {
67- $func="sha512_block_data_order";
67+ $func="sha512_block_ppc";
6868 $SZ=8;
6969 @Sigma0=(28,34,39);
7070 @Sigma1=(14,18,41);
@@ -76,7 +76,7 @@ if ($output =~ /512/) {
7676 $ROR="rotrdi";
7777 $SHR="srdi";
7878 } else {
79- $func="sha256_block_data_order";
79+ $func="sha256_block_ppc";
8080 $SZ=4;
8181 @Sigma0=( 2,13,22);
8282 @Sigma1=( 6,11,25);
@@ -243,7 +243,7 @@ Lunaligned:
243243 andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
244244 beq Lcross_page
245245 $UCMP $num,$t1
246- ble- Laligned ; didn't cross the page boundary
246+ ble Laligned ; didn't cross the page boundary
247247 subfc $num,$t1,$num
248248 add $t1,$inp,$t1
249249 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
@@ -279,7 +279,7 @@ Lmemcpy:
279279 $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
280280 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
281281 addic. $num,$num,`-16*$SZ` ; num--
282- bne- Lunaligned
282+ bne Lunaligned
283283
284284 Ldone:
285285 $POP r0,`$FRAME+$LRSAVE`($sp)
@@ -339,7 +339,7 @@ for(;$i<32;$i++) {
339339 unshift(@V,pop(@V));
340340 }
341341 $code.=<<___;
342- bdnz- Lrounds
342+ bdnz Lrounds
343343
344344 $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
345345 $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
--- /dev/null
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -0,0 +1,431 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# SHA256/512 for PowerISA v2.07.
11+#
12+# Accurate performance measurements are problematic, because it's
13+# always virtualized setup with possibly throttled processor.
14+# Relative comparison is therefore more informative. This module is
15+# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
16+# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
17+# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
18+# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
19+# result is degree of computational resources' utilization. POWER8 is
20+# "massively multi-threaded chip" and difference between single- and
21+# maximum multi-process benchmark results tells that utlization is
22+# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
23+# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
24+# to single-process one, given that all threads end up on the same
25+# physical core.
26+#
27+#######################################################################
28+#
29+# SHA256/pre-2.07(*) SHA512/pre-2.07(*) SHA1(*)
30+# POWER8 9.3 /14.8 5.8 /9.5 7.1
31+#
32+# (*) presented for reference/comparison purposes;
33+
34+$flavour=shift;
35+$output =shift;
36+
37+if ($flavour =~ /64/) {
38+ $SIZE_T=8;
39+ $LRSAVE=2*$SIZE_T;
40+ $STU="stdu";
41+ $POP="ld";
42+ $PUSH="std";
43+} elsif ($flavour =~ /32/) {
44+ $SIZE_T=4;
45+ $LRSAVE=$SIZE_T;
46+ $STU="stwu";
47+ $POP="lwz";
48+ $PUSH="stw";
49+} else { die "nonsense $flavour"; }
50+
51+$LENDIAN=($flavour=~/le/);
52+
53+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
55+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
56+die "can't locate ppc-xlate.pl";
57+
58+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
59+
60+if ($output =~ /512/) {
61+ $bits=512;
62+ $SZ=8;
63+ $sz="d";
64+ $rounds=80;
65+} else {
66+ $bits=256;
67+ $SZ=4;
68+ $sz="w";
69+ $rounds=64;
70+}
71+
72+$func="sha${bits}_block_p8";
73+$FRAME=8*$SIZE_T;
74+
75+$sp ="r1";
76+$toc="r2";
77+$ctx="r3";
78+$inp="r4";
79+$num="r5";
80+$Tbl="r6";
81+$idx="r7";
82+$lrsave="r8";
83+$offload="r11";
84+$vrsave="r12";
85+($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
86+ $x00=0 if ($flavour =~ /osx/);
87+
88+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
89+@X=map("v$_",(8..23));
90+($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
91+
92+sub ROUND {
93+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
94+my $j=($i+1)%16;
95+
96+$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
97+ lvx_u @X[$i+1],0,$inp ; load X[i] in advance
98+ addi $inp,$inp,16
99+___
100+$code.=<<___ if ($i<16 && ($i%(16/$SZ)));
101+ vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ
102+___
103+$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
104+ vperm @X[$i],@X[$i],@X[$i],$lemask
105+___
106+$code.=<<___;
107+ `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
108+ vsel $Func,$g,$f,$e ; Ch(e,f,g)
109+ vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
110+ vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
111+ vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
112+ `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
113+ vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
114+ vxor $Func,$a,$b
115+ `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
116+ vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
117+ vsel $Func,$b,$c,$Func ; Maj(a,b,c)
118+ vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
119+ vaddu${sz}m $d,$d,$h ; d+=h
120+ vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
121+ `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
122+ lvx $Ki,$idx,$Tbl ; load next K[i]
123+ addi $idx,$idx,16
124+ vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
125+ `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
126+___
127+}
128+
129+$code=<<___;
130+.machine "any"
131+.text
132+
133+.globl $func
134+.align 6
135+$func:
136+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
137+ mflr $lrsave
138+ li r10,`$FRAME+8*16+15`
139+ li r11,`$FRAME+8*16+31`
140+ stvx v20,r10,$sp # ABI says so
141+ addi r10,r10,32
142+ mfspr $vrsave,256
143+ stvx v21,r11,$sp
144+ addi r11,r11,32
145+ stvx v22,r10,$sp
146+ addi r10,r10,32
147+ stvx v23,r11,$sp
148+ addi r11,r11,32
149+ stvx v24,r10,$sp
150+ addi r10,r10,32
151+ stvx v25,r11,$sp
152+ addi r11,r11,32
153+ stvx v26,r10,$sp
154+ addi r10,r10,32
155+ stvx v27,r11,$sp
156+ addi r11,r11,32
157+ stvx v28,r10,$sp
158+ addi r10,r10,32
159+ stvx v29,r11,$sp
160+ addi r11,r11,32
161+ stvx v30,r10,$sp
162+ stvx v31,r11,$sp
163+ li r11,-1
164+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
165+ li $x10,0x10
166+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
167+ li $x20,0x20
168+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
169+ li $x30,0x30
170+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
171+ li $x40,0x40
172+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
173+ li $x50,0x50
174+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
175+ li $x60,0x60
176+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
177+ li $x70,0x70
178+ $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
179+ mtspr 256,r11
180+
181+ bl LPICmeup
182+ addi $offload,$sp,$FRAME+15
183+___
184+$code.=<<___ if ($LENDIAN);
185+ li $idx,8
186+ lvsl $lemask,0,$idx
187+ vspltisb $Ki,0x0f
188+ vxor $lemask,$lemask,$Ki
189+___
190+$code.=<<___ if ($SZ==4);
191+ lvx_4w $A,$x00,$ctx
192+ lvx_4w $E,$x10,$ctx
193+ vsldoi $B,$A,$A,4 # unpack
194+ vsldoi $C,$A,$A,8
195+ vsldoi $D,$A,$A,12
196+ vsldoi $F,$E,$E,4
197+ vsldoi $G,$E,$E,8
198+ vsldoi $H,$E,$E,12
199+___
200+$code.=<<___ if ($SZ==8);
201+ lvx_u $A,$x00,$ctx
202+ lvx_u $C,$x10,$ctx
203+ lvx_u $E,$x20,$ctx
204+ vsldoi $B,$A,$A,8 # unpack
205+ lvx_u $G,$x30,$ctx
206+ vsldoi $D,$C,$C,8
207+ vsldoi $F,$E,$E,8
208+ vsldoi $H,$G,$G,8
209+___
210+$code.=<<___;
211+ li r0,`($rounds-16)/16` # inner loop counter
212+ b Loop
213+.align 5
214+Loop:
215+ lvx $Ki,$x00,$Tbl
216+ li $idx,16
217+ lvx_u @X[0],0,$inp
218+ addi $inp,$inp,16
219+ stvx $A,$x00,$offload # offload $A-$H
220+ stvx $B,$x10,$offload
221+ stvx $C,$x20,$offload
222+ stvx $D,$x30,$offload
223+ stvx $E,$x40,$offload
224+ stvx $F,$x50,$offload
225+ stvx $G,$x60,$offload
226+ stvx $H,$x70,$offload
227+ vaddu${sz}m $H,$H,$Ki # h+K[i]
228+ lvx $Ki,$idx,$Tbl
229+ addi $idx,$idx,16
230+___
231+for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
232+$code.=<<___;
233+ mtctr r0
234+ b L16_xx
235+.align 5
236+L16_xx:
237+___
238+for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
239+$code.=<<___;
240+ bdnz L16_xx
241+
242+ lvx @X[2],$x00,$offload
243+ subic. $num,$num,1
244+ lvx @X[3],$x10,$offload
245+ vaddu${sz}m $A,$A,@X[2]
246+ lvx @X[4],$x20,$offload
247+ vaddu${sz}m $B,$B,@X[3]
248+ lvx @X[5],$x30,$offload
249+ vaddu${sz}m $C,$C,@X[4]
250+ lvx @X[6],$x40,$offload
251+ vaddu${sz}m $D,$D,@X[5]
252+ lvx @X[7],$x50,$offload
253+ vaddu${sz}m $E,$E,@X[6]
254+ lvx @X[8],$x60,$offload
255+ vaddu${sz}m $F,$F,@X[7]
256+ lvx @X[9],$x70,$offload
257+ vaddu${sz}m $G,$G,@X[8]
258+ vaddu${sz}m $H,$H,@X[9]
259+ bne Loop
260+___
261+$code.=<<___ if ($SZ==4);
262+ lvx @X[0],$idx,$Tbl
263+ addi $idx,$idx,16
264+ vperm $A,$A,$B,$Ki # pack the answer
265+ lvx @X[1],$idx,$Tbl
266+ vperm $E,$E,$F,$Ki
267+ vperm $A,$A,$C,@X[0]
268+ vperm $E,$E,$G,@X[0]
269+ vperm $A,$A,$D,@X[1]
270+ vperm $E,$E,$H,@X[1]
271+ stvx_4w $A,$x00,$ctx
272+ stvx_4w $E,$x10,$ctx
273+___
274+$code.=<<___ if ($SZ==8);
275+ vperm $A,$A,$B,$Ki # pack the answer
276+ vperm $C,$C,$D,$Ki
277+ vperm $E,$E,$F,$Ki
278+ vperm $G,$G,$H,$Ki
279+ stvx_u $A,$x00,$ctx
280+ stvx_u $C,$x10,$ctx
281+ stvx_u $E,$x20,$ctx
282+ stvx_u $G,$x30,$ctx
283+___
284+$code.=<<___;
285+ li r10,`$FRAME+8*16+15`
286+ mtlr $lrsave
287+ li r11,`$FRAME+8*16+31`
288+ mtspr 256,$vrsave
289+ lvx v20,r10,$sp # ABI says so
290+ addi r10,r10,32
291+ lvx v21,r11,$sp
292+ addi r11,r11,32
293+ lvx v22,r10,$sp
294+ addi r10,r10,32
295+ lvx v23,r11,$sp
296+ addi r11,r11,32
297+ lvx v24,r10,$sp
298+ addi r10,r10,32
299+ lvx v25,r11,$sp
300+ addi r11,r11,32
301+ lvx v26,r10,$sp
302+ addi r10,r10,32
303+ lvx v27,r11,$sp
304+ addi r11,r11,32
305+ lvx v28,r10,$sp
306+ addi r10,r10,32
307+ lvx v29,r11,$sp
308+ addi r11,r11,32
309+ lvx v30,r10,$sp
310+ lvx v31,r11,$sp
311+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
312+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
313+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
314+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
315+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
316+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
317+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
318+ blr
319+ .long 0
320+ .byte 0,12,4,1,0x80,6,3,0
321+ .long 0
322+.size $func,.-$func
323+___
324+
325+# Ugly hack here, because PPC assembler syntax seem to vary too
326+# much from platforms to platform...
327+$code.=<<___;
328+.align 6
329+LPICmeup:
330+ mflr r0
331+ bcl 20,31,\$+4
332+ mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
333+ addi $Tbl,$Tbl,`64-8`
334+ mtlr r0
335+ blr
336+ .long 0
337+ .byte 0,12,0x14,0,0,0,0,0
338+ .space `64-9*4`
339+___
340+
341+if ($SZ==8) {
342+ local *table = sub {
343+ foreach(@_) { $code.=".quad $_,$_\n"; }
344+ };
345+ table(
346+ "0x428a2f98d728ae22","0x7137449123ef65cd",
347+ "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
348+ "0x3956c25bf348b538","0x59f111f1b605d019",
349+ "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
350+ "0xd807aa98a3030242","0x12835b0145706fbe",
351+ "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
352+ "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
353+ "0x9bdc06a725c71235","0xc19bf174cf692694",
354+ "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
355+ "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
356+ "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
357+ "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
358+ "0x983e5152ee66dfab","0xa831c66d2db43210",
359+ "0xb00327c898fb213f","0xbf597fc7beef0ee4",
360+ "0xc6e00bf33da88fc2","0xd5a79147930aa725",
361+ "0x06ca6351e003826f","0x142929670a0e6e70",
362+ "0x27b70a8546d22ffc","0x2e1b21385c26c926",
363+ "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
364+ "0x650a73548baf63de","0x766a0abb3c77b2a8",
365+ "0x81c2c92e47edaee6","0x92722c851482353b",
366+ "0xa2bfe8a14cf10364","0xa81a664bbc423001",
367+ "0xc24b8b70d0f89791","0xc76c51a30654be30",
368+ "0xd192e819d6ef5218","0xd69906245565a910",
369+ "0xf40e35855771202a","0x106aa07032bbd1b8",
370+ "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
371+ "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
372+ "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
373+ "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
374+ "0x748f82ee5defb2fc","0x78a5636f43172f60",
375+ "0x84c87814a1f0ab72","0x8cc702081a6439ec",
376+ "0x90befffa23631e28","0xa4506cebde82bde9",
377+ "0xbef9a3f7b2c67915","0xc67178f2e372532b",
378+ "0xca273eceea26619c","0xd186b8c721c0c207",
379+ "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
380+ "0x06f067aa72176fba","0x0a637dc5a2c898a6",
381+ "0x113f9804bef90dae","0x1b710b35131c471b",
382+ "0x28db77f523047d84","0x32caab7b40c72493",
383+ "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
384+ "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
385+ "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
386+$code.=<<___ if (!$LENDIAN);
387+.quad 0x0001020304050607,0x1011121314151617
388+___
389+$code.=<<___ if ($LENDIAN); # quad-swapped
390+.quad 0x1011121314151617,0x0001020304050607
391+___
392+} else {
393+ local *table = sub {
394+ foreach(@_) { $code.=".long $_,$_,$_,$_\n"; }
395+ };
396+ table(
397+ "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
398+ "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
399+ "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
400+ "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
401+ "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
402+ "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
403+ "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
404+ "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
405+ "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
406+ "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
407+ "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
408+ "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
409+ "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
410+ "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
411+ "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
412+ "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
413+$code.=<<___ if (!$LENDIAN);
414+.long 0x00010203,0x10111213,0x10111213,0x10111213
415+.long 0x00010203,0x04050607,0x10111213,0x10111213
416+.long 0x00010203,0x04050607,0x08090a0b,0x10111213
417+___
418+$code.=<<___ if ($LENDIAN); # word-swapped
419+.long 0x10111213,0x10111213,0x10111213,0x00010203
420+.long 0x10111213,0x10111213,0x04050607,0x00010203
421+.long 0x10111213,0x08090a0b,0x04050607,0x00010203
422+___
423+}
424+$code.=<<___;
425+.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
426+.align 2
427+___
428+
429+$code =~ s/\`([^\`]*)\`/eval $1/gem;
430+print $code;
431+close STDOUT;
--- a/fips/fips_premain.c
+++ b/fips/fips_premain.c
@@ -140,6 +140,9 @@ void FINGERPRINT_premain(void)
140140 }
141141 #endif
142142 } while(0);
143+#if defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)
144+ fips_openssl_cpuid_setup();
145+#endif
143146 }
144147
145148 #else
--- a/fips/fips_premain.c.sha1
+++ b/fips/fips_premain.c.sha1
@@ -1 +1 @@
1-HMAC-SHA1(fips_premain.c)= 65b20c3cec235cec85af848e1cd2dfdfa101804a
1+HMAC-SHA1(fips_premain.c)= 2bfb57ef540bdba29220a45d65e1b4080de9adc1
--- a/fips/fipssyms.h
+++ b/fips/fipssyms.h
@@ -712,6 +712,23 @@
712712 #define _bn_GF2m_mul_2x2 _fips_bn_GF2m_mul_2x2
713713 #define _OPENSSL_cleanse _FIPS_openssl_cleanse
714714 #endif
715+#define aes_p8_encrypt fips_aes_p8_encrypt
716+#define aes_p8_decrypt fips_aes_p8_decrypt
717+#define aes_p8_set_encrypt_key fips_aes_p8_set_encrypt_key
718+#define aes_p8_set_decrypt_key fips_aes_p8_set_decrypt_key
719+#define aes_p8_cbc_encrypt fips_aes_p8_cbc_encrypt
720+#define aes_p8_ctr32_encrypt_blocks fips_aes_p8_ctr32_encrypt_blocks
721+#define aes_p8_xts_encrypt fips_aes_p8_xts_encrypt
722+#define aes_p8_xts_decrypt fips_aes_p8_xts_decrypt
723+#define gcm_init_p8 fips_gcm_init_p8
724+#define gcm_gmult_p8 fips_gcm_gmult_p8
725+#define gcm_ghash_p8 fips_gcm_ghash_p8
726+#define sha256_block_p8 fips_sha256_block_p8
727+#define sha512_block_p8 fips_sha512_block_p8
728+#define sha256_block_ppc fips_sha256_block_ppc
729+#define sha512_block_ppc fips_sha512_block_ppc
730+#define OPENSSL_ppccap_P fips_openssl_ppccap_p
731+#define OPENSSL_crypto207_probe fips_openssl_crypto207_probe
715732
716733 #if defined(_MSC_VER)
717734 # pragma const_seg("fipsro$b")