Skip to content

Commit

Permalink
Fix Mac
Browse files Browse the repository at this point in the history
  • Loading branch information
potuz committed May 23, 2024
1 parent a116e77 commit c98dc10
Show file tree
Hide file tree
Showing 9 changed files with 290 additions and 116 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,35 @@ jobs:
- name: Run tests
run: ./build/test

windows:
name: windows
runs-on: windows-latest
steps:
- uses: actions/checkout@v4

- name: Install MinGW
run: |
choco install mingw
echo "C:\tools\mingw64\bin" >> $GITHUB_PATH
- name: Build
run: CC=gcc make all
shell: bash

- name: Run tests
run: ./build/test.exe
shell: bash

macos:
name: macos
runs-on: macos-latest
steps:
- uses: actions/checkout@v4
- name: Build
run: make all
- name: Run tests
run: ./build/test

rust-bindings:
runs-on: ubuntu-latest
name: (${{ matrix.target }})
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ There are no dependencies besides the standard `C` header `stdint.h`. Benchmarks
have a dependency on `libm`. Tests and benchmarks on x86-64 an extra
dependency on `cpuid.h` is needed. An optional dependency on openssl allows to
test and benchmark against openssl. The only build-time dependency is a GCC and
GNU assembler compatible compiler like `gcc` and `gas`.
GNU assembler compatible compiler like `gcc` and `gas`. On Mac OS X with newer Apple Silicon processors the library can be built with the default clang compiler.
## Compilation
- Start by cloning the repository
Expand Down
4 changes: 4 additions & 0 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,11 @@ $(libname): $(objx86)
$(AR) rcs $@ $(objx86)
endif

ifeq ($(WIN),1)
all: $(libname) test
else
all: $(libname) test bench
endif

test: hashtree.h acutest.h test.c $(libname)
$(CC) $(CFLAGS) $(LDFLAGS) -L$(LIB_DIR) -o $(OUT_DIR)/test test.c $(testlibs)
Expand Down
6 changes: 6 additions & 0 deletions src/hashtree.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ SOFTWARE.
#include <cpuid.h>
#endif
#ifdef __aarch64__
#ifndef __APPLE__
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif
#endif

static void init_and_hash(unsigned char *output, const unsigned char *input, uint64_t count);

Expand Down Expand Up @@ -65,6 +67,9 @@ static hashtree_hash_fcn hashtree_detect() {
return (hashtree_hash_fcn)0;
#endif
#ifdef __aarch64__
#ifdef __APPLE__
return &hashtree_sha256_sha_x1;
#else
long hwcaps = getauxval(AT_HWCAP);
if (hwcaps & HWCAP_SHA2) {
return &hashtree_sha256_sha_x1;
Expand All @@ -76,6 +81,7 @@ static hashtree_hash_fcn hashtree_detect() {

return (hashtree_hash_fcn)0;
#endif
#endif
}

int hashtree_init(hashtree_hash_fcn override) {
Expand Down
104 changes: 62 additions & 42 deletions src/sha256_armv8_crypto.S
Original file line number Diff line number Diff line change
Expand Up @@ -55,35 +55,55 @@ padding .req x5


.macro hashupdate WORD
sha256h q2, q3, WORD\().4s
sha256h2 q3, q8, WORD\().4s
sha256h q2, q3, \WORD
sha256h2 q3, q8, \WORD
mov v8.16b, v2.16b
.endm

.macro schedule A, B, C, D, E, WORD
add \WORD\().4s, \B\().4s, \A\().4s
sha256su0 \B\().4s, \C\().4s
sha256su1 \E\().4s, \C\().4s, \D\().4s
add \WORD, \B, \A
sha256su0 \B, \C
sha256su1 \E, \C, \D
hashupdate \WORD
.endm

#ifdef __APPLE__
.global _hashtree_sha256_sha_x1
#else
.global hashtree_sha256_sha_x1
#endif
#ifndef __APPLE__
.type hashtree_sha256_sha_x1,%function
#endif
.align 5
#ifdef __APPLE__
_hashtree_sha256_sha_x1:
#else
hashtree_sha256_sha_x1:
#endif
// Set up stack, need to save the clobbered registers d8-d11
sub sp, sp, #32
stp d8, d9, [sp]

#ifdef __APPLE__
adrp digest, .LDIGEST@PAGE
add digest, digest, #:lo12:.LDIGEST@PAGEOFF
adrp k256, .LK256@PAGE
add k256, k256, #:lo12:.LK256@PAGEOFF
#else
adrp digest, .LDIGEST
add digest, digest, #:lo12:.LDIGEST

adrp k256, .LK256
add k256, k256, #:lo12:.LK256

#endif
stp d10, d11, [sp, #16]
#ifdef __APPLE__
adrp padding, .LPADDING@PAGE
add padding, padding, #:lo12:.LPADDING@PAGEOFF
#else
adrp padding, .LPADDING
add padding, padding, #:lo12:.LPADDING

#endif
add last, output, count, lsl #5

ld1 {v0.4s, v1.4s}, [digest]
Expand Down Expand Up @@ -113,29 +133,29 @@ hashtree_sha256_sha_x1:

add v9.4s, v4.4s, v16.4s
sha256su0 v4.4s, v5.4s
hashupdate v9

schedule v17, v5, v6, v7, v4, v9
schedule v18, v6, v7, v4, v5, v9
schedule v19, v7, v4, v5, v6, v9
schedule v20, v4, v5, v6, v7, v9
schedule v21, v5, v6, v7, v4, v9
schedule v22, v6, v7, v4, v5, v9
schedule v23, v7, v4, v5, v6, v9
schedule v24, v4, v5, v6, v7, v9
schedule v25, v5, v6, v7, v4, v9
schedule v26, v6, v7, v4, v5, v9
schedule v27, v7, v4, v5, v6, v9
hashupdate v9.4s

schedule v17.4s, v5.4s, v6.4s, v7.4s, v4.4s, v9.4s
schedule v18.4s, v6.4s, v7.4s, v4.4s, v5.4s, v9.4s
schedule v19.4s, v7.4s, v4.4s, v5.4s, v6.4s, v9.4s
schedule v20.4s, v4.4s, v5.4s, v6.4s, v7.4s, v9.4s
schedule v21.4s, v5.4s, v6.4s, v7.4s, v4.4s, v9.4s
schedule v22.4s, v6.4s, v7.4s, v4.4s, v5.4s, v9.4s
schedule v23.4s, v7.4s, v4.4s, v5.4s, v6.4s, v9.4s
schedule v24.4s, v4.4s, v5.4s, v6.4s, v7.4s, v9.4s
schedule v25.4s, v5.4s, v6.4s, v7.4s, v4.4s, v9.4s
schedule v26.4s, v6.4s, v7.4s, v4.4s, v5.4s, v9.4s
schedule v27.4s, v7.4s, v4.4s, v5.4s, v6.4s, v9.4s

add v9.4s, v4.4s, v28.4s
hashupdate v9
hashupdate v9.4s
sha256su1 v7.4s, v5.4s, v6.4s
add v9.4s, v5.4s, v29.4s
hashupdate v9
hashupdate v9.4s
add v9.4s, v6.4s, v30.4s
hashupdate v9
hashupdate v9.4s
add v9.4s, v7.4s, v31.4s
hashupdate v9
hashupdate v9.4s

// Add initial digest and back it up
add v2.4s, v0.4s, v2.4s
Expand All @@ -153,22 +173,22 @@ hashtree_sha256_sha_x1:
ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [padding]
sub padding, padding, #192

hashupdate v16
hashupdate v17
hashupdate v18
hashupdate v19
hashupdate v20
hashupdate v21
hashupdate v22
hashupdate v23
hashupdate v24
hashupdate v25
hashupdate v26
hashupdate v27
hashupdate v28
hashupdate v29
hashupdate v30
hashupdate v31
hashupdate v16.4s
hashupdate v17.4s
hashupdate v18.4s
hashupdate v19.4s
hashupdate v20.4s
hashupdate v21.4s
hashupdate v22.4s
hashupdate v23.4s
hashupdate v24.4s
hashupdate v25.4s
hashupdate v26.4s
hashupdate v27.4s
hashupdate v28.4s
hashupdate v29.4s
hashupdate v30.4s
hashupdate v31.4s

// Add backed up digest
add v2.4s, v10.4s, v2.4s
Expand All @@ -185,7 +205,7 @@ hashtree_sha256_sha_x1:
ldp d10, d11, [sp], #16
ret

.section .rodata
.section .rodata, "a"
.align 4
.LDIGEST:
.word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,\
Expand Down
32 changes: 25 additions & 7 deletions src/sha256_armv8_neon_x1.S
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ T5 .req w22
# it reads pre-scheduled words from ptr + offset.
##################################################################################
.macro one_round A, B, C, D, E, F, G, H, ptr, offset
ldr T3, [\ptr, #\offset]
ldr T3, [\ptr, \offset]
ror T1, \E, #6
ror T2, \A, #2
ror T4, \A, #13
Expand Down Expand Up @@ -299,21 +299,35 @@ T5 .req w22
# is writable.
#
########################################################################################################

#ifdef __APPLE__
.global _hashtree_sha256_neon_x1
#else
.global hashtree_sha256_neon_x1
.type hashtree_sha256_neon_x1,%function
#endif
.align 4
#ifdef __APPLE__
_hashtree_sha256_neon_x1:
#else
hashtree_sha256_neon_x1:
#endif
sub sp, sp, #64
stp digest,k256, [sp, #48]

movi VZ.4s, #0
stp padding, x22, [sp, #32]
adrp digest, .LDIGEST
#ifdef __APPLE__
adrp digest, .LDIGEST@PAGE
add digest, digest, .LDIGEST@PAGEOFF
adrp padding, .LPADDING@PAGE
add padding, padding, .LPADDING@PAGEOFF
#else
adrp digest, .LDIGEST
add digest, digest, #:lo12:.LDIGEST

adrp padding, .LPADDING
add padding, padding, #:lo12:.LPADDING

add padding, padding, #:lo12:.LPADDING
#endif
add last, output, count, lsl #5

.Lhash_1_block_loop:
Expand All @@ -322,9 +336,13 @@ hashtree_sha256_neon_x1:
beq .Larmv8_neon_x1_finish

ld1 {VR0.4s, VR1.4s, VR2.4s, VR3.4s}, [input], #64
#ifdef __APPLE__
adrp k256, .LK256@PAGE
add k256, k256, #:lo12:.LK256@PAGEOFF
#else
adrp k256, .LK256
add k256, k256, #:lo12:.LK256

#endif
# change endianness
rev32 VR0.16b, VR0.16b
rev32 VR1.16b, VR1.16b
Expand Down Expand Up @@ -423,7 +441,7 @@ hashtree_sha256_neon_x1:
add sp, sp, #64
ret

.section .rodata
.section .rodata, "a"
.align 4
.LDIGEST:
.word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,\
Expand Down
38 changes: 31 additions & 7 deletions src/sha256_armv8_neon_x4.S
Original file line number Diff line number Diff line change
Expand Up @@ -277,23 +277,44 @@ TQ7 .req q22
round_padding \F, \G, \H, \A, \B, \C, \D, \E
.endm

#ifdef __APPLE__
.global _hashtree_sha256_neon_x4
#else
.global hashtree_sha256_neon_x4
#endif
#ifdef __APPLE__
//.type hashtree_sha256_neon_x4,%function
#else
.type hashtree_sha256_neon_x4,%function
#endif

.align 5
#ifdef __APPLE__
_hashtree_sha256_neon_x4:
#else
hashtree_sha256_neon_x4:
#endif
sub sp, sp, #1024
adrp k256, .LK256x4
add k256, k256, #:lo12:.LK256x4

#ifdef __APPLE__
adrp k256,.LK256x4@GOTPAGE
ldr k256, [k256, .LK256x4@GOTPAGEOFF]
adrp padding, .LPADDINGx4@GOTPAGE
ldr padding, [padding, .LPADDINGx4@GOTPAGEOFF]
adrp digest, .LDIGESTx4L@GOTPAGE
ldr digest, [digest, .LDIGESTx4L@GOTPAGEOFF]
adrp digest2, .LDIGESTx4H@GOTPAGE
ldr digest2, [digest2, .LDIGESTx4H@GOTPAGEOFF]
#else
adrp k256,.LK256x4
add k256, k256, #:lo12:.LK256x4
adrp padding, .LPADDINGx4
add padding, padding, #:lo12:.LPADDINGx4

adrp digest, .LDIGESTx4L
add digest, digest, #:lo12:.LDIGESTx4L

adrp digest2, .LDIGESTx4H
add digest2, digest2, #:lo12:.LDIGESTx4H

#endif
mov post64, #64
mov post32, #32
mov postminus80, #-80
Expand Down Expand Up @@ -397,9 +418,12 @@ hashtree_sha256_neon_x4:
b .Larmv8_neon_x4_loop
.Lsha256_armv8_x4_epilog:
add sp, sp, #1024
#ifdef __APPLE__
b _hashtree_sha256_neon_x1
#else
b hashtree_sha256_neon_x1

.section .rodata
#endif
.section .rodata,"a"
.align 4
.LDIGESTx4L:
.word 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667,\
Expand Down
Loading

0 comments on commit c98dc10

Please sign in to comment.