Skip to content

Commit

Permalink
Better BLAS support (#55)
Browse files Browse the repository at this point in the history
* Better BLAS support

* added more fixups for BLAS

* Test with USE_BLAS should now pass

* cblas_saxpby is Standard 4 of BLAS
  • Loading branch information
oysteijo committed Nov 15, 2023
1 parent 2295b22 commit ea7868c
Show file tree
Hide file tree
Showing 11 changed files with 64 additions and 20 deletions.
12 changes: 5 additions & 7 deletions examples/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,22 @@ NEURALNET_LIBPATH = ../src
CFLAGS += -I$(NPY_ARRAY_LIBPATH) -I$(NEURALNET_LIBPATH) -I..
LDFLAGS += -L$(NEURALNET_LIBPATH) -lsimd_neuralnet
LDFLAGS += -L$(NPY_ARRAY_LIBPATH) -lnpy_array
LDFLAGS += `pkg-config --libs openblas`
LDFLAGS += -lzip -ldl -lm

#DEFINES = -DUSE_CBLAS
DEFINES =

ifeq ($(CC),gcc)
LDFLAGS += -lgomp
DEFINES += -Wno-override-init
DEFINE += -Wno-override-init
endif

ifeq ($(CC),clang)
LDFLAGS += -lomp
DEFINES += -Wno-initializer-overrides
DEFINE += -Wno-initializer-overrides
endif

DEFINES += -Wno-unused-parameter
DEFINE += -Wno-unused-parameter

CFLAGS += $(DEFINES)
CFLAGS += $(DEFINE)

examples = example_01 example_02 example_03 test_sgd

Expand Down
1 change: 1 addition & 0 deletions examples/configure
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,6 @@ fi
echo "Architecture options: $cpuinfo ..."
echo 'arch = '$cpuinfo'' >> Makefile

echo 'include ../src/Makefile.blas' >>Makefile
echo 'include Makefile.in' >>Makefile
echo 'Configuration complete, type make to build.'
13 changes: 13 additions & 0 deletions src/Makefile.blas
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#DEFINE = -DUSE_CBLAS
# Netlib BLAS
#BLAS_CFLAGS = `pkg-config --cflags blas`
#BLAS_LDFLAGS = `pkg-config --libs blas`

# OpenBLAS
#BLAS_CFLAGS = `pkg-config --cflags openblas`
#BLAS_LDFLAGS = `pkg-config --libs openblas`

# clBLAS (Needs tweeking)
#BLAS_CFLAGS = `pkg-config --cflags clBLAS`
#BLAS_LDFLAGS = `pkg-config --libs clBLAS`

3 changes: 1 addition & 2 deletions src/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@ SHELL := /bin/bash

CC = gcc
INCLUDE += -I.
CFLAGS += -std=c99 $(INCLUDE) -Wall -Wextra -O3 -fopenmp $(arch) $(dbg) $(profile)
CFLAGS += -std=c99 $(DEFINE) $(INCLUDE) $(BLAS_CFLAGS) -Wall -Wextra -O3 -fopenmp $(arch) $(dbg) $(profile)
SED = sed
RANLIB = ranlib

obj = $(src:.c=.o)
dep = $(obj:.o=.d) # one dependency file for each source

Expand Down
6 changes: 4 additions & 2 deletions src/activation.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "activation.h"
#include "simd.h"

#include <string.h>
#include <math.h>
Expand Down Expand Up @@ -302,10 +303,11 @@ static void relu( const int n, float *y )
int i = 0;
#ifdef __AVX__
const __m256 zero = _mm256_set1_ps(0.0f);

__m256 YMM0, YMM1;
for ( ; !is_aligned( y + i ); i++)
y[i] = fmaxf(0.0f, y[i]);

for (i = 0; i <= ((n)-16); i += 16) {
for ( ; i <= ((n)-16); i += 16) {
YMM0 = _mm256_load_ps(y + i);
YMM1 = _mm256_load_ps(y + i + 8);
YMM0 = _mm256_max_ps(zero, YMM0);
Expand Down
1 change: 1 addition & 0 deletions src/configure
Original file line number Diff line number Diff line change
Expand Up @@ -156,5 +156,6 @@ if $have_pkg_config; then
echo "pkg_config_path = ${pc_path}" >>Makefile
fi

echo "include Makefile.blas" >>Makefile
echo "include Makefile.in" >>Makefile
echo "configure done, type 'make' to build."
20 changes: 20 additions & 0 deletions src/matrix_operations.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#ifdef USE_CBLAS
#include <cblas.h>
#include <string.h>
#endif

#ifdef __AVX__
Expand Down Expand Up @@ -104,6 +105,11 @@ void vector_vector_outer( int n_rows, int n_cols, const float *x, const float *y
* However, I'm not sure how much it will improve the performance. */
void vector_matrix_multiply( int n, int m, const float *weight, const float *bias, const float *input, float *y )
{
#ifdef USE_CBLAS
memcpy( y, bias, m * sizeof(float));
cblas_sgemv( CblasRowMajor, CblasTrans,
n, m, 1.0f, weight, m, input, 1, 1.0f, y, 1 );
#else
/*
assert( is_aligned( weight ));
assert( is_aligned( bias ));
Expand Down Expand Up @@ -172,6 +178,7 @@ void vector_matrix_multiply( int n, int m, const float *weight, const float *bia
}
}
}
#endif /* USE_CBLAS */
}

/**
Expand All @@ -184,6 +191,9 @@ void vector_matrix_multiply( int n, int m, const float *weight, const float *bia
/* This is actually the same as saxpy -- y = ax + y, but with a = 1.0 */
void vector_accumulate( const int n, float *a, const float *b )
{
#ifdef USE_CBLAS
cblas_saxpy( n, 1.0f, b, 1, a, 1 );
#else
int i = 0;
float *a_ptr = a;
const float *b_ptr = b;
Expand All @@ -197,6 +207,7 @@ void vector_accumulate( const int n, float *a, const float *b )
#endif
for (; i < n; i++ )
*a_ptr++ += *b_ptr++;
#endif /* USE_CBLAS */
}

/**
Expand Down Expand Up @@ -291,6 +302,9 @@ void vector_divide_by_scalar( const int n, float *v, const float scalar )
*/
void vector_saxpy( const int n, float *a, const float alpha, const float *b )
{
#ifdef USE_CBLAS
cblas_saxpy( n, alpha, b, 1, a, 1 );
#else
int i = 0;
float *a_ptr = a;
const float *b_ptr = b;
Expand All @@ -304,6 +318,7 @@ void vector_saxpy( const int n, float *a, const float alpha, const float *b )
#endif
for (; i < n; i++ )
*a_ptr++ += alpha * *b_ptr++;
#endif /* USE_CBLAS */
}

/**
Expand All @@ -326,6 +341,10 @@ void vector_saxpy( const int n, float *a, const float alpha, const float *b )
*/
void vector_saxpby( const int n, float *a, const float alpha, const float *b, const float beta )
{

#if USE_CBLAS // # Eh? is it not the blas standard?
cblas_saxpby( n, alpha, b, 1, beta, a, 1 );
#else
int i = 0;
float *a_ptr = a;
const float *b_ptr = b;
Expand Down Expand Up @@ -354,6 +373,7 @@ void vector_saxpby( const int n, float *a, const float alpha, const float *b, co
#endif
for (; i < n; i++, a_ptr++ )
*a_ptr = beta * *a_ptr + alpha * *b_ptr++;
#endif /* USE_CBLAS */
}

/**
Expand Down
10 changes: 8 additions & 2 deletions src/neuralnet.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright -- Øystein Schønning-Johansen 2007-2021 */
/* Copyright -- Øystein Schønning-Johansen 2007-2023 */

#include "neuralnet.h"
#include "simd.h"
Expand Down Expand Up @@ -646,6 +646,7 @@ neuralnet_t * neuralnet_create( const int n_layers, int sizes[], char *activatio
return NULL;
}

#ifndef USE_CBLAS
/* OK - resize "bad" sized */
int resizes[n_layers + 1];
memcpy( resizes, sizes, (n_layers + 1) * sizeof(int));
Expand All @@ -657,7 +658,7 @@ neuralnet_t * neuralnet_create( const int n_layers, int sizes[], char *activatio
fprintf( stderr, "INFO - Neural network size is resized to match CPUs SIMD registers.\nINFO - Hidden size %d is resized to %d.\n",
sizes[i], resizes[i] );
}

#endif
neuralnet_t *nn;
/* The neural network itself doesn't need to be aligned */
if ( (nn = malloc( sizeof( neuralnet_t ))) == NULL ){
Expand All @@ -675,8 +676,13 @@ neuralnet_t * neuralnet_create( const int n_layers, int sizes[], char *activatio
}

for( int i = 0; i < nn->n_layers; i++ ){
#ifdef USE_CBLAS
nn->layer[i].n_input = sizes[i];
nn->layer[i].n_output = sizes[i+1];
#else
nn->layer[i].n_input = resizes[i];
nn->layer[i].n_output = resizes[i+1];
#endif
}

if( !_weights_memory_allocate( nn )){
Expand Down
10 changes: 4 additions & 6 deletions tests/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,21 @@ LDLIBS += -L$(NEURALNET_LIBPATH) -lsimd_neuralnet
LDLIBS += -L$(NPY_ARRAY_LIBPATH) -lnpy_array
LDLIBS += -ldl
LDLIBS += -lm

#DEFINES = -DUSE_CBLAS
DEFINES =
LDLIBS += $(BLAS_LDFLAGS)

ifeq ($(CC),gcc)
DEFINES += -Wno-override-init
DEFINE += -Wno-override-init
LDLIBS += -lgomp
endif

ifeq ($(CC),clang)
DEFINES += -Wno-initializer-overrides
DEFINE += -Wno-initializer-overrides
LDLIBS += -lomp
endif



CFLAGS += $(DEFINES)
CFLAGS += $(DEFINE)

testprogs = test_neuralnet test_sgd test_backpropagation test_activation test_loss test_metrics

Expand Down
1 change: 1 addition & 0 deletions tests/configure
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,6 @@ fi

echo 'arch = '$cpuinfo'' >> Makefile

echo "include ../src/Makefile.blas" >>Makefile
echo "include Makefile.in" >>Makefile
echo 'configuration complete, type make to build.'
7 changes: 6 additions & 1 deletion tests/test_neuralnet.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,13 @@ int main(int argc, char *argv[] )
floats_per_simd_register==16 ? "AVX512" :
floats_per_simd_register==8 ? "AVX/AVX2" :
floats_per_simd_register==4 ? "SSE" : "No SIMD");
#ifdef USE_CBLAS
const int n_param_342 = 26;
#else
const int n_param_342 = floats_per_simd_register == 16 ? 98 :
floats_per_simd_register == 8 ? 50 : 26;
#endif
printf( "I believe the number of parameters should be: %d\n", n_param_342 );

CHECK_INT_EQUALS_MSG( n_param_342, neuralnet_total_n_parameters(nn),
"Checking that total number of parametes is correct" );
Expand Down Expand Up @@ -72,7 +77,7 @@ int main(int argc, char *argv[] )

CHECK_NOT_NULL_MSG( nn,
"Checking that neural network was created" );
#if floats_per_simd_register == 16
#if !(defined USE_CBLAS) && floats_per_simd_register == 16
n_out += 8;
#endif

Expand Down

0 comments on commit ea7868c

Please sign in to comment.