| #include <stdio.h> |
| #include <assert.h> |
| #include <stdlib.h> |
| #include <time.h> |
|
|
| const int N = 1 << 14; |
| const int M = 1 << 14; |
|
|
| void mul_mat_vec_f32_0( |
| const float * src0, |
| const float * src1, |
| float * dst, |
| unsigned nrows, |
| unsigned ncols) { |
| for (unsigned i = 0; i < nrows; i++) { |
| float sum = 0.0f; |
| for (unsigned j = 0; j < ncols; j++) { |
| sum += src0[i*ncols + j]*src1[j]; |
| } |
| dst[i] = sum; |
| } |
| } |
| #if defined(_MSC_VER) |
| typedef float __declspec(align(32)) afloat; |
| #else |
| typedef float afloat __attribute__((__aligned__(32))); |
| #endif |
| void mul_mat_vec_f32_1( |
| const afloat *restrict src0, |
| const afloat *restrict src1, |
| afloat *restrict dst, |
| unsigned nrows, |
| unsigned ncols) { |
| for (unsigned i = 0; i < nrows; i++) { |
| const afloat * restrict row = src0 + i*ncols; |
| const afloat * restrict col = src1; |
|
|
| float sum = 0.0f; |
|
|
| for (unsigned j = 0; j < ncols; j++) { |
| sum += *row++ * *col++; |
| } |
|
|
| dst[i] = sum; |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| } |
| } |
|
|
| void mul_mat_vec_f32_2( |
| const void * src0, |
| const void * src1, |
| void * dst, |
| unsigned nrows, |
| unsigned ncols) { |
| void * d = dst; |
| for (unsigned i = 0; i < nrows; i++) { |
| float sum = 0.0f; |
|
|
| const char * row = (const char*)src0 + i*ncols*sizeof(float); |
| const char * col = (const char*)src1; |
| for (unsigned j = 0; j < ncols; j++) { |
| sum += (*(float *)row) * (*(float *)col); |
| row += sizeof(float); |
| col += sizeof(float); |
| } |
| *(float *)d = sum; |
| d = (char*)d + sizeof(float); |
| } |
| } |
|
|
| #if defined(_MSC_VER) |
| void* aligned_alloc(size_t alignment, size_t size) { |
| return _aligned_malloc(size, alignment); |
| } |
| #endif |
|
|
| int main(int argc, const char ** argv) { |
| |
| |
| |
|
|
| afloat * src0 = (float *)(aligned_alloc(32, sizeof(float)*N*M)); |
| afloat * src1 = (float *)(aligned_alloc(32, sizeof(float)*M)); |
| afloat * dst = (float *)(aligned_alloc(32, sizeof(float)*N)); |
|
|
| for (int i = 0; i < N*M; i++) { |
| src0[i] = (afloat)i; |
| } |
|
|
| for (int i = 0; i < M; i++) { |
| src1[i] = (afloat)i; |
| } |
|
|
| const int nIter = 10; |
|
|
| const clock_t start = clock(); |
|
|
| double sum = 0.0f; |
| for (int i = 0; i < nIter; i++) { |
| |
| mul_mat_vec_f32_1(src0, src1, dst, N, M); |
| |
| for (int i = 0; i < N; i++) { |
| sum += dst[i]; |
| } |
| } |
|
|
| { |
| const clock_t end = clock(); |
| printf("%s: elapsed ticks: %ld\n", __func__, end - start); |
| } |
|
|
| printf("%f\n", sum); |
|
|
| return 0; |
| } |
|
|