| #include "ggml_v1.h" |
|
|
| #if defined(_MSC_VER) || defined(__MINGW32__) |
| #include <malloc.h> |
| #elif !defined(__FreeBSD__) |
| #include <alloca.h> |
| #endif |
|
|
| #include <assert.h> |
| #include <time.h> |
| #include <math.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <float.h> |
|
|
| |
| |
| #ifndef static_assert |
| #define static_assert(cond, msg) struct global_scope_noop_trick |
| #endif |
|
|
| #if defined _MSC_VER || defined(__MINGW32__) |
|
|
| #if !defined(__MINGW32__) |
| #include <Windows.h> |
| #else |
| |
| #include <windows.h> |
| #include <errno.h> |
| #endif |
|
|
| typedef volatile LONG atomic_int; |
| typedef atomic_int atomic_bool; |
|
|
| static void atomic_store(atomic_int* ptr, LONG val) { |
| InterlockedExchange(ptr, val); |
| } |
| static LONG atomic_load(atomic_int* ptr) { |
| return InterlockedCompareExchange(ptr, 0, 0); |
| } |
| static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { |
| return InterlockedExchangeAdd(ptr, inc); |
| } |
| static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { |
| return atomic_fetch_add(ptr, -(dec)); |
| } |
|
|
| typedef HANDLE pthread_t; |
|
|
| typedef DWORD thread_ret_t; |
| static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { |
| HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); |
| if (handle == NULL) |
| { |
| return EAGAIN; |
| } |
|
|
| *out = handle; |
| return 0; |
| } |
|
|
| static int pthread_join(pthread_t thread, void* unused) { |
| return (int) WaitForSingleObject(thread, INFINITE); |
| } |
|
|
| static int sched_yield (void) { |
| Sleep (0); |
| return 0; |
| } |
| #else |
| #include <pthread.h> |
| #include <stdatomic.h> |
|
|
| typedef void* thread_ret_t; |
| #endif |
|
|
| #ifdef __HAIKU__ |
| #define static_assert(cond, msg) _Static_assert(cond, msg) |
| #endif |
|
|
| |
| #define GGML_V1_DEBUG 0 |
| #define GGML_V1_GELU_FP16 |
|
|
| #define GGML_V1_SOFT_MAX_UNROLL 4 |
| #define GGML_V1_VEC_DOT_UNROLL 2 |
|
|
| #ifdef GGML_USE_ACCELERATE |
| |
| |
| |
| #endif |
|
|
| #if UINTPTR_MAX == 0xFFFFFFFF |
| #define GGML_V1_MEM_ALIGN 4 |
| #else |
| #define GGML_V1_MEM_ALIGN 16 |
| #endif |
|
|
| #define UNUSED(x) (void)(x) |
| #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) |
|
|
| #define GGML_V1_ASSERT(x) \ |
| do { \ |
| if (!(x)) { \ |
| fprintf(stderr, "GGML_V1_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ |
| abort(); \ |
| } \ |
| } while (0) |
|
|
| #ifdef GGML_USE_ACCELERATE |
| #include <Accelerate/Accelerate.h> |
| #elif GGML_USE_OPENBLAS |
| #include <cblas.h> |
| #endif |
|
|
| #undef MIN |
| #undef MAX |
| #define MIN(a, b) ((a) < (b) ? (a) : (b)) |
| #define MAX(a, b) ((a) > (b) ? (a) : (b)) |
|
|
| |
| typedef double ggml_v1_float; |
|
|
| |
| |
| |
| #ifdef __ARM_NEON |
|
|
| |
| |
| |
| |
| #include <arm_neon.h> |
|
|
| #define GGML_V1_COMPUTE_FP16_TO_FP32(x) (x) |
| #define GGML_V1_COMPUTE_FP32_TO_FP16(x) (x) |
|
|
| #define GGML_V1_FP16_TO_FP32(x) (x) |
| #define GGML_V1_FP32_TO_FP16(x) (x) |
|
|
| #else |
|
|
| #ifdef __wasm_simd128__ |
| #include <wasm_simd128.h> |
| #else |
| #ifdef __POWER9_VECTOR__ |
| #include <altivec.h> |
| #undef bool |
| #define bool _Bool |
| #else |
| #if !defined(__riscv) |
| #include <immintrin.h> |
| #endif |
| #endif |
| #endif |
|
|
| #ifdef __F16C__ |
|
|
| #define GGML_V1_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) |
| #define GGML_V1_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) |
|
|
| #else |
|
|
| |
| |
|
|
| static inline float fp32_from_bits(uint32_t w) { |
| union { |
| uint32_t as_bits; |
| float as_value; |
| } fp32; |
| fp32.as_bits = w; |
| return fp32.as_value; |
| } |
|
|
| static inline uint32_t fp32_to_bits(float f) { |
| union { |
| float as_value; |
| uint32_t as_bits; |
| } fp32; |
| fp32.as_value = f; |
| return fp32.as_bits; |
| } |
|
|
| static inline float ggml_v1_compute_fp16_to_fp32(ggml_v1_fp16_t h) { |
| const uint32_t w = (uint32_t) h << 16; |
| const uint32_t sign = w & UINT32_C(0x80000000); |
| const uint32_t two_w = w + w; |
|
|
| const uint32_t exp_offset = UINT32_C(0xE0) << 23; |
| #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) |
| const float exp_scale = 0x1.0p-112f; |
| #else |
| const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); |
| #endif |
| const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; |
|
|
| const uint32_t magic_mask = UINT32_C(126) << 23; |
| const float magic_bias = 0.5f; |
| const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; |
|
|
| const uint32_t denormalized_cutoff = UINT32_C(1) << 27; |
| const uint32_t result = sign | |
| (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); |
| return fp32_from_bits(result); |
| } |
|
|
| static inline ggml_v1_fp16_t ggml_v1_compute_fp32_to_fp16(float f) { |
| #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) |
| const float scale_to_inf = 0x1.0p+112f; |
| const float scale_to_zero = 0x1.0p-110f; |
| #else |
| const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); |
| const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); |
| #endif |
| float base = (fabsf(f) * scale_to_inf) * scale_to_zero; |
|
|
| const uint32_t w = fp32_to_bits(f); |
| const uint32_t shl1_w = w + w; |
| const uint32_t sign = w & UINT32_C(0x80000000); |
| uint32_t bias = shl1_w & UINT32_C(0xFF000000); |
| if (bias < UINT32_C(0x71000000)) { |
| bias = UINT32_C(0x71000000); |
| } |
|
|
| base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; |
| const uint32_t bits = fp32_to_bits(base); |
| const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); |
| const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); |
| const uint32_t nonsign = exp_bits + mantissa_bits; |
| return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); |
| } |
|
|
| #define GGML_V1_COMPUTE_FP16_TO_FP32(x) ggml_v1_compute_fp16_to_fp32(x) |
| #define GGML_V1_COMPUTE_FP32_TO_FP16(x) ggml_v1_compute_fp32_to_fp16(x) |
|
|
| #endif |
|
|
| #endif |
|
|
| |
| |
| |
|
|
| |
| static ggml_v1_fp16_t table_gelu_f16[1 << 16]; |
|
|
| |
| static ggml_v1_fp16_t table_exp_f16[1 << 16]; |
|
|
| |
| static float table_f32_f16[1 << 16]; |
|
|
| |
| |
| #if !defined(GGML_V1_FP16_TO_FP32) || !defined(GGML_V1_FP32_TO_FP16) |
|
|
| inline static float ggml_v1_lookup_fp16_to_fp32(ggml_v1_fp16_t f) { |
| uint16_t s; |
| memcpy(&s, &f, sizeof(uint16_t)); |
| return table_f32_f16[s]; |
| } |
|
|
| #define GGML_V1_FP16_TO_FP32(x) ggml_v1_lookup_fp16_to_fp32(x) |
| #define GGML_V1_FP32_TO_FP16(x) GGML_V1_COMPUTE_FP32_TO_FP16(x) |
|
|
| #endif |
|
|
| |
| |
| float ggml_v1_fp16_to_fp32(ggml_v1_fp16_t x) { |
| return GGML_V1_FP16_TO_FP32(x); |
| } |
|
|
| ggml_v1_fp16_t ggml_v1_fp32_to_fp16(float x) { |
| return GGML_V1_FP32_TO_FP16(x); |
| } |
|
|
| |
| |
| |
|
|
| #if defined(_MSC_VER) || defined(__MINGW32__) |
| static int64_t timer_freq; |
| void ggml_v1_time_init(void) { |
| LARGE_INTEGER frequency; |
| QueryPerformanceFrequency(&frequency); |
| timer_freq = frequency.QuadPart; |
| } |
| int64_t ggml_v1_time_ms(void) { |
| LARGE_INTEGER t; |
| QueryPerformanceCounter(&t); |
| return (t.QuadPart * 1000) / timer_freq; |
| } |
| int64_t ggml_v1_time_us(void) { |
| LARGE_INTEGER t; |
| QueryPerformanceCounter(&t); |
| return (t.QuadPart * 1000000) / timer_freq; |
| } |
| #else |
| void ggml_v1_time_init(void) {} |
| int64_t ggml_v1_time_ms(void) { |
| struct timespec ts; |
| clock_gettime(CLOCK_MONOTONIC, &ts); |
| return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000; |
| } |
|
|
| int64_t ggml_v1_time_us(void) { |
| struct timespec ts; |
| clock_gettime(CLOCK_MONOTONIC, &ts); |
| return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000; |
| } |
| #endif |
|
|
| int64_t ggml_v1_cycles(void) { |
| return clock(); |
| } |
|
|
| int64_t ggml_v1_cycles_per_ms(void) { |
| return CLOCKS_PER_SEC/1000; |
| } |
|
|
| #ifdef GGML_V1_PERF |
| #define ggml_v1_perf_time_ms() ggml_v1_time_ms() |
| #define ggml_v1_perf_time_us() ggml_v1_time_us() |
| #define ggml_v1_perf_cycles() ggml_v1_cycles() |
| #define ggml_v1_perf_cycles_per_ms() ggml_v1_cycles_per_ms() |
| #else |
| #define ggml_v1_perf_time_ms() 0 |
| #define ggml_v1_perf_time_us() 0 |
| #define ggml_v1_perf_cycles() 0 |
| #define ggml_v1_perf_cycles_per_ms() 0 |
| #endif |
|
|
| |
| |
| |
|
|
| #if defined(__cpp_lib_hardware_interference_size) |
| #define CACHE_LINE_SIZE hardware_destructive_interference_size |
| #else |
| #if defined(__POWER9_VECTOR__) |
| #define CACHE_LINE_SIZE 128 |
| #else |
| #define CACHE_LINE_SIZE 64 |
| #endif |
| #endif |
|
|
| static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); |
|
|
| |
| |
| |
|
|
| #define QK 32 |
|
|
| |
| |
| |
| static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { |
| assert(k % QK == 0); |
|
|
| const int nb = k / QK; |
|
|
| float * restrict pd = (float *) (y); |
| uint8_t * restrict pb = (uint8_t *) (pd + nb); |
|
|
| uint8_t pp[QK/2]; |
|
|
| #if __ARM_NEON |
| #if QK == 32 |
| for (int i = 0; i < nb; i++) { |
| float amax = 0.0f; |
|
|
| float32x4_t srcv [8]; |
| float32x4_t asrcv[8]; |
| float32x4_t amaxv[8]; |
|
|
| for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); |
| for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); |
|
|
| for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]); |
| for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); |
| for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); |
|
|
| amax = MAX( |
| MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)), |
| MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3))); |
|
|
| const float d = amax / ((1 << 3) - 1); |
| const float id = d ? 1.0/d : 0.0; |
|
|
| pd[i] = d; |
|
|
| for (int l = 0; l < 8; l++) { |
| const float32x4_t v = vmulq_n_f32(srcv[l], id); |
| const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f)); |
| const int32x4_t vi = vcvtq_s32_f32(vf); |
|
|
| pp[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); |
| pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); |
| } |
|
|
| memcpy(pb + i*16, pp, sizeof(pp)); |
| } |
| #else |
| #error "not implemented for QK" |
| #endif |
| #elif defined(__wasm_simd128__) |
| #if QK == 32 |
| for (int i = 0; i < nb; i++) { |
| float amax = 0.0f; |
|
|
| v128_t srcv [8]; |
| v128_t asrcv[8]; |
| v128_t amaxv[8]; |
|
|
| for (int l = 0; l < 8; l++) srcv[l] = wasm_v128_load(x + i*32 + 4*l); |
| for (int l = 0; l < 8; l++) asrcv[l] = wasm_f32x4_abs(srcv[l]); |
|
|
| for (int l = 0; l < 4; l++) amaxv[2*l] = wasm_f32x4_max(asrcv[2*l], asrcv[2*l+1]); |
| for (int l = 0; l < 2; l++) amaxv[4*l] = wasm_f32x4_max(amaxv[4*l], amaxv[4*l+2]); |
| for (int l = 0; l < 1; l++) amaxv[8*l] = wasm_f32x4_max(amaxv[8*l], amaxv[8*l+4]); |
|
|
| amax = MAX( |
| MAX(wasm_f32x4_extract_lane(amaxv[0], 0), wasm_f32x4_extract_lane(amaxv[0], 1)), |
| MAX(wasm_f32x4_extract_lane(amaxv[0], 2), wasm_f32x4_extract_lane(amaxv[0], 3))); |
|
|
| const float d = amax / ((1 << 3) - 1); |
| const float id = d ? 1.0/d : 0.0; |
|
|
| pd[i] = d; |
|
|
| for (int l = 0; l < 8; l++) { |
| const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id)); |
| const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f)); |
| const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf); |
|
|
| pp[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4); |
| pp[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4); |
| } |
|
|
| memcpy(pb + i*16, pp, sizeof(pp)); |
| } |
| #else |
| #error "not implemented for QK" |
| #endif |
| #else |
| |
| for (int i = 0; i < nb; i++) { |
| float amax = 0.0f; |
|
|
| for (int l = 0; l < QK; l++) { |
| const float v = x[i*QK + l]; |
| amax = MAX(amax, fabsf(v)); |
| } |
|
|
| const float d = amax / ((1 << 3) - 1); |
| const float id = d ? 1.0f/d : 0.0f; |
|
|
| pd[i] = d; |
|
|
| for (int l = 0; l < QK; l += 2) { |
| const float v0 = x[i*QK + l + 0]*id; |
| const float v1 = x[i*QK + l + 1]*id; |
|
|
| const uint8_t vi0 = ((int8_t) (round(v0))) + 8; |
| const uint8_t vi1 = ((int8_t) (round(v1))) + 8; |
|
|
| assert(vi0 >= 0 && vi0 < 16); |
| assert(vi1 >= 0 && vi1 < 16); |
|
|
| pp[l/2] = vi0 | (vi1 << 4); |
| } |
|
|
| memcpy(pb + i*QK/2, pp, sizeof(pp)); |
| } |
| #endif |
| } |
|
|
| |
| |
| |
| static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { |
| assert(k % QK == 0); |
|
|
| const int nb = k / QK; |
|
|
| float * restrict pm = (float *) (y); |
| float * restrict pd = (float *) (pm + nb); |
| uint8_t * restrict pb = (uint8_t *) (pd + nb); |
|
|
| uint8_t pp[QK/2]; |
|
|
| for (int i = 0; i < nb; i++) { |
| float min = FLT_MAX; |
| float max = -FLT_MAX; |
|
|
| for (int l = 0; l < QK; l++) { |
| const float v = x[i*QK + l]; |
| if (v < min) min = v; |
| if (v > max) max = v; |
| } |
|
|
| const float d = (max - min) / ((1 << 4) - 1); |
| const float id = d ? 1.0f/d : 0.0f; |
|
|
| pm[i] = min; |
| pd[i] = d; |
|
|
| for (int l = 0; l < QK; l += 2) { |
| const float v0 = (x[i*QK + l + 0] - min)*id; |
| const float v1 = (x[i*QK + l + 1] - min)*id; |
|
|
| const uint8_t vi0 = round(v0); |
| const uint8_t vi1 = round(v1); |
|
|
| assert(vi0 >= 0 && vi0 < 16); |
| assert(vi1 >= 0 && vi1 < 16); |
|
|
| pp[l/2] = vi0 | (vi1 << 4); |
| } |
|
|
| memcpy(pb + i*QK/2, pp, sizeof(pp)); |
| } |
| } |
|
|
| |
| static void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { |
| assert(k % QK == 0); |
|
|
| const int nb = k / QK; |
|
|
| const float * restrict pd = (const float *) (x); |
| const uint8_t * restrict pb = (const uint8_t *) (pd + nb); |
|
|
| |
| for (int i = 0; i < nb; i++) { |
| const float d = pd[i]; |
|
|
| const uint8_t * restrict pp = pb + i*QK/2; |
|
|
| for (int l = 0; l < QK; l += 2) { |
| const uint8_t vi = pp[l/2]; |
|
|
| const int8_t vi0 = vi & 0xf; |
| const int8_t vi1 = vi >> 4; |
|
|
| const float v0 = (vi0 - 8)*d; |
| const float v1 = (vi1 - 8)*d; |
|
|
| y[i*QK + l + 0] = v0; |
| y[i*QK + l + 1] = v1; |
|
|
| assert(!isnan(y[i*QK + l + 0])); |
| assert(!isnan(y[i*QK + l + 1])); |
| } |
| } |
| } |
|
|
| static void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { |
| assert(k % QK == 0); |
|
|
| const int nb = k / QK; |
|
|
| const float * restrict pm = (const float *) (x); |
| const float * restrict pd = (const float *) (pm + nb); |
| const uint8_t * restrict pb = (const uint8_t *) (pd + nb); |
|
|
| for (int i = 0; i < nb; i++) { |
| const float m = pm[i]; |
| const float d = pd[i]; |
|
|
| const uint8_t * restrict pp = pb + i*QK/2; |
|
|
| for (int l = 0; l < QK; l += 2) { |
| const uint8_t vi = pp[l/2]; |
|
|
| const int8_t vi0 = vi & 0xf; |
| const int8_t vi1 = vi >> 4; |
|
|
| const float v0 = vi0*d + m; |
| const float v1 = vi1*d + m; |
|
|
| y[i*QK + l + 0] = v0; |
| y[i*QK + l + 1] = v1; |
|
|
| assert(!isnan(y[i*QK + l + 0])); |
| assert(!isnan(y[i*QK + l + 1])); |
| } |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) |
|
|
| #define GGML_V1_SIMD |
|
|
| |
|
|
| #define GGML_V1_F32_STEP 16 |
| #define GGML_V1_F32_EPR 4 |
|
|
| #define GGML_V1_F32x4 float32x4_t |
| #define GGML_V1_F32x4_ZERO vdupq_n_f32(0.0f) |
| #define GGML_V1_F32x4_SET1(x) vdupq_n_f32(x) |
| #define GGML_V1_F32x4_LOAD vld1q_f32 |
| #define GGML_V1_F32x4_STORE vst1q_f32 |
| #define GGML_V1_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) |
| #define GGML_V1_F32x4_ADD vaddq_f32 |
| #define GGML_V1_F32x4_MUL vmulq_f32 |
| #if defined(__ARM_FEATURE_QRDMX) |
| #define GGML_V1_F32x4_REDUCE_ONE(x) vaddvq_f32(x) |
| #else |
| #define GGML_V1_F32x4_REDUCE_ONE(x) \ |
| (vgetq_lane_f32(x, 0) + \ |
| vgetq_lane_f32(x, 1) + \ |
| vgetq_lane_f32(x, 2) + \ |
| vgetq_lane_f32(x, 3)) |
| #endif |
| #define GGML_V1_F32x4_REDUCE(res, x) \ |
| { \ |
| for (int i = 0; i < GGML_V1_F32_ARR/2; ++i) { \ |
| x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/4; ++i) { \ |
| x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/8; ++i) { \ |
| x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \ |
| } \ |
| res = GGML_V1_F32x4_REDUCE_ONE(x[0]); \ |
| } |
|
|
| #define GGML_V1_F32_VEC GGML_V1_F32x4 |
| #define GGML_V1_F32_VEC_ZERO GGML_V1_F32x4_ZERO |
| #define GGML_V1_F32_VEC_SET1 GGML_V1_F32x4_SET1 |
| #define GGML_V1_F32_VEC_LOAD GGML_V1_F32x4_LOAD |
| #define GGML_V1_F32_VEC_STORE GGML_V1_F32x4_STORE |
| #define GGML_V1_F32_VEC_FMA GGML_V1_F32x4_FMA |
| #define GGML_V1_F32_VEC_ADD GGML_V1_F32x4_ADD |
| #define GGML_V1_F32_VEC_MUL GGML_V1_F32x4_MUL |
| #define GGML_V1_F32_VEC_REDUCE GGML_V1_F32x4_REDUCE |
|
|
| |
|
|
| #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) |
| #define GGML_V1_F16_STEP 32 |
| #define GGML_V1_F16_EPR 8 |
|
|
| #define GGML_V1_F16x8 float16x8_t |
| #define GGML_V1_F16x8_ZERO vdupq_n_f16(0.0f) |
| #define GGML_V1_F16x8_SET1(x) vdupq_n_f16(x) |
| #define GGML_V1_F16x8_LOAD vld1q_f16 |
| #define GGML_V1_F16x8_STORE vst1q_f16 |
| #define GGML_V1_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) |
| #define GGML_V1_F16x8_ADD vaddq_f16 |
| #define GGML_V1_F16x8_MUL vmulq_f16 |
| #define GGML_V1_F16x8_REDUCE(res, x) \ |
| { \ |
| for (int i = 0; i < GGML_V1_F16_ARR/2; ++i) { \ |
| x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F16_ARR/4; ++i) { \ |
| x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F16_ARR/8; ++i) { \ |
| x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \ |
| } \ |
| const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ |
| const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ |
| res = vaddvq_f32(vaddq_f32(t0, t1)); \ |
| } |
|
|
| #define GGML_V1_F16_VEC GGML_V1_F16x8 |
| #define GGML_V1_F16_VEC_ZERO GGML_V1_F16x8_ZERO |
| #define GGML_V1_F16_VEC_SET1 GGML_V1_F16x8_SET1 |
| #define GGML_V1_F16_VEC_LOAD(p, i) GGML_V1_F16x8_LOAD(p) |
| #define GGML_V1_F16_VEC_STORE(p, r, i) GGML_V1_F16x8_STORE(p, r[i]) |
| #define GGML_V1_F16_VEC_FMA GGML_V1_F16x8_FMA |
| #define GGML_V1_F16_VEC_ADD GGML_V1_F16x8_ADD |
| #define GGML_V1_F16_VEC_MUL GGML_V1_F16x8_MUL |
| #define GGML_V1_F16_VEC_REDUCE GGML_V1_F16x8_REDUCE |
| #else |
| |
| |
|
|
| #define GGML_V1_F16_STEP 16 |
| #define GGML_V1_F16_EPR 4 |
|
|
| #define GGML_V1_F32Cx4 float32x4_t |
| #define GGML_V1_F32Cx4_ZERO vdupq_n_f32(0.0f) |
| #define GGML_V1_F32Cx4_SET1(x) vdupq_n_f32(x) |
| #define GGML_V1_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) |
| #define GGML_V1_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) |
| #define GGML_V1_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) |
| #define GGML_V1_F32Cx4_ADD vaddq_f32 |
| #define GGML_V1_F32Cx4_MUL vmulq_f32 |
| #define GGML_V1_F32Cx4_REDUCE GGML_V1_F32x4_REDUCE |
|
|
| #define GGML_V1_F16_VEC GGML_V1_F32Cx4 |
| #define GGML_V1_F16_VEC_ZERO GGML_V1_F32Cx4_ZERO |
| #define GGML_V1_F16_VEC_SET1 GGML_V1_F32Cx4_SET1 |
| #define GGML_V1_F16_VEC_LOAD(p, i) GGML_V1_F32Cx4_LOAD(p) |
| #define GGML_V1_F16_VEC_STORE(p, r, i) GGML_V1_F32Cx4_STORE(p, r[i]) |
| #define GGML_V1_F16_VEC_FMA GGML_V1_F32Cx4_FMA |
| #define GGML_V1_F16_VEC_ADD GGML_V1_F32Cx4_ADD |
| #define GGML_V1_F16_VEC_MUL GGML_V1_F32Cx4_MUL |
| #define GGML_V1_F16_VEC_REDUCE GGML_V1_F32Cx4_REDUCE |
| #endif |
|
|
| #elif defined(__AVX__) |
|
|
| #define GGML_V1_SIMD |
|
|
| |
|
|
| #define GGML_V1_F32_STEP 32 |
| #define GGML_V1_F32_EPR 8 |
|
|
| #define GGML_V1_F32x8 __m256 |
| #define GGML_V1_F32x8_ZERO _mm256_setzero_ps() |
| #define GGML_V1_F32x8_SET1(x) _mm256_set1_ps(x) |
| #define GGML_V1_F32x8_LOAD _mm256_loadu_ps |
| #define GGML_V1_F32x8_STORE _mm256_storeu_ps |
| #if defined(__FMA__) |
| #define GGML_V1_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) |
| #else |
| #define GGML_V1_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) |
| #endif |
| #define GGML_V1_F32x8_ADD _mm256_add_ps |
| #define GGML_V1_F32x8_MUL _mm256_mul_ps |
| #define GGML_V1_F32x8_REDUCE(res, x) \ |
| { \ |
| for (int i = 0; i < GGML_V1_F32_ARR/2; ++i) { \ |
| x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/4; ++i) { \ |
| x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/8; ++i) { \ |
| x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \ |
| } \ |
| const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ |
| _mm256_extractf128_ps(x[0], 1)); \ |
| const __m128 t1 = _mm_hadd_ps(t0, t0); \ |
| res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ |
| } |
| |
|
|
| #define GGML_V1_F32_VEC GGML_V1_F32x8 |
| #define GGML_V1_F32_VEC_ZERO GGML_V1_F32x8_ZERO |
| #define GGML_V1_F32_VEC_SET1 GGML_V1_F32x8_SET1 |
| #define GGML_V1_F32_VEC_LOAD GGML_V1_F32x8_LOAD |
| #define GGML_V1_F32_VEC_STORE GGML_V1_F32x8_STORE |
| #define GGML_V1_F32_VEC_FMA GGML_V1_F32x8_FMA |
| #define GGML_V1_F32_VEC_ADD GGML_V1_F32x8_ADD |
| #define GGML_V1_F32_VEC_MUL GGML_V1_F32x8_MUL |
| #define GGML_V1_F32_VEC_REDUCE GGML_V1_F32x8_REDUCE |
|
|
| |
|
|
| #define GGML_V1_F16_STEP 32 |
| #define GGML_V1_F16_EPR 8 |
|
|
| |
| |
|
|
| #define GGML_V1_F32Cx8 __m256 |
| #define GGML_V1_F32Cx8_ZERO _mm256_setzero_ps() |
| #define GGML_V1_F32Cx8_SET1(x) _mm256_set1_ps(x) |
| #if defined(__F16C__) |
| #define GGML_V1_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) |
| #define GGML_V1_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) |
| #else |
| static inline __m256 __avx_f32cx8_load(ggml_v1_fp16_t *x) { |
| float tmp[8]; |
|
|
| for (int i = 0; i < 8; i++) |
| tmp[i] = GGML_V1_FP16_TO_FP32(x[i]); |
|
|
| return _mm256_loadu_ps(tmp); |
| } |
| static inline void __avx_f32cx8_store(ggml_v1_fp16_t *x, __m256 y) { |
| float arr[8]; |
|
|
| _mm256_storeu_ps(arr, y); |
|
|
| for (int i = 0; i < 8; i++) |
| x[i] = GGML_V1_FP32_TO_FP16(arr[i]); |
| } |
| #define GGML_V1_F32Cx8_LOAD(x) __avx_f32cx8_load(x) |
| #define GGML_V1_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) |
| #endif |
|
|
| #define GGML_V1_F32Cx8_FMA GGML_V1_F32x8_FMA |
| #define GGML_V1_F32Cx8_ADD _mm256_add_ps |
| #define GGML_V1_F32Cx8_MUL _mm256_mul_ps |
| #define GGML_V1_F32Cx8_REDUCE GGML_V1_F32x8_REDUCE |
|
|
| #define GGML_V1_F16_VEC GGML_V1_F32Cx8 |
| #define GGML_V1_F16_VEC_ZERO GGML_V1_F32Cx8_ZERO |
| #define GGML_V1_F16_VEC_SET1 GGML_V1_F32Cx8_SET1 |
| #define GGML_V1_F16_VEC_LOAD(p, i) GGML_V1_F32Cx8_LOAD(p) |
| #define GGML_V1_F16_VEC_STORE(p, r, i) GGML_V1_F32Cx8_STORE(p, r[i]) |
| #define GGML_V1_F16_VEC_FMA GGML_V1_F32Cx8_FMA |
| #define GGML_V1_F16_VEC_ADD GGML_V1_F32Cx8_ADD |
| #define GGML_V1_F16_VEC_MUL GGML_V1_F32Cx8_MUL |
| #define GGML_V1_F16_VEC_REDUCE GGML_V1_F32Cx8_REDUCE |
|
|
| #elif defined(__POWER9_VECTOR__) |
|
|
| #define GGML_V1_SIMD |
|
|
| |
|
|
| #define GGML_V1_F32_STEP 32 |
| #define GGML_V1_F32_EPR 4 |
|
|
| #define GGML_V1_F32x4 vector float |
| #define GGML_V1_F32x4_ZERO 0.0f |
| #define GGML_V1_F32x4_SET1 vec_splats |
| #define GGML_V1_F32x4_LOAD(p) vec_xl(0, p) |
| #define GGML_V1_F32x4_STORE(p, r) vec_xst(r, 0, p) |
| #define GGML_V1_F32x4_FMA(a, b, c) vec_madd(b, c, a) |
| #define GGML_V1_F32x4_ADD vec_add |
| #define GGML_V1_F32x4_MUL vec_mul |
| #define GGML_V1_F32x4_REDUCE(res, x) \ |
| { \ |
| for (int i = 0; i < GGML_V1_F32_ARR/2; ++i) { \ |
| x[2*i] = vec_add(x[2*i], x[2*i+1]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/4; ++i) { \ |
| x[4*i] = vec_add(x[4*i], x[4*i+2]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/8; ++i) { \ |
| x[8*i] = vec_add(x[8*i], x[8*i+4]); \ |
| } \ |
| res = vec_extract(x[0], 0) + \ |
| vec_extract(x[0], 1) + \ |
| vec_extract(x[0], 2) + \ |
| vec_extract(x[0], 3); \ |
| } |
|
|
| #define GGML_V1_F32_VEC GGML_V1_F32x4 |
| #define GGML_V1_F32_VEC_ZERO GGML_V1_F32x4_ZERO |
| #define GGML_V1_F32_VEC_SET1 GGML_V1_F32x4_SET1 |
| #define GGML_V1_F32_VEC_LOAD GGML_V1_F32x4_LOAD |
| #define GGML_V1_F32_VEC_STORE GGML_V1_F32x4_STORE |
| #define GGML_V1_F32_VEC_FMA GGML_V1_F32x4_FMA |
| #define GGML_V1_F32_VEC_ADD GGML_V1_F32x4_ADD |
| #define GGML_V1_F32_VEC_MUL GGML_V1_F32x4_MUL |
| #define GGML_V1_F32_VEC_REDUCE GGML_V1_F32x4_REDUCE |
|
|
| |
| #define GGML_V1_F16_STEP GGML_V1_F32_STEP |
| #define GGML_V1_F16_EPR GGML_V1_F32_EPR |
| #define GGML_V1_F16_VEC GGML_V1_F32x4 |
| #define GGML_V1_F16_VEC_ZERO GGML_V1_F32x4_ZERO |
| #define GGML_V1_F16_VEC_SET1 GGML_V1_F32x4_SET1 |
| #define GGML_V1_F16_VEC_FMA GGML_V1_F32x4_FMA |
| #define GGML_V1_F16_VEC_REDUCE GGML_V1_F32x4_REDUCE |
| |
| #define GGML_V1_F16_VEC_LOAD(p, i) (i & 0x1) ? \ |
| vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_V1_F16_EPR)) : \ |
| vec_extract_fp32_from_shortl(vec_xl(0, p)) |
| #define GGML_V1_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] |
| #define GGML_V1_F16_VEC_STORE(p, r, i) \ |
| if (i & 0x1) \ |
| vec_xst(vec_pack_to_short_fp32(r[i - GGML_V1_ENDIAN_BYTE(1)], \ |
| r[i - GGML_V1_ENDIAN_BYTE(0)]), \ |
| 0, p - GGML_V1_F16_EPR) |
|
|
| #elif defined(__wasm_simd128__) |
|
|
| #define GGML_V1_SIMD |
|
|
| |
|
|
| #define GGML_V1_F32_STEP 16 |
| #define GGML_V1_F32_EPR 4 |
|
|
| #define GGML_V1_F32x4 v128_t |
| #define GGML_V1_F32x4_ZERO wasm_f32x4_splat(0.0f) |
| #define GGML_V1_F32x4_SET1(x) wasm_f32x4_splat(x) |
| #define GGML_V1_F32x4_LOAD wasm_v128_load |
| #define GGML_V1_F32x4_STORE wasm_v128_store |
| #define GGML_V1_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) |
| #define GGML_V1_F32x4_ADD wasm_f32x4_add |
| #define GGML_V1_F32x4_MUL wasm_f32x4_mul |
| #define GGML_V1_F32x4_REDUCE(res, x) \ |
| { \ |
| for (int i = 0; i < GGML_V1_F32_ARR/2; ++i) { \ |
| x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/4; ++i) { \ |
| x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/8; ++i) { \ |
| x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \ |
| } \ |
| res = wasm_f32x4_extract_lane(x[0], 0) + \ |
| wasm_f32x4_extract_lane(x[0], 1) + \ |
| wasm_f32x4_extract_lane(x[0], 2) + \ |
| wasm_f32x4_extract_lane(x[0], 3); \ |
| } |
|
|
| #define GGML_V1_F32_VEC GGML_V1_F32x4 |
| #define GGML_V1_F32_VEC_ZERO GGML_V1_F32x4_ZERO |
| #define GGML_V1_F32_VEC_SET1 GGML_V1_F32x4_SET1 |
| #define GGML_V1_F32_VEC_LOAD GGML_V1_F32x4_LOAD |
| #define GGML_V1_F32_VEC_STORE GGML_V1_F32x4_STORE |
| #define GGML_V1_F32_VEC_FMA GGML_V1_F32x4_FMA |
| #define GGML_V1_F32_VEC_ADD GGML_V1_F32x4_ADD |
| #define GGML_V1_F32_VEC_MUL GGML_V1_F32x4_MUL |
| #define GGML_V1_F32_VEC_REDUCE GGML_V1_F32x4_REDUCE |
|
|
| |
|
|
| #define GGML_V1_F16_STEP 16 |
| #define GGML_V1_F16_EPR 4 |
|
|
| inline static v128_t __wasm_f16x4_load(const ggml_v1_fp16_t * p) { |
| float tmp[4]; |
|
|
| tmp[0] = GGML_V1_FP16_TO_FP32(p[0]); |
| tmp[1] = GGML_V1_FP16_TO_FP32(p[1]); |
| tmp[2] = GGML_V1_FP16_TO_FP32(p[2]); |
| tmp[3] = GGML_V1_FP16_TO_FP32(p[3]); |
|
|
| return wasm_v128_load(tmp); |
| } |
|
|
| inline static void __wasm_f16x4_store(ggml_v1_fp16_t * p, v128_t x) { |
| float tmp[4]; |
|
|
| wasm_v128_store(tmp, x); |
|
|
| p[0] = GGML_V1_FP32_TO_FP16(tmp[0]); |
| p[1] = GGML_V1_FP32_TO_FP16(tmp[1]); |
| p[2] = GGML_V1_FP32_TO_FP16(tmp[2]); |
| p[3] = GGML_V1_FP32_TO_FP16(tmp[3]); |
| } |
|
|
| #define GGML_V1_F16x4 v128_t |
| #define GGML_V1_F16x4_ZERO wasm_f32x4_splat(0.0f) |
| #define GGML_V1_F16x4_SET1(x) wasm_f32x4_splat(x) |
| #define GGML_V1_F16x4_LOAD(x) __wasm_f16x4_load(x) |
| #define GGML_V1_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) |
| #define GGML_V1_F16x4_FMA GGML_V1_F32x4_FMA |
| #define GGML_V1_F16x4_ADD wasm_f32x4_add |
| #define GGML_V1_F16x4_MUL wasm_f32x4_mul |
| #define GGML_V1_F16x4_REDUCE(res, x) \ |
| { \ |
| for (int i = 0; i < GGML_V1_F16_ARR/2; ++i) { \ |
| x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F16_ARR/4; ++i) { \ |
| x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F16_ARR/8; ++i) { \ |
| x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \ |
| } \ |
| res = wasm_f32x4_extract_lane(x[0], 0) + \ |
| wasm_f32x4_extract_lane(x[0], 1) + \ |
| wasm_f32x4_extract_lane(x[0], 2) + \ |
| wasm_f32x4_extract_lane(x[0], 3); \ |
| } |
|
|
| #define GGML_V1_F16_VEC GGML_V1_F16x4 |
| #define GGML_V1_F16_VEC_ZERO GGML_V1_F16x4_ZERO |
| #define GGML_V1_F16_VEC_SET1 GGML_V1_F16x4_SET1 |
| #define GGML_V1_F16_VEC_LOAD(p, i) GGML_V1_F16x4_LOAD(p) |
| #define GGML_V1_F16_VEC_STORE(p, r, i) GGML_V1_F16x4_STORE(p, r[i]) |
| #define GGML_V1_F16_VEC_FMA GGML_V1_F16x4_FMA |
| #define GGML_V1_F16_VEC_ADD GGML_V1_F16x4_ADD |
| #define GGML_V1_F16_VEC_MUL GGML_V1_F16x4_MUL |
| #define GGML_V1_F16_VEC_REDUCE GGML_V1_F16x4_REDUCE |
|
|
| #elif defined(__SSE3__) |
|
|
| #define GGML_V1_SIMD |
|
|
| |
|
|
| #define GGML_V1_F32_STEP 32 |
| #define GGML_V1_F32_EPR 4 |
|
|
| #define GGML_V1_F32x4 __m128 |
| #define GGML_V1_F32x4_ZERO _mm_setzero_ps() |
| #define GGML_V1_F32x4_SET1(x) _mm_set1_ps(x) |
| #define GGML_V1_F32x4_LOAD _mm_loadu_ps |
| #define GGML_V1_F32x4_STORE _mm_storeu_ps |
| #if defined(__FMA__) |
| |
| #define GGML_V1_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) |
| #else |
| #define GGML_V1_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) |
| #endif |
| #define GGML_V1_F32x4_ADD _mm_add_ps |
| #define GGML_V1_F32x4_MUL _mm_mul_ps |
| #define GGML_V1_F32x4_REDUCE(res, x) \ |
| { \ |
| for (int i = 0; i < GGML_V1_F32_ARR/2; ++i) { \ |
| x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/4; ++i) { \ |
| x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \ |
| } \ |
| for (int i = 0; i < GGML_V1_F32_ARR/8; ++i) { \ |
| x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \ |
| } \ |
| const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ |
| res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ |
| } |
| |
|
|
| #define GGML_V1_F32_VEC GGML_V1_F32x4 |
| #define GGML_V1_F32_VEC_ZERO GGML_V1_F32x4_ZERO |
| #define GGML_V1_F32_VEC_SET1 GGML_V1_F32x4_SET1 |
| #define GGML_V1_F32_VEC_LOAD GGML_V1_F32x4_LOAD |
| #define GGML_V1_F32_VEC_STORE GGML_V1_F32x4_STORE |
| #define GGML_V1_F32_VEC_FMA GGML_V1_F32x4_FMA |
| #define GGML_V1_F32_VEC_ADD GGML_V1_F32x4_ADD |
| #define GGML_V1_F32_VEC_MUL GGML_V1_F32x4_MUL |
| #define GGML_V1_F32_VEC_REDUCE GGML_V1_F32x4_REDUCE |
|
|
| |
|
|
| #define GGML_V1_F16_STEP 32 |
| #define GGML_V1_F16_EPR 4 |
|
|
| static inline __m128 __sse_f16x4_load(ggml_v1_fp16_t *x) { |
| float tmp[4]; |
|
|
| tmp[0] = GGML_V1_FP16_TO_FP32(x[0]); |
| tmp[1] = GGML_V1_FP16_TO_FP32(x[1]); |
| tmp[2] = GGML_V1_FP16_TO_FP32(x[2]); |
| tmp[3] = GGML_V1_FP16_TO_FP32(x[3]); |
|
|
| return _mm_loadu_ps(tmp); |
| } |
|
|
| static inline void __sse_f16x4_store(ggml_v1_fp16_t *x, __m128 y) { |
| float arr[4]; |
|
|
| _mm_storeu_ps(arr, y); |
|
|
| x[0] = GGML_V1_FP32_TO_FP16(arr[0]); |
| x[1] = GGML_V1_FP32_TO_FP16(arr[1]); |
| x[2] = GGML_V1_FP32_TO_FP16(arr[2]); |
| x[3] = GGML_V1_FP32_TO_FP16(arr[3]); |
| } |
|
|
| #define GGML_V1_F32Cx4 __m128 |
| #define GGML_V1_F32Cx4_ZERO _mm_setzero_ps() |
| #define GGML_V1_F32Cx4_SET1(x) _mm_set1_ps(x) |
| #define GGML_V1_F32Cx4_LOAD(x) __sse_f16x4_load(x) |
| #define GGML_V1_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) |
| #define GGML_V1_F32Cx4_FMA GGML_V1_F32x4_FMA |
| #define GGML_V1_F32Cx4_ADD _mm_add_ps |
| #define GGML_V1_F32Cx4_MUL _mm_mul_ps |
| #define GGML_V1_F32Cx4_REDUCE GGML_V1_F32x4_REDUCE |
|
|
| #define GGML_V1_F16_VEC GGML_V1_F32Cx4 |
| #define GGML_V1_F16_VEC_ZERO GGML_V1_F32Cx4_ZERO |
| #define GGML_V1_F16_VEC_SET1 GGML_V1_F32Cx4_SET1 |
| #define GGML_V1_F16_VEC_LOAD(p, i) GGML_V1_F32Cx4_LOAD(p) |
| #define GGML_V1_F16_VEC_STORE(p, r, i) GGML_V1_F32Cx4_STORE(p, r[i]) |
| #define GGML_V1_F16_VEC_FMA GGML_V1_F32Cx4_FMA |
| #define GGML_V1_F16_VEC_ADD GGML_V1_F32Cx4_ADD |
| #define GGML_V1_F16_VEC_MUL GGML_V1_F32Cx4_MUL |
| #define GGML_V1_F16_VEC_REDUCE GGML_V1_F32Cx4_REDUCE |
|
|
| #endif |
|
|
| |
| |
| #ifdef GGML_V1_SIMD |
| #define GGML_V1_F32_ARR (GGML_V1_F32_STEP/GGML_V1_F32_EPR) |
| #define GGML_V1_F16_ARR (GGML_V1_F16_STEP/GGML_V1_F16_EPR) |
| #endif |
|
|
| |
| |
| |
|
|
| inline static void ggml_v1_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } |
|
|
| inline static void ggml_v1_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } |
|
|
| inline static void ggml_v1_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } |
|
|
| inline static void ggml_v1_vec_set_f16(const int n, ggml_v1_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } |
|
|
| inline static void ggml_v1_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } |
| inline static void ggml_v1_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } |
| inline static void ggml_v1_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } |
| inline static void ggml_v1_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } |
| inline static void ggml_v1_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } |
| inline static void ggml_v1_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } |
| inline static void ggml_v1_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } |
| inline static void ggml_v1_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } |
| inline static void ggml_v1_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } |
|
|
| inline static void ggml_v1_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { |
| ggml_v1_float sumf = 0.0; |
|
|
| #ifdef GGML_V1_SIMD |
| const int np = (n & ~(GGML_V1_F32_STEP - 1)); |
|
|
| GGML_V1_F32_VEC sum[GGML_V1_F32_ARR] = { GGML_V1_F32_VEC_ZERO }; |
|
|
| GGML_V1_F32_VEC ax[GGML_V1_F32_ARR]; |
| GGML_V1_F32_VEC ay[GGML_V1_F32_ARR]; |
|
|
| for (int i = 0; i < np; i += GGML_V1_F32_STEP) { |
| for (int j = 0; j < GGML_V1_F32_ARR; j++) { |
| ax[j] = GGML_V1_F32_VEC_LOAD(x + i + j*GGML_V1_F32_EPR); |
| ay[j] = GGML_V1_F32_VEC_LOAD(y + i + j*GGML_V1_F32_EPR); |
|
|
| sum[j] = GGML_V1_F32_VEC_FMA(sum[j], ax[j], ay[j]); |
| } |
| } |
|
|
| |
| GGML_V1_F32_VEC_REDUCE(sumf, sum); |
|
|
| |
| for (int i = np; i < n; ++i) { |
| sumf += x[i]*y[i]; |
| } |
| #else |
| |
| for (int i = 0; i < n; ++i) { |
| sumf += x[i]*y[i]; |
| } |
| #endif |
|
|
| *s = sumf; |
| } |
|
|
| inline static void ggml_v1_vec_dot_f16(const int n, float * restrict s, ggml_v1_fp16_t * restrict x, ggml_v1_fp16_t * restrict y) { |
| ggml_v1_float sumf = 0.0; |
|
|
| #if defined(GGML_V1_SIMD) |
| const int np = (n & ~(GGML_V1_F16_STEP - 1)); |
|
|
| GGML_V1_F16_VEC sum[GGML_V1_F16_ARR] = { GGML_V1_F16_VEC_ZERO }; |
|
|
| GGML_V1_F16_VEC ax[GGML_V1_F16_ARR]; |
| GGML_V1_F16_VEC ay[GGML_V1_F16_ARR]; |
|
|
| for (int i = 0; i < np; i += GGML_V1_F16_STEP) { |
| for (int j = 0; j < GGML_V1_F16_ARR; j++) { |
| ax[j] = GGML_V1_F16_VEC_LOAD(x + i + j*GGML_V1_F16_EPR, j); |
| ay[j] = GGML_V1_F16_VEC_LOAD(y + i + j*GGML_V1_F16_EPR, j); |
|
|
| sum[j] = GGML_V1_F16_VEC_FMA(sum[j], ax[j], ay[j]); |
| } |
| } |
|
|
| |
| GGML_V1_F16_VEC_REDUCE(sumf, sum); |
|
|
| |
| for (int i = np; i < n; ++i) { |
| sumf += GGML_V1_FP16_TO_FP32(x[i])*GGML_V1_FP16_TO_FP32(y[i]); |
| } |
| #else |
| for (int i = 0; i < n; ++i) { |
| sumf += GGML_V1_FP16_TO_FP32(x[i])*GGML_V1_FP16_TO_FP32(y[i]); |
| } |
| #endif |
|
|
| *s = sumf; |
| } |
|
|
| inline static void ggml_v1_vec_dot_q4_0(const int n, float * restrict s, const void * restrict x, const void * restrict y) { |
| const int nb = n / QK; |
|
|
| assert(n % QK == 0); |
| assert(nb % 2 == 0); |
|
|
| const float * restrict pd0 = (const float *) x; |
| const float * restrict pd1 = (const float *) y; |
|
|
| const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb); |
| const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb); |
|
|
| float sumf = 0.0; |
|
|
| #ifdef __ARM_NEON |
| #if QK == 32 |
| float sum0 = 0.0f; |
| float sum1 = 0.0f; |
|
|
| for (int i = 0; i < nb; i += 2) { |
| const float d0_0 = pd0[i + 0]; |
| const float d1_0 = pd1[i + 0]; |
| const float d0_1 = pd0[i + 1]; |
| const float d1_1 = pd1[i + 1]; |
|
|
| |
|
|
| const uint8_t * restrict p0 = pb0 + i*16; |
| const uint8_t * restrict p1 = pb1 + i*16; |
|
|
| const uint8x16_t m4b = vdupq_n_u8(0xf); |
| const int8x16_t s8b = vdupq_n_s8(0x8); |
|
|
| const uint8x16_t v0_0 = vld1q_u8(p0); |
| const uint8x16_t v1_0 = vld1q_u8(p1); |
| const uint8x16_t v0_1 = vld1q_u8(p0 + 16); |
| const uint8x16_t v1_1 = vld1q_u8(p1 + 16); |
|
|
| |
| const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); |
| const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); |
|
|
| const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); |
| const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); |
|
|
| const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); |
| const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); |
|
|
| const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); |
| const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); |
|
|
| |
| const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); |
| const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); |
|
|
| const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); |
| const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); |
|
|
| const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); |
| const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); |
|
|
| const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); |
| const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); |
|
|
| |
| const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); |
| const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); |
|
|
| const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); |
| const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); |
|
|
| const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); |
| const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); |
|
|
| const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); |
| const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); |
|
|
| const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h); |
| const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h); |
|
|
| const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h); |
| const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h); |
|
|
| const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); |
| const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); |
|
|
| |
| #if defined(__ARM_FEATURE_QRDMX) |
| sum0 += d0_0*d1_0*vaddvq_s16(p_0); |
| sum1 += d0_1*d1_1*vaddvq_s16(p_1); |
| #else |
| sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7)); |
| sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7)); |
| #endif |
| } |
|
|
| sumf = sum0 + sum1; |
| #else |
| #error "not implemented for QK" |
| #endif |
| #elif defined(__wasm_simd128__) |
| #if QK == 32 |
| |
| float sum0 = 0.0f; |
| float sum1 = 0.0f; |
|
|
| for (int i = 0; i < nb; i += 2) { |
| const float d0_0 = pd0[i + 0]; |
| const float d0_1 = pd0[i + 1]; |
| const float d1_0 = pd1[i + 0]; |
| const float d1_1 = pd1[i + 1]; |
|
|
| const uint8_t * restrict p0 = pb0 + i*16; |
| const uint8_t * restrict p1 = pb1 + i*16; |
|
|
| const v128_t m4b = wasm_u8x16_splat(0xf); |
| const v128_t s8b = wasm_i8x16_splat(0x8); |
|
|
| const v128_t v0_0 = wasm_v128_load(p0); |
| const v128_t v0_1 = wasm_v128_load(p0 + 16); |
| const v128_t v1_0 = wasm_v128_load(p1); |
| const v128_t v1_1 = wasm_v128_load(p1 + 16); |
|
|
| |
| const v128_t v0_0l = wasm_v128_and(v0_0, m4b); |
| const v128_t v1_0l = wasm_v128_and(v1_0, m4b); |
|
|
| const v128_t v0_0h = wasm_u8x16_shr(v0_0, 4); |
| const v128_t v1_0h = wasm_u8x16_shr(v1_0, 4); |
|
|
| const v128_t v0_1l = wasm_v128_and(v0_1, m4b); |
| const v128_t v1_1l = wasm_v128_and(v1_1, m4b); |
|
|
| const v128_t v0_1h = wasm_u8x16_shr(v0_1, 4); |
| const v128_t v1_1h = wasm_u8x16_shr(v1_1, 4); |
|
|
| |
| const v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b); |
| const v128_t v1_0ls = wasm_i8x16_sub(v1_0l, s8b); |
|
|
| const v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b); |
| const v128_t v1_0hs = wasm_i8x16_sub(v1_0h, s8b); |
|
|
| const v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b); |
| const v128_t v1_1ls = wasm_i8x16_sub(v1_1l, s8b); |
|
|
| const v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b); |
| const v128_t v1_1hs = wasm_i8x16_sub(v1_1h, s8b); |
|
|
| |
| const v128_t pl0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0ls), wasm_i16x8_extend_low_i8x16(v1_0ls)); |
| const v128_t pl0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0ls), wasm_i16x8_extend_high_i8x16(v1_0ls)); |
|
|
| const v128_t ph0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0hs), wasm_i16x8_extend_low_i8x16(v1_0hs)); |
| const v128_t ph0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0hs), wasm_i16x8_extend_high_i8x16(v1_0hs)); |
|
|
| const v128_t pl1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1ls), wasm_i16x8_extend_low_i8x16(v1_1ls)); |
| const v128_t pl1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1ls), wasm_i16x8_extend_high_i8x16(v1_1ls)); |
|
|
| const v128_t ph1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1hs), wasm_i16x8_extend_low_i8x16(v1_1hs)); |
| const v128_t ph1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1hs), wasm_i16x8_extend_high_i8x16(v1_1hs)); |
|
|
| const v128_t pl_0 = wasm_i16x8_add(pl0l, pl0h); |
| const v128_t ph_0 = wasm_i16x8_add(ph0l, ph0h); |
|
|
| const v128_t pl_1 = wasm_i16x8_add(pl1l, pl1h); |
| const v128_t ph_1 = wasm_i16x8_add(ph1l, ph1h); |
|
|
| const v128_t p_0 = wasm_i16x8_add(pl_0, ph_0); |
| const v128_t p_1 = wasm_i16x8_add(pl_1, ph_1); |
|
|
| sum0 += d0_0*d1_0*( |
| wasm_i16x8_extract_lane(p_0, 0) + wasm_i16x8_extract_lane(p_0, 1) + |
| wasm_i16x8_extract_lane(p_0, 2) + wasm_i16x8_extract_lane(p_0, 3) + |
| wasm_i16x8_extract_lane(p_0, 4) + wasm_i16x8_extract_lane(p_0, 5) + |
| wasm_i16x8_extract_lane(p_0, 6) + wasm_i16x8_extract_lane(p_0, 7)); |
| sum1 += d0_1*d1_1*( |
| wasm_i16x8_extract_lane(p_1, 0) + wasm_i16x8_extract_lane(p_1, 1) + |
| wasm_i16x8_extract_lane(p_1, 2) + wasm_i16x8_extract_lane(p_1, 3) + |
| wasm_i16x8_extract_lane(p_1, 4) + wasm_i16x8_extract_lane(p_1, 5) + |
| wasm_i16x8_extract_lane(p_1, 6) + wasm_i16x8_extract_lane(p_1, 7)); |
| } |
|
|
| sumf = sum0 + sum1; |
| #else |
| #error "not implemented for QK" |
| #endif |
| #else |
| |
| for (int i = 0; i < nb; i++) { |
| const float d0 = pd0[i]; |
| const float d1 = pd1[i]; |
|
|
| const uint8_t * restrict p0 = pb0 + i*QK/2; |
| const uint8_t * restrict p1 = pb1 + i*QK/2; |
|
|
| for (int j = 0; j < QK/2; j++) { |
| const uint8_t v0 = p0[j]; |
| const uint8_t v1 = p1[j]; |
|
|
| const float f0 = d0*((int8_t) (v0 & 0xf) - 8); |
| const float f1 = d0*((int8_t) (v0 >> 4) - 8); |
|
|
| const float f2 = d1*((int8_t) (v1 & 0xf) - 8); |
| const float f3 = d1*((int8_t) (v1 >> 4) - 8); |
|
|
| sumf += f0*f2 + f1*f3; |
| } |
| } |
| #endif |
|
|
| *s = sumf; |
| } |
|
|
| inline static void ggml_v1_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) { |
| const int nb = n / QK; |
|
|
| const float * restrict pm0 = (const float *) x; |
| const float * restrict pm1 = (const float *) y; |
|
|
| const float * restrict pd0 = (const float *) (pm0 + nb); |
| const float * restrict pd1 = (const float *) (pm1 + nb); |
|
|
| const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb); |
| const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb); |
|
|
| float sumf = 0.0; |
|
|
| #if 1 |
| |
| for (int i = 0; i < nb; i++) { |
| const float m0 = pm0[i]; |
| const float m1 = pm1[i]; |
|
|
| const float d0 = pd0[i]; |
| const float d1 = pd1[i]; |
|
|
| const uint8_t * restrict p0 = pb0 + i*QK/2; |
| const uint8_t * restrict p1 = pb1 + i*QK/2; |
|
|
| for (int j = 0; j < QK/2; j++) { |
| const uint8_t v0 = p0[j]; |
| const uint8_t v1 = p1[j]; |
|
|
| const float f0 = d0*(v0 & 0xf) + m0; |
| const float f1 = d0*(v0 >> 4) + m0; |
|
|
| const float f2 = d1*(v1 & 0xf) + m1; |
| const float f3 = d1*(v1 >> 4) + m1; |
|
|
| sumf += f0*f2 + f1*f3; |
| } |
| } |
| #endif |
|
|
| *s = sumf; |
| } |
|
|
| |
| |
| inline static void ggml_v1_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_v1_fp16_t * restrict y) { |
| ggml_v1_float sumf[GGML_V1_VEC_DOT_UNROLL] = { 0.0 }; |
|
|
| ggml_v1_fp16_t * restrict x[GGML_V1_VEC_DOT_UNROLL]; |
|
|
| for (int i = 0; i < GGML_V1_VEC_DOT_UNROLL; ++i) { |
| x[i] = (ggml_v1_fp16_t *) ((char *) xv + i*xs); |
| } |
|
|
| #if defined(GGML_V1_SIMD) |
| const int np = (n & ~(GGML_V1_F16_STEP - 1)); |
|
|
| GGML_V1_F16_VEC sum[GGML_V1_VEC_DOT_UNROLL][GGML_V1_F16_ARR] = { { GGML_V1_F16_VEC_ZERO } }; |
|
|
| GGML_V1_F16_VEC ax[GGML_V1_F16_ARR]; |
| GGML_V1_F16_VEC ay[GGML_V1_F16_ARR]; |
|
|
| for (int i = 0; i < np; i += GGML_V1_F16_STEP) { |
| for (int j = 0; j < GGML_V1_F16_ARR; j++) { |
| ay[j] = GGML_V1_F16_VEC_LOAD(y + i + j*GGML_V1_F16_EPR, j); |
|
|
| for (int k = 0; k < GGML_V1_VEC_DOT_UNROLL; ++k) { |
| ax[j] = GGML_V1_F16_VEC_LOAD(x[k] + i + j*GGML_V1_F16_EPR, j); |
|
|
| sum[k][j] = GGML_V1_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); |
| } |
| } |
| } |
|
|
| |
| for (int k = 0; k < GGML_V1_VEC_DOT_UNROLL; ++k) { |
| GGML_V1_F16_VEC_REDUCE(sumf[k], sum[k]); |
| } |
|
|
| |
| for (int i = np; i < n; ++i) { |
| for (int j = 0; j < GGML_V1_VEC_DOT_UNROLL; ++j) { |
| sumf[j] += GGML_V1_FP16_TO_FP32(x[j][i])*GGML_V1_FP16_TO_FP32(y[i]); |
| } |
| } |
| #else |
| for (int i = 0; i < n; ++i) { |
| for (int j = 0; j < GGML_V1_VEC_DOT_UNROLL; ++j) { |
| sumf[j] += GGML_V1_FP16_TO_FP32(x[j][i])*GGML_V1_FP16_TO_FP32(y[i]); |
| } |
| } |
| #endif |
|
|
| for (int i = 0; i < GGML_V1_VEC_DOT_UNROLL; ++i) { |
| s[i] = sumf[i]; |
| } |
| } |
|
|
| inline static void ggml_v1_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { |
| #if defined(GGML_V1_SIMD) |
| const int np = (n & ~(GGML_V1_F32_STEP - 1)); |
|
|
| GGML_V1_F32_VEC vx = GGML_V1_F32_VEC_SET1(v); |
|
|
| GGML_V1_F32_VEC ax[GGML_V1_F32_ARR]; |
| GGML_V1_F32_VEC ay[GGML_V1_F32_ARR]; |
|
|
| for (int i = 0; i < np; i += GGML_V1_F32_STEP) { |
| for (int j = 0; j < GGML_V1_F32_ARR; j++) { |
| ax[j] = GGML_V1_F32_VEC_LOAD(x + i + j*GGML_V1_F32_EPR); |
| ay[j] = GGML_V1_F32_VEC_LOAD(y + i + j*GGML_V1_F32_EPR); |
| ay[j] = GGML_V1_F32_VEC_FMA(ay[j], ax[j], vx); |
|
|
| GGML_V1_F32_VEC_STORE(y + i + j*GGML_V1_F32_EPR, ay[j]); |
| } |
| } |
|
|
| |
| for (int i = np; i < n; ++i) { |
| y[i] += x[i]*v; |
| } |
| #else |
| |
| for (int i = 0; i < n; ++i) { |
| y[i] += x[i]*v; |
| } |
| #endif |
| } |
|
|
| inline static void ggml_v1_vec_mad_f16(const int n, ggml_v1_fp16_t * restrict y, ggml_v1_fp16_t * restrict x, const float v) { |
| #if defined(GGML_V1_SIMD) |
| const int np = (n & ~(GGML_V1_F16_STEP - 1)); |
|
|
| GGML_V1_F16_VEC vx = GGML_V1_F16_VEC_SET1(v); |
|
|
| GGML_V1_F16_VEC ax[GGML_V1_F16_ARR]; |
| GGML_V1_F16_VEC ay[GGML_V1_F16_ARR]; |
|
|
| for (int i = 0; i < np; i += GGML_V1_F16_STEP) { |
| for (int j = 0; j < GGML_V1_F16_ARR; j++) { |
| ax[j] = GGML_V1_F16_VEC_LOAD(x + i + j*GGML_V1_F16_EPR, j); |
| ay[j] = GGML_V1_F16_VEC_LOAD(y + i + j*GGML_V1_F16_EPR, j); |
| ay[j] = GGML_V1_F16_VEC_FMA(ay[j], ax[j], vx); |
|
|
| GGML_V1_F16_VEC_STORE(y + i + j*GGML_V1_F16_EPR, ay, j); |
| } |
| } |
|
|
| |
| for (int i = np; i < n; ++i) { |
| GGML_V1_ASSERT(false); |
| y[i] = GGML_V1_FP32_TO_FP16(GGML_V1_FP16_TO_FP32(y[i]) + GGML_V1_FP16_TO_FP32(x[i])*v); |
| } |
| #else |
| for (int i = 0; i < n; ++i) { |
| y[i] = GGML_V1_FP32_TO_FP16(GGML_V1_FP16_TO_FP32(y[i]) + GGML_V1_FP16_TO_FP32(x[i])*v); |
| } |
| #endif |
| } |
|
|
| inline static void ggml_v1_vec_mad_q4_0(const int n, float * restrict y, void * restrict x, const float v) { |
| assert(n % QK == 0); |
|
|
| const int nb = n / QK; |
|
|
| const float * restrict pd = (const float *) (x); |
| const uint8_t * restrict pb = (const uint8_t *) (pd + nb); |
|
|
| |
| for (int i = 0; i < nb; i++) { |
| const float d = pd[i]; |
|
|
| const uint8_t * restrict pp = pb + i*QK/2; |
|
|
| for (int l = 0; l < QK; l += 2) { |
| const uint8_t vi = pp[l/2]; |
|
|
| const int8_t vi0 = vi & 0xf; |
| const int8_t vi1 = vi >> 4; |
|
|
| const float v0 = (vi0 - 8)*d; |
| const float v1 = (vi1 - 8)*d; |
|
|
| y[i*QK + l + 0] += v0*v; |
| y[i*QK + l + 1] += v1*v; |
|
|
| assert(!isnan(y[i*QK + l + 0])); |
| assert(!isnan(y[i*QK + l + 1])); |
| assert(!isinf(y[i*QK + l + 0])); |
| assert(!isinf(y[i*QK + l + 1])); |
| } |
| } |
| } |
|
|
| inline static void ggml_v1_vec_mad_q4_1(const int n, float * restrict y, void * restrict x, const float v) { |
| assert(n % QK == 0); |
|
|
| const int nb = n / QK; |
|
|
| const float * restrict pm = (const float *) (x); |
| const float * restrict pd = (const float *) (pm + nb); |
| const uint8_t * restrict pb = (const uint8_t *) (pd + nb); |
|
|
| for (int i = 0; i < nb; i++) { |
| const float m = pm[i]; |
| const float d = pd[i]; |
|
|
| const uint8_t * restrict pp = pb + i*QK/2; |
|
|
| for (int l = 0; l < QK; l += 2) { |
| const uint8_t vi = pp[l/2]; |
|
|
| const uint8_t vi0 = vi & 0xf; |
| const uint8_t vi1 = vi >> 4; |
|
|
| const float v0 = d*vi0 + m; |
| const float v1 = d*vi1 + m; |
|
|
| y[i*QK + l + 0] += v0*v; |
| y[i*QK + l + 1] += v1*v; |
|
|
| assert(!isnan(y[i*QK + l + 0])); |
| assert(!isnan(y[i*QK + l + 1])); |
| assert(!isinf(y[i*QK + l + 0])); |
| assert(!isinf(y[i*QK + l + 1])); |
| |
| } |
| } |
| } |
|
|
| |
| inline static void ggml_v1_vec_scale_f32(const int n, float * y, const float v) { |
| #if defined(GGML_V1_SIMD) |
| const int np = (n & ~(GGML_V1_F32_STEP - 1)); |
|
|
| GGML_V1_F32_VEC vx = GGML_V1_F32_VEC_SET1(v); |
|
|
| GGML_V1_F32_VEC ay[GGML_V1_F32_ARR]; |
|
|
| for (int i = 0; i < np; i += GGML_V1_F32_STEP) { |
| for (int j = 0; j < GGML_V1_F32_ARR; j++) { |
| ay[j] = GGML_V1_F32_VEC_LOAD(y + i + j*GGML_V1_F32_EPR); |
| ay[j] = GGML_V1_F32_VEC_MUL(ay[j], vx); |
|
|
| GGML_V1_F32_VEC_STORE(y + i + j*GGML_V1_F32_EPR, ay[j]); |
| } |
| } |
|
|
| |
| for (int i = np; i < n; ++i) { |
| y[i] *= v; |
| } |
| #else |
| |
| for (int i = 0; i < n; ++i) { |
| y[i] *= v; |
| } |
| #endif |
| } |
|
|
| inline static void ggml_v1_vec_norm_f32 (const int n, float * s, const float * x) { ggml_v1_vec_dot_f32(n, s, x, x); *s = sqrt(*s); } |
| inline static void ggml_v1_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } |
| inline static void ggml_v1_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrt(x[i]); } |
| inline static void ggml_v1_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } |
| inline static void ggml_v1_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } |
| inline static void ggml_v1_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } |
| inline static void ggml_v1_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } |
|
|
| static const ggml_v1_float GELU_COEF_A = 0.044715; |
| static const ggml_v1_float SQRT_2_OVER_PI = 0.79788456080286535587989211986876; |
|
|
| inline static float ggml_v1_gelu_f32(float x) { |
| return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x))); |
| } |
|
|
| inline static void ggml_v1_vec_gelu_f16(const int n, ggml_v1_fp16_t * y, const ggml_v1_fp16_t * x) { |
| const uint16_t * i16 = (const uint16_t *) x; |
| for (int i = 0; i < n; ++i) { |
| y[i] = table_gelu_f16[i16[i]]; |
| } |
| } |
|
|
| #ifdef GGML_V1_GELU_FP16 |
| inline static void ggml_v1_vec_gelu_f32(const int n, float * y, const float * x) { |
| uint16_t t; |
| for (int i = 0; i < n; ++i) { |
| ggml_v1_fp16_t fp16 = GGML_V1_FP32_TO_FP16(x[i]); |
| memcpy(&t, &fp16, sizeof(uint16_t)); |
| y[i] = GGML_V1_FP16_TO_FP32(table_gelu_f16[t]); |
| } |
| } |
| #else |
| inline static void ggml_v1_vec_gelu_f32(const int n, float * y, const float * x) { |
| for (int i = 0; i < n; ++i) { |
| y[i] = ggml_v1_gelu_f32(x[i]); |
| } |
| } |
| #endif |
|
|
| inline static void ggml_v1_vec_sum_f32(const int n, float * s, const float * x) { |
| #ifndef GGML_USE_ACCELERATE |
| ggml_v1_float sum = 0.0; |
| for (int i = 0; i < n; ++i) { |
| sum += x[i]; |
| } |
| *s = sum; |
| #else |
| vDSP_sve(x, 1, s, n); |
| #endif |
| } |
|
|
| inline static void ggml_v1_vec_max_f32(const int n, float * s, const float * x) { |
| #ifndef GGML_USE_ACCELERATE |
| ggml_v1_float max = -INFINITY; |
| for (int i = 0; i < n; ++i) { |
| max = MAX(max, x[i]); |
| } |
| *s = max; |
| #else |
| vDSP_maxv(x, 1, s, n); |
| #endif |
| } |
|
|
| inline static void ggml_v1_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_v1_vec_norm_f32(n, s, x); *s = 1./(*s); } |
|
|
| |
| |
| |
|
|
| #if (GGML_V1_DEBUG >= 1) |
| #define GGML_V1_PRINT_DEBUG(...) printf(__VA_ARGS__) |
| #else |
| #define GGML_V1_PRINT_DEBUG(...) |
| #endif |
|
|
| #if (GGML_V1_DEBUG >= 5) |
| #define GGML_V1_PRINT_DEBUG_5(...) printf(__VA_ARGS__) |
| #else |
| #define GGML_V1_PRINT_DEBUG_5(...) |
| #endif |
|
|
| #if (GGML_V1_DEBUG >= 10) |
| #define GGML_V1_PRINT_DEBUG_10(...) printf(__VA_ARGS__) |
| #else |
| #define GGML_V1_PRINT_DEBUG_10(...) |
| #endif |
|
|
| #define GGML_V1_PRINT(...) printf(__VA_ARGS__) |
|
|
| |
| |
| |
|
|
| static const int GGML_V1_BLCK_SIZE[GGML_V1_TYPE_COUNT] = { |
| QK, |
| QK, |
| 1, |
| 1, |
| 1, |
| 1, |
| 1, |
| }; |
|
|
| static_assert(GGML_V1_TYPE_COUNT == 7, "GGML_V1_TYPE_COUNT != 5"); |
|
|
| static const size_t GGML_V1_TYPE_SIZE[GGML_V1_TYPE_COUNT] = { |
| sizeof(float ) + QK/2, |
| sizeof(float )*2 + QK/2, |
| sizeof(int8_t ), |
| sizeof(int16_t), |
| sizeof(int32_t), |
| sizeof(ggml_v1_fp16_t), |
| sizeof(float ), |
| }; |
|
|
| |
| static_assert(GGML_V1_TYPE_COUNT == 7, "GGML_V1_TYPE_COUNT != 5"); |
|
|
| static const char * GGML_V1_OP_LABEL[GGML_V1_OP_COUNT] = { |
| "NONE", |
|
|
| "DUP", |
| "ADD", |
| "SUB", |
| "MUL", |
| "DIV", |
| "SQR", |
| "SQRT", |
| "SUM", |
| "MEAN", |
| "REPEAT", |
| "ABS", |
| "SGN", |
| "NEG", |
| "STEP", |
| "RELU", |
| "GELU", |
| "NORM", |
|
|
| "MUL_MAT", |
|
|
| "SCALE", |
| "CPY", |
| "RESHAPE", |
| "VIEW", |
| "PERMUTE", |
| "TRANSPOSE", |
| "GET_ROWS", |
| "DIAG_MASK_INF", |
| "SOFT_MAX", |
| "ROPE", |
| "CONV_1D_1S", |
| "CONV_1D_2S", |
|
|
| "FLASH_ATTN", |
| "FLASH_FF", |
| }; |
|
|
| static_assert(GGML_V1_OP_COUNT == 33, "GGML_V1_OP_COUNT != 33"); |
|
|
| static const char * GGML_V1_OP_SYMBOL[GGML_V1_OP_COUNT] = { |
| "none", |
|
|
| "x", |
| "x+y", |
| "x-y", |
| "x*y", |
| "x/y", |
| "x^2", |
| "√x", |
| "Σx", |
| "Σx/n", |
| "repeat(x)", |
| "abs(x)", |
| "sgn(x)", |
| "-x", |
| "step(x)", |
| "relu(x)", |
| "gelu(x)", |
| "norm(x)", |
|
|
| "X*Y", |
|
|
| "x*v", |
| "x-\\>y", |
| "reshape(x)", |
| "view(x)", |
| "permute(x)", |
| "transpose(x)", |
| "get_rows(x)", |
| "diag_mask_inf(x)", |
| "soft_max(x)", |
| "rope(x)", |
| "conv_1d_1s(x)", |
| "conv_1d_2s(x)", |
|
|
| "flash_attn(x)", |
| "flash_ff(x)", |
| }; |
|
|
| static_assert(GGML_V1_OP_COUNT == 33, "GGML_V1_OP_COUNT != 33"); |
|
|
| |
| |
| |
|
|
| struct ggml_v1_object { |
| size_t offs; |
| size_t size; |
|
|
| struct ggml_v1_object * next; |
|
|
| char padding[8]; |
| }; |
|
|
| static const size_t GGML_V1_OBJECT_SIZE = sizeof(struct ggml_v1_object); |
|
|
| static_assert(sizeof(struct ggml_v1_object)%GGML_V1_MEM_ALIGN == 0, "ggml_v1_object size must be a multiple of GGML_V1_MEM_ALIGN"); |
| static_assert(sizeof(struct ggml_v1_tensor)%GGML_V1_MEM_ALIGN == 0, "ggml_v1_tensor size must be a multiple of GGML_V1_MEM_ALIGN"); |
|
|
| |
| |
| |
|
|
| struct ggml_v1_context { |
| size_t mem_size; |
| void * mem_buffer; |
| bool mem_buffer_owned; |
|
|
| int n_objects; |
|
|
| struct ggml_v1_object * objects_begin; |
| struct ggml_v1_object * objects_end; |
|
|
| struct ggml_v1_scratch scratch; |
| struct ggml_v1_scratch scratch_save; |
| }; |
|
|
| struct ggml_v1_context_container { |
| bool used; |
|
|
| struct ggml_v1_context context; |
| }; |
|
|
| |
| |
| |
|
|
| enum ggml_v1_task_type { |
| GGML_V1_TASK_INIT = 0, |
| GGML_V1_TASK_COMPUTE, |
| GGML_V1_TASK_FINALIZE, |
| }; |
|
|
| struct ggml_v1_compute_params { |
| enum ggml_v1_task_type type; |
|
|
| int ith, nth; |
|
|
| |
| size_t wsize; |
| void * wdata; |
| }; |
|
|
| |
| |
| |
|
|
| struct ggml_v1_state { |
| struct ggml_v1_context_container contexts[GGML_V1_MAX_CONTEXTS]; |
| }; |
|
|
| |
| static struct ggml_v1_state g_state; |
| static atomic_int g_state_barrier = 0; |
|
|
| |
| inline static void ggml_v1_critical_section_start(void) { |
| int processing = atomic_fetch_add(&g_state_barrier, 1); |
|
|
| while (processing > 0) { |
| |
| atomic_fetch_sub(&g_state_barrier, 1); |
| sched_yield(); |
| processing = atomic_fetch_add(&g_state_barrier, 1); |
| } |
| } |
|
|
| |
| |
| inline static void ggml_v1_critical_section_end(void) { |
| atomic_fetch_sub(&g_state_barrier, 1); |
| } |
|
|
| |
|
|
| void ggml_v1_print_object(const struct ggml_v1_object * obj) { |
| GGML_V1_PRINT(" - ggml_v1_object: offset = %zu, size = %zu, next = %p\n", |
| obj->offs, obj->size, (const void *) obj->next); |
| } |
|
|
| void ggml_v1_print_objects(const struct ggml_v1_context * ctx) { |
| struct ggml_v1_object * obj = ctx->objects_begin; |
|
|
| GGML_V1_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx); |
|
|
| while (obj != NULL) { |
| ggml_v1_print_object(obj); |
| obj = obj->next; |
| } |
|
|
| GGML_V1_PRINT("%s: --- end ---\n", __func__); |
| } |
|
|
| int ggml_v1_nelements(const struct ggml_v1_tensor * tensor) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; |
| } |
|
|
| int ggml_v1_nrows(const struct ggml_v1_tensor * tensor) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; |
| } |
|
|
| size_t ggml_v1_nbytes(const struct ggml_v1_tensor * tensor) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return (ggml_v1_nelements(tensor)*GGML_V1_TYPE_SIZE[tensor->type])/GGML_V1_BLCK_SIZE[tensor->type]; |
| } |
|
|
| int ggml_v1_blck_size(enum ggml_v1_type type) { |
| return GGML_V1_BLCK_SIZE[type]; |
| } |
|
|
| size_t ggml_v1_type_size(enum ggml_v1_type type) { |
| return GGML_V1_TYPE_SIZE[type]; |
| } |
|
|
| float ggml_v1_type_sizef(enum ggml_v1_type type) { |
| return ((float)(GGML_V1_TYPE_SIZE[type]))/GGML_V1_BLCK_SIZE[type]; |
| } |
|
|
| size_t ggml_v1_element_size(const struct ggml_v1_tensor * tensor) { |
| return GGML_V1_TYPE_SIZE[tensor->type]; |
| } |
|
|
| static inline bool ggml_v1_is_scalar(const struct ggml_v1_tensor * tensor) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; |
| } |
|
|
| static inline bool ggml_v1_is_vector(const struct ggml_v1_tensor * tensor) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; |
| } |
|
|
| static inline bool ggml_v1_is_matrix(const struct ggml_v1_tensor * tensor) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return tensor->ne[2] == 1 && tensor->ne[3] == 1; |
| } |
|
|
| static inline bool ggml_v1_can_mul_mat(const struct ggml_v1_tensor * t0, const struct ggml_v1_tensor * t1) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return |
| (t0->ne[0] == t1->ne[0]) && |
| (t0->ne[2] == t1->ne[2]) && |
| (t0->ne[3] == t1->ne[3]); |
| } |
|
|
| static inline bool ggml_v1_is_contiguous(const struct ggml_v1_tensor * tensor) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return |
| tensor->nb[0] == GGML_V1_TYPE_SIZE[tensor->type] && |
| tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_V1_BLCK_SIZE[tensor->type] && |
| tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && |
| tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; |
| } |
|
|
| static inline bool ggml_v1_is_padded_1d(const struct ggml_v1_tensor * tensor) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return |
| tensor->nb[0] == GGML_V1_TYPE_SIZE[tensor->type] && |
| tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && |
| tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; |
| } |
|
|
| static inline bool ggml_v1_are_same_shape(const struct ggml_v1_tensor * t0, const struct ggml_v1_tensor * t1) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return |
| (t0->ne[0] == t1->ne[0] ) && |
| (t0->ne[1] == t1->ne[1] ) && |
| (t0->ne[2] == t1->ne[2] ) && |
| (t0->ne[3] == t1->ne[3] ); |
| } |
|
|
| |
| static inline bool ggml_v1_can_repeat(const struct ggml_v1_tensor * t0, const struct ggml_v1_tensor * t1) { |
| static_assert(GGML_V1_MAX_DIMS == 4, "GGML_V1_MAX_DIMS is not 4 - update this function"); |
|
|
| return |
| (t1->ne[0]%t0->ne[0] == 0) && |
| (t1->ne[1]%t0->ne[1] == 0) && |
| (t1->ne[2]%t0->ne[2] == 0) && |
| (t1->ne[3]%t0->ne[3] == 0); |
| } |
|
|
| static inline int ggml_v1_up32(int n) { |
| return (n + 31) & ~31; |
| } |
|
|
| static inline int ggml_v1_up64(int n) { |
| return (n + 63) & ~63; |
| } |
|
|
| static inline int ggml_v1_up(int n, int m) { |
| |
| GGML_V1_ASSERT((m & (m - 1)) == 0); |
| return (n + m - 1) & ~(m - 1); |
| } |
|
|
| |
| #define ggml_v1_assert_aligned(ptr) \ |
| assert(((uintptr_t) (ptr))%GGML_V1_MEM_ALIGN == 0) |
|
|
| |
|
|
| struct ggml_v1_context * ggml_v1_init(struct ggml_v1_init_params params) { |
| |
| ggml_v1_critical_section_start(); |
|
|
| static bool is_first_call = true; |
|
|
| if (is_first_call) { |
| |
| { |
| const uint64_t t_start = ggml_v1_time_us(); UNUSED(t_start); |
|
|
| ggml_v1_fp16_t ii; |
| for (int i = 0; i < (1 << 16); ++i) { |
| uint16_t ui = i; |
| memcpy(&ii, &ui, sizeof(ii)); |
| const float f = table_f32_f16[i] = GGML_V1_COMPUTE_FP16_TO_FP32(ii); |
| table_gelu_f16[i] = GGML_V1_FP32_TO_FP16(ggml_v1_gelu_f32(f)); |
| table_exp_f16[i] = GGML_V1_FP32_TO_FP16(exp(f)); |
| } |
|
|
| const uint64_t t_end = ggml_v1_time_us(); UNUSED(t_end); |
|
|
| GGML_V1_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); |
| } |
|
|
| |
| { |
| const uint64_t t_start = ggml_v1_time_us(); UNUSED(t_start); |
|
|
| g_state = (struct ggml_v1_state) { |
| { { 0 } }, |
| }; |
|
|
| for (int i = 0; i < GGML_V1_MAX_CONTEXTS; ++i) { |
| g_state.contexts[i].used = false; |
| } |
|
|
| const uint64_t t_end = ggml_v1_time_us(); UNUSED(t_end); |
|
|
| GGML_V1_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); |
| } |
|
|
| is_first_call = false; |
| } |
|
|
| |
| struct ggml_v1_context * ctx = NULL; |
|
|
| for (int i = 0; i < GGML_V1_MAX_CONTEXTS; i++) { |
| if (!g_state.contexts[i].used) { |
| g_state.contexts[i].used = true; |
| ctx = &g_state.contexts[i].context; |
|
|
| GGML_V1_PRINT_DEBUG("%s: found unused context %d\n", __func__, i); |
| break; |
| } |
| } |
|
|
| if (ctx == NULL) { |
| GGML_V1_PRINT_DEBUG("%s: no unused context found\n", __func__); |
|
|
| ggml_v1_critical_section_end(); |
|
|
| return NULL; |
| } |
|
|
| *ctx = (struct ggml_v1_context) { |
| params.mem_size, |
| params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), |
| params.mem_buffer ? false : true, |
| 0, |
| NULL, |
| NULL, |
| { 0, 0, NULL, }, |
| { 0, 0, NULL, }, |
| }; |
|
|
| ggml_v1_assert_aligned(ctx->mem_buffer); |
|
|
| GGML_V1_PRINT_DEBUG("%s: context initialized\n", __func__); |
|
|
| ggml_v1_critical_section_end(); |
|
|
| return ctx; |
| } |
|
|
| void ggml_v1_free(struct ggml_v1_context * ctx) { |
| |
| ggml_v1_critical_section_start(); |
|
|
| bool found = false; |
|
|
| for (int i = 0; i < GGML_V1_MAX_CONTEXTS; i++) { |
| if (&g_state.contexts[i].context == ctx) { |
| g_state.contexts[i].used = false; |
|
|
| GGML_V1_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", |
| __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); |
|
|
| if (ctx->mem_buffer_owned) { |
| free(ctx->mem_buffer); |
| } |
|
|
| found = true; |
| break; |
| } |
| } |
|
|
| if (!found) { |
| GGML_V1_PRINT_DEBUG("%s: context not found\n", __func__); |
| } |
|
|
| ggml_v1_critical_section_end(); |
| } |
|
|
| size_t ggml_v1_used_mem(const struct ggml_v1_context * ctx) { |
| return ctx->objects_end->offs + ctx->objects_end->size; |
| } |
|
|
| size_t ggml_v1_set_scratch(struct ggml_v1_context * ctx, struct ggml_v1_scratch scratch) { |
| const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0; |
|
|
| ctx->scratch = scratch; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_new_tensor_impl( |
| struct ggml_v1_context * ctx, |
| enum ggml_v1_type type, |
| int n_dims, |
| const int* ne, |
| void* data) { |
| |
| struct ggml_v1_object * obj_cur = ctx->objects_end; |
|
|
| const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs; |
| const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size; |
| const size_t cur_end = cur_offs + cur_size; |
|
|
| size_t size_needed = 0; |
|
|
| if (data == NULL) { |
| size_needed += GGML_V1_TYPE_SIZE[type]*(ne[0]/GGML_V1_BLCK_SIZE[type]); |
| for (int i = 1; i < n_dims; i++) { |
| size_needed *= ne[i]; |
| } |
| |
| size_needed = ((size_needed + GGML_V1_MEM_ALIGN - 1)/GGML_V1_MEM_ALIGN)*GGML_V1_MEM_ALIGN; |
| } |
|
|
| char * const mem_buffer = ctx->mem_buffer; |
| struct ggml_v1_object * const obj_new = (struct ggml_v1_object *)(mem_buffer + cur_end); |
|
|
| if (ctx->scratch.data == NULL || data != NULL) { |
| size_needed += sizeof(struct ggml_v1_tensor); |
|
|
| if (cur_end + size_needed + GGML_V1_OBJECT_SIZE > ctx->mem_size) { |
| GGML_V1_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", |
| __func__, cur_end + size_needed + GGML_V1_OBJECT_SIZE, ctx->mem_size); |
| assert(false); |
| return NULL; |
| } |
|
|
| *obj_new = (struct ggml_v1_object) { |
| .offs = cur_end + GGML_V1_OBJECT_SIZE, |
| .size = size_needed, |
| .next = NULL, |
| }; |
| } else { |
| if (ctx->scratch.offs + size_needed > ctx->scratch.size) { |
| GGML_V1_PRINT("%s: not enough space in the scratch memory\n", __func__); |
| assert(false); |
| return NULL; |
| } |
|
|
| if (cur_end + sizeof(struct ggml_v1_tensor) + GGML_V1_OBJECT_SIZE > ctx->mem_size) { |
| GGML_V1_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", |
| __func__, cur_end + sizeof(struct ggml_v1_tensor) + GGML_V1_OBJECT_SIZE, ctx->mem_size); |
| assert(false); |
| return NULL; |
| } |
|
|
| data = (char * const) ctx->scratch.data + ctx->scratch.offs; |
|
|
| *obj_new = (struct ggml_v1_object) { |
| .offs = cur_end + GGML_V1_OBJECT_SIZE, |
| .size = sizeof(struct ggml_v1_tensor), |
| .next = NULL, |
| }; |
|
|
| |
|
|
| ctx->scratch.offs += size_needed; |
| } |
|
|
| if (obj_cur != NULL) { |
| obj_cur->next = obj_new; |
| } else { |
| |
| ctx->objects_begin = obj_new; |
| } |
|
|
| ctx->objects_end = obj_new; |
|
|
| |
|
|
| struct ggml_v1_tensor * const result = (struct ggml_v1_tensor *)(mem_buffer + obj_new->offs); |
|
|
| ggml_v1_assert_aligned(result); |
|
|
| *result = (struct ggml_v1_tensor) { |
| type, |
| n_dims, |
| { 1, 1, 1, 1 }, |
| { 0, 0, 0, 0 }, |
| GGML_V1_OP_NONE, |
| false, |
| NULL, |
| NULL, |
| NULL, |
| { NULL }, |
| 0, |
| 0, |
| 0, |
| 0, |
| data == NULL ? (void *)(result + 1) : data, |
| { 0 }, |
| }; |
|
|
| ggml_v1_assert_aligned(result->data); |
|
|
| for (int i = 0; i < n_dims; i++) { |
| result->ne[i] = ne[i]; |
| } |
|
|
| result->nb[0] = GGML_V1_TYPE_SIZE[type]; |
| result->nb[1] = result->nb[0]*(result->ne[0]/GGML_V1_BLCK_SIZE[type]); |
| for (int i = 2; i < GGML_V1_MAX_DIMS; i++) { |
| result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; |
| } |
|
|
| ctx->n_objects++; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_new_tensor( |
| struct ggml_v1_context * ctx, |
| enum ggml_v1_type type, |
| int n_dims, |
| const int * ne) { |
| return ggml_v1_new_tensor_impl(ctx, type, n_dims, ne, NULL); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_new_tensor_1d( |
| struct ggml_v1_context * ctx, |
| enum ggml_v1_type type, |
| int ne0) { |
| return ggml_v1_new_tensor(ctx, type, 1, &ne0); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_new_tensor_2d( |
| struct ggml_v1_context * ctx, |
| enum ggml_v1_type type, |
| int ne0, |
| int ne1) { |
| const int ne[2] = { ne0, ne1 }; |
| return ggml_v1_new_tensor(ctx, type, 2, ne); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_new_tensor_3d( |
| struct ggml_v1_context * ctx, |
| enum ggml_v1_type type, |
| int ne0, |
| int ne1, |
| int ne2) { |
| const int ne[3] = { ne0, ne1, ne2 }; |
| return ggml_v1_new_tensor(ctx, type, 3, ne); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_new_tensor_4d( |
| struct ggml_v1_context * ctx, |
| enum ggml_v1_type type, |
| int ne0, |
| int ne1, |
| int ne2, |
| int ne3) { |
| const int ne[4] = { ne0, ne1, ne2, ne3 }; |
| return ggml_v1_new_tensor(ctx, type, 4, ne); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_new_i32(struct ggml_v1_context * ctx, int32_t value) { |
| ctx->scratch_save = ctx->scratch; |
| ctx->scratch.data = NULL; |
|
|
| struct ggml_v1_tensor * result = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_I32, 1); |
|
|
| ctx->scratch = ctx->scratch_save; |
|
|
| ggml_v1_set_i32(result, value); |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_new_f32(struct ggml_v1_context * ctx, float value) { |
| ctx->scratch_save = ctx->scratch; |
| ctx->scratch.data = NULL; |
|
|
| struct ggml_v1_tensor * result = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, 1); |
|
|
| ctx->scratch = ctx->scratch_save; |
|
|
| ggml_v1_set_f32(result, value); |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_dup_tensor(struct ggml_v1_context * ctx, const struct ggml_v1_tensor * src) { |
| return ggml_v1_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_set_zero(struct ggml_v1_tensor * tensor) { |
| memset(tensor->data, 0, ggml_v1_nbytes(tensor)); |
| return tensor; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_set_i32 (struct ggml_v1_tensor * tensor, int32_t value) { |
| const int n = ggml_v1_nrows(tensor); |
| const int nc = tensor->ne[0]; |
| const size_t n1 = tensor->nb[1]; |
|
|
| char * const data = tensor->data; |
|
|
| switch (tensor->type) { |
| case GGML_V1_TYPE_Q4_0: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_Q4_1: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_I8: |
| { |
| assert(tensor->nb[0] == sizeof(int8_t)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_i8(nc, (int8_t *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_I16: |
| { |
| assert(tensor->nb[0] == sizeof(int16_t)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_i16(nc, (int16_t *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_I32: |
| { |
| assert(tensor->nb[0] == sizeof(int32_t)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_i32(nc, (int32_t *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_F16: |
| { |
| assert(tensor->nb[0] == sizeof(ggml_v1_fp16_t)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_f16(nc, (ggml_v1_fp16_t *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| assert(tensor->nb[0] == sizeof(float)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_f32(nc, (float *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
|
|
| return tensor; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_set_f32(struct ggml_v1_tensor * tensor, float value) { |
| const int n = ggml_v1_nrows(tensor); |
| const int nc = tensor->ne[0]; |
| const size_t n1 = tensor->nb[1]; |
|
|
| char * const data = tensor->data; |
|
|
| switch (tensor->type) { |
| case GGML_V1_TYPE_Q4_0: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_Q4_1: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_I8: |
| { |
| assert(tensor->nb[0] == sizeof(int8_t)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_i8(nc, (int8_t *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_I16: |
| { |
| assert(tensor->nb[0] == sizeof(int16_t)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_i16(nc, (int16_t *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_I32: |
| { |
| assert(tensor->nb[0] == sizeof(int32_t)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_i32(nc, (int32_t *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_F16: |
| { |
| assert(tensor->nb[0] == sizeof(ggml_v1_fp16_t)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_f16(nc, (ggml_v1_fp16_t *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| assert(tensor->nb[0] == sizeof(float)); |
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_set_f32(nc, (float *)(data + i*n1), value); |
| } |
| } break; |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
|
|
| return tensor; |
| } |
|
|
| int32_t ggml_v1_get_i32_1d(const struct ggml_v1_tensor * tensor, int i) { |
| switch (tensor->type) { |
| case GGML_V1_TYPE_Q4_0: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_Q4_1: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_I8: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int8_t)); |
| return ((int8_t *)(tensor->data))[i]; |
| } break; |
| case GGML_V1_TYPE_I16: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int16_t)); |
| return ((int16_t *)(tensor->data))[i]; |
| } break; |
| case GGML_V1_TYPE_I32: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int32_t)); |
| return ((int32_t *)(tensor->data))[i]; |
| } break; |
| case GGML_V1_TYPE_F16: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(ggml_v1_fp16_t)); |
| return GGML_V1_FP16_TO_FP32(((ggml_v1_fp16_t *)(tensor->data))[i]); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(float)); |
| return ((float *)(tensor->data))[i]; |
| } break; |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
|
|
| return 0.0f; |
| } |
|
|
| void ggml_v1_set_i32_1d(const struct ggml_v1_tensor * tensor, int i, int32_t value) { |
| switch (tensor->type) { |
| case GGML_V1_TYPE_Q4_0: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_Q4_1: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_I8: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int8_t)); |
| ((int8_t *)(tensor->data))[i] = value; |
| } break; |
| case GGML_V1_TYPE_I16: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int16_t)); |
| ((int16_t *)(tensor->data))[i] = value; |
| } break; |
| case GGML_V1_TYPE_I32: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int32_t)); |
| ((int32_t *)(tensor->data))[i] = value; |
| } break; |
| case GGML_V1_TYPE_F16: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(ggml_v1_fp16_t)); |
| ((ggml_v1_fp16_t *)(tensor->data))[i] = GGML_V1_FP32_TO_FP16(value); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(float)); |
| ((float *)(tensor->data))[i] = value; |
| } break; |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| float ggml_v1_get_f32_1d(const struct ggml_v1_tensor * tensor, int i) { |
| switch (tensor->type) { |
| case GGML_V1_TYPE_Q4_0: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_Q4_1: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_I8: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int8_t)); |
| return ((int8_t *)(tensor->data))[i]; |
| } break; |
| case GGML_V1_TYPE_I16: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int16_t)); |
| return ((int16_t *)(tensor->data))[i]; |
| } break; |
| case GGML_V1_TYPE_I32: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int32_t)); |
| return ((int32_t *)(tensor->data))[i]; |
| } break; |
| case GGML_V1_TYPE_F16: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(ggml_v1_fp16_t)); |
| return GGML_V1_FP16_TO_FP32(((ggml_v1_fp16_t *)(tensor->data))[i]); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(float)); |
| return ((float *)(tensor->data))[i]; |
| } break; |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
|
|
| return 0.0f; |
| } |
|
|
| void ggml_v1_set_f32_1d(const struct ggml_v1_tensor * tensor, int i, float value) { |
| switch (tensor->type) { |
| case GGML_V1_TYPE_Q4_0: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_Q4_1: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_I8: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int8_t)); |
| ((int8_t *)(tensor->data))[i] = value; |
| } break; |
| case GGML_V1_TYPE_I16: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int16_t)); |
| ((int16_t *)(tensor->data))[i] = value; |
| } break; |
| case GGML_V1_TYPE_I32: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(int32_t)); |
| ((int32_t *)(tensor->data))[i] = value; |
| } break; |
| case GGML_V1_TYPE_F16: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(ggml_v1_fp16_t)); |
| ((ggml_v1_fp16_t *)(tensor->data))[i] = GGML_V1_FP32_TO_FP16(value); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| GGML_V1_ASSERT(tensor->nb[0] == sizeof(float)); |
| ((float *)(tensor->data))[i] = value; |
| } break; |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| void * ggml_v1_get_data(const struct ggml_v1_tensor * tensor) { |
| return tensor->data; |
| } |
|
|
| float * ggml_v1_get_data_f32(const struct ggml_v1_tensor * tensor) { |
| assert(tensor->type == GGML_V1_TYPE_F32); |
| return (float *)(tensor->data); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_view_tensor( |
| struct ggml_v1_context * ctx, |
| const struct ggml_v1_tensor * src) { |
| return ggml_v1_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); |
| } |
|
|
| |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_dup_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_DUP; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_dup( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_dup_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_dup_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_dup_impl(ctx, a, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_add_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b, |
| bool inplace) { |
| GGML_V1_ASSERT(ggml_v1_are_same_shape(a, b)); |
|
|
| bool is_node = false; |
|
|
| if (!inplace && (a->grad || b->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_ADD; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_add( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_add_impl(ctx, a, b, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_add_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_add_impl(ctx, a, b, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_sub_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b, |
| bool inplace) { |
| GGML_V1_ASSERT(ggml_v1_are_same_shape(a, b)); |
|
|
| bool is_node = false; |
|
|
| if (!inplace && (a->grad || b->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_SUB; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_sub( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_sub_impl(ctx, a, b, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_sub_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_sub_impl(ctx, a, b, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_mul_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b, |
| bool inplace) { |
| GGML_V1_ASSERT(ggml_v1_are_same_shape(a, b)); |
|
|
| bool is_node = false; |
|
|
| if (!inplace && (a->grad || b->grad)) { |
| is_node = true; |
| } |
|
|
| if (inplace) { |
| GGML_V1_ASSERT(is_node == false); |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_MUL; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_mul( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_mul_impl(ctx, a, b, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_mul_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_mul_impl(ctx, a, b, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_div_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b, |
| bool inplace) { |
| GGML_V1_ASSERT(ggml_v1_are_same_shape(a, b)); |
|
|
| bool is_node = false; |
|
|
| if (!inplace && (a->grad || b->grad)) { |
| is_node = true; |
| } |
|
|
| if (inplace) { |
| GGML_V1_ASSERT(is_node == false); |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_DIV; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_div( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_div_impl(ctx, a, b, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_div_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_div_impl(ctx, a, b, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_sqr_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_SQR; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_sqr( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_sqr_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_sqr_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_sqr_impl(ctx, a, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_sqrt_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_SQRT; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_sqrt( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_sqrt_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_sqrt_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_sqrt_impl(ctx, a, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_sum( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| bool is_node = false; |
|
|
| if (a->grad) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = ggml_v1_new_tensor_1d(ctx, a->type, 1); |
|
|
| result->op = GGML_V1_OP_SUM; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_mean( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| bool is_node = false; |
|
|
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| int ne[GGML_V1_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; |
| struct ggml_v1_tensor * result = ggml_v1_new_tensor(ctx, GGML_V1_TYPE_F32, a->n_dims, ne); |
|
|
| result->op = GGML_V1_OP_MEAN; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_repeat( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| GGML_V1_ASSERT(ggml_v1_can_repeat(a, b)); |
|
|
| bool is_node = false; |
|
|
| if (a->grad) { |
| is_node = true; |
| } |
|
|
| if (ggml_v1_are_same_shape(a, b) && !is_node) { |
| return a; |
| } |
|
|
| struct ggml_v1_tensor * result = ggml_v1_new_tensor(ctx, a->type, b->n_dims, b->ne); |
|
|
| result->op = GGML_V1_OP_REPEAT; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_abs_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_ABS; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_abs( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_abs_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_abs_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_abs_impl(ctx, a, true); |
| } |
|
|
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_sgn_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_SGN; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_sgn( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_sgn_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_sgn_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_sgn_impl(ctx, a, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_neg_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_NEG; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_neg( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_neg_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_neg_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_neg_impl(ctx, a, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_step_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_STEP; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_step( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_step_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_step_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_step_impl(ctx, a, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_relu_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_RELU; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_relu( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_relu_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_relu_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_relu_impl(ctx, a, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_gelu_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_GELU; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_gelu( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_gelu_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_gelu_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_gelu_impl(ctx, a, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_norm_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| bool inplace) { |
| bool is_node = false; |
|
|
| if (!inplace && (a->grad)) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = inplace ? ggml_v1_view_tensor(ctx, a) : ggml_v1_dup_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_NORM; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_norm( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_norm_impl(ctx, a, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_norm_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| return ggml_v1_norm_impl(ctx, a, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_mul_mat( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| GGML_V1_ASSERT(ggml_v1_can_mul_mat(a, b)); |
|
|
| bool is_node = false; |
|
|
| if (a->grad || b->grad) { |
| is_node = true; |
| } |
|
|
| const int ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] }; |
| struct ggml_v1_tensor * result = ggml_v1_new_tensor(ctx, GGML_V1_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); |
|
|
| result->op = GGML_V1_OP_MUL_MAT; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_scale_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b, |
| bool inplace) { |
| GGML_V1_ASSERT(ggml_v1_is_scalar(b)); |
| GGML_V1_ASSERT(ggml_v1_is_padded_1d(a)); |
|
|
| bool is_node = false; |
|
|
| if (!inplace && (a->grad || b->grad)) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| |
| |
| struct ggml_v1_tensor * result = ggml_v1_view_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_SCALE; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_scale( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_scale_impl(ctx, a, b, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_scale_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_scale_impl(ctx, a, b, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_cpy_impl( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b, |
| bool inplace) { |
| GGML_V1_ASSERT(ggml_v1_nelements(a) == ggml_v1_nelements(b)); |
|
|
| bool is_node = false; |
|
|
| if (!inplace && (a->grad || b->grad)) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| |
| struct ggml_v1_tensor * result = ggml_v1_view_tensor(ctx, b); |
|
|
| result->op = GGML_V1_OP_CPY; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_cpy( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_cpy_impl(ctx, a, b, false); |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_cpy_inplace( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| return ggml_v1_cpy_impl(ctx, a, b, true); |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_reshape( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(a)); |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(b)); |
| GGML_V1_ASSERT(ggml_v1_nelements(a) == ggml_v1_nelements(b)); |
|
|
| bool is_node = false; |
|
|
| if (a->grad || b->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = ggml_v1_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); |
|
|
| result->op = GGML_V1_OP_RESHAPE; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_reshape_2d( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| int ne0, |
| int ne1) { |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(a)); |
| GGML_V1_ASSERT(ggml_v1_nelements(a) == ne0*ne1); |
|
|
| bool is_node = false; |
|
|
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| const int ne[2] = { ne0, ne1 }; |
| struct ggml_v1_tensor * result = ggml_v1_new_tensor_impl(ctx, a->type, 2, ne, a->data); |
|
|
| result->op = GGML_V1_OP_RESHAPE; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_tensor * ggml_v1_reshape_3d( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| int ne0, |
| int ne1, |
| int ne2) { |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(a)); |
| GGML_V1_ASSERT(ggml_v1_nelements(a) == ne0*ne1*ne2); |
|
|
| bool is_node = false; |
|
|
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| const int ne[3] = { ne0, ne1, ne2 }; |
| struct ggml_v1_tensor * result = ggml_v1_new_tensor_impl(ctx, a->type, 3, ne, a->data); |
|
|
| result->op = GGML_V1_OP_RESHAPE; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_view_1d( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| int ne0, |
| size_t offset) { |
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| } |
|
|
| struct ggml_v1_tensor * result = ggml_v1_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); |
|
|
| result->op = GGML_V1_OP_VIEW; |
| result->grad = NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_view_2d( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| int ne0, |
| int ne1, |
| size_t nb1, |
| size_t offset) { |
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| } |
|
|
| const int ne[GGML_V1_MAX_DIMS] = { ne0, ne1, 1, 1 }; |
|
|
| struct ggml_v1_tensor * result = ggml_v1_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); |
|
|
| result->nb[1] = nb1; |
| result->nb[2] = result->nb[1]*ne1; |
| result->nb[3] = result->nb[2]; |
|
|
| result->op = GGML_V1_OP_VIEW; |
| result->grad = NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_permute( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| int axis0, |
| int axis1, |
| int axis2, |
| int axis3) { |
| GGML_V1_ASSERT(axis0 >= 0 && axis0 < GGML_V1_MAX_DIMS); |
| GGML_V1_ASSERT(axis1 >= 0 && axis1 < GGML_V1_MAX_DIMS); |
| GGML_V1_ASSERT(axis2 >= 0 && axis2 < GGML_V1_MAX_DIMS); |
| GGML_V1_ASSERT(axis3 >= 0 && axis3 < GGML_V1_MAX_DIMS); |
|
|
| GGML_V1_ASSERT(axis0 != axis1); |
| GGML_V1_ASSERT(axis0 != axis2); |
| GGML_V1_ASSERT(axis0 != axis3); |
| GGML_V1_ASSERT(axis1 != axis2); |
| GGML_V1_ASSERT(axis1 != axis3); |
| GGML_V1_ASSERT(axis2 != axis3); |
|
|
| bool is_node = false; |
|
|
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = ggml_v1_view_tensor(ctx, a); |
|
|
| int ne[GGML_V1_MAX_DIMS]; |
| int nb[GGML_V1_MAX_DIMS]; |
|
|
| ne[axis0] = a->ne[0]; |
| ne[axis1] = a->ne[1]; |
| ne[axis2] = a->ne[2]; |
| ne[axis3] = a->ne[3]; |
|
|
| nb[axis0] = a->nb[0]; |
| nb[axis1] = a->nb[1]; |
| nb[axis2] = a->nb[2]; |
| nb[axis3] = a->nb[3]; |
|
|
| result->ne[0] = ne[0]; |
| result->ne[1] = ne[1]; |
| result->ne[2] = ne[2]; |
| result->ne[3] = ne[3]; |
|
|
| result->nb[0] = nb[0]; |
| result->nb[1] = nb[1]; |
| result->nb[2] = nb[2]; |
| result->nb[3] = nb[3]; |
|
|
| result->op = GGML_V1_OP_PERMUTE; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_transpose( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| bool is_node = false; |
|
|
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| struct ggml_v1_tensor * result = ggml_v1_view_tensor(ctx, a); |
|
|
| result->ne[0] = a->ne[1]; |
| result->ne[1] = a->ne[0]; |
|
|
| result->nb[0] = a->nb[1]; |
| result->nb[1] = a->nb[0]; |
|
|
| result->op = GGML_V1_OP_TRANSPOSE; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_get_rows( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| GGML_V1_ASSERT(ggml_v1_is_matrix(a) && ggml_v1_is_vector(b) && b->type == GGML_V1_TYPE_I32); |
|
|
| bool is_node = false; |
|
|
| if (a->grad || b->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| |
| |
| struct ggml_v1_tensor * result = ggml_v1_new_tensor_2d(ctx, GGML_V1_TYPE_F32, a->ne[0], b->ne[0]); |
|
|
| result->op = GGML_V1_OP_GET_ROWS; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_diag_mask_inf( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| int n_past) { |
| bool is_node = false; |
|
|
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| |
| |
| struct ggml_v1_tensor * result = ggml_v1_view_tensor(ctx, a); |
| struct ggml_v1_tensor * b = ggml_v1_new_i32(ctx, n_past); |
|
|
| result->op = GGML_V1_OP_DIAG_MASK_INF; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_soft_max( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a) { |
| bool is_node = false; |
|
|
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| |
| |
| struct ggml_v1_tensor * result = ggml_v1_view_tensor(ctx, a); |
|
|
| result->op = GGML_V1_OP_SOFT_MAX; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = NULL; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_rope( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| int n_past, |
| int n_dims, |
| int mode) { |
| GGML_V1_ASSERT(n_past >= 0); |
| bool is_node = false; |
|
|
| if (a->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| |
| |
| struct ggml_v1_tensor * result = ggml_v1_view_tensor(ctx, a); |
|
|
| struct ggml_v1_tensor * b = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_I32, 3); |
| ((int32_t *) b->data)[0] = n_past; |
| ((int32_t *) b->data)[1] = n_dims; |
| ((int32_t *) b->data)[2] = mode; |
|
|
| result->op = GGML_V1_OP_ROPE; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_conv_1d_1s( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| GGML_V1_ASSERT(ggml_v1_is_matrix(b)); |
| GGML_V1_ASSERT(a->ne[1] == b->ne[1]); |
| GGML_V1_ASSERT(a->ne[3] == 1); |
| bool is_node = false; |
|
|
| if (a->grad || b->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| const int ne[4] = { b->ne[0], a->ne[2], 1, 1, }; |
| struct ggml_v1_tensor * result = ggml_v1_new_tensor(ctx, GGML_V1_TYPE_F32, 2, ne); |
|
|
| result->op = GGML_V1_OP_CONV_1D_1S; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_conv_1d_2s( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b) { |
| GGML_V1_ASSERT(ggml_v1_is_matrix(b)); |
| GGML_V1_ASSERT(a->ne[1] == b->ne[1]); |
| GGML_V1_ASSERT(a->ne[3] == 1); |
| bool is_node = false; |
|
|
| if (a->grad || b->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| const int ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, }; |
| struct ggml_v1_tensor * result = ggml_v1_new_tensor(ctx, GGML_V1_TYPE_F32, 2, ne); |
|
|
| result->op = GGML_V1_OP_CONV_1D_2S; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b; |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_flash_attn( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * q, |
| struct ggml_v1_tensor * k, |
| struct ggml_v1_tensor * v, |
| bool masked) { |
| GGML_V1_ASSERT(ggml_v1_can_mul_mat(k, q)); |
| |
|
|
| bool is_node = false; |
|
|
| if (q->grad || k->grad || v->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| |
| struct ggml_v1_tensor * result = ggml_v1_new_tensor(ctx, GGML_V1_TYPE_F32, 4, q->ne); |
|
|
| result->op = GGML_V1_OP_FLASH_ATTN; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = q; |
| result->src1 = k; |
| result->opt[0] = v; |
| result->opt[1] = ggml_v1_new_i32(ctx, masked ? 1 : 0); |
|
|
| return result; |
| } |
|
|
| |
|
|
| struct ggml_v1_tensor * ggml_v1_flash_ff( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * a, |
| struct ggml_v1_tensor * b0, |
| struct ggml_v1_tensor * b1, |
| struct ggml_v1_tensor * c0, |
| struct ggml_v1_tensor * c1) { |
| GGML_V1_ASSERT(ggml_v1_can_mul_mat(b0, a)); |
| |
|
|
| bool is_node = false; |
|
|
| if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { |
| GGML_V1_ASSERT(false); |
| is_node = true; |
| } |
|
|
| |
| struct ggml_v1_tensor * result = ggml_v1_new_tensor(ctx, GGML_V1_TYPE_F32, 4, a->ne); |
|
|
| result->op = GGML_V1_OP_FLASH_FF; |
| result->grad = is_node ? ggml_v1_dup_tensor(ctx, result) : NULL; |
| result->src0 = a; |
| result->src1 = b0; |
| result->opt[0] = b1; |
| result->opt[1] = c0; |
| result->opt[2] = c1; |
|
|
| return result; |
| } |
|
|
| |
|
|
| void ggml_v1_set_param( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_tensor * tensor) { |
| tensor->is_param = true; |
|
|
| GGML_V1_ASSERT(tensor->grad == NULL); |
| tensor->grad = ggml_v1_dup_tensor(ctx, tensor); |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_dup_f16( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(params->ith == 0); |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(dst)); |
| GGML_V1_ASSERT(ggml_v1_nelements(dst) == ggml_v1_nelements(src0)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| const int ne03 = src0->ne[3]; |
|
|
| const size_t nb00 = src0->nb[0]; |
| const size_t nb01 = src0->nb[1]; |
| const size_t nb02 = src0->nb[2]; |
| const size_t nb03 = src0->nb[3]; |
|
|
| if (ggml_v1_is_contiguous(src0) && src0->type == dst->type) { |
| memcpy(dst->data, src0->data, ggml_v1_nelements(dst) * GGML_V1_TYPE_SIZE[src0->type]); |
| return; |
| } |
|
|
| if (src0->nb[0] == sizeof(ggml_v1_fp16_t)) { |
| if (dst->type == GGML_V1_TYPE_F16) { |
| int id = 0; |
| const size_t rs = ne00*nb00; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; |
| char * dst_ptr = (char *) dst->data + id*rs; |
|
|
| memcpy(dst_ptr, src0_ptr, rs); |
|
|
| id++; |
| } |
| } |
| } |
| } else if (dst->type == GGML_V1_TYPE_F32) { |
| int id = 0; |
| float * dst_ptr = (float *) dst->data; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| for (int i00 = 0; i00 < ne00; i00++) { |
| const ggml_v1_fp16_t * src0_ptr = (ggml_v1_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); |
|
|
| dst_ptr[id] = GGML_V1_FP16_TO_FP32(*src0_ptr); |
| id++; |
| } |
| } |
| } |
| } |
| } else { |
| GGML_V1_ASSERT(false); |
| } |
| } else { |
| |
|
|
| if (dst->type == GGML_V1_TYPE_F32) { |
| int id = 0; |
| float * dst_ptr = (float *) dst->data; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| for (int i00 = 0; i00 < ne00; i00++) { |
| const ggml_v1_fp16_t * src0_ptr = (ggml_v1_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); |
|
|
| dst_ptr[id] = GGML_V1_FP16_TO_FP32(*src0_ptr); |
| id++; |
| } |
| } |
| } |
| } |
| } else if (dst->type == GGML_V1_TYPE_F16) { |
| int id = 0; |
| ggml_v1_fp16_t * dst_ptr = (ggml_v1_fp16_t *) dst->data; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| for (int i00 = 0; i00 < ne00; i00++) { |
| const ggml_v1_fp16_t * src0_ptr = (ggml_v1_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); |
|
|
| dst_ptr[id] = *src0_ptr; |
| id++; |
| } |
| } |
| } |
| } |
| } else { |
| GGML_V1_ASSERT(false); |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_dup_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(params->ith == 0); |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(dst)); |
| GGML_V1_ASSERT(ggml_v1_nelements(dst) == ggml_v1_nelements(src0)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| const int ne03 = src0->ne[3]; |
|
|
| const size_t nb00 = src0->nb[0]; |
| const size_t nb01 = src0->nb[1]; |
| const size_t nb02 = src0->nb[2]; |
| const size_t nb03 = src0->nb[3]; |
|
|
| if (ggml_v1_is_contiguous(src0) && src0->type == dst->type) { |
| memcpy(dst->data, src0->data, ggml_v1_nelements(dst) * GGML_V1_TYPE_SIZE[src0->type]); |
| return; |
| } |
|
|
| if (src0->nb[0] == sizeof(float)) { |
| if (dst->type == GGML_V1_TYPE_F32) { |
| int id = 0; |
| const size_t rs = ne00*nb00; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; |
| char * dst_ptr = (char *) dst->data + id*rs; |
|
|
| memcpy(dst_ptr, src0_ptr, rs); |
|
|
| id++; |
| } |
| } |
| } |
| } else if (dst->type == GGML_V1_TYPE_F16) { |
| int id = 0; |
| ggml_v1_fp16_t * dst_ptr = (ggml_v1_fp16_t *) dst->data; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| for (int i00 = 0; i00 < ne00; i00++) { |
| const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); |
|
|
| dst_ptr[id] = GGML_V1_FP32_TO_FP16(*src0_ptr); |
| id++; |
| } |
| } |
| } |
| } |
| } else { |
| GGML_V1_ASSERT(false); |
| } |
| } else { |
| |
|
|
| if (dst->type == GGML_V1_TYPE_F32) { |
| int id = 0; |
| float * dst_ptr = (float *) dst->data; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| for (int i00 = 0; i00 < ne00; i00++) { |
| const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); |
|
|
| dst_ptr[id] = *src0_ptr; |
| id++; |
| } |
| } |
| } |
| } |
| } else if (dst->type == GGML_V1_TYPE_F16) { |
| int id = 0; |
| ggml_v1_fp16_t * dst_ptr = (ggml_v1_fp16_t *) dst->data; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| for (int i00 = 0; i00 < ne00; i00++) { |
| const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); |
|
|
| dst_ptr[id] = GGML_V1_FP32_TO_FP16(*src0_ptr); |
| id++; |
| } |
| } |
| } |
| } |
| } else { |
| GGML_V1_ASSERT(false); |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_dup( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F16: |
| { |
| ggml_v1_compute_forward_dup_f16(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_dup_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_add_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(ggml_v1_are_same_shape(src0, src1) && ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| const size_t nb00 = src0->nb[0]; |
| const size_t nb01 = src0->nb[1]; |
|
|
| const size_t nb10 = src1->nb[0]; |
| const size_t nb11 = src1->nb[1]; |
|
|
| const size_t nb0 = dst->nb[0]; |
| const size_t nb1 = dst->nb[1]; |
|
|
| GGML_V1_ASSERT( nb0 == sizeof(float)); |
| GGML_V1_ASSERT(nb00 == sizeof(float)); |
|
|
| if (nb10 == sizeof(float)) { |
| const int j0 = (n/nth)*ith; |
| const int j1 = ith == nth - 1 ? n : (n/nth)*(ith + 1); |
|
|
| for (int j = j0; j < j1; j++) { |
| ggml_v1_vec_add_f32(nc, |
| (float *) ((char *) dst->data + j*nb1), |
| (float *) ((char *) src0->data + j*nb01), |
| (float *) ((char *) src1->data + j*nb11)); |
| } |
| } else { |
| |
| for (int j = ith; j < n; j += nth) { |
| float * dst_ptr = (float *) ((char *) dst->data + j*nb1); |
| float * src0_ptr = (float *) ((char *) src0->data + j*nb01); |
| for (int i = 0; i < nc; i++) { |
| float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); |
|
|
| dst_ptr[i] = src0_ptr[i] + *src1_ptr; |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_add( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_add_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_sub_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, src1) && ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert( dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
| assert(src1->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_sub_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1])), |
| (float *) ((char *) src1->data + i*(src1->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_sub( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_sub_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_mul_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, src1) && ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert( dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
| assert(src1->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_mul_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1])), |
| (float *) ((char *) src1->data + i*(src1->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_mul( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_mul_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_div_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, src1) && ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert( dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
| assert(src1->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_div_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1])), |
| (float *) ((char *) src1->data + i*(src1->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_div( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_div_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_sqr_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert( dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_sqr_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_sqr( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_sqr_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_sqrt_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert( dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_sqrt_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_sqrt( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_sqrt_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_sum_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_is_scalar(dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| assert(ggml_v1_is_scalar(dst)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| const int ne03 = src0->ne[3]; |
|
|
| const size_t nb01 = src0->nb[1]; |
| const size_t nb02 = src0->nb[2]; |
| const size_t nb03 = src0->nb[3]; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| ggml_v1_vec_sum_f32(ne00, |
| (float *) (dst->data), |
| (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_sum( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_sum_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_mean_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| assert(src0->nb[0] == sizeof(float)); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| const int ne03 = src0->ne[3]; |
|
|
| const size_t nb01 = src0->nb[1]; |
| const size_t nb02 = src0->nb[2]; |
| const size_t nb03 = src0->nb[3]; |
|
|
| const int ne0 = dst->ne[0]; |
| const int ne1 = dst->ne[1]; |
| const int ne2 = dst->ne[2]; |
| const int ne3 = dst->ne[3]; |
|
|
| assert(ne0 == 1); |
| assert(ne1 == ne01); |
| assert(ne2 == ne02); |
| assert(ne3 == ne03); |
|
|
| UNUSED(ne0); |
| UNUSED(ne1); |
| UNUSED(ne2); |
| UNUSED(ne3); |
|
|
| const size_t nb1 = dst->nb[1]; |
| const size_t nb2 = dst->nb[2]; |
| const size_t nb3 = dst->nb[3]; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| ggml_v1_vec_sum_f32(ne00, |
| (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), |
| (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); |
|
|
| *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_mean( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_mean_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_repeat_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_can_repeat(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
| assert(src0->ne[2] == 1); |
| assert(src0->ne[3] == 1); |
| assert( dst->ne[2] == 1); |
| assert( dst->ne[3] == 1); |
|
|
| const int nc = dst->ne[0]; |
| const int nr = dst->ne[1]; |
| const int nc0 = src0->ne[0]; |
| const int nr0 = src0->ne[1]; |
| const int ncr = nc/nc0; |
| const int nrr = nr/nr0; |
|
|
| |
| assert( dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| |
| for (int i = 0; i < nrr; i++) { |
| for (int j = 0; j < ncr; j++) { |
| for (int k = 0; k < nr0; k++) { |
| ggml_v1_vec_cpy_f32(nc0, |
| (float *) ((char *) dst->data + (i*nr0 + k)*( dst->nb[1]) + j*nc0*( dst->nb[0])), |
| (float *) ((char *) src0->data + ( k)*(src0->nb[1]))); |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_repeat( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_repeat_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_abs_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert(dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_abs_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_abs( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_abs_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_sgn_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert(dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_sgn_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_sgn( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_sgn_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_neg_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert(dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_neg_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_neg( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_neg_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_step_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert(dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_step_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_step( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_step_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_relu_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
|
|
| assert(dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < n; i++) { |
| ggml_v1_vec_relu_f32(nc, |
| (float *) ((char *) dst->data + i*( dst->nb[1])), |
| (float *) ((char *) src0->data + i*(src0->nb[1]))); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_relu( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_relu_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_gelu_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(src0)); |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(dst)); |
| GGML_V1_ASSERT(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int nc = src0->ne[0]; |
| const int nr = ggml_v1_nrows(src0); |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| for (int i1 = ir0; i1 < ir1; i1++) { |
| ggml_v1_vec_gelu_f32(nc, |
| (float *) ((char *) dst->data + i1*( dst->nb[1])), |
| (float *) ((char *) src0->data + i1*(src0->nb[1]))); |
|
|
| #ifndef NDEBUG |
| for (int k = 0; k < nc; k++) { |
| const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; |
| UNUSED(x); |
| assert(!isnan(x)); |
| assert(!isinf(x)); |
| } |
| #endif |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_gelu( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_gelu_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
|
|
| |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_norm_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| GGML_V1_ASSERT(src0->nb[0] == sizeof(float)); |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| const int ne03 = src0->ne[3]; |
|
|
| const size_t nb01 = src0->nb[1]; |
| const size_t nb02 = src0->nb[2]; |
| const size_t nb03 = src0->nb[3]; |
|
|
| const size_t nb1 = dst->nb[1]; |
| const size_t nb2 = dst->nb[2]; |
| const size_t nb3 = dst->nb[3]; |
|
|
| const ggml_v1_float eps = 1e-5f; |
|
|
| |
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = ith; i01 < ne01; i01 += nth) { |
| const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); |
|
|
| ggml_v1_float mean = 0.0; |
| for (int i00 = 0; i00 < ne00; i00++) { |
| mean += x[i00]; |
| } |
|
|
| mean /= ne00; |
|
|
| float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); |
|
|
| ggml_v1_float sum2 = 0.0; |
| for (int i00 = 0; i00 < ne00; i00++) { |
| ggml_v1_float v = x[i00] - mean; |
| y[i00] = v; |
| sum2 += v*v; |
| } |
|
|
| const float scale = 1.0/sqrt(sum2/ne00 + eps); |
|
|
| ggml_v1_vec_scale_f32(ne00, y, scale); |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_norm( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_norm_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |
| |
| |
| static bool ggml_v1_compute_forward_mul_mat_use_blas( |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| UNUSED(src0); |
|
|
| const int ne10 = src1->ne[0]; |
|
|
| const int ne0 = dst->ne[0]; |
| const int ne1 = dst->ne[1]; |
|
|
| |
| if (ggml_v1_is_contiguous(src0) && |
| ggml_v1_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { |
| |
| return true; |
| } |
|
|
| return false; |
| } |
| #endif |
|
|
| static void ggml_v1_compute_forward_mul_mat_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| const int ne03 = src0->ne[3]; |
|
|
| const int ne10 = src1->ne[0]; |
| const int ne11 = src1->ne[1]; |
| const int ne12 = src1->ne[2]; |
| const int ne13 = src1->ne[3]; |
|
|
| const int ne0 = dst->ne[0]; |
| const int ne1 = dst->ne[1]; |
| const int ne2 = dst->ne[2]; |
| const int ne3 = dst->ne[3]; |
| const int ne = ne0*ne1*ne2*ne3; |
|
|
| const int nb00 = src0->nb[0]; |
| const int nb01 = src0->nb[1]; |
| const int nb02 = src0->nb[2]; |
| const int nb03 = src0->nb[3]; |
|
|
| const int nb10 = src1->nb[0]; |
| const int nb11 = src1->nb[1]; |
| const int nb12 = src1->nb[2]; |
| const int nb13 = src1->nb[3]; |
|
|
| const int nb0 = dst->nb[0]; |
| const int nb1 = dst->nb[1]; |
| const int nb2 = dst->nb[2]; |
| const int nb3 = dst->nb[3]; |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| assert(ne02 == ne12); |
| assert(ne03 == ne13); |
| assert(ne2 == ne12); |
| assert(ne3 == ne13); |
|
|
| |
| assert(nb00 == sizeof(float) || nb01 == sizeof(float)); |
|
|
| |
| assert(nb0 == sizeof(float)); |
| assert(nb0 <= nb1); |
| assert(nb1 <= nb2); |
| assert(nb2 <= nb3); |
|
|
| assert(ne0 == ne01); |
| assert(ne1 == ne11); |
| assert(ne2 == ne02); |
| assert(ne3 == ne03); |
|
|
| |
| |
| |
| |
| |
|
|
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |
| if (ggml_v1_compute_forward_mul_mat_use_blas(src0, src1, dst)) { |
| GGML_V1_ASSERT(nb10 == sizeof(float)); |
|
|
| if (params->ith != 0) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| const float * x = (float *) (src0->data); |
| const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); |
|
|
| float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); |
|
|
| |
| { |
| cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, |
| ne11, ne01, ne10, |
| 1.0f, y, ne10, |
| x, ne10, |
| 0.0f, d, ne01); |
| } |
| } |
| } |
|
|
| |
|
|
| return; |
| } |
| #endif |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| if (nb01 >= nb00) { |
| return; |
| } |
|
|
| |
| memset(params->wdata, 0, params->wsize); |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| if (nb01 >= nb00) { |
| return; |
| } |
|
|
| |
| |
|
|
| float * const wdata = params->wdata; |
|
|
| |
| const int dc = (ne + nth - 1)/nth; |
|
|
| |
| const int ic0 = dc*ith; |
| const int ic1 = MIN(ic0 + dc, ne); |
|
|
| ggml_v1_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); |
|
|
| for (int k = 1; k < nth; k++) { |
| ggml_v1_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); |
| } |
|
|
| return; |
| } |
|
|
| if (nb01 >= nb00) { |
| |
| assert(nb10 == sizeof(float)); |
|
|
| |
|
|
| |
| const int nr = ne01*ne02*ne03; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| for (int ir = ir0; ir < ir1; ++ir) { |
| |
| const int i03 = ir/(ne02*ne01); |
| const int i02 = (ir - i03*ne02*ne01)/ne01; |
| const int i01 = (ir - i03*ne02*ne01 - i02*ne01); |
|
|
| for (int ic = 0; ic < ne11; ++ic) { |
| |
| const int i13 = i03; |
| const int i12 = i02; |
| const int i11 = ic; |
|
|
| |
| const int i0 = i01; |
| const int i1 = i11; |
| const int i2 = i02; |
| const int i3 = i03; |
|
|
| ggml_v1_vec_dot_f32(ne00, |
| (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), |
| (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)), |
| (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); |
| } |
| } |
| } else { |
| |
| |
| |
|
|
| |
| const int nc = ne10; |
|
|
| |
| const int dc = (nc + nth - 1)/nth; |
|
|
| |
| const int ic0 = dc*ith; |
| const int ic1 = MIN(ic0 + dc, nc); |
|
|
| |
| const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; |
| float * const wdata = params->wdata; |
|
|
| for (int i13 = 0; i13 < ne13; ++i13) { |
| for (int i12 = 0; i12 < ne12; ++i12) { |
| for (int i11 = 0; i11 < ne11; ++i11) { |
| for (int ic = ic0; ic < ic1; ++ic) { |
| |
| const int i10 = ic; |
|
|
| |
| const int i03 = i13; |
| const int i02 = i12; |
| const int i00 = ic; |
|
|
| |
| const int i1 = i11; |
| const int i2 = i12; |
| const int i3 = i13; |
|
|
| assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); |
|
|
| ggml_v1_vec_mad_f32(ne01, |
| (float *) (wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0), |
| (float *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)), |
| *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13))); |
| } |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| } |
|
|
| static void ggml_v1_compute_forward_mul_mat_f16_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| const int ne03 = src0->ne[3]; |
|
|
| const int ne10 = src1->ne[0]; |
| const int ne11 = src1->ne[1]; |
| const int ne12 = src1->ne[2]; |
| const int ne13 = src1->ne[3]; |
|
|
| const int ne0 = dst->ne[0]; |
| const int ne1 = dst->ne[1]; |
| const int ne2 = dst->ne[2]; |
| const int ne3 = dst->ne[3]; |
| const int ne = ne0*ne1*ne2*ne3; |
|
|
| const int nb00 = src0->nb[0]; |
| const int nb01 = src0->nb[1]; |
| const int nb02 = src0->nb[2]; |
| const int nb03 = src0->nb[3]; |
|
|
| const int nb10 = src1->nb[0]; |
| const int nb11 = src1->nb[1]; |
| const int nb12 = src1->nb[2]; |
| const int nb13 = src1->nb[3]; |
|
|
| const int nb0 = dst->nb[0]; |
| const int nb1 = dst->nb[1]; |
| const int nb2 = dst->nb[2]; |
| const int nb3 = dst->nb[3]; |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| GGML_V1_ASSERT(ne02 == ne12); |
| GGML_V1_ASSERT(ne03 == ne13); |
| GGML_V1_ASSERT(ne2 == ne12); |
| GGML_V1_ASSERT(ne3 == ne13); |
|
|
| |
| GGML_V1_ASSERT(nb00 == sizeof(ggml_v1_fp16_t) || nb01 == sizeof(ggml_v1_fp16_t)); |
|
|
| |
| GGML_V1_ASSERT(nb0 == sizeof(float)); |
| GGML_V1_ASSERT(nb0 <= nb1); |
| GGML_V1_ASSERT(nb1 <= nb2); |
| GGML_V1_ASSERT(nb2 <= nb3); |
|
|
| GGML_V1_ASSERT(ne0 == ne01); |
| GGML_V1_ASSERT(ne1 == ne11); |
| GGML_V1_ASSERT(ne2 == ne02); |
| GGML_V1_ASSERT(ne3 == ne03); |
|
|
| |
| |
| |
| |
| |
|
|
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |
| if (ggml_v1_compute_forward_mul_mat_use_blas(src0, src1, dst)) { |
| GGML_V1_ASSERT(nb10 == sizeof(float)); |
|
|
| if (params->ith != 0) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| float * const wdata = params->wdata; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| { |
| int id = 0; |
| for (int i01 = 0; i01 < ne01; ++i01) { |
| for (int i00 = 0; i00 < ne00; ++i00) { |
| wdata[id++] = GGML_V1_FP16_TO_FP32(*(ggml_v1_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); |
| } |
| } |
| } |
|
|
| const float * x = wdata; |
| const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); |
|
|
| |
| |
| |
| |
| |
| |
|
|
| { |
| #if 1 |
| |
| cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, |
| ne11, ne01, ne10, |
| 1.0f, y, ne00, |
| x, ne00, |
| 0.0f, d, ne01); |
| #else |
| |
| cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, |
| ne01, ne11, ne10, |
| 1.0f, x, ne00, |
| y, ne00, |
| 0.0f, d, ne01); |
| #endif |
| } |
| } |
| } |
|
|
| |
|
|
| return; |
| } |
| #endif |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| if (nb01 >= nb00) { |
| ggml_v1_fp16_t * const wdata = params->wdata; |
|
|
| int id = 0; |
| for (int i13 = 0; i13 < ne13; ++i13) { |
| for (int i12 = 0; i12 < ne12; ++i12) { |
| for (int i11 = 0; i11 < ne11; ++i11) { |
| for (int i10 = 0; i10 < ne10; ++i10) { |
| wdata[id++] = GGML_V1_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); |
| } |
| } |
| } |
| } |
|
|
| GGML_V1_ASSERT(id*sizeof(ggml_v1_fp16_t) <= params->wsize); |
|
|
| return; |
| } |
|
|
| |
| memset(params->wdata, 0, params->wsize); |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| if (nb01 >= nb00) { |
| return; |
| } |
|
|
| |
| |
|
|
| ggml_v1_fp16_t * const wdata = params->wdata; |
|
|
| |
| const int dc = (ne + nth - 1)/nth; |
|
|
| |
| const int ic0 = dc*ith; |
| const int ic1 = MIN(ic0 + dc, ne); |
|
|
| for (int i = ic0; i < ic1; ++i) { |
| ((float *) dst->data)[i] = GGML_V1_FP16_TO_FP32(wdata[i]); |
| } |
|
|
| for (int k = 1; k < nth; k++) { |
| for (int i = ic0; i < ic1; ++i) { |
| ((float *) dst->data)[i] += GGML_V1_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]); |
| } |
| } |
|
|
| return; |
| } |
|
|
| if (nb01 >= nb00) { |
| |
| |
| assert(nb10/2 == sizeof(ggml_v1_fp16_t)); |
|
|
| |
|
|
| |
| const int nr = ne01*ne02*ne03; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| ggml_v1_fp16_t * wdata = params->wdata; |
|
|
| for (int ir = ir0; ir < ir1; ++ir) { |
| |
| const int i03 = ir/(ne02*ne01); |
| const int i02 = (ir - i03*ne02*ne01)/ne01; |
| const int i01 = (ir - i03*ne02*ne01 - i02*ne01); |
|
|
| const int i13 = i03; |
| const int i12 = i02; |
|
|
| const int i0 = i01; |
| const int i2 = i02; |
| const int i3 = i03; |
|
|
| ggml_v1_fp16_t * src0_row = (ggml_v1_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); |
| ggml_v1_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00; |
|
|
| float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); |
|
|
| assert(ne00 % 32 == 0); |
|
|
| for (int ic = 0; ic < ne11; ++ic) { |
| ggml_v1_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); |
| } |
| } |
| } else { |
| |
| |
| |
|
|
| |
| const int nc = ne10; |
|
|
| |
| const int dc = (nc + nth - 1)/nth; |
|
|
| |
| const int ic0 = dc*ith; |
| const int ic1 = MIN(ic0 + dc, nc); |
|
|
| |
| const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; |
| ggml_v1_fp16_t * const wdata = params->wdata; |
|
|
| for (int i13 = 0; i13 < ne13; ++i13) { |
| for (int i12 = 0; i12 < ne12; ++i12) { |
| for (int i11 = 0; i11 < ne11; ++i11) { |
| |
| const int i1 = i11; |
| const int i2 = i12; |
| const int i3 = i13; |
|
|
| ggml_v1_fp16_t * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; |
|
|
| for (int ic = ic0; ic < ic1; ++ic) { |
| |
| const int i10 = ic; |
|
|
| |
| const int i03 = i13; |
| const int i02 = i12; |
| const int i00 = ic; |
|
|
| assert(sizeof(ggml_v1_fp16_t)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); |
|
|
| ggml_v1_fp16_t * src0_col = (ggml_v1_fp16_t *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); |
| float src1_val = * (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); |
|
|
| ggml_v1_vec_mad_f16(ne01, dst_row, src0_col, src1_val); |
| } |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| } |
|
|
| static void ggml_v1_compute_forward_mul_mat_q4_0_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| const int ne03 = src0->ne[3]; |
|
|
| const int ne10 = src1->ne[0]; |
| const int ne11 = src1->ne[1]; |
| const int ne12 = src1->ne[2]; |
| const int ne13 = src1->ne[3]; |
|
|
| const int ne0 = dst->ne[0]; |
| const int ne1 = dst->ne[1]; |
| const int ne2 = dst->ne[2]; |
| const int ne3 = dst->ne[3]; |
| const int ne = ne0*ne1*ne2*ne3; |
|
|
| const int nb00 = src0->nb[0]; |
| const int nb01 = src0->nb[1]; |
| const int nb02 = src0->nb[2]; |
| const int nb03 = src0->nb[3]; |
|
|
| const int nb10 = src1->nb[0]; |
| const int nb11 = src1->nb[1]; |
| const int nb12 = src1->nb[2]; |
| const int nb13 = src1->nb[3]; |
|
|
| const int nb0 = dst->nb[0]; |
| const int nb1 = dst->nb[1]; |
| const int nb2 = dst->nb[2]; |
| const int nb3 = dst->nb[3]; |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| GGML_V1_ASSERT(ne02 == ne12); |
| GGML_V1_ASSERT(ne03 == ne13); |
| GGML_V1_ASSERT(ne2 == ne12); |
| GGML_V1_ASSERT(ne3 == ne13); |
|
|
| |
| GGML_V1_ASSERT(nb00 == (int) GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_0] || nb01 == (int) GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_0]); |
|
|
| |
| GGML_V1_ASSERT(nb0 == sizeof(float)); |
| GGML_V1_ASSERT(nb0 <= nb1); |
| GGML_V1_ASSERT(nb1 <= nb2); |
| GGML_V1_ASSERT(nb2 <= nb3); |
|
|
| GGML_V1_ASSERT(ne0 == ne01); |
| GGML_V1_ASSERT(ne1 == ne11); |
| GGML_V1_ASSERT(ne2 == ne02); |
| GGML_V1_ASSERT(ne3 == ne03); |
|
|
| |
| |
| |
| |
| |
|
|
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |
| if (ggml_v1_compute_forward_mul_mat_use_blas(src0, src1, dst)) { |
| GGML_V1_ASSERT(nb10 == sizeof(float)); |
|
|
| if (params->ith != 0) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| float * const wdata = params->wdata; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| { |
| int id = 0; |
| for (int i01 = 0; i01 < ne01; ++i01) { |
| |
| |
| |
| dequantize_row_q4_0((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); |
| id += ne00; |
| } |
| } |
|
|
| const float * x = wdata; |
| const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); |
|
|
| |
| |
| |
| |
| |
| |
|
|
| { |
| #if 1 |
| |
| cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, |
| ne11, ne01, ne10, |
| 1.0f, y, ne00, |
| x, ne00, |
| 0.0f, d, ne01); |
| #else |
| |
| cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, |
| ne01, ne11, ne10, |
| 1.0f, x, ne00, |
| y, ne00, |
| 0.0f, d, ne01); |
| #endif |
| } |
| } |
| } |
|
|
| |
|
|
| return; |
| } |
| #endif |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| |
| if (nb01 >= nb00) { |
| char * wdata = params->wdata; |
|
|
| for (int i13 = 0; i13 < ne13; ++i13) { |
| for (int i12 = 0; i12 < ne12; ++i12) { |
| for (int i11 = 0; i11 < ne11; ++i11) { |
| |
| |
| |
| quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); |
| wdata += (ne10*GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_0])/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_0]; |
| } |
| } |
| } |
|
|
| return; |
| } |
|
|
| |
| memset(params->wdata, 0, params->wsize); |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| if (nb01 >= nb00) { |
| return; |
| } |
|
|
| float * const wdata = params->wdata; |
|
|
| |
| const int dc = (ne + nth - 1)/nth; |
|
|
| |
| const int ic0 = dc*ith; |
| const int ic1 = MIN(ic0 + dc, ne); |
|
|
| ggml_v1_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); |
|
|
| for (int k = 1; k < nth; k++) { |
| ggml_v1_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); |
| } |
|
|
| return; |
| } |
|
|
| if (nb01 >= nb00) { |
| |
|
|
| |
|
|
| |
| const int nr = ne01*ne02*ne03; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| void * wdata = params->wdata; |
|
|
| for (int ir = ir0; ir < ir1; ++ir) { |
| |
| const int i03 = ir/(ne02*ne01); |
| const int i02 = (ir - i03*ne02*ne01)/ne01; |
| const int i01 = (ir - i03*ne02*ne01 - i02*ne01); |
|
|
| const int i13 = i03; |
| const int i12 = i02; |
|
|
| const int i0 = i01; |
| const int i2 = i02; |
| const int i3 = i03; |
|
|
| void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); |
| char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_0])/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_0]); |
|
|
| float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); |
|
|
| assert(ne00 % 32 == 0); |
|
|
| for (int ic = 0; ic < ne11; ++ic) { |
| ggml_v1_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_0])/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_0]))); |
| } |
| } |
| } else { |
| |
| |
| |
| |
|
|
| |
| const int nc = ne10; |
|
|
| |
| const int dc = (nc + nth - 1)/nth; |
|
|
| |
| const int ic0 = dc*ith; |
| const int ic1 = MIN(ic0 + dc, nc); |
|
|
| |
| const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; |
| float * const wdata = params->wdata; |
|
|
| for (int i13 = 0; i13 < ne13; ++i13) { |
| for (int i12 = 0; i12 < ne12; ++i12) { |
| for (int i11 = 0; i11 < ne11; ++i11) { |
| |
| const int i1 = i11; |
| const int i2 = i12; |
| const int i3 = i13; |
|
|
| float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; |
|
|
| for (int ic = ic0; ic < ic1; ++ic) { |
| |
| const int i10 = ic; |
|
|
| |
| const int i03 = i13; |
| const int i02 = i12; |
| const int i00 = ic; |
|
|
| assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); |
|
|
| void * src0_col = (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); |
| float src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); |
|
|
| ggml_v1_vec_mad_q4_0(ne01, dst_row, src0_col, src1_val); |
| } |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| } |
|
|
| static void ggml_v1_compute_forward_mul_mat_q4_1_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| const int ne03 = src0->ne[3]; |
|
|
| const int ne10 = src1->ne[0]; |
| const int ne11 = src1->ne[1]; |
| const int ne12 = src1->ne[2]; |
| const int ne13 = src1->ne[3]; |
|
|
| const int ne0 = dst->ne[0]; |
| const int ne1 = dst->ne[1]; |
| const int ne2 = dst->ne[2]; |
| const int ne3 = dst->ne[3]; |
| const int ne = ne0*ne1*ne2*ne3; |
|
|
| const int nb00 = src0->nb[0]; |
| const int nb01 = src0->nb[1]; |
| const int nb02 = src0->nb[2]; |
| const int nb03 = src0->nb[3]; |
|
|
| const int nb10 = src1->nb[0]; |
| const int nb11 = src1->nb[1]; |
| const int nb12 = src1->nb[2]; |
| const int nb13 = src1->nb[3]; |
|
|
| const int nb0 = dst->nb[0]; |
| const int nb1 = dst->nb[1]; |
| const int nb2 = dst->nb[2]; |
| const int nb3 = dst->nb[3]; |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| GGML_V1_ASSERT(ne02 == ne12); |
| GGML_V1_ASSERT(ne03 == ne13); |
| GGML_V1_ASSERT(ne2 == ne12); |
| GGML_V1_ASSERT(ne3 == ne13); |
|
|
| |
| GGML_V1_ASSERT(nb00 == (int) GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_1] || nb01 == (int) GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_1]); |
|
|
| |
| GGML_V1_ASSERT(nb0 == sizeof(float)); |
| GGML_V1_ASSERT(nb0 <= nb1); |
| GGML_V1_ASSERT(nb1 <= nb2); |
| GGML_V1_ASSERT(nb2 <= nb3); |
|
|
| GGML_V1_ASSERT(ne0 == ne01); |
| GGML_V1_ASSERT(ne1 == ne11); |
| GGML_V1_ASSERT(ne2 == ne02); |
| GGML_V1_ASSERT(ne3 == ne03); |
|
|
| |
| |
| |
| |
| |
|
|
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |
| if (ggml_v1_compute_forward_mul_mat_use_blas(src0, src1, dst)) { |
| GGML_V1_ASSERT(nb10 == sizeof(float)); |
|
|
| if (params->ith != 0) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| float * const wdata = params->wdata; |
|
|
| for (int i03 = 0; i03 < ne03; i03++) { |
| for (int i02 = 0; i02 < ne02; i02++) { |
| { |
| int id = 0; |
| for (int i01 = 0; i01 < ne01; ++i01) { |
| |
| |
| |
| dequantize_row_q4_1((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); |
| id += ne00; |
| } |
| } |
|
|
| const float * x = wdata; |
| const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); |
|
|
| |
| |
| |
| |
| |
| |
|
|
| { |
| #if 1 |
| |
| cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, |
| ne11, ne01, ne10, |
| 1.0f, y, ne00, |
| x, ne00, |
| 0.0f, d, ne01); |
| #else |
| |
| cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, |
| ne01, ne11, ne10, |
| 1.0f, x, ne00, |
| y, ne00, |
| 0.0f, d, ne01); |
| #endif |
| } |
| } |
| } |
|
|
| |
|
|
| return; |
| } |
| #endif |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| |
| if (nb01 >= nb00) { |
| char * wdata = params->wdata; |
|
|
| for (int i13 = 0; i13 < ne13; ++i13) { |
| for (int i12 = 0; i12 < ne12; ++i12) { |
| for (int i11 = 0; i11 < ne11; ++i11) { |
| |
| |
| |
| quantize_row_q4_1((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); |
| wdata += (ne10*GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_1])/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_1]; |
| } |
| } |
| } |
|
|
| return; |
| } |
|
|
| |
| memset(params->wdata, 0, params->wsize); |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| if (nb01 >= nb00) { |
| return; |
| } |
|
|
| float * const wdata = params->wdata; |
|
|
| |
| const int dc = (ne + nth - 1)/nth; |
|
|
| |
| const int ic0 = dc*ith; |
| const int ic1 = MIN(ic0 + dc, ne); |
|
|
| ggml_v1_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); |
|
|
| for (int k = 1; k < nth; k++) { |
| ggml_v1_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); |
| } |
|
|
| return; |
| } |
|
|
| if (nb01 >= nb00) { |
| |
|
|
| |
|
|
| |
| const int nr = ne01*ne02*ne03; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| void * wdata = params->wdata; |
|
|
| for (int ir = ir0; ir < ir1; ++ir) { |
| |
| const int i03 = ir/(ne02*ne01); |
| const int i02 = (ir - i03*ne02*ne01)/ne01; |
| const int i01 = (ir - i03*ne02*ne01 - i02*ne01); |
|
|
| const int i13 = i03; |
| const int i12 = i02; |
|
|
| const int i0 = i01; |
| const int i2 = i02; |
| const int i3 = i03; |
|
|
| void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); |
| char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_1])/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_1]); |
|
|
| float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); |
|
|
| assert(ne00 % 32 == 0); |
|
|
| for (int ic = 0; ic < ne11; ++ic) { |
| ggml_v1_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_1])/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_1]))); |
| } |
| } |
| } else { |
| |
| |
| |
| |
|
|
| |
| const int nc = ne10; |
|
|
| |
| const int dc = (nc + nth - 1)/nth; |
|
|
| |
| const int ic0 = dc*ith; |
| const int ic1 = MIN(ic0 + dc, nc); |
|
|
| |
| const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; |
| float * const wdata = params->wdata; |
|
|
| for (int i13 = 0; i13 < ne13; ++i13) { |
| for (int i12 = 0; i12 < ne12; ++i12) { |
| for (int i11 = 0; i11 < ne11; ++i11) { |
| |
| const int i1 = i11; |
| const int i2 = i12; |
| const int i3 = i13; |
|
|
| float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; |
|
|
| for (int ic = ic0; ic < ic1; ++ic) { |
| |
| const int i10 = ic; |
|
|
| |
| const int i03 = i13; |
| const int i02 = i12; |
| const int i00 = ic; |
|
|
| assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); |
|
|
| void * src0_col = (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); |
| float src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); |
|
|
| ggml_v1_vec_mad_q4_1(ne01, dst_row, src0_col, src1_val); |
| } |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| } |
|
|
| static void ggml_v1_compute_forward_mul_mat( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_Q4_0: |
| { |
| ggml_v1_compute_forward_mul_mat_q4_0_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_1: |
| { |
| ggml_v1_compute_forward_mul_mat_q4_1_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_F16: |
| { |
| ggml_v1_compute_forward_mul_mat_f16_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_mul_mat_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
|
|
| #if 0 |
| if (src0->type == GGML_V1_TYPE_F16 || src0->type == GGML_V1_TYPE_Q4_1) { |
| static int first = 8; |
| printf("src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src0->ne[0], src0->ne[1], src0->ne[2]); |
| printf("src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src1->ne[0], src1->ne[1], src1->ne[2]); |
| printf("dst: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", dst->ne[0], dst->ne[1], dst->ne[2]); |
| if (first) { |
| --first; |
| } else { |
| for (int k = 0; k < dst->ne[1]; ++k) { |
| for (int j = 0; j < dst->ne[0]/16; ++j) { |
| for (int i = 0; i < 16; ++i) { |
| printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); |
| } |
| printf("\n"); |
| } |
| printf("\n"); |
| } |
| printf("\n"); |
| exit(0); |
| } |
| } else { |
| printf("aaaa src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src0->ne[0], src0->ne[1], src0->ne[2]); |
| printf("aaaa src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src1->ne[0], src1->ne[1], src1->ne[2]); |
| printf("aaaa dst: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", dst->ne[0], dst->ne[1], dst->ne[2]); |
| } |
| #endif |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_scale_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(src0)); |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(dst)); |
| GGML_V1_ASSERT(ggml_v1_are_same_shape(src0, dst)); |
| GGML_V1_ASSERT(ggml_v1_is_scalar(src1)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
| const float v = *(float *) src1->data; |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int nc = src0->ne[0]; |
| const int nr = ggml_v1_nrows(src0); |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| for (int i1 = ir0; i1 < ir1; i1++) { |
| ggml_v1_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_scale( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_scale_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_cpy( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| ggml_v1_compute_forward_dup(params, src0, dst); |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_reshape( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| |
| UNUSED(params); |
| UNUSED(src0); |
| UNUSED(dst); |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_view( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0) { |
| |
| UNUSED(params); |
| UNUSED(src0); |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_permute( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0) { |
| |
| UNUSED(params); |
| UNUSED(src0); |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_transpose( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0) { |
| |
| UNUSED(params); |
| UNUSED(src0); |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_get_rows_q4_0( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int nc = src0->ne[0]; |
| const int nr = ggml_v1_nelements(src1); |
|
|
| assert( dst->ne[0] == nc); |
| assert( dst->ne[1] == nr); |
| assert(src0->nb[0] == GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_0]); |
|
|
| for (int i = 0; i < nr; ++i) { |
| const int r = ((int32_t *) src1->data)[i]; |
|
|
| dequantize_row_q4_0( |
| (const void *) ((char *) src0->data + r*src0->nb[1]), |
| (float *) ((char *) dst->data + i*dst->nb[1]), nc); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_get_rows_q4_1( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int nc = src0->ne[0]; |
| const int nr = ggml_v1_nelements(src1); |
|
|
| assert( dst->ne[0] == nc); |
| assert( dst->ne[1] == nr); |
| assert(src0->nb[0] == GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_1]); |
|
|
| for (int i = 0; i < nr; ++i) { |
| const int r = ((int32_t *) src1->data)[i]; |
|
|
| dequantize_row_q4_1( |
| (const void *) ((char *) src0->data + r*src0->nb[1]), |
| (float *) ((char *) dst->data + i*dst->nb[1]), nc); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_get_rows_f16( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int nc = src0->ne[0]; |
| const int nr = ggml_v1_nelements(src1); |
|
|
| assert( dst->ne[0] == nc); |
| assert( dst->ne[1] == nr); |
| assert(src0->nb[0] == sizeof(ggml_v1_fp16_t)); |
|
|
| for (int i = 0; i < nr; ++i) { |
| const int r = ((int32_t *) src1->data)[i]; |
|
|
| for (int j = 0; j < nc; ++j) { |
| ggml_v1_fp16_t v = ((ggml_v1_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j]; |
| ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_V1_FP16_TO_FP32(v); |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_get_rows_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int nc = src0->ne[0]; |
| const int nr = ggml_v1_nelements(src1); |
|
|
| assert( dst->ne[0] == nc); |
| assert( dst->ne[1] == nr); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| for (int i = 0; i < nr; ++i) { |
| const int r = ((int32_t *) src1->data)[i]; |
|
|
| ggml_v1_vec_cpy_f32(nc, |
| (float *) ((char *) dst->data + i*dst->nb[1]), |
| (float *) ((char *) src0->data + r*src0->nb[1])); |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_get_rows( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_Q4_0: |
| { |
| ggml_v1_compute_forward_get_rows_q4_0(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_1: |
| { |
| ggml_v1_compute_forward_get_rows_q4_1(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_F16: |
| { |
| ggml_v1_compute_forward_get_rows_f16(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_get_rows_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_diag_mask_inf_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(src1->type == GGML_V1_TYPE_I32); |
| assert(ggml_v1_nelements(src1) == 1); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n_past = ((int32_t *) src1->data)[0]; |
|
|
| |
|
|
| const int n = ggml_v1_nrows(src0); |
| const int nc = src0->ne[0]; |
| const int nr = src0->ne[1]; |
| const int nz = n/nr; |
|
|
| assert( dst->nb[0] == sizeof(float)); |
| assert(src0->nb[0] == sizeof(float)); |
|
|
| for (int k = 0; k < nz; k++) { |
| for (int j = 0; j < nr; j++) { |
| for (int i = n_past; i < nc; i++) { |
| if (i > n_past + j) { |
| *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY; |
| } |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_diag_mask_inf( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_diag_mask_inf_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_soft_max_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(src0)); |
| GGML_V1_ASSERT(ggml_v1_is_contiguous(dst)); |
| GGML_V1_ASSERT(ggml_v1_are_same_shape(src0, dst)); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int nc = src0->ne[0]; |
| const int nr = ggml_v1_nrows(src0); |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| for (int i1 = ir0; i1 < ir1; i1++) { |
| float *p = (float *)((char *) dst->data + i1*dst->nb[1]); |
|
|
| #ifndef NDEBUG |
| for (int i = 0; i < nc; ++i) { |
| |
| assert(!isnan(p[i])); |
| } |
| #endif |
|
|
| float max = -INFINITY; |
| ggml_v1_vec_max_f32(nc, &max, p); |
|
|
| ggml_v1_float sum = 0.0; |
|
|
| uint16_t scvt; |
| for (int i = 0; i < nc; i++) { |
| if (p[i] == -INFINITY) { |
| p[i] = 0.0f; |
| } else { |
| |
| ggml_v1_fp16_t s = GGML_V1_FP32_TO_FP16(p[i] - max); |
| memcpy(&scvt, &s, sizeof(scvt)); |
| const float val = GGML_V1_FP16_TO_FP32(table_exp_f16[scvt]); |
| sum += val; |
| p[i] = val; |
| } |
| } |
|
|
| assert(sum > 0.0f); |
|
|
| sum = 1.0/sum; |
| ggml_v1_vec_scale_f32(nc, p, sum); |
|
|
| #ifndef NDEBUG |
| for (int i = 0; i < nc; ++i) { |
| assert(!isnan(p[i])); |
| assert(!isinf(p[i])); |
| } |
| #endif |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_soft_max( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_soft_max_f32(params, src0, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_F16: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_rope_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(src1->type == GGML_V1_TYPE_I32); |
| assert(ggml_v1_nelements(src1) == 3); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n_past = ((int32_t *) src1->data)[0]; |
| const int n_dims = ((int32_t *) src1->data)[1]; |
| const int mode = ((int32_t *) src1->data)[2]; |
|
|
| |
| const int ne1 = src0->ne[1]; |
| const int ne2 = src0->ne[2]; |
| const int ne3 = src0->ne[3]; |
|
|
| const int nb0 = src0->nb[0]; |
| const int nb1 = src0->nb[1]; |
| const int nb2 = src0->nb[2]; |
| const int nb3 = src0->nb[3]; |
|
|
| |
| |
|
|
| assert(nb0 == sizeof(float)); |
|
|
| |
| for (int i3 = 0; i3 < ne3; i3++) { |
| for (int i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { |
| const int p = (mode == 0 ? n_past + i2 : i2); |
| for (int i1 = 0; i1 < ne1; i1++) { |
| for (int i0 = 0; i0 < n_dims; i0 += 2) { |
| const double theta = pow(10000.0, ((double)-i0)/n_dims); |
|
|
| const double cos_theta = cos(p*theta); |
| const double sin_theta = sin(p*theta); |
|
|
| const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); |
| float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); |
|
|
| double x0 = src[0]; |
| double x1 = src[1]; |
|
|
| dst_data[0] = x0*cos_theta - x1*sin_theta; |
| dst_data[1] = x0*sin_theta + x1*cos_theta; |
| } |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_rope_f16( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| assert(params->ith == 0); |
| assert(src1->type == GGML_V1_TYPE_I32); |
| assert(ggml_v1_nelements(src1) == 3); |
|
|
| if (params->type == GGML_V1_TASK_INIT || params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| const int n_past = ((int32_t *) src1->data)[0]; |
| const int n_dims = ((int32_t *) src1->data)[1]; |
| const int mode = ((int32_t *) src1->data)[2]; |
|
|
| |
| const int ne1 = src0->ne[1]; |
| const int ne2 = src0->ne[2]; |
| const int ne3 = src0->ne[3]; |
|
|
| const int nb0 = src0->nb[0]; |
| const int nb1 = src0->nb[1]; |
| const int nb2 = src0->nb[2]; |
| const int nb3 = src0->nb[3]; |
|
|
| |
| |
|
|
| assert(nb0 == sizeof(ggml_v1_fp16_t)); |
|
|
| for (int i3 = 0; i3 < ne3; i3++) { |
| for (int i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { |
| const int p = (mode == 0 ? n_past + i2 : i2); |
| for (int i1 = 0; i1 < ne1; i1++) { |
| for (int i0 = 0; i0 < n_dims; i0 += 2) { |
| const double theta = pow(10000.0, ((double)-i0)/n_dims); |
|
|
| const double cos_theta = cos(p*theta); |
| const double sin_theta = sin(p*theta); |
|
|
| const ggml_v1_fp16_t * const src = (ggml_v1_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); |
| ggml_v1_fp16_t * dst_data = (ggml_v1_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); |
|
|
| double x0 = ggml_v1_fp16_to_fp32(src[0]); |
| double x1 = ggml_v1_fp16_to_fp32(src[1]); |
|
|
| dst_data[0] = ggml_v1_fp32_to_fp16(x0*cos_theta - x1*sin_theta); |
| dst_data[1] = ggml_v1_fp32_to_fp16(x0*sin_theta + x1*cos_theta); |
| } |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_rope( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F16: |
| { |
| ggml_v1_compute_forward_rope_f16(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_rope_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_conv_1d_1s_f16_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(src0->type == GGML_V1_TYPE_F16); |
| GGML_V1_ASSERT(src1->type == GGML_V1_TYPE_F32); |
| GGML_V1_ASSERT( dst->type == GGML_V1_TYPE_F32); |
|
|
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| |
|
|
| const int ne10 = src1->ne[0]; |
| const int ne11 = src1->ne[1]; |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| const int nb00 = src0->nb[0]; |
| const int nb01 = src0->nb[1]; |
| const int nb02 = src0->nb[2]; |
| |
|
|
| const int nb10 = src1->nb[0]; |
| const int nb11 = src1->nb[1]; |
| |
| |
|
|
| |
| const int nb1 = dst->nb[1]; |
| |
| |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int nk = ne00; |
| const int nh = nk/2; |
|
|
| const int ew0 = ggml_v1_up32(ne01); |
|
|
| GGML_V1_ASSERT(ne00 % 2 == 1); |
| GGML_V1_ASSERT(nb00 == sizeof(ggml_v1_fp16_t)); |
| GGML_V1_ASSERT(nb10 == sizeof(float)); |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| |
| memset(params->wdata, 0, params->wsize); |
|
|
| |
| { |
| ggml_v1_fp16_t * const wdata = (ggml_v1_fp16_t *) params->wdata + 0; |
|
|
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| const ggml_v1_fp16_t * const src = (ggml_v1_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); |
| ggml_v1_fp16_t * dst_data = wdata + i02*ew0*ne00; |
| for (int i00 = 0; i00 < ne00; i00++) { |
| dst_data[i00*ew0 + i01] = src[i00]; |
| } |
| } |
| } |
| } |
|
|
| |
| { |
| ggml_v1_fp16_t * const wdata = (ggml_v1_fp16_t *) params->wdata + ne02*ew0*ne00; |
|
|
| for (int i11 = 0; i11 < ne11; i11++) { |
| const float * const src = (float *)((char *) src1->data + i11*nb11); |
| ggml_v1_fp16_t * dst_data = wdata; |
| for (int i10 = 0; i10 < ne10; i10++) { |
| dst_data[(i10 + nh)*ew0 + i11] = GGML_V1_FP32_TO_FP16(src[i10]); |
| } |
| } |
| } |
|
|
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
| const int nr = ne02; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| for (int i1 = ir0; i1 < ir1; i1++) { |
| float * dst_data = (float *)((char *) dst->data + i1*nb1); |
| for (int i0 = 0; i0 < ne10; ++i0) { |
| dst_data[i0] = 0; |
| for (int k = -nh; k <= nh; k++) { |
| float v = 0.0f; |
| ggml_v1_vec_dot_f16(ew0, &v, |
| (ggml_v1_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, |
| (ggml_v1_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); |
|
|
| dst_data[i0] += v; |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_conv_1d_1s_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(src0->type == GGML_V1_TYPE_F32); |
| GGML_V1_ASSERT(src1->type == GGML_V1_TYPE_F32); |
| GGML_V1_ASSERT( dst->type == GGML_V1_TYPE_F32); |
|
|
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| |
|
|
| const int ne10 = src1->ne[0]; |
| const int ne11 = src1->ne[1]; |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| const int nb00 = src0->nb[0]; |
| const int nb01 = src0->nb[1]; |
| const int nb02 = src0->nb[2]; |
| |
|
|
| const int nb10 = src1->nb[0]; |
| const int nb11 = src1->nb[1]; |
| |
| |
|
|
| |
| const int nb1 = dst->nb[1]; |
| |
| |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int nk = ne00; |
| const int nh = nk/2; |
|
|
| const int ew0 = ggml_v1_up32(ne01); |
|
|
| GGML_V1_ASSERT(ne00 % 2 == 1); |
| GGML_V1_ASSERT(nb00 == sizeof(float)); |
| GGML_V1_ASSERT(nb10 == sizeof(float)); |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| |
| memset(params->wdata, 0, params->wsize); |
|
|
| |
| { |
| float * const wdata = (float *) params->wdata + 0; |
|
|
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); |
| float * dst_data = wdata + i02*ew0*ne00; |
| for (int i00 = 0; i00 < ne00; i00++) { |
| dst_data[i00*ew0 + i01] = src[i00]; |
| } |
| } |
| } |
| } |
|
|
| |
| { |
| float * const wdata = (float *) params->wdata + ne02*ew0*ne00; |
|
|
| for (int i11 = 0; i11 < ne11; i11++) { |
| const float * const src = (float *)((char *) src1->data + i11*nb11); |
| float * dst_data = wdata; |
| for (int i10 = 0; i10 < ne10; i10++) { |
| dst_data[(i10 + nh)*ew0 + i11] = src[i10]; |
| } |
| } |
| } |
|
|
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
| const int nr = ne02; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| for (int i1 = ir0; i1 < ir1; i1++) { |
| float * dst_data = (float *)((char *) dst->data + i1*nb1); |
| for (int i0 = 0; i0 < ne10; ++i0) { |
| dst_data[i0] = 0; |
| for (int k = -nh; k <= nh; k++) { |
| float v = 0.0f; |
| ggml_v1_vec_dot_f32(ew0, &v, |
| (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, |
| (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); |
|
|
| dst_data[i0] += v; |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_conv_1d_1s( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F16: |
| { |
| ggml_v1_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_conv_1d_1s_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_conv_1d_2s_f16_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(src0->type == GGML_V1_TYPE_F16); |
| GGML_V1_ASSERT(src1->type == GGML_V1_TYPE_F32); |
| GGML_V1_ASSERT( dst->type == GGML_V1_TYPE_F32); |
|
|
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| |
|
|
| const int ne10 = src1->ne[0]; |
| const int ne11 = src1->ne[1]; |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| const int nb00 = src0->nb[0]; |
| const int nb01 = src0->nb[1]; |
| const int nb02 = src0->nb[2]; |
| |
|
|
| const int nb10 = src1->nb[0]; |
| const int nb11 = src1->nb[1]; |
| |
| |
|
|
| |
| const int nb1 = dst->nb[1]; |
| |
| |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int nk = ne00; |
| const int nh = nk/2; |
|
|
| const int ew0 = ggml_v1_up32(ne01); |
|
|
| GGML_V1_ASSERT(ne00 % 2 == 1); |
| GGML_V1_ASSERT(nb00 == sizeof(ggml_v1_fp16_t)); |
| GGML_V1_ASSERT(nb10 == sizeof(float)); |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| |
| memset(params->wdata, 0, params->wsize); |
|
|
| |
| { |
| ggml_v1_fp16_t * const wdata = (ggml_v1_fp16_t *) params->wdata + 0; |
|
|
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| const ggml_v1_fp16_t * const src = (ggml_v1_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); |
| ggml_v1_fp16_t * dst_data = wdata + i02*ew0*ne00; |
| for (int i00 = 0; i00 < ne00; i00++) { |
| dst_data[i00*ew0 + i01] = src[i00]; |
| } |
| } |
| } |
| } |
|
|
| |
| { |
| ggml_v1_fp16_t * const wdata = (ggml_v1_fp16_t *) params->wdata + ne02*ew0*ne00; |
|
|
| for (int i11 = 0; i11 < ne11; i11++) { |
| const float * const src = (float *)((char *) src1->data + i11*nb11); |
| ggml_v1_fp16_t * dst_data = wdata; |
| for (int i10 = 0; i10 < ne10; i10++) { |
| dst_data[(i10 + nh)*ew0 + i11] = GGML_V1_FP32_TO_FP16(src[i10]); |
| } |
| } |
| } |
|
|
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
| const int nr = ne02; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| for (int i1 = ir0; i1 < ir1; i1++) { |
| float * dst_data = (float *)((char *) dst->data + i1*nb1); |
| for (int i0 = 0; i0 < ne10; i0 += 2) { |
| dst_data[i0/2] = 0; |
| for (int k = -nh; k <= nh; k++) { |
| float v = 0.0f; |
| ggml_v1_vec_dot_f16(ew0, &v, |
| (ggml_v1_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, |
| (ggml_v1_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); |
|
|
| dst_data[i0/2] += v; |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_conv_1d_2s_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| GGML_V1_ASSERT(src0->type == GGML_V1_TYPE_F32); |
| GGML_V1_ASSERT(src1->type == GGML_V1_TYPE_F32); |
| GGML_V1_ASSERT( dst->type == GGML_V1_TYPE_F32); |
|
|
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int ne00 = src0->ne[0]; |
| const int ne01 = src0->ne[1]; |
| const int ne02 = src0->ne[2]; |
| |
|
|
| const int ne10 = src1->ne[0]; |
| const int ne11 = src1->ne[1]; |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| const int nb00 = src0->nb[0]; |
| const int nb01 = src0->nb[1]; |
| const int nb02 = src0->nb[2]; |
| |
|
|
| const int nb10 = src1->nb[0]; |
| const int nb11 = src1->nb[1]; |
| |
| |
|
|
| |
| const int nb1 = dst->nb[1]; |
| |
| |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int nk = ne00; |
| const int nh = nk/2; |
|
|
| const int ew0 = ggml_v1_up32(ne01); |
|
|
| GGML_V1_ASSERT(ne00 % 2 == 1); |
| GGML_V1_ASSERT(nb00 == sizeof(float)); |
| GGML_V1_ASSERT(nb10 == sizeof(float)); |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| |
| memset(params->wdata, 0, params->wsize); |
|
|
| |
| { |
| float * const wdata = (float *) params->wdata + 0; |
|
|
| for (int i02 = 0; i02 < ne02; i02++) { |
| for (int i01 = 0; i01 < ne01; i01++) { |
| const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); |
| float * dst_data = wdata + i02*ew0*ne00; |
| for (int i00 = 0; i00 < ne00; i00++) { |
| dst_data[i00*ew0 + i01] = src[i00]; |
| } |
| } |
| } |
| } |
|
|
| |
| { |
| float * const wdata = (float *) params->wdata + ne02*ew0*ne00; |
|
|
| for (int i11 = 0; i11 < ne11; i11++) { |
| const float * const src = (float *)((char *) src1->data + i11*nb11); |
| float * dst_data = wdata; |
| for (int i10 = 0; i10 < ne10; i10++) { |
| dst_data[(i10 + nh)*ew0 + i11] = src[i10]; |
| } |
| } |
| } |
|
|
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
| const int nr = ne02; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| for (int i1 = ir0; i1 < ir1; i1++) { |
| float * dst_data = (float *)((char *) dst->data + i1*nb1); |
| for (int i0 = 0; i0 < ne10; i0 += 2) { |
| dst_data[i0/2] = 0; |
| for (int k = -nh; k <= nh; k++) { |
| float v = 0.0f; |
| ggml_v1_vec_dot_f32(ew0, &v, |
| (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, |
| (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); |
|
|
| dst_data[i0/2] += v; |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_conv_1d_2s( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * src0, |
| const struct ggml_v1_tensor * src1, |
| struct ggml_v1_tensor * dst) { |
| switch (src0->type) { |
| case GGML_V1_TYPE_F16: |
| { |
| ggml_v1_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_conv_1d_2s_f32(params, src0, src1, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_flash_attn_f32( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * q, |
| const struct ggml_v1_tensor * k, |
| const struct ggml_v1_tensor * v, |
| const bool masked, |
| struct ggml_v1_tensor * dst) { |
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int neq0 = q->ne[0]; |
| const int neq1 = q->ne[1]; |
| const int neq2 = q->ne[2]; |
| const int neq3 = q->ne[3]; |
|
|
| const int nek0 = k->ne[0]; |
| const int nek1 = k->ne[1]; |
| |
| |
|
|
| |
| const int nev1 = v->ne[1]; |
| |
| |
|
|
| const int ne0 = dst->ne[0]; |
| const int ne1 = dst->ne[1]; |
| |
| |
|
|
| const int nbk0 = k->nb[0]; |
| const int nbk1 = k->nb[1]; |
| const int nbk2 = k->nb[2]; |
| const int nbk3 = k->nb[3]; |
|
|
| const int nbq0 = q->nb[0]; |
| const int nbq1 = q->nb[1]; |
| const int nbq2 = q->nb[2]; |
| const int nbq3 = q->nb[3]; |
|
|
| const int nbv0 = v->nb[0]; |
| const int nbv1 = v->nb[1]; |
| const int nbv2 = v->nb[2]; |
| const int nbv3 = v->nb[3]; |
|
|
| const int nb0 = dst->nb[0]; |
| const int nb1 = dst->nb[1]; |
| const int nb2 = dst->nb[2]; |
| const int nb3 = dst->nb[3]; |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int D = neq0; |
| const int N = neq1; |
| const int P = nek1 - N; |
| const int M = P + N; |
|
|
| const int Mup = ggml_v1_up(M, GGML_V1_SOFT_MAX_UNROLL); |
|
|
| GGML_V1_ASSERT(ne0 == D); |
| GGML_V1_ASSERT(ne1 == N); |
| GGML_V1_ASSERT(P >= 0); |
|
|
| GGML_V1_ASSERT(nbq0 == sizeof(float)); |
| GGML_V1_ASSERT(nbk0 == sizeof(float)); |
| GGML_V1_ASSERT(nbv0 == sizeof(float)); |
|
|
| GGML_V1_ASSERT(neq0 == D); |
| GGML_V1_ASSERT(nek0 == D); |
| GGML_V1_ASSERT(nev1 == D); |
|
|
| GGML_V1_ASSERT(neq1 == N); |
| GGML_V1_ASSERT(nek1 == N + P); |
| GGML_V1_ASSERT(nev1 == D); |
|
|
| |
| GGML_V1_ASSERT(nb0 == sizeof(float)); |
| GGML_V1_ASSERT(nb0 <= nb1); |
| GGML_V1_ASSERT(nb1 <= nb2); |
| GGML_V1_ASSERT(nb2 <= nb3); |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
|
|
| |
| const int nr = neq1*neq2*neq3; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| const float scale = 1.0/sqrt((double) D); |
|
|
| |
|
|
| for (int ir = ir0; ir < ir1; ++ir) { |
| |
| const int iq3 = ir/(neq2*neq1); |
| const int iq2 = (ir - iq3*neq2*neq1)/neq1; |
| const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); |
|
|
| float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32); |
|
|
| for (int i = M; i < Mup; ++i) { |
| S[i] = -INFINITY; |
| } |
|
|
| for (int ic = 0; ic < nek1; ++ic) { |
| |
| const int ik3 = iq3; |
| const int ik2 = iq2; |
| const int ik1 = ic; |
|
|
| |
| const int i1 = ik1; |
|
|
| ggml_v1_vec_dot_f32(neq0, |
| S + i1, |
| (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), |
| (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); |
| } |
|
|
| |
| ggml_v1_vec_scale_f32(nek1, S, scale); |
|
|
| if (masked) { |
| for (int i = P; i < M; i++) { |
| if (i > P + iq1) { |
| S[i] = -INFINITY; |
| } |
| } |
| } |
|
|
| |
| { |
| float max = -INFINITY; |
| ggml_v1_vec_max_f32(M, &max, S); |
|
|
| float sum = 0.0f; |
| { |
| #ifdef GGML_V1_SOFT_MAX_ACCELERATE |
| max = -max; |
| vDSP_vsadd(S, 1, &max, S, 1, Mup); |
| vvexpf(S, S, &Mup); |
| ggml_v1_vec_sum_f32(Mup, &sum, S); |
| #else |
| uint16_t scvt[GGML_V1_SOFT_MAX_UNROLL]; |
| ggml_v1_float sump[GGML_V1_SOFT_MAX_UNROLL] = { 0.0 }; |
|
|
| for (int i = 0; i < Mup; i += GGML_V1_SOFT_MAX_UNROLL) { |
| float * SS = S + i; |
|
|
| for (int j = 0; j < GGML_V1_SOFT_MAX_UNROLL; ++j) { |
| if (SS[j] == -INFINITY) { |
| SS[j] = 0.0f; |
| } else { |
| ggml_v1_fp16_t s = GGML_V1_FP32_TO_FP16(SS[j] - max); |
| memcpy(&scvt[j], &s, sizeof(uint16_t)); |
| const float val = GGML_V1_FP16_TO_FP32(table_exp_f16[scvt[j]]); |
| sump[j] += val; |
| SS[j] = val; |
| } |
| } |
| } |
|
|
| for (int i = 0; i < GGML_V1_SOFT_MAX_UNROLL; i++) { |
| sum += sump[i]; |
| } |
| #endif |
| } |
|
|
| assert(sum > 0.0f); |
|
|
| sum = 1.0/sum; |
| ggml_v1_vec_scale_f32(M, S, sum); |
|
|
| #ifndef NDEBUG |
| for (int i = 0; i < M; ++i) { |
| assert(!isnan(S[i])); |
| assert(!isinf(S[i])); |
| } |
| #endif |
| } |
|
|
| for (int ic = 0; ic < nev1; ++ic) { |
| |
| const int i1 = iq1; |
| const int i2 = iq2; |
| const int i3 = iq3; |
|
|
| ggml_v1_vec_dot_f32(nek1, |
| (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), |
| (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), |
| S); |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_flash_attn_f16( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * q, |
| const struct ggml_v1_tensor * k, |
| const struct ggml_v1_tensor * v, |
| const bool masked, |
| struct ggml_v1_tensor * dst) { |
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int neq0 = q->ne[0]; |
| const int neq1 = q->ne[1]; |
| const int neq2 = q->ne[2]; |
| const int neq3 = q->ne[3]; |
|
|
| const int nek0 = k->ne[0]; |
| const int nek1 = k->ne[1]; |
| |
| |
|
|
| |
| const int nev1 = v->ne[1]; |
| |
| |
|
|
| const int ne0 = dst->ne[0]; |
| const int ne1 = dst->ne[1]; |
| |
| |
|
|
| const int nbk0 = k->nb[0]; |
| const int nbk1 = k->nb[1]; |
| const int nbk2 = k->nb[2]; |
| const int nbk3 = k->nb[3]; |
|
|
| const int nbq0 = q->nb[0]; |
| const int nbq1 = q->nb[1]; |
| const int nbq2 = q->nb[2]; |
| const int nbq3 = q->nb[3]; |
|
|
| const int nbv0 = v->nb[0]; |
| const int nbv1 = v->nb[1]; |
| const int nbv2 = v->nb[2]; |
| const int nbv3 = v->nb[3]; |
|
|
| const int nb0 = dst->nb[0]; |
| const int nb1 = dst->nb[1]; |
| const int nb2 = dst->nb[2]; |
| const int nb3 = dst->nb[3]; |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int D = neq0; |
| const int N = neq1; |
| const int P = nek1 - N; |
| const int M = P + N; |
|
|
| const int Mup = ggml_v1_up(M, GGML_V1_SOFT_MAX_UNROLL); |
|
|
| GGML_V1_ASSERT(ne0 == D); |
| GGML_V1_ASSERT(ne1 == N); |
| GGML_V1_ASSERT(P >= 0); |
|
|
| GGML_V1_ASSERT(nbq0 == sizeof(ggml_v1_fp16_t)); |
| GGML_V1_ASSERT(nbk0 == sizeof(ggml_v1_fp16_t)); |
| GGML_V1_ASSERT(nbv0 == sizeof(ggml_v1_fp16_t)); |
|
|
| GGML_V1_ASSERT(neq0 == D); |
| GGML_V1_ASSERT(nek0 == D); |
| GGML_V1_ASSERT(nev1 == D); |
|
|
| GGML_V1_ASSERT(neq1 == N); |
| GGML_V1_ASSERT(nek1 == N + P); |
| GGML_V1_ASSERT(nev1 == D); |
|
|
| |
| GGML_V1_ASSERT(nb0 == sizeof(float)); |
| GGML_V1_ASSERT(nb0 <= nb1); |
| GGML_V1_ASSERT(nb1 <= nb2); |
| GGML_V1_ASSERT(nb2 <= nb3); |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
|
|
| |
| const int nr = neq1*neq2*neq3; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| const float scale = 1.0/sqrt((double) D); |
|
|
| |
|
|
| for (int ir = ir0; ir < ir1; ++ir) { |
| |
| const int iq3 = ir/(neq2*neq1); |
| const int iq2 = (ir - iq3*neq2*neq1)/neq1; |
| const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); |
|
|
| float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32); |
|
|
| for (int i = M; i < Mup; ++i) { |
| S[i] = -INFINITY; |
| } |
|
|
| if (GGML_V1_VEC_DOT_UNROLL > 2 || nek1 % GGML_V1_VEC_DOT_UNROLL != 0) { |
| for (int ic = 0; ic < nek1; ++ic) { |
| |
| const int ik3 = iq3; |
| const int ik2 = iq2; |
| const int ik1 = ic; |
|
|
| |
| const int i1 = ik1; |
|
|
| ggml_v1_vec_dot_f16(neq0, |
| S + i1, |
| (ggml_v1_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), |
| (ggml_v1_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); |
| } |
| } else { |
| for (int ic = 0; ic < nek1; ic += GGML_V1_VEC_DOT_UNROLL) { |
| |
| const int ik3 = iq3; |
| const int ik2 = iq2; |
| const int ik1 = ic; |
|
|
| |
| const int i1 = ik1; |
|
|
| ggml_v1_vec_dot_f16_unroll(neq0, nbk1, |
| S + i1, |
| ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), |
| (ggml_v1_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); |
| } |
| } |
|
|
| |
| ggml_v1_vec_scale_f32(nek1, S, scale); |
|
|
| if (masked) { |
| for (int i = P; i < M; i++) { |
| if (i > P + iq1) { |
| S[i] = -INFINITY; |
| } |
| } |
| } |
|
|
| |
| { |
| float max = -INFINITY; |
| ggml_v1_vec_max_f32(M, &max, S); |
|
|
| float sum = 0.0f; |
| { |
| #ifdef GGML_V1_SOFT_MAX_ACCELERATE |
| max = -max; |
| vDSP_vsadd(S, 1, &max, S, 1, Mup); |
| vvexpf(S, S, &Mup); |
| ggml_v1_vec_sum_f32(Mup, &sum, S); |
| #else |
| uint16_t scvt[GGML_V1_SOFT_MAX_UNROLL]; |
| ggml_v1_float sump[GGML_V1_SOFT_MAX_UNROLL] = { 0.0 }; |
|
|
| for (int i = 0; i < Mup; i += GGML_V1_SOFT_MAX_UNROLL) { |
| float * SS = S + i; |
|
|
| for (int j = 0; j < GGML_V1_SOFT_MAX_UNROLL; ++j) { |
| if (SS[j] == -INFINITY) { |
| SS[j] = 0.0f; |
| } else { |
| ggml_v1_fp16_t s = GGML_V1_FP32_TO_FP16(SS[j] - max); |
| memcpy(&scvt[j], &s, sizeof(uint16_t)); |
| const float val = GGML_V1_FP16_TO_FP32(table_exp_f16[scvt[j]]); |
| sump[j] += val; |
| SS[j] = val; |
| } |
| } |
| } |
|
|
| for (int i = 0; i < GGML_V1_SOFT_MAX_UNROLL; i++) { |
| sum += sump[i]; |
| } |
| #endif |
| } |
|
|
| assert(sum > 0.0f); |
|
|
| sum = 1.0/sum; |
| ggml_v1_vec_scale_f32(M, S, sum); |
|
|
| #ifndef NDEBUG |
| for (int i = 0; i < M; ++i) { |
| assert(!isnan(S[i])); |
| assert(!isinf(S[i])); |
| } |
| #endif |
| } |
|
|
| ggml_v1_fp16_t * S16 = (ggml_v1_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup); |
|
|
| for (int i = 0; i < M; i++) { |
| S16[i] = GGML_V1_FP32_TO_FP16(S[i]); |
| } |
|
|
| if (GGML_V1_VEC_DOT_UNROLL == 1 || (nev1 % GGML_V1_VEC_DOT_UNROLL != 0)) { |
| for (int ic = 0; ic < nev1; ++ic) { |
| |
| const int i1 = iq1; |
| const int i2 = iq2; |
| const int i3 = iq3; |
|
|
| ggml_v1_vec_dot_f16(nek1, |
| (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), |
| (ggml_v1_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), |
| S16); |
| } |
| } else { |
| for (int ic = 0; ic < nev1; ic += GGML_V1_VEC_DOT_UNROLL) { |
| |
| const int i1 = iq1; |
| const int i2 = iq2; |
| const int i3 = iq3; |
|
|
| ggml_v1_vec_dot_f16_unroll(nek1, nbv1, |
| (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), |
| ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), |
| S16); |
| } |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_flash_attn( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * q, |
| const struct ggml_v1_tensor * k, |
| const struct ggml_v1_tensor * v, |
| const bool masked, |
| struct ggml_v1_tensor * dst) { |
| switch (q->type) { |
| case GGML_V1_TYPE_F16: |
| { |
| ggml_v1_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| ggml_v1_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward_flash_ff_f16( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * a, |
| const struct ggml_v1_tensor * b0, |
| const struct ggml_v1_tensor * b1, |
| const struct ggml_v1_tensor * c0, |
| const struct ggml_v1_tensor * c1, |
| struct ggml_v1_tensor * dst) { |
| int64_t t0 = ggml_v1_perf_time_us(); |
| UNUSED(t0); |
|
|
| const int nea0 = a->ne[0]; |
| const int nea1 = a->ne[1]; |
| const int nea2 = a->ne[2]; |
| const int nea3 = a->ne[3]; |
|
|
| const int neb00 = b0->ne[0]; |
| const int neb01 = b0->ne[1]; |
| |
| |
|
|
| const int neb10 = b1->ne[0]; |
| const int neb11 = b1->ne[1]; |
| |
| |
|
|
| const int nec00 = c0->ne[0]; |
| const int nec01 = c0->ne[1]; |
| |
| |
|
|
| const int nec10 = c1->ne[0]; |
| const int nec11 = c1->ne[1]; |
| |
| |
|
|
| const int ne0 = dst->ne[0]; |
| const int ne1 = dst->ne[1]; |
| const int ne2 = dst->ne[2]; |
| |
|
|
| const int nba0 = a->nb[0]; |
| const int nba1 = a->nb[1]; |
| const int nba2 = a->nb[2]; |
| const int nba3 = a->nb[3]; |
|
|
| const int nbb00 = b0->nb[0]; |
| const int nbb01 = b0->nb[1]; |
| const int nbb02 = b0->nb[2]; |
| const int nbb03 = b0->nb[3]; |
|
|
| const int nbb10 = b1->nb[0]; |
| |
| |
| |
|
|
| const int nbc00 = c0->nb[0]; |
| const int nbc01 = c0->nb[1]; |
| const int nbc02 = c0->nb[2]; |
| const int nbc03 = c0->nb[3]; |
|
|
| const int nbc10 = c1->nb[0]; |
| |
| |
| |
|
|
| const int nb0 = dst->nb[0]; |
| const int nb1 = dst->nb[1]; |
| const int nb2 = dst->nb[2]; |
| const int nb3 = dst->nb[3]; |
|
|
| const int ith = params->ith; |
| const int nth = params->nth; |
|
|
| const int D = nea0; |
| |
| const int M = neb01; |
|
|
| GGML_V1_ASSERT(ne0 == nea0); |
| GGML_V1_ASSERT(ne1 == nea1); |
| GGML_V1_ASSERT(ne2 == nea2); |
|
|
| GGML_V1_ASSERT(nba0 == sizeof(ggml_v1_fp16_t)); |
| GGML_V1_ASSERT(nbb00 == sizeof(ggml_v1_fp16_t)); |
| GGML_V1_ASSERT(nbb10 == sizeof(float)); |
| GGML_V1_ASSERT(nbc00 == sizeof(ggml_v1_fp16_t)); |
| GGML_V1_ASSERT(nbc10 == sizeof(float)); |
|
|
| GGML_V1_ASSERT(neb00 == D); |
| GGML_V1_ASSERT(neb01 == M); |
| GGML_V1_ASSERT(neb10 == M); |
| GGML_V1_ASSERT(neb11 == 1); |
|
|
| GGML_V1_ASSERT(nec00 == M); |
| GGML_V1_ASSERT(nec01 == D); |
| GGML_V1_ASSERT(nec10 == D); |
| GGML_V1_ASSERT(nec11 == 1); |
|
|
| |
| GGML_V1_ASSERT(nb0 == sizeof(float)); |
| GGML_V1_ASSERT(nb0 <= nb1); |
| GGML_V1_ASSERT(nb1 <= nb2); |
| GGML_V1_ASSERT(nb2 <= nb3); |
|
|
| if (params->type == GGML_V1_TASK_INIT) { |
| return; |
| } |
|
|
| if (params->type == GGML_V1_TASK_FINALIZE) { |
| return; |
| } |
|
|
| |
|
|
| |
| const int nr = nea1*nea2*nea3; |
|
|
| |
| const int dr = (nr + nth - 1)/nth; |
|
|
| |
| const int ir0 = dr*ith; |
| const int ir1 = MIN(ir0 + dr, nr); |
|
|
| for (int ir = ir0; ir < ir1; ++ir) { |
| |
| const int ia3 = ir/(nea2*nea1); |
| const int ia2 = (ir - ia3*nea2*nea1)/nea1; |
| const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1); |
|
|
| float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32); |
|
|
| for (int ic = 0; ic < neb01; ++ic) { |
| |
| const int ib03 = ia3; |
| const int ib02 = ia2; |
| const int ib01 = ic; |
|
|
| |
| const int i1 = ib01; |
|
|
| ggml_v1_vec_dot_f16(nea0, |
| S + i1, |
| (ggml_v1_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), |
| (ggml_v1_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); |
| } |
|
|
| ggml_v1_vec_add_f32(neb01, S, S, (float *) b1->data); |
| |
|
|
| ggml_v1_fp16_t * S16 = (ggml_v1_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); |
|
|
| for (int i = 0; i < M; i++) { |
| S16[i] = GGML_V1_FP32_TO_FP16(S[i]); |
| } |
|
|
| ggml_v1_vec_gelu_f16(neb01, S16, S16); |
|
|
| { |
| |
| const int i1 = ia1; |
| const int i2 = ia2; |
| const int i3 = ia3; |
|
|
| for (int ic = 0; ic < nec01; ++ic) { |
|
|
| ggml_v1_vec_dot_f16(neb01, |
| (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), |
| (ggml_v1_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), |
| S16); |
| } |
|
|
| ggml_v1_vec_add_f32(nec01, |
| (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), |
| (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), |
| (float *) c1->data); |
| } |
| } |
| } |
|
|
| static void ggml_v1_compute_forward_flash_ff( |
| const struct ggml_v1_compute_params * params, |
| const struct ggml_v1_tensor * a, |
| const struct ggml_v1_tensor * b0, |
| const struct ggml_v1_tensor * b1, |
| const struct ggml_v1_tensor * c0, |
| const struct ggml_v1_tensor * c1, |
| struct ggml_v1_tensor * dst) { |
| switch (b0->type) { |
| case GGML_V1_TYPE_F16: |
| { |
| ggml_v1_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); |
| } break; |
| case GGML_V1_TYPE_F32: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_TYPE_Q4_0: |
| case GGML_V1_TYPE_Q4_1: |
| case GGML_V1_TYPE_I8: |
| case GGML_V1_TYPE_I16: |
| case GGML_V1_TYPE_I32: |
| case GGML_V1_TYPE_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_forward(struct ggml_v1_compute_params * params, struct ggml_v1_tensor * tensor) { |
| GGML_V1_ASSERT(params); |
|
|
| switch (tensor->op) { |
| case GGML_V1_OP_DUP: |
| { |
| ggml_v1_compute_forward_dup(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_ADD: |
| { |
| ggml_v1_compute_forward_add(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_SUB: |
| { |
| ggml_v1_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_MUL: |
| { |
| ggml_v1_compute_forward_mul(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_DIV: |
| { |
| ggml_v1_compute_forward_div(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_SQR: |
| { |
| ggml_v1_compute_forward_sqr(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_SQRT: |
| { |
| ggml_v1_compute_forward_sqrt(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_SUM: |
| { |
| ggml_v1_compute_forward_sum(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_MEAN: |
| { |
| ggml_v1_compute_forward_mean(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_REPEAT: |
| { |
| ggml_v1_compute_forward_repeat(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_ABS: |
| { |
| ggml_v1_compute_forward_abs(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_SGN: |
| { |
| ggml_v1_compute_forward_sgn(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_NEG: |
| { |
| ggml_v1_compute_forward_neg(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_STEP: |
| { |
| ggml_v1_compute_forward_step(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_RELU: |
| { |
| ggml_v1_compute_forward_relu(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_GELU: |
| { |
| ggml_v1_compute_forward_gelu(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_NORM: |
| { |
| ggml_v1_compute_forward_norm(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_MUL_MAT: |
| { |
| ggml_v1_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_SCALE: |
| { |
| ggml_v1_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_CPY: |
| { |
| ggml_v1_compute_forward_cpy(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_RESHAPE: |
| { |
| ggml_v1_compute_forward_reshape(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_VIEW: |
| { |
| ggml_v1_compute_forward_view(params, tensor->src0); |
| } break; |
| case GGML_V1_OP_PERMUTE: |
| { |
| ggml_v1_compute_forward_permute(params, tensor->src0); |
| } break; |
| case GGML_V1_OP_TRANSPOSE: |
| { |
| ggml_v1_compute_forward_transpose(params, tensor->src0); |
| } break; |
| case GGML_V1_OP_GET_ROWS: |
| { |
| ggml_v1_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_DIAG_MASK_INF: |
| { |
| ggml_v1_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_SOFT_MAX: |
| { |
| ggml_v1_compute_forward_soft_max(params, tensor->src0, tensor); |
| } break; |
| case GGML_V1_OP_ROPE: |
| { |
| ggml_v1_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_CONV_1D_1S: |
| { |
| ggml_v1_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_CONV_1D_2S: |
| { |
| ggml_v1_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor); |
| } break; |
| case GGML_V1_OP_FLASH_ATTN: |
| { |
| int32_t t = ggml_v1_get_i32_1d(tensor->opt[1], 0); |
| GGML_V1_ASSERT(t == 0 || t == 1); |
| bool masked = t != 0; |
| ggml_v1_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor); |
| } break; |
| case GGML_V1_OP_FLASH_FF: |
| { |
| ggml_v1_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); |
| } break; |
| case GGML_V1_OP_NONE: |
| { |
| |
| } break; |
| case GGML_V1_OP_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| |
|
|
| static void ggml_v1_compute_backward(struct ggml_v1_context * ctx, struct ggml_v1_tensor * tensor, bool inplace) { |
| struct ggml_v1_tensor * src0 = tensor->src0; |
| struct ggml_v1_tensor * src1 = tensor->src1; |
|
|
| switch (tensor->op) { |
| case GGML_V1_OP_DUP: |
| { |
| if (src0->grad) { |
| src0->grad = ggml_v1_add_impl(ctx, src0->grad, tensor->grad, inplace); |
| } |
| } break; |
| case GGML_V1_OP_ADD: |
| { |
| if (src0->grad) { |
| src0->grad = ggml_v1_add_impl(ctx, src0->grad, tensor->grad, inplace); |
| } |
| if (src1->grad) { |
| src1->grad = ggml_v1_add_impl(ctx, src1->grad, tensor->grad, inplace); |
| } |
| } break; |
| case GGML_V1_OP_SUB: |
| { |
| if (src0->grad) { |
| src0->grad = ggml_v1_add_impl(ctx, src0->grad, tensor->grad, inplace); |
| } |
| if (src1->grad) { |
| src1->grad = ggml_v1_sub_impl(ctx, src1->grad, tensor->grad, inplace); |
| } |
| } break; |
| case GGML_V1_OP_MUL: |
| { |
| if (src0->grad) { |
| src0->grad = |
| ggml_v1_add_impl(ctx, |
| src0->grad, |
| ggml_v1_mul(ctx, src1, tensor->grad), |
| inplace); |
| } |
| if (src1->grad) { |
| src1->grad = |
| ggml_v1_add_impl(ctx, |
| src1->grad, |
| ggml_v1_mul(ctx, src0, tensor->grad), |
| inplace); |
| } |
| } break; |
| case GGML_V1_OP_DIV: |
| { |
| if (src0->grad) { |
| src0->grad = |
| ggml_v1_add_impl(ctx, |
| src0->grad, |
| ggml_v1_div(ctx, tensor->grad, src1), |
| inplace); |
| } |
| if (src1->grad) { |
| src1->grad = |
| ggml_v1_sub_impl(ctx, |
| src1->grad, |
| ggml_v1_mul(ctx, |
| tensor->grad, |
| ggml_v1_div(ctx, tensor, src1)), |
| inplace); |
| } |
| } break; |
| case GGML_V1_OP_SQR: |
| { |
| if (src0->grad) { |
| src0->grad = |
| ggml_v1_add_impl(ctx, |
| src0->grad, |
| ggml_v1_mul(ctx, |
| ggml_v1_mul(ctx, src0, tensor->grad), |
| ggml_v1_repeat(ctx, ggml_v1_new_f32(ctx, 2.0f), src0)), |
| inplace); |
| } |
| } break; |
| case GGML_V1_OP_SQRT: |
| { |
| if (src0->grad) { |
| src0->grad = |
| ggml_v1_add_impl(ctx, |
| src0->grad, |
| ggml_v1_div(ctx, |
| ggml_v1_repeat(ctx, ggml_v1_new_f32(ctx, 0.5f), tensor), |
| tensor), |
| inplace); |
| } |
| } break; |
| case GGML_V1_OP_SUM: |
| { |
| if (src0->grad) { |
| src0->grad = |
| ggml_v1_add_impl(ctx, |
| src0->grad, |
| ggml_v1_repeat(ctx, tensor->grad, src0->grad), |
| inplace); |
| } |
| } break; |
| case GGML_V1_OP_MEAN: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_REPEAT: |
| { |
| if (src0->grad) { |
| src0->grad = |
| ggml_v1_add_impl(ctx, |
| src0->grad, |
| ggml_v1_sum(ctx, tensor->grad), |
| inplace); |
| } |
| } break; |
| case GGML_V1_OP_ABS: |
| { |
| if (src0->grad) { |
| src0->grad = |
| ggml_v1_add_impl(ctx, |
| src0->grad, |
| ggml_v1_mul(ctx, |
| ggml_v1_sgn(ctx, src0), |
| tensor->grad), |
| inplace); |
| } |
| } break; |
| case GGML_V1_OP_SGN: |
| { |
| if (src0->grad) { |
| |
| } |
| } break; |
| case GGML_V1_OP_NEG: |
| { |
| if (src0->grad) { |
| src0->grad = ggml_v1_sub_impl(ctx, src0->grad, tensor->grad, inplace); |
| } |
| } break; |
| case GGML_V1_OP_STEP: |
| { |
| if (src0->grad) { |
| |
| } |
| } break; |
| case GGML_V1_OP_RELU: |
| { |
| if (src0->grad) { |
| src0->grad = ggml_v1_sub_impl(ctx, |
| src0->grad, |
| ggml_v1_mul(ctx, |
| ggml_v1_step(ctx, src0), |
| tensor->grad), |
| inplace); |
| } |
| } break; |
| case GGML_V1_OP_GELU: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_NORM: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_MUL_MAT: |
| { |
| if (src0->grad) { |
| |
| GGML_V1_ASSERT(false); |
| } |
| if (src1->grad) { |
| src1->grad = |
| ggml_v1_add_impl(ctx, |
| src1->grad, |
| |
| ggml_v1_mul_mat(ctx, ggml_v1_transpose(ctx, src0), tensor->grad), |
| inplace); |
| } |
| } break; |
| case GGML_V1_OP_SCALE: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_CPY: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_RESHAPE: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_VIEW: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_PERMUTE: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_TRANSPOSE: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_GET_ROWS: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_DIAG_MASK_INF: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_SOFT_MAX: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_ROPE: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_CONV_1D_1S: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_CONV_1D_2S: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_FLASH_ATTN: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_FLASH_FF: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| case GGML_V1_OP_NONE: |
| { |
| |
| } break; |
| case GGML_V1_OP_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| static void ggml_v1_visit_parents(struct ggml_v1_cgraph * cgraph, struct ggml_v1_tensor * node) { |
| if (node->grad == NULL) { |
| |
| |
| if (node->op != GGML_V1_OP_NONE) { |
| |
| } |
| } |
|
|
| |
| for (int i = 0; i < cgraph->n_nodes; i++) { |
| if (cgraph->nodes[i] == node) { |
| return; |
| } |
| } |
|
|
| for (int i = 0; i < cgraph->n_leafs; i++) { |
| if (cgraph->leafs[i] == node) { |
| return; |
| } |
| } |
|
|
| if (node->src0) { |
| ggml_v1_visit_parents(cgraph, node->src0); |
| } |
|
|
| if (node->src1) { |
| ggml_v1_visit_parents(cgraph, node->src1); |
| } |
|
|
| for (int i = 0; i < GGML_V1_MAX_OPT; ++i) { |
| if (node->opt[i]) { |
| ggml_v1_visit_parents(cgraph, node->opt[i]); |
| } |
| } |
|
|
| if (node->op == GGML_V1_OP_NONE && node->grad == NULL) { |
| |
| GGML_V1_ASSERT(cgraph->n_leafs < GGML_V1_MAX_NODES); |
|
|
| cgraph->leafs[cgraph->n_leafs] = node; |
| cgraph->n_leafs++; |
| } else { |
| GGML_V1_ASSERT(cgraph->n_nodes < GGML_V1_MAX_NODES); |
|
|
| cgraph->nodes[cgraph->n_nodes] = node; |
| cgraph->grads[cgraph->n_nodes] = node->grad; |
| cgraph->n_nodes++; |
| } |
| } |
|
|
| static void ggml_v1_build_forward_impl(struct ggml_v1_cgraph * cgraph, struct ggml_v1_tensor * tensor, bool expand) { |
| if (!expand) { |
| cgraph->n_nodes = 0; |
| cgraph->n_leafs = 0; |
| } |
|
|
| const int n0 = cgraph->n_nodes; |
| UNUSED(n0); |
|
|
| ggml_v1_visit_parents(cgraph, tensor); |
|
|
| const int n_new = cgraph->n_nodes - n0; |
| GGML_V1_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new); |
|
|
| if (n_new > 0) { |
| |
| GGML_V1_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor); |
| } |
| } |
|
|
| void ggml_v1_build_forward_expand(struct ggml_v1_cgraph * cgraph, struct ggml_v1_tensor * tensor) { |
| ggml_v1_build_forward_impl(cgraph, tensor, true); |
| } |
|
|
| struct ggml_v1_cgraph ggml_v1_build_forward(struct ggml_v1_tensor * tensor) { |
| struct ggml_v1_cgraph result = { |
| 0, |
| 0, |
| 0, |
| 0, |
| NULL, |
| { NULL }, |
| { NULL }, |
| { NULL }, |
| 0, |
| 0, |
| 0, |
| }; |
|
|
| ggml_v1_build_forward_impl(&result, tensor, false); |
|
|
| return result; |
| } |
|
|
| struct ggml_v1_cgraph ggml_v1_build_backward(struct ggml_v1_context * ctx, struct ggml_v1_cgraph * gf, bool keep) { |
| struct ggml_v1_cgraph result = *gf; |
|
|
| GGML_V1_ASSERT(gf->n_nodes > 0); |
|
|
| |
| if (keep) { |
| for (int i = 0; i < gf->n_nodes; i++) { |
| struct ggml_v1_tensor * node = gf->nodes[i]; |
|
|
| if (node->grad) { |
| node->grad = ggml_v1_dup_tensor(ctx, node); |
| gf->grads[i] = node->grad; |
| } |
| } |
| } |
|
|
| for (int i = gf->n_nodes - 1; i >= 0; i--) { |
| struct ggml_v1_tensor * node = gf->nodes[i]; |
|
|
| |
| if (node->grad) { |
| ggml_v1_compute_backward(ctx, node, keep); |
| } |
| } |
|
|
| for (int i = gf->n_nodes - 1; i >= 0; i--) { |
| struct ggml_v1_tensor * node = gf->nodes[i]; |
|
|
| if (node->is_param) { |
| GGML_V1_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); |
| ggml_v1_build_forward_impl(&result, node->grad, true); |
| } |
| } |
|
|
| return result; |
| } |
|
|
| |
| |
| |
| |
| |
| |
|
|
| #ifdef __APPLE__ |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| typedef int ggml_v1_lock_t; |
|
|
| #define ggml_v1_lock_init(x) UNUSED(x) |
| #define ggml_v1_lock_destroy(x) UNUSED(x) |
| #define ggml_v1_lock_lock(x) UNUSED(x) |
| #define ggml_v1_lock_unlock(x) UNUSED(x) |
|
|
| #define GGML_V1_LOCK_INITIALIZER 0 |
|
|
| typedef pthread_t ggml_v1_thread_t; |
|
|
| #define ggml_v1_thread_create pthread_create |
| #define ggml_v1_thread_join pthread_join |
|
|
| #else |
|
|
| |
|
|
| |
| |
| |
| |
|
|
| typedef int ggml_v1_lock_t; |
|
|
| #define ggml_v1_lock_init(x) UNUSED(x) |
| #define ggml_v1_lock_destroy(x) UNUSED(x) |
| #define ggml_v1_lock_lock(x) UNUSED(x) |
| #define ggml_v1_lock_unlock(x) UNUSED(x) |
|
|
| #define GGML_V1_LOCK_INITIALIZER 0 |
|
|
| typedef pthread_t ggml_v1_thread_t; |
|
|
| #define ggml_v1_thread_create pthread_create |
| #define ggml_v1_thread_join pthread_join |
|
|
| #endif |
|
|
| struct ggml_v1_compute_state_shared { |
| ggml_v1_lock_t spin; |
|
|
| int n_threads; |
|
|
| |
| atomic_int n_ready; |
| atomic_bool has_work; |
| atomic_bool stop; |
| }; |
|
|
| struct ggml_v1_compute_state { |
| ggml_v1_thread_t thrd; |
|
|
| struct ggml_v1_compute_params params; |
| struct ggml_v1_tensor * node; |
|
|
| struct ggml_v1_compute_state_shared * shared; |
| }; |
|
|
| static thread_ret_t ggml_v1_graph_compute_thread(void * data) { |
| struct ggml_v1_compute_state * state = (struct ggml_v1_compute_state *) data; |
|
|
| const int n_threads = state->shared->n_threads; |
|
|
| while (true) { |
| if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) { |
| atomic_store(&state->shared->has_work, false); |
| } else { |
| while (atomic_load(&state->shared->has_work)) { |
| if (atomic_load(&state->shared->stop)) { |
| return 0; |
| } |
| ggml_v1_lock_lock (&state->shared->spin); |
| ggml_v1_lock_unlock(&state->shared->spin); |
| } |
| } |
|
|
| atomic_fetch_sub(&state->shared->n_ready, 1); |
|
|
| |
| while (!atomic_load(&state->shared->has_work)) { |
| if (atomic_load(&state->shared->stop)) { |
| return 0; |
| } |
| ggml_v1_lock_lock (&state->shared->spin); |
| ggml_v1_lock_unlock(&state->shared->spin); |
| } |
|
|
| |
| if (atomic_load(&state->shared->stop)) { |
| break; |
| } |
|
|
| if (state->node) { |
| if (state->params.ith < state->params.nth) { |
| ggml_v1_compute_forward(&state->params, state->node); |
| } |
|
|
| state->node = NULL; |
| } else { |
| break; |
| } |
| } |
|
|
| return 0; |
| } |
|
|
| void ggml_v1_graph_compute(struct ggml_v1_context * ctx, struct ggml_v1_cgraph * cgraph) { |
| if (cgraph->n_threads <= 0) { |
| cgraph->n_threads = 8; |
| } |
|
|
| const int n_threads = cgraph->n_threads; |
|
|
| struct ggml_v1_compute_state_shared state_shared = { |
| GGML_V1_LOCK_INITIALIZER, |
| n_threads, |
| 0, |
| false, |
| false, |
| }; |
| struct ggml_v1_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_v1_compute_state)*(n_threads - 1)) : NULL; |
|
|
| |
| if (n_threads > 1) { |
| ggml_v1_lock_init(&state_shared.spin); |
|
|
| atomic_store(&state_shared.has_work, true); |
|
|
| for (int j = 0; j < n_threads - 1; j++) { |
| workers[j] = (struct ggml_v1_compute_state) { |
| .thrd = 0, |
| .params = { |
| .type = GGML_V1_TASK_COMPUTE, |
| .ith = j + 1, |
| .nth = n_threads, |
| .wsize = cgraph->work ? ggml_v1_nbytes(cgraph->work) : 0, |
| .wdata = cgraph->work ? cgraph->work->data : NULL, |
| }, |
| .node = NULL, |
| .shared = &state_shared, |
| }; |
|
|
| int rc = ggml_v1_thread_create(&workers[j].thrd, NULL, ggml_v1_graph_compute_thread, &workers[j]); |
| GGML_V1_ASSERT(rc == 0); |
| UNUSED(rc); |
| } |
| } |
|
|
| |
| { |
| size_t work_size = 0; |
|
|
| |
| for (int i = 0; i < cgraph->n_nodes; i++) { |
| struct ggml_v1_tensor * node = cgraph->nodes[i]; |
|
|
| switch (node->op) { |
| case GGML_V1_OP_DUP: |
| { |
| node->n_tasks = 1; |
| } break; |
| case GGML_V1_OP_ADD: |
| { |
| node->n_tasks = n_threads; |
| } break; |
| case GGML_V1_OP_SUB: |
| case GGML_V1_OP_MUL: |
| case GGML_V1_OP_DIV: |
| case GGML_V1_OP_SQR: |
| case GGML_V1_OP_SQRT: |
| case GGML_V1_OP_SUM: |
| case GGML_V1_OP_MEAN: |
| case GGML_V1_OP_REPEAT: |
| case GGML_V1_OP_ABS: |
| case GGML_V1_OP_SGN: |
| case GGML_V1_OP_NEG: |
| case GGML_V1_OP_STEP: |
| case GGML_V1_OP_RELU: |
| { |
| node->n_tasks = 1; |
| } break; |
| case GGML_V1_OP_GELU: |
| { |
| node->n_tasks = n_threads; |
| } break; |
| case GGML_V1_OP_NORM: |
| { |
| node->n_tasks = n_threads; |
| } break; |
| case GGML_V1_OP_MUL_MAT: |
| { |
| node->n_tasks = n_threads; |
|
|
| |
| |
| |
|
|
| |
| |
|
|
| size_t cur = 0; |
|
|
| |
| if (node->src0->nb[1] < node->src0->nb[0]) { |
| cur = ggml_v1_nbytes(node)*node->n_tasks; |
| |
| } else { |
| if (node->src0->type == GGML_V1_TYPE_F16 && |
| node->src1->type == GGML_V1_TYPE_F32) { |
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |
| if (ggml_v1_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { |
| node->n_tasks = 1; |
| |
| cur = GGML_V1_TYPE_SIZE[GGML_V1_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); |
| |
| |
| |
| } else { |
| cur = GGML_V1_TYPE_SIZE[GGML_V1_TYPE_F16]*ggml_v1_nelements(node->src1); |
| } |
| #else |
| cur = GGML_V1_TYPE_SIZE[GGML_V1_TYPE_F16]*ggml_v1_nelements(node->src1); |
| #endif |
| } else if (node->src0->type == GGML_V1_TYPE_F32 && |
| node->src1->type == GGML_V1_TYPE_F32) { |
| cur = 0; |
| } else if (node->src0->type == GGML_V1_TYPE_Q4_0 && |
| node->src1->type == GGML_V1_TYPE_F32) { |
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |
| if (ggml_v1_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { |
| node->n_tasks = 1; |
| cur = GGML_V1_TYPE_SIZE[GGML_V1_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); |
| } else { |
| cur = (GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_0]*ggml_v1_nelements(node->src1))/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_0]; |
| } |
| #else |
| cur = (GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_0]*ggml_v1_nelements(node->src1))/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_0]; |
| #endif |
| } else if (node->src0->type == GGML_V1_TYPE_Q4_1 && |
| node->src1->type == GGML_V1_TYPE_F32) { |
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |
| if (ggml_v1_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { |
| node->n_tasks = 1; |
| cur = GGML_V1_TYPE_SIZE[GGML_V1_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); |
| } else { |
| cur = (GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_1]*ggml_v1_nelements(node->src1))/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_1]; |
| } |
| #else |
| cur = (GGML_V1_TYPE_SIZE[GGML_V1_TYPE_Q4_1]*ggml_v1_nelements(node->src1))/GGML_V1_BLCK_SIZE[GGML_V1_TYPE_Q4_1]; |
| #endif |
| } else { |
| GGML_V1_ASSERT(false); |
| } |
| } |
|
|
| work_size = MAX(work_size, cur); |
| } break; |
| case GGML_V1_OP_SCALE: |
| { |
| node->n_tasks = n_threads; |
| } break; |
| case GGML_V1_OP_CPY: |
| case GGML_V1_OP_RESHAPE: |
| case GGML_V1_OP_VIEW: |
| case GGML_V1_OP_PERMUTE: |
| case GGML_V1_OP_TRANSPOSE: |
| case GGML_V1_OP_GET_ROWS: |
| case GGML_V1_OP_DIAG_MASK_INF: |
| { |
| node->n_tasks = 1; |
| } break; |
| case GGML_V1_OP_SOFT_MAX: |
| { |
| node->n_tasks = n_threads; |
| } break; |
| case GGML_V1_OP_ROPE: |
| { |
| node->n_tasks = 1; |
| } break; |
| case GGML_V1_OP_CONV_1D_1S: |
| case GGML_V1_OP_CONV_1D_2S: |
| { |
| node->n_tasks = n_threads; |
|
|
| GGML_V1_ASSERT(node->src0->ne[3] == 1); |
| GGML_V1_ASSERT(node->src1->ne[2] == 1); |
| GGML_V1_ASSERT(node->src1->ne[3] == 1); |
|
|
| size_t cur = 0; |
| const int nk = node->src0->ne[0]; |
|
|
| if (node->src0->type == GGML_V1_TYPE_F16 && |
| node->src1->type == GGML_V1_TYPE_F32) { |
| cur = sizeof(ggml_v1_fp16_t)*( |
| nk*ggml_v1_up32(node->src0->ne[1])*node->src0->ne[2] + |
| ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] |
| ); |
| } else if (node->src0->type == GGML_V1_TYPE_F32 && |
| node->src1->type == GGML_V1_TYPE_F32) { |
| cur = sizeof(float)*( |
| nk*ggml_v1_up32(node->src0->ne[1])*node->src0->ne[2] + |
| ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] |
| ); |
| } else { |
| GGML_V1_ASSERT(false); |
| } |
|
|
| work_size = MAX(work_size, cur); |
| } break; |
| case GGML_V1_OP_FLASH_ATTN: |
| { |
| node->n_tasks = n_threads; |
|
|
| size_t cur = 0; |
|
|
| const int ne11 = ggml_v1_up(node->src1->ne[1], GGML_V1_SOFT_MAX_UNROLL); |
|
|
| if (node->src1->type == GGML_V1_TYPE_F32) { |
| cur = sizeof(float)*ne11*node->n_tasks; |
| cur += sizeof(float)*ne11*node->n_tasks; |
| } |
|
|
| if (node->src1->type == GGML_V1_TYPE_F16) { |
| cur = sizeof(float)*ne11*node->n_tasks; |
| cur += sizeof(float)*ne11*node->n_tasks; |
| } |
|
|
| work_size = MAX(work_size, cur); |
| } break; |
| case GGML_V1_OP_FLASH_FF: |
| { |
| node->n_tasks = n_threads; |
|
|
| size_t cur = 0; |
|
|
| if (node->src1->type == GGML_V1_TYPE_F32) { |
| cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; |
| cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; |
| } |
|
|
| if (node->src1->type == GGML_V1_TYPE_F16) { |
| cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; |
| cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; |
| } |
|
|
| work_size = MAX(work_size, cur); |
| } break; |
| case GGML_V1_OP_NONE: |
| { |
| node->n_tasks = 1; |
| } break; |
| case GGML_V1_OP_COUNT: |
| { |
| GGML_V1_ASSERT(false); |
| } break; |
| } |
| } |
|
|
| if (cgraph->work != NULL && work_size > cgraph->work_size) { |
| GGML_V1_ASSERT(false); |
| } |
|
|
| if (work_size > 0 && cgraph->work == NULL) { |
| cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); |
|
|
| GGML_V1_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); |
| cgraph->work = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_I8, cgraph->work_size); |
| } |
| } |
|
|
| const int64_t perf_start_cycles = ggml_v1_perf_cycles(); |
| const int64_t perf_start_time_us = ggml_v1_perf_time_us(); |
|
|
| for (int i = 0; i < cgraph->n_nodes; i++) { |
| GGML_V1_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); |
|
|
| struct ggml_v1_tensor * node = cgraph->nodes[i]; |
|
|
| |
| |
| |
| |
|
|
| const int64_t perf_node_start_cycles = ggml_v1_perf_cycles(); |
| const int64_t perf_node_start_time_us = ggml_v1_perf_time_us(); |
|
|
| |
| struct ggml_v1_compute_params params = { |
| GGML_V1_TASK_INIT, |
| 0, |
| node->n_tasks, |
| cgraph->work ? ggml_v1_nbytes(cgraph->work) : 0, |
| cgraph->work ? cgraph->work->data : NULL, |
| }; |
|
|
| ggml_v1_compute_forward(¶ms, node); |
|
|
| |
| if (node->n_tasks > 1) { |
| if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { |
| atomic_store(&state_shared.has_work, false); |
| } |
|
|
| while (atomic_load(&state_shared.has_work)) { |
| ggml_v1_lock_lock (&state_shared.spin); |
| ggml_v1_lock_unlock(&state_shared.spin); |
| } |
|
|
| |
| for (int j = 0; j < n_threads - 1; j++) { |
| workers[j].params = (struct ggml_v1_compute_params) { |
| .type = GGML_V1_TASK_COMPUTE, |
| .ith = j + 1, |
| .nth = node->n_tasks, |
| .wsize = cgraph->work ? ggml_v1_nbytes(cgraph->work) : 0, |
| .wdata = cgraph->work ? cgraph->work->data : NULL, |
| }; |
| workers[j].node = node; |
| } |
|
|
| atomic_fetch_sub(&state_shared.n_ready, 1); |
|
|
| while (atomic_load(&state_shared.n_ready) > 0) { |
| ggml_v1_lock_lock (&state_shared.spin); |
| ggml_v1_lock_unlock(&state_shared.spin); |
| } |
|
|
| atomic_store(&state_shared.has_work, true); |
| } |
|
|
| params.type = GGML_V1_TASK_COMPUTE; |
| ggml_v1_compute_forward(¶ms, node); |
|
|
| |
| if (node->n_tasks > 1) { |
| if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { |
| atomic_store(&state_shared.has_work, false); |
| } |
|
|
| while (atomic_load(&state_shared.has_work)) { |
| ggml_v1_lock_lock (&state_shared.spin); |
| ggml_v1_lock_unlock(&state_shared.spin); |
| } |
|
|
| atomic_fetch_sub(&state_shared.n_ready, 1); |
|
|
| while (atomic_load(&state_shared.n_ready) != 0) { |
| ggml_v1_lock_lock (&state_shared.spin); |
| ggml_v1_lock_unlock(&state_shared.spin); |
| } |
| } |
|
|
| |
| if (node->n_tasks > 1) { |
| if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { |
| atomic_store(&state_shared.has_work, false); |
| } |
|
|
| while (atomic_load(&state_shared.has_work)) { |
| ggml_v1_lock_lock (&state_shared.spin); |
| ggml_v1_lock_unlock(&state_shared.spin); |
| } |
|
|
| |
| for (int j = 0; j < n_threads - 1; j++) { |
| workers[j].params = (struct ggml_v1_compute_params) { |
| .type = GGML_V1_TASK_FINALIZE, |
| .ith = j + 1, |
| .nth = node->n_tasks, |
| .wsize = cgraph->work ? ggml_v1_nbytes(cgraph->work) : 0, |
| .wdata = cgraph->work ? cgraph->work->data : NULL, |
| }; |
| workers[j].node = node; |
| } |
|
|
| atomic_fetch_sub(&state_shared.n_ready, 1); |
|
|
| while (atomic_load(&state_shared.n_ready) > 0) { |
| ggml_v1_lock_lock (&state_shared.spin); |
| ggml_v1_lock_unlock(&state_shared.spin); |
| } |
|
|
| atomic_store(&state_shared.has_work, true); |
| } |
|
|
| params.type = GGML_V1_TASK_FINALIZE; |
| ggml_v1_compute_forward(¶ms, node); |
|
|
| |
| if (node->n_tasks > 1) { |
| if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { |
| atomic_store(&state_shared.has_work, false); |
| } |
|
|
| while (atomic_load(&state_shared.has_work)) { |
| ggml_v1_lock_lock (&state_shared.spin); |
| ggml_v1_lock_unlock(&state_shared.spin); |
| } |
|
|
| atomic_fetch_sub(&state_shared.n_ready, 1); |
|
|
| while (atomic_load(&state_shared.n_ready) != 0) { |
| ggml_v1_lock_lock (&state_shared.spin); |
| ggml_v1_lock_unlock(&state_shared.spin); |
| } |
| } |
|
|
| |
| { |
| int64_t perf_cycles_cur = ggml_v1_perf_cycles() - perf_node_start_cycles; |
| int64_t perf_time_us_cur = ggml_v1_perf_time_us() - perf_node_start_time_us; |
|
|
| node->perf_runs++; |
| node->perf_cycles += perf_cycles_cur; |
| node->perf_time_us += perf_time_us_cur; |
| } |
| } |
|
|
| |
| if (n_threads > 1) { |
| atomic_store(&state_shared.stop, true); |
| atomic_store(&state_shared.has_work, true); |
|
|
| for (int j = 0; j < n_threads - 1; j++) { |
| int rc = ggml_v1_thread_join(workers[j].thrd, NULL); |
| GGML_V1_ASSERT(rc == 0); |
| UNUSED(rc); |
| } |
|
|
| ggml_v1_lock_destroy(&state_shared.spin); |
| } |
|
|
| |
| { |
| int64_t perf_cycles_cur = ggml_v1_perf_cycles() - perf_start_cycles; |
| int64_t perf_time_us_cur = ggml_v1_perf_time_us() - perf_start_time_us; |
|
|
| cgraph->perf_runs++; |
| cgraph->perf_cycles += perf_cycles_cur; |
| cgraph->perf_time_us += perf_time_us_cur; |
|
|
| GGML_V1_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", |
| __func__, cgraph->perf_runs, |
| (double) perf_cycles_cur / (double) ggml_v1_cycles_per_ms(), |
| (double) cgraph->perf_cycles / (double) ggml_v1_cycles_per_ms() / (double) cgraph->perf_runs, |
| (double) perf_time_us_cur / 1000.0, |
| (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs); |
| } |
| } |
|
|
| void ggml_v1_graph_reset(struct ggml_v1_cgraph * cgraph) { |
| for (int i = 0; i < cgraph->n_nodes; i++) { |
| struct ggml_v1_tensor * grad = cgraph->grads[i]; |
|
|
| if (grad) { |
| ggml_v1_set_zero(grad); |
| } |
| } |
| } |
|
|
| void ggml_v1_graph_print(const struct ggml_v1_cgraph * cgraph) { |
| int64_t perf_total_per_op_us[GGML_V1_OP_COUNT] = {0}; |
|
|
| GGML_V1_PRINT("=== GRAPH ===\n"); |
|
|
| GGML_V1_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); |
| GGML_V1_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size); |
|
|
| GGML_V1_PRINT("n_nodes = %d\n", cgraph->n_nodes); |
| for (int i = 0; i < cgraph->n_nodes; i++) { |
| struct ggml_v1_tensor * node = cgraph->nodes[i]; |
|
|
| perf_total_per_op_us[node->op] += node->perf_time_us; |
|
|
| GGML_V1_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", |
| i, |
| node->ne[0], node->ne[1], node->ne[2], |
| GGML_V1_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, |
| (double) node->perf_cycles / (double) ggml_v1_cycles_per_ms(), |
| (double) node->perf_cycles / (double) ggml_v1_cycles_per_ms() / (double) node->perf_runs, |
| (double) node->perf_time_us / 1000.0, |
| (double) node->perf_time_us / 1000.0 / node->perf_runs); |
| } |
|
|
| GGML_V1_PRINT("n_leafs = %d\n", cgraph->n_leafs); |
| for (int i = 0; i < cgraph->n_leafs; i++) { |
| struct ggml_v1_tensor * node = cgraph->leafs[i]; |
|
|
| GGML_V1_PRINT(" - %3d: [ %6d, %6d] %8s\n", |
| i, |
| node->ne[0], node->ne[1], |
| GGML_V1_OP_LABEL[node->op]); |
| } |
|
|
| for (int i = 0; i < GGML_V1_OP_COUNT; i++) { |
| GGML_V1_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_V1_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0); |
| } |
|
|
| GGML_V1_PRINT("========================================\n"); |
| } |
|
|
| |
| static bool ggml_v1_graph_find(const struct ggml_v1_cgraph * cgraph, const struct ggml_v1_tensor * node) { |
| if (cgraph == NULL) { |
| return true; |
| } |
|
|
| for (int i = 0; i < cgraph->n_nodes; i++) { |
| if (cgraph->nodes[i] == node) { |
| return true; |
| } |
| } |
|
|
| return false; |
| } |
|
|
| static struct ggml_v1_tensor * ggml_v1_graph_get_parent(const struct ggml_v1_cgraph * cgraph, const struct ggml_v1_tensor * node) { |
| for (int i = 0; i < cgraph->n_nodes; i++) { |
| struct ggml_v1_tensor * parent = cgraph->nodes[i]; |
|
|
| if (parent->grad == node) { |
| return parent; |
| } |
| } |
|
|
| return NULL; |
| } |
|
|
| void ggml_v1_graph_dump_dot(const struct ggml_v1_cgraph * gb, const struct ggml_v1_cgraph * gf, const char * filename) { |
| char color[16]; |
|
|
| FILE * fp = fopen(filename, "w"); |
| GGML_V1_ASSERT(fp); |
|
|
| fprintf(fp, "digraph G {\n"); |
| fprintf(fp, " newrank = true;\n"); |
| fprintf(fp, " rankdir = LR;\n"); |
|
|
| for (int i = 0; i < gb->n_nodes; i++) { |
| struct ggml_v1_tensor * node = gb->nodes[i]; |
|
|
| if (ggml_v1_graph_get_parent(gb, node) != NULL) { |
| continue; |
| } |
|
|
| if (node->is_param) { |
| snprintf(color, sizeof(color), "yellow"); |
| } else if (node->grad) { |
| if (ggml_v1_graph_find(gf, node)) { |
| snprintf(color, sizeof(color), "green"); |
| } else { |
| snprintf(color, sizeof(color), "lightblue"); |
| } |
| } else { |
| snprintf(color, sizeof(color), "white"); |
| } |
|
|
| fprintf(fp, " \"%p\" [ \ |
| style = filled; fillcolor = %s; shape = record; \ |
| label=\"%d [%d, %d] | <x>%s", |
| (void *) node, color, |
| i, node->ne[0], node->ne[1], |
| GGML_V1_OP_SYMBOL[node->op]); |
|
|
| if (node->grad) { |
| fprintf(fp, " | <g>%s\"; ]\n", GGML_V1_OP_SYMBOL[node->grad->op]); |
| } else { |
| fprintf(fp, "\"; ]\n"); |
| } |
| } |
|
|
| for (int i = 0; i < gb->n_leafs; i++) { |
| struct ggml_v1_tensor * node = gb->leafs[i]; |
|
|
| snprintf(color, sizeof(color), "pink"); |
|
|
| if (ggml_v1_nelements(node) == 1) { |
| fprintf(fp, " \"%p\" [ \ |
| style = filled; fillcolor = %s; shape = record; \ |
| label=\"<x>%.1e\"; ]\n", |
| (void *) node, color, ggml_v1_get_f32_1d(node, 0)); |
| } else { |
| fprintf(fp, " \"%p\" [ \ |
| style = filled; fillcolor = %s; shape = record; \ |
| label=\"<x>CONST %d [%d, %d]\"; ]\n", |
| (void *) node, color, |
| i, node->ne[0], node->ne[1]); |
| } |
| } |
|
|
| for (int i = 0; i < gb->n_nodes; i++) { |
| struct ggml_v1_tensor * node = gb->nodes[i]; |
|
|
| struct ggml_v1_tensor * parent = ggml_v1_graph_get_parent(gb, node); |
|
|
| if (node->src0) { |
| struct ggml_v1_tensor * parent0 = ggml_v1_graph_get_parent(gb, node->src0); |
|
|
| fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n", |
| parent0 ? (void *) parent0 : (void *) node->src0, |
| parent0 ? "g" : "x", |
| parent ? (void *) parent : (void *) node, |
| parent ? "g" : "x", |
| parent ? "empty" : "vee", |
| parent ? "dashed" : "solid"); |
| } |
|
|
| if (node->src1) { |
| struct ggml_v1_tensor * parent1 = ggml_v1_graph_get_parent(gb, node->src1); |
|
|
| fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n", |
| parent1 ? (void *) parent1 : (void *) node->src1, |
| parent1 ? "g" : "x", |
| parent ? (void *) parent : (void *) node, |
| parent ? "g" : "x", |
| parent ? "empty" : "vee", |
| parent ? "dashed" : "solid"); |
| } |
| } |
|
|
| for (int i = 0; i < gb->n_leafs; i++) { |
| struct ggml_v1_tensor * node = gb->leafs[i]; |
|
|
| if (node->src0) { |
| fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n", |
| (void *) node->src0, "x", |
| (void *) node, "x"); |
| } |
|
|
| if (node->src1) { |
| fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n", |
| (void *) node->src1, "x", |
| (void *) node, "x"); |
| } |
| } |
|
|
| fprintf(fp, "}\n"); |
|
|
| fclose(fp); |
|
|
| GGML_V1_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); |
| } |
|
|
| |
|
|
| static void ggml_v1_opt_set_params(int np, struct ggml_v1_tensor * const ps[], const float * x) { |
| int i = 0; |
| for (int p = 0; p < np; ++p) { |
| const int ne = ggml_v1_nelements(ps[p]) ; |
| |
| for (int j = 0; j < ne; ++j) { |
| ggml_v1_set_f32_1d(ps[p], j, x[i++]); |
| } |
| } |
| } |
|
|
| static void ggml_v1_opt_get_params(int np, struct ggml_v1_tensor * const ps[], float * x) { |
| int i = 0; |
| for (int p = 0; p < np; ++p) { |
| const int ne = ggml_v1_nelements(ps[p]) ; |
| |
| for (int j = 0; j < ne; ++j) { |
| x[i++] = ggml_v1_get_f32_1d(ps[p], j); |
| } |
| } |
| } |
|
|
| static void ggml_v1_opt_get_grad(int np, struct ggml_v1_tensor * const ps[], float * g) { |
| int i = 0; |
| for (int p = 0; p < np; ++p) { |
| const int ne = ggml_v1_nelements(ps[p]) ; |
| |
| for (int j = 0; j < ne; ++j) { |
| g[i++] = ggml_v1_get_f32_1d(ps[p]->grad, j); |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
|
|
| static enum ggml_v1_opt_result ggml_v1_opt_adam( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_opt_params params, |
| struct ggml_v1_tensor * f, |
| struct ggml_v1_cgraph * gf, |
| struct ggml_v1_cgraph * gb) { |
| GGML_V1_ASSERT(ggml_v1_is_scalar(f)); |
|
|
| gf->n_threads = params.n_threads; |
| gb->n_threads = params.n_threads; |
|
|
| |
| struct ggml_v1_tensor * ps[GGML_V1_MAX_PARAMS]; |
|
|
| int np = 0; |
| int nx = 0; |
| for (int i = 0; i < gf->n_nodes; ++i) { |
| if (gf->nodes[i]->is_param) { |
| GGML_V1_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); |
|
|
| GGML_V1_ASSERT(np < GGML_V1_MAX_PARAMS); |
|
|
| ps[np++] = gf->nodes[i]; |
| nx += ggml_v1_nelements(gf->nodes[i]); |
| } |
| } |
|
|
| |
| const float alpha = params.adam.alpha; |
| const float beta1 = params.adam.beta1; |
| const float beta2 = params.adam.beta2; |
| const float eps = params.adam.eps; |
|
|
| float * x = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * g1 = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * g2 = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * m = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * v = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * mh = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * vh = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
|
|
| float * pf = params.past > 0 ? ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, params.past)->data : NULL; |
|
|
| |
| ggml_v1_vec_set_f32(nx, m, 0.0f); |
| ggml_v1_vec_set_f32(nx, v, 0.0f); |
|
|
| |
| ggml_v1_opt_get_params(np, ps, x); |
|
|
| |
| ggml_v1_graph_reset (gf); |
| ggml_v1_set_f32 (f->grad, 1.0f); |
| ggml_v1_graph_compute(ctx, gb); |
|
|
| float fx_prev = ggml_v1_get_f32_1d(f, 0); |
| if (pf) { |
| pf[0] = fx_prev; |
| } |
|
|
| int n_no_improvement = 0; |
| float fx_best = fx_prev; |
|
|
| |
| for (int t = 0; t < params.adam.n_iter; ++t) { |
| GGML_V1_PRINT_DEBUG ("=== iter %d ===\n", t); |
|
|
| GGML_V1_PRINT_DEBUG ("f = %10.6f\n", ggml_v1_get_f32_1d(f, 0)); |
| GGML_V1_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_v1_get_f32_1d(ps[0]->grad, 0)); |
| GGML_V1_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_v1_get_f32_1d(ps[1]->grad, 0)); |
|
|
| for (int i = 0; i < np; ++i) { |
| GGML_V1_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i, |
| ggml_v1_get_f32_1d(ps[i], 0), ggml_v1_get_f32_1d(ps[i]->grad, 0)); |
| } |
|
|
| const int64_t t_start_wall = ggml_v1_time_us(); |
| const int64_t t_start_cpu = ggml_v1_cycles(); |
| UNUSED(t_start_wall); |
| UNUSED(t_start_cpu); |
|
|
| { |
| |
| ggml_v1_opt_get_grad(np, ps, g1); |
|
|
| |
| ggml_v1_vec_scale_f32(nx, m, beta1); |
| ggml_v1_vec_mad_f32 (nx, m, g1, 1.0f - beta1); |
|
|
| |
| ggml_v1_vec_sqr_f32 (nx, g2, g1); |
|
|
| |
| ggml_v1_vec_scale_f32(nx, v, beta2); |
| ggml_v1_vec_mad_f32 (nx, v, g2, 1.0f - beta2); |
|
|
| |
| |
| |
| ggml_v1_vec_cpy_f32 (nx, mh, m); |
| ggml_v1_vec_cpy_f32 (nx, vh, v); |
|
|
| ggml_v1_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1))); |
| ggml_v1_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, t + 1))); |
|
|
| ggml_v1_vec_sqrt_f32 (nx, vh, vh); |
| ggml_v1_vec_acc1_f32 (nx, vh, eps); |
|
|
| ggml_v1_vec_div_f32 (nx, mh, mh, vh); |
| ggml_v1_vec_sub_f32 (nx, x, x, mh); |
|
|
| |
| ggml_v1_opt_set_params(np, ps, x); |
| } |
|
|
| ggml_v1_graph_reset (gf); |
| ggml_v1_set_f32 (f->grad, 1.0f); |
| ggml_v1_graph_compute(ctx, gb); |
|
|
| const float fx = ggml_v1_get_f32_1d(f, 0); |
|
|
| |
| if (fabsf(fx - fx_prev)/fx < params.adam.eps_f) { |
| GGML_V1_PRINT_DEBUG("converged\n"); |
|
|
| return GGML_V1_OPT_OK; |
| } |
|
|
| |
| if (pf != NULL) { |
| |
| if (params.past <= t) { |
| const float rate = (pf[t%params.past] - fx)/fx; |
|
|
| if (fabs(rate) < params.delta) { |
| return GGML_V1_OPT_OK; |
| } |
| } |
|
|
| pf[t%params.past] = fx; |
| } |
|
|
| |
| if (params.max_no_improvement > 0) { |
| if (fx_best > fx) { |
| fx_best = fx; |
| n_no_improvement = 0; |
| } else { |
| ++n_no_improvement; |
|
|
| if (n_no_improvement >= params.max_no_improvement) { |
| return GGML_V1_OPT_OK; |
| } |
| } |
| } |
|
|
| fx_prev = fx; |
|
|
| { |
| const int64_t t_end_cpu = ggml_v1_cycles(); |
| GGML_V1_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC); |
| UNUSED(t_end_cpu); |
|
|
| const int64_t t_end_wall = ggml_v1_time_us(); |
| GGML_V1_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6); |
| UNUSED(t_end_wall); |
| } |
| } |
|
|
| return GGML_V1_OPT_DID_NOT_CONVERGE; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| struct ggml_v1_lbfgs_iteration_data { |
| float alpha; |
| float ys; |
| float * s; |
| float * y; |
| }; |
|
|
| static enum ggml_v1_opt_result linesearch_backtracking( |
| struct ggml_v1_context * ctx, |
| const struct ggml_v1_opt_params * params, |
| int nx, |
| float * x, |
| float * fx, |
| float * g, |
| float * d, |
| float * step, |
| const float * xp, |
| struct ggml_v1_tensor * f, |
| struct ggml_v1_cgraph * gf, |
| struct ggml_v1_cgraph * gb, |
| const int np, |
| struct ggml_v1_tensor * ps[]) { |
| int count = 0; |
|
|
| float width = 0.0f; |
| float dg = 0.0f; |
| float finit = 0.0f; |
| float dginit = 0.0f; |
| float dgtest = 0.0f; |
|
|
| const float dec = 0.5f; |
| const float inc = 2.1f; |
|
|
| if (*step <= 0.) { |
| return GGML_V1_LINESEARCH_INVALID_PARAMETERS; |
| } |
|
|
| |
| ggml_v1_vec_dot_f32(nx, &dginit, g, d); |
|
|
| |
| if (0 < dginit) { |
| return GGML_V1_LINESEARCH_FAIL; |
| } |
|
|
| |
| finit = *fx; |
| dgtest = params->lbfgs.ftol*dginit; |
|
|
| while (true) { |
| ggml_v1_vec_cpy_f32(nx, x, xp); |
| ggml_v1_vec_mad_f32(nx, x, d, *step); |
|
|
| |
| { |
| ggml_v1_opt_set_params(np, ps, x); |
|
|
| ggml_v1_graph_reset (gf); |
| ggml_v1_set_f32 (f->grad, 1.0f); |
| ggml_v1_graph_compute(ctx, gb); |
|
|
| ggml_v1_opt_get_grad(np, ps, g); |
|
|
| *fx = ggml_v1_get_f32_1d(f, 0); |
| } |
|
|
| ++count; |
|
|
| if (*fx > finit + (*step)*dgtest) { |
| width = dec; |
| } else { |
| |
| if (params->lbfgs.linesearch == GGML_V1_LINESEARCH_BACKTRACKING_ARMIJO) { |
| return count; |
| } |
|
|
| ggml_v1_vec_dot_f32(nx, &dg, g, d); |
|
|
| |
| if (dg < params->lbfgs.wolfe * dginit) { |
| width = inc; |
| } else { |
| if(params->lbfgs.linesearch == GGML_V1_LINESEARCH_BACKTRACKING_WOLFE) { |
| |
| return count; |
| } |
|
|
| if(dg > -params->lbfgs.wolfe*dginit) { |
| width = dec; |
| } else { |
| |
| return count; |
| } |
| return count; |
| } |
| } |
|
|
| if (*step < params->lbfgs.min_step) { |
| return GGML_V1_LINESEARCH_MINIMUM_STEP; |
| } |
| if (*step > params->lbfgs.max_step) { |
| return GGML_V1_LINESEARCH_MAXIMUM_STEP; |
| } |
| if (params->lbfgs.max_linesearch <= count) { |
| return GGML_V1_LINESEARCH_MAXIMUM_ITERATIONS; |
| } |
|
|
| (*step) *= width; |
| } |
|
|
| return GGML_V1_LINESEARCH_FAIL; |
| } |
|
|
| static enum ggml_v1_opt_result ggml_v1_opt_lbfgs( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_opt_params params, |
| struct ggml_v1_tensor * f, |
| struct ggml_v1_cgraph * gf, |
| struct ggml_v1_cgraph * gb) { |
| if (params.lbfgs.linesearch == GGML_V1_LINESEARCH_BACKTRACKING_WOLFE || |
| params.lbfgs.linesearch == GGML_V1_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { |
| if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1. <= params.lbfgs.wolfe) { |
| return GGML_V1_OPT_INVALID_WOLFE; |
| } |
| } |
|
|
| gf->n_threads = params.n_threads; |
| gb->n_threads = params.n_threads; |
|
|
| const int m = params.lbfgs.m; |
|
|
| |
| struct ggml_v1_tensor * ps[GGML_V1_MAX_PARAMS]; |
|
|
| int np = 0; |
| int nx = 0; |
| for (int i = 0; i < gf->n_nodes; ++i) { |
| if (gf->nodes[i]->is_param) { |
| GGML_V1_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); |
|
|
| GGML_V1_ASSERT(np < GGML_V1_MAX_PARAMS); |
|
|
| ps[np++] = gf->nodes[i]; |
| nx += ggml_v1_nelements(gf->nodes[i]); |
| } |
| } |
|
|
| float * x = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * xp = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * g = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * gp = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| float * d = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
|
|
| float * pf = params.past > 0 ? ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, params.past)->data : NULL; |
|
|
| float fx = 0.0f; |
| float xnorm = 0.0f; |
| float gnorm = 0.0f; |
| float step = 0.0f; |
|
|
| |
| ggml_v1_opt_get_params(np, ps, x); |
|
|
| |
| struct ggml_v1_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_v1_lbfgs_iteration_data)*m); |
|
|
| for (int i = 0; i < m; ++i) { |
| lm[i].alpha = 0.0f; |
| lm[i].ys = 0.0f; |
| lm[i].s = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| lm[i].y = ggml_v1_new_tensor_1d(ctx, GGML_V1_TYPE_F32, nx)->data; |
| } |
|
|
| |
| { |
| ggml_v1_opt_set_params(np, ps, x); |
|
|
| ggml_v1_graph_reset (gf); |
| ggml_v1_set_f32 (f->grad, 1.0f); |
| ggml_v1_graph_compute(ctx, gb); |
|
|
| ggml_v1_opt_get_grad(np, ps, g); |
|
|
| fx = ggml_v1_get_f32_1d(f, 0); |
| } |
|
|
| if (pf) { |
| pf[0] = fx; |
| } |
|
|
| float fx_best = fx; |
|
|
| |
| ggml_v1_vec_neg_f32(nx, d, g); |
|
|
| |
| ggml_v1_vec_norm_f32(nx, &xnorm, x); |
| ggml_v1_vec_norm_f32(nx, &gnorm, g); |
|
|
| if (xnorm < 1.0f) { |
| xnorm = 1.0f; |
| } |
|
|
| |
| if (gnorm/xnorm <= params.lbfgs.eps) { |
| return GGML_V1_OPT_OK; |
| } |
|
|
| |
| ggml_v1_vec_norm_inv_f32(nx, &step, d); |
|
|
| int j = 0; |
| int k = 1; |
| int ls = 0; |
| int end = 0; |
| int bound = 0; |
| int n_no_improvement = 0; |
|
|
| float ys = 0.0f; |
| float yy = 0.0f; |
| float beta = 0.0f; |
|
|
| while (true) { |
| |
| ggml_v1_vec_cpy_f32(nx, xp, x); |
| ggml_v1_vec_cpy_f32(nx, gp, g); |
|
|
| ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, &step, xp, f, gf, gb, np, ps); |
|
|
| if (ls < 0) { |
| |
| ggml_v1_vec_cpy_f32(nx, x, xp); |
| ggml_v1_vec_cpy_f32(nx, g, gp); |
|
|
| return ls; |
| } |
|
|
| ggml_v1_vec_norm_f32(nx, &xnorm, x); |
| ggml_v1_vec_norm_f32(nx, &gnorm, g); |
|
|
| GGML_V1_PRINT_DEBUG("f = %10.6f\n", ggml_v1_get_f32_1d(f, 0)); |
|
|
| if (xnorm < 1.0) { |
| xnorm = 1.0; |
| } |
| if (gnorm/xnorm <= params.lbfgs.eps) { |
| |
| return GGML_V1_OPT_OK; |
| } |
|
|
| |
| if (pf != NULL) { |
| |
| if (params.past <= k) { |
| const float rate = (pf[k%params.past] - fx)/fx; |
|
|
| if (fabs(rate) < params.delta) { |
| return GGML_V1_OPT_OK; |
| } |
| } |
|
|
| pf[k%params.past] = fx; |
| } |
|
|
| |
| if (params.max_no_improvement > 0) { |
| if (fx < fx_best) { |
| fx_best = fx; |
| n_no_improvement = 0; |
| } else { |
| n_no_improvement++; |
|
|
| if (n_no_improvement >= params.max_no_improvement) { |
| return GGML_V1_OPT_OK; |
| } |
| } |
| } |
|
|
| if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < k + 1) { |
| |
| return GGML_V1_OPT_DID_NOT_CONVERGE; |
| } |
|
|
| |
| |
| |
| |
| ggml_v1_vec_sub_f32(nx, lm[end].s, x, xp); |
| ggml_v1_vec_sub_f32(nx, lm[end].y, g, gp); |
|
|
| |
| |
| |
| |
| ggml_v1_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s); |
| ggml_v1_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y); |
|
|
| lm[end].ys = ys; |
|
|
| |
| |
|
|
| bound = (m <= k) ? m : k; |
| k++; |
| end = (end + 1)%m; |
|
|
| |
| ggml_v1_vec_neg_f32(nx, d, g); |
|
|
| j = end; |
| for (int i = 0; i < bound; ++i) { |
| j = (j + m - 1) % m; |
| |
| ggml_v1_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d); |
| lm[j].alpha /= lm[j].ys; |
| |
| ggml_v1_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha); |
| } |
|
|
| ggml_v1_vec_scale_f32(nx, d, ys/yy); |
|
|
| for (int i = 0; i < bound; ++i) { |
| |
| ggml_v1_vec_dot_f32(nx, &beta, lm[j].y, d); |
| beta /= lm[j].ys; |
| |
| ggml_v1_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta); |
| j = (j + 1)%m; |
| } |
|
|
| step = 1.0; |
| } |
|
|
| return GGML_V1_OPT_DID_NOT_CONVERGE; |
| } |
|
|
| struct ggml_v1_opt_params ggml_v1_opt_default_params(enum ggml_v1_opt_type type) { |
| struct ggml_v1_opt_params result; |
|
|
| switch (type) { |
| case GGML_V1_OPT_ADAM: |
| { |
| result = (struct ggml_v1_opt_params) { |
| .type = GGML_V1_OPT_ADAM, |
| .n_threads = 1, |
| .past = 0, |
| .delta = 1e-5f, |
|
|
| .max_no_improvement = 100, |
|
|
| .print_forward_graph = true, |
| .print_backward_graph = true, |
|
|
| .adam = { |
| .n_iter = 10000, |
| .alpha = 0.001f, |
| .beta1 = 0.9f, |
| .beta2 = 0.999f, |
| .eps = 1e-8f, |
| .eps_f = 1e-5f, |
| .eps_g = 1e-3f, |
| }, |
| }; |
| } break; |
| case GGML_V1_OPT_LBFGS: |
| { |
| result = (struct ggml_v1_opt_params) { |
| .type = GGML_V1_OPT_LBFGS, |
| .n_threads = 1, |
| .past = 0, |
| .delta = 1e-5f, |
|
|
| .max_no_improvement = 0, |
|
|
| .print_forward_graph = true, |
| .print_backward_graph = true, |
|
|
| .lbfgs = { |
| .m = 6, |
| .n_iter = 100, |
| .max_linesearch = 20, |
|
|
| .eps = 1e-5f, |
| .ftol = 1e-4f, |
| .wolfe = 0.9f, |
| .min_step = 1e-20f, |
| .max_step = 1e+20f, |
|
|
| .linesearch = GGML_V1_LINESEARCH_DEFAULT, |
| }, |
| }; |
| } break; |
| } |
|
|
| return result; |
| } |
|
|
| enum ggml_v1_opt_result ggml_v1_opt( |
| struct ggml_v1_context * ctx, |
| struct ggml_v1_opt_params params, |
| struct ggml_v1_tensor * f) { |
| bool free_ctx = false; |
| if (ctx == NULL) { |
| struct ggml_v1_init_params params_ctx; |
| params_ctx.mem_size = 16*1024*1024; |
| params_ctx.mem_buffer = NULL; |
|
|
|
|
| ctx = ggml_v1_init(params_ctx); |
| if (ctx == NULL) { |
| return GGML_V1_OPT_NO_CONTEXT; |
| } |
|
|
| free_ctx = true; |
| } |
|
|
| enum ggml_v1_opt_result result = GGML_V1_OPT_OK; |
|
|
| |
| struct ggml_v1_cgraph gf = ggml_v1_build_forward (f); |
| struct ggml_v1_cgraph gb = ggml_v1_build_backward(ctx, &gf, false); |
|
|
| switch (params.type) { |
| case GGML_V1_OPT_ADAM: |
| { |
| result = ggml_v1_opt_adam(ctx, params, f, &gf, &gb); |
| } break; |
| case GGML_V1_OPT_LBFGS: |
| { |
| result = ggml_v1_opt_lbfgs(ctx, params, f, &gf, &gb); |
| } break; |
| } |
|
|
| if (params.print_forward_graph) { |
| ggml_v1_graph_print (&gf); |
| ggml_v1_graph_dump_dot(&gf, NULL, "opt-forward.dot"); |
| } |
|
|
| if (params.print_backward_graph) { |
| ggml_v1_graph_print (&gb); |
| ggml_v1_graph_dump_dot(&gb, &gf, "opt-backward.dot"); |
| } |
|
|
| if (free_ctx) { |
| ggml_v1_free(ctx); |
| } |
|
|
| return result; |
| } |
|
|
| |
|
|
| int ggml_v1_cpu_has_avx(void) { |
| #if defined(__AVX__) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_avx2(void) { |
| #if defined(__AVX2__) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_avx512(void) { |
| #if defined(__AVX512F__) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_fma(void) { |
| #if defined(__FMA__) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_neon(void) { |
| #if defined(__ARM_NEON) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_arm_fma(void) { |
| #if defined(__ARM_FEATURE_FMA) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_f16c(void) { |
| #if defined(__F16C__) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_fp16_va(void) { |
| #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_wasm_simd(void) { |
| #if defined(__wasm_simd128__) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_blas(void) { |
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_sse3(void) { |
| #if defined(__SSE3__) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| int ggml_v1_cpu_has_vsx(void) { |
| #if defined(__POWER9_VECTOR__) |
| return 1; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| |
|
|