#include #include #include /********* generated code snippet *********/ #define N 16 static void fdct_1d(float *dst, const float *src, int stridea, int strideb) { int i; for (i = 0; i < N; i++) { const float x0_0 = src[ 0*stridea] + src[15*stridea]; const float x0_1 = src[ 1*stridea] + src[14*stridea]; const float x0_2 = src[ 2*stridea] + src[13*stridea]; const float x0_3 = src[ 3*stridea] + src[12*stridea]; const float x0_4 = src[ 4*stridea] + src[11*stridea]; const float x0_5 = src[ 5*stridea] + src[10*stridea]; const float x0_6 = src[ 6*stridea] + src[ 9*stridea]; const float x0_7 = src[ 7*stridea] + src[ 8*stridea]; const float x0_8 = src[ 0*stridea] - src[15*stridea]; const float x0_9 = src[ 1*stridea] - src[14*stridea]; const float x0_a = src[ 2*stridea] - src[13*stridea]; const float x0_b = src[ 3*stridea] - src[12*stridea]; const float x0_c = src[ 4*stridea] - src[11*stridea]; const float x0_d = src[ 5*stridea] - src[10*stridea]; const float x0_e = src[ 6*stridea] - src[ 9*stridea]; const float x0_f = src[ 7*stridea] - src[ 8*stridea]; const float x2_0 = x0_0 + x0_7; const float x2_1 = x0_1 + x0_6; const float x2_2 = x0_2 + x0_5; const float x2_3 = x0_3 + x0_4; const float x2_4 = x0_0 - x0_7; const float x2_5 = x0_1 - x0_6; const float x2_6 = x0_2 - x0_5; const float x2_7 = x0_3 - x0_4; const float x4_0 = x2_0 + x2_3; const float x4_1 = x2_1 + x2_2; const float x4_2 = x2_0 - x2_3; const float x4_3 = x2_1 - x2_2; const float x5_0 = x4_0 + x4_1; const float x5_1 = x4_0 - x4_1; const float x5_2 = 1.30656296487638*x4_2 + 0.541196100146197*x4_3; const float x5_3 = 0.541196100146197*x4_2 - 1.30656296487638*x4_3; const float x6_0 = 1.38703984532215*x2_4 + 0.275899379282943*x2_7; const float x6_1 = 1.17587560241936*x2_5 + 0.785694958387102*x2_6; const float x6_2 = -0.785694958387102*x2_5 + 1.17587560241936*x2_6; const float x6_3 = 0.275899379282943*x2_4 - 1.38703984532215*x2_7; const float x7_0 = x6_0 + x6_1; const float x7_1 = x6_0 - x6_1; const float x7_2 = x6_2 + x6_3; const float x7_3 = x6_2 - x6_3; const float x3_5 = 0.707106781186547*x7_1 - 0.707106781186547*x7_3; const float x3_6 = 0.707106781186547*x7_1 + 0.707106781186547*x7_3; const float x8_0 = 1.40740373752638*x0_8 + 0.138617169199091*x0_f; const float x8_1 = 1.35331800117435*x0_9 + 0.410524527522357*x0_e; const float x8_2 = 1.24722501298667*x0_a + 0.666655658477747*x0_d; const float x8_3 = 1.09320186700176*x0_b + 0.897167586342636*x0_c; const float x8_4 = -0.897167586342636*x0_b + 1.09320186700176*x0_c; const float x8_5 = 0.666655658477747*x0_a - 1.24722501298667*x0_d; const float x8_6 = -0.410524527522357*x0_9 + 1.35331800117435*x0_e; const float x8_7 = 0.138617169199091*x0_8 - 1.40740373752638*x0_f; const float xa_0 = x8_0 + x8_3; const float xa_1 = x8_1 + x8_2; const float xa_2 = x8_0 - x8_3; const float xa_3 = x8_1 - x8_2; const float xb_0 = xa_0 + xa_1; const float xb_1 = xa_0 - xa_1; const float xb_2 = 1.30656296487638*xa_2 + 0.541196100146197*xa_3; const float xb_3 = 0.541196100146197*xa_2 - 1.30656296487638*xa_3; const float xc_0 = x8_4 + x8_7; const float xc_1 = x8_5 + x8_6; const float xc_2 = x8_4 - x8_7; const float xc_3 = x8_5 - x8_6; const float xd_0 = xc_0 + xc_1; const float xd_1 = xc_0 - xc_1; const float xd_2 = 1.30656296487638*xc_2 + 0.541196100146197*xc_3; const float xd_3 = 0.541196100146197*xc_2 - 1.30656296487638*xc_3; const float x1_9 = 0.707106781186547*xb_2 - 0.707106781186547*xd_3; const float x1_a = 0.707106781186547*xb_2 + 0.707106781186547*xd_3; const float x1_b = 0.707106781186547*xb_1 + 0.707106781186547*xd_1; const float x1_c = 0.707106781186547*xb_1 - 0.707106781186547*xd_1; const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2; const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2; dst[ 0*stridea] = 0.25*x5_0; dst[ 1*stridea] = 0.25*xb_0; dst[ 2*stridea] = 0.25*x7_0; dst[ 3*stridea] = 0.25*x1_9; dst[ 4*stridea] = 0.25*x5_2; dst[ 5*stridea] = 0.25*x1_a; dst[ 6*stridea] = 0.25*x3_5; dst[ 7*stridea] = 0.25*x1_b; dst[ 8*stridea] = 0.25*x5_1; dst[ 9*stridea] = 0.25*x1_c; dst[10*stridea] = 0.25*x3_6; dst[11*stridea] = 0.25*x1_d; dst[12*stridea] = 0.25*x5_3; dst[13*stridea] = 0.25*x1_e; dst[14*stridea] = 0.25*x7_2; dst[15*stridea] = 0.25*xd_0; dst += strideb; src += strideb; } } static void fdct(float *dst, const float *src) { float tmp[N*N]; fdct_1d(tmp, src, 1, N); fdct_1d(dst, tmp, N, 1); } static void idct_1d(float *dst, const float *src, int stridea, int strideb) { int i; for (i = 0; i < N; i++) { const float xe_0 = 1.4142135623731*src[ 0*stridea]; const float xe_1 = 1.40740373752638*src[ 1*stridea] + 0.138617169199091*src[15*stridea]; const float xe_2 = 1.38703984532215*src[ 2*stridea] + 0.275899379282943*src[14*stridea]; const float xe_3 = 1.35331800117435*src[ 3*stridea] + 0.410524527522357*src[13*stridea]; const float xe_4 = 1.30656296487638*src[ 4*stridea] + 0.541196100146197*src[12*stridea]; const float xe_5 = 1.24722501298667*src[ 5*stridea] + 0.666655658477747*src[11*stridea]; const float xe_6 = 1.17587560241936*src[ 6*stridea] + 0.785694958387102*src[10*stridea]; const float xe_7 = 1.09320186700176*src[ 7*stridea] + 0.897167586342636*src[ 9*stridea]; const float xe_8 = 1.4142135623731*src[ 8*stridea]; const float xe_9 = -0.897167586342636*src[ 7*stridea] + 1.09320186700176*src[ 9*stridea]; const float xe_a = 0.785694958387102*src[ 6*stridea] - 1.17587560241936*src[10*stridea]; const float xe_b = -0.666655658477747*src[ 5*stridea] + 1.24722501298667*src[11*stridea]; const float xe_c = 0.541196100146197*src[ 4*stridea] - 1.30656296487638*src[12*stridea]; const float xe_d = -0.410524527522357*src[ 3*stridea] + 1.35331800117435*src[13*stridea]; const float xe_e = 0.275899379282943*src[ 2*stridea] - 1.38703984532215*src[14*stridea]; const float xe_f = -0.138617169199091*src[ 1*stridea] + 1.40740373752638*src[15*stridea]; const float x10_0 = xe_0 + xe_8; const float x10_1 = xe_1 + xe_7; const float x10_2 = xe_2 + xe_6; const float x10_3 = xe_3 + xe_5; const float x10_4 = 1.4142135623731*xe_4; const float x10_5 = xe_0 - xe_8; const float x10_6 = xe_1 - xe_7; const float x10_7 = xe_2 - xe_6; const float x10_8 = xe_3 - xe_5; const float x12_0 = x10_0 + x10_4; const float x12_1 = x10_1 + x10_3; const float x12_2 = 1.4142135623731*x10_2; const float x12_3 = x10_0 - x10_4; const float x12_4 = x10_1 - x10_3; const float x13_0 = 0.5*x12_0 + 0.707106781186548*x12_1 + 0.5*x12_2; const float x13_1 = 0.707106781186548*x12_0 - 0.707106781186548*x12_2; const float x13_2 = 0.5*x12_0 - 0.707106781186548*x12_1 + 0.5*x12_2; const float x13_3 = 0.707106781186547*x12_3 + 0.707106781186547*x12_4; const float x13_4 = 0.707106781186547*x12_3 - 0.707106781186547*x12_4; const float x14_0 = 1.4142135623731*x10_5; const float x14_1 = 1.30656296487638*x10_6 + 0.541196100146197*x10_8; const float x14_2 = 1.4142135623731*x10_7; const float x14_3 = -0.541196100146197*x10_6 + 1.30656296487638*x10_8; const float x15_0 = 0.5*x14_0 + 0.707106781186548*x14_1 + 0.5*x14_2; const float x15_1 = 0.707106781186548*x14_0 - 0.707106781186548*x14_2; const float x15_2 = 0.5*x14_0 - 0.707106781186548*x14_1 + 0.5*x14_2; const float x11_6 = 0.707106781186547*x15_1 - 0.707106781186547*x14_3; const float x11_7 = 0.707106781186547*x15_1 + 0.707106781186547*x14_3; const float x16_0 = 1.4142135623731*xe_c; const float x16_1 = xe_b + xe_d; const float x16_2 = xe_a + xe_e; const float x16_3 = xe_9 + xe_f; const float x16_4 = xe_9 - xe_f; const float x16_5 = xe_a - xe_e; const float x16_6 = xe_b - xe_d; const float x18_0 = 1.4142135623731*x16_0; const float x18_1 = 1.30656296487638*x16_1 + 0.541196100146197*x16_3; const float x18_2 = 1.4142135623731*x16_2; const float x18_3 = -0.541196100146197*x16_1 + 1.30656296487638*x16_3; const float x19_0 = 0.5*x18_0 + 0.707106781186548*x18_1 + 0.5*x18_2; const float x19_1 = 0.707106781186548*x18_0 - 0.707106781186548*x18_2; const float x19_2 = 0.5*x18_0 - 0.707106781186548*x18_1 + 0.5*x18_2; const float x17_1 = 0.707106781186547*x19_1 - 0.707106781186547*x18_3; const float x17_2 = 0.707106781186547*x19_1 + 0.707106781186547*x18_3; const float x1a_0 = 1.4142135623731*x16_5; const float x1a_1 = x16_4 + x16_6; const float x1a_2 = x16_4 - x16_6; const float x1b_0 = 0.707106781186547*x1a_0 + 0.707106781186547*x1a_1; const float x1b_1 = 0.707106781186547*x1a_0 - 0.707106781186547*x1a_1; const float x17_6 = -x1b_1; const float xf_b = -x17_1; const float xf_f = -x19_2; dst[ 0*stridea] = 0.353553390593274*x13_0; dst[ 1*stridea] = 0.25*x15_0 - 0.25*xf_f; dst[ 2*stridea] = 0.25*x15_0 + 0.25*xf_f; dst[ 3*stridea] = 0.25*x13_3 + 0.25*x17_6; dst[ 4*stridea] = 0.25*x13_3 - 0.25*x17_6; dst[ 5*stridea] = 0.25*x11_6 - 0.25*x17_2; dst[ 6*stridea] = 0.25*x11_6 + 0.25*x17_2; dst[ 7*stridea] = 0.25*x13_1 + 0.25*x1a_2; dst[ 8*stridea] = 0.25*x13_1 - 0.25*x1a_2; dst[ 9*stridea] = 0.25*x11_7 - 0.25*xf_b; dst[10*stridea] = 0.25*x11_7 + 0.25*xf_b; dst[11*stridea] = 0.25*x13_4 + 0.25*x1b_0; dst[12*stridea] = 0.25*x13_4 - 0.25*x1b_0; dst[13*stridea] = 0.25*x15_2 - 0.25*x19_0; dst[14*stridea] = 0.25*x15_2 + 0.25*x19_0; dst[15*stridea] = 0.353553390593274*x13_2; dst += strideb; src += strideb; } } static void idct(float *dst, const float *src) { float tmp[N*N]; idct_1d(tmp, src, 1, N); idct_1d(dst, tmp, N, 1); } /********* slow reference dct code ********/ static float dct_matrix [N*N]; static float dct_trp_matrix[N*N]; void init_dct(void) { int i, j; // dct matrix for (i = 0; i < N; i++) for (j = 0; j < N; j++) if (i == 0) dct_matrix[i*N + j] = 1 / sqrt(N); else dct_matrix[i*N + j] = sqrt(2./N) * cos(((2*j+1)*i*M_PI) / (2*N)); // dct matrix transposed for (i = 0; i < N; i++) for (j = 0; j < N; j++) dct_trp_matrix[i*N + j] = dct_matrix[j*N + i]; } static void dct_1d_ref(float *dst, const float *src, int stridea, int strideb, const float *matrix) { int x; for (x = 0; x < N; x++) { int i, j; for (j = 0; j < N; j++) { float sum = 0.; for (i = 0; i < N; i++) sum += matrix[j*N + i] * src[i*stridea]; dst[j*stridea] = sum; } dst += strideb; src += strideb; } } static void fdct_ref(float *dst, const float *src) { float tmp[N*N]; dct_1d_ref(tmp, src, 1, N, dct_matrix); dct_1d_ref(dst, tmp, N, 1, dct_matrix); } static void idct_ref(float *dst, const float *src) { float tmp[N*N]; dct_1d_ref(tmp, src, 1, N, dct_trp_matrix); dct_1d_ref(dst, tmp, N, 1, dct_trp_matrix); } /********** test **************************/ static int check_output(const char *name, const float *ref, const float *out) { int i, ret = 0; for (i = 0; i < N*N; i++) { const int ok = fabs(ref[i] - out[i]) < 0.0005; if (!ok) { printf("%s ref:%9.3f out:%9.3f diff:%9.3f\n", name, ref[i], out[i], ref[i] - out[i]); ret = -1; } } return ret; } int main() { int i; float src[N*N]; float ref_fdct[N*N], ref_idct[N*N]; float out_fdct[N*N], out_idct[N*N]; for (i = 0; i < N*N; i++) src[i] = random() % 256; init_dct(); fdct_ref(ref_fdct, src); fdct(out_fdct, src); if (check_output("FDCT", ref_fdct, out_fdct) < 0) return 1; idct_ref(ref_idct, ref_fdct); idct(out_idct, out_fdct); if (check_output("IDCT", ref_idct, out_idct) < 0) return 1; return 0; }