[mad-dev] short imdct

Rafael Ávila de Espíndola rafael.espindola@ic.unicamp.br
Sat, 28 Feb 2004 16:45:18 -0300


--Boundary-00=_O/OQA81J7Mejltb
Content-Type: Text/Plain;
  charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable
Content-Description: clearsigned data
Content-Disposition: inline

=2D----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Is there any reason for not using the Szu-Wei Lee's algorithm for the short=
=20
IMDCT (size 12)?

Annexed is a patch witch implements it.

Rafael =C1vila de Esp=EDndola

=2D----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.4 (GNU/Linux)

iD8DBQFAQO/RLlrfGJ8JUHwRAogGAJ9c8dZFc+vHM5SJ94FJGgJ8g2IJWQCfbyr5
qczH/qDxC7XBLkbkXn6mGcQ=3D
=3DgjVQ
=2D----END PGP SIGNATURE-----

--Boundary-00=_O/OQA81J7Mejltb
Content-Type: text/x-diff;
  charset="us-ascii";
  name="imdct12.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
	filename="imdct12.patch"

--- libmad-0.15.1b/layer3.c	2004-01-23 07:41:32.000000000 -0200
+++ libmad-0.15.1b_new/layer3.c	2004-02-28 17:35:53.000000000 -0300
@@ -1577,7 +1577,7 @@
 # else
 #  if 1
 static
-void fastsdct(mad_fixed_t const x[9], mad_fixed_t y[18])
+void sdct9(mad_fixed_t const x[9], mad_fixed_t y[18])
 {
   mad_fixed_t a0,  a1,  a2,  a3,  a4,  a5,  a6,  a7,  a8,  a9,  a10, a11, a12;
   mad_fixed_t a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25;
@@ -1644,59 +1644,75 @@
   y[16] = a22 + m7;
 }
 
+static
+void sdct3(mad_fixed_t const x[3], mad_fixed_t y[6]){
+  mad_fixed_t a0,a1;
+  static const mad_fixed_t d0=MAD_F(0x1bb67ae8); /*2*cos(M_PI/6)*/;
+  a0=x[0]+x[2];
+  a1=x[0]-x[2];
+  y[0]=x[1]+a0;
+  y[2]=mad_f_mul(d0,a1);
+  y[4]=a0-2*x[1];
+}
+
 static inline
-void sdctII(mad_fixed_t const x[18], mad_fixed_t X[18])
+void sdctII(int N, mad_fixed_t const x[18], mad_fixed_t X[18])
 {
-  mad_fixed_t tmp[9];
+  mad_fixed_t tmp[N/2];
   int i;
 
-  /* scale[i] = 2 * cos(PI * (2 * i + 1) / (2 * 18)) */
-  static mad_fixed_t const scale[9] = {
+  /* scale[i] = 2 * cos(PI * (2 * i + 1) / (2 * N)) */
+  static mad_fixed_t const scale36[9] = {
     MAD_F(0x1fe0d3b4), MAD_F(0x1ee8dd47), MAD_F(0x1d007930),
     MAD_F(0x1a367e59), MAD_F(0x16a09e66), MAD_F(0x125abcf8),
     MAD_F(0x0d8616bc), MAD_F(0x08483ee1), MAD_F(0x02c9fad7)
   };
+  static mad_fixed_t const scale12[3] = {
+    MAD_F(0x1ee8dd47), MAD_F(0x16a09e66), MAD_F(0x08483ee1)
+  };
 
   /* divide the 18-point SDCT-II into two 9-point SDCT-IIs */
 
   /* even input butterfly */
 
-  for (i = 0; i < 9; i += 3) {
-    tmp[i + 0] = x[i + 0] + x[18 - (i + 0) - 1];
-    tmp[i + 1] = x[i + 1] + x[18 - (i + 1) - 1];
-    tmp[i + 2] = x[i + 2] + x[18 - (i + 2) - 1];
+  for (i = 0; i < N/2; i ++) {
+    tmp[i + 0] = x[i + 0] + x[N - (i + 0) - 1];
   }
 
-  fastsdct(tmp, &X[0]);
+  if(N==18)
+    sdct9(tmp, &X[0]);
+  else
+    sdct3(tmp, &X[0]);
 
   /* odd input butterfly and scaling */
 
-  for (i = 0; i < 9; i += 3) {
-    tmp[i + 0] = mad_f_mul(x[i + 0] - x[18 - (i + 0) - 1], scale[i + 0]);
-    tmp[i + 1] = mad_f_mul(x[i + 1] - x[18 - (i + 1) - 1], scale[i + 1]);
-    tmp[i + 2] = mad_f_mul(x[i + 2] - x[18 - (i + 2) - 1], scale[i + 2]);
+  for (i = 0; i < N/2; i ++) {
+    if(N==18)
+      tmp[i + 0] = mad_f_mul(x[i + 0] - x[N - (i + 0) - 1], scale36[i + 0]);
+    else
+      tmp[i + 0] = mad_f_mul(x[i + 0] - x[N - (i + 0) - 1], scale12[i + 0]);
   }
 
-  fastsdct(tmp, &X[1]);
+  if(N==18)
+    sdct9(tmp, &X[1]);
+  else
+    sdct3(tmp, &X[1]);
 
   /* output accumulation */
 
-  for (i = 3; i < 18; i += 8) {
+  for (i = 3; i < N; i += 2) {
     X[i + 0] -= X[(i + 0) - 2];
-    X[i + 2] -= X[(i + 2) - 2];
-    X[i + 4] -= X[(i + 4) - 2];
-    X[i + 6] -= X[(i + 6) - 2];
   }
 }
 
 static inline
-void dctIV(mad_fixed_t const y[18], mad_fixed_t X[18])
+void dctIV(int N, mad_fixed_t const y[18], mad_fixed_t X[18])
 {
-  mad_fixed_t tmp[18];
+  mad_fixed_t tmp[N];
   int i;
 
-  /* scale[i] = 2 * cos(PI * (2 * i + 1) / (4 * 18)) */
-  static mad_fixed_t const scale[18] = {
+  /* scale[i] = 2 * cos(PI * (2 * i + 1) / (4 * N)) */
+  static mad_fixed_t const scale36[18] = {
     MAD_F(0x1ff833fa), MAD_F(0x1fb9ea93), MAD_F(0x1f3dd120),
     MAD_F(0x1e84d969), MAD_F(0x1d906bcf), MAD_F(0x1c62648b),
     MAD_F(0x1afd100f), MAD_F(0x1963268b), MAD_F(0x1797c6a4),
@@ -1704,61 +1720,56 @@
     MAD_F(0x0ec6a507), MAD_F(0x0c3ef153), MAD_F(0x099f61c5),
     MAD_F(0x06ed12c5), MAD_F(0x042d4544), MAD_F(0x0165547c)
   };
+  static mad_fixed_t const scale12[6]={
+    MAD_F(0x1fb9ea93), MAD_F(0x1d906bcf), MAD_F(0x1963268b),
+    MAD_F(0x137af940), MAD_F(0x0c3ef153), MAD_F(0x042d4544)
+  };
 
   /* scaling */
 
-  for (i = 0; i < 18; i += 3) {
-    tmp[i + 0] = mad_f_mul(y[i + 0], scale[i + 0]);
-    tmp[i + 1] = mad_f_mul(y[i + 1], scale[i + 1]);
-    tmp[i + 2] = mad_f_mul(y[i + 2], scale[i + 2]);
+  for (i = 0; i < N; i ++) {
+    if(N==18)
+      tmp[i + 0] = mad_f_mul(y[i + 0], scale36[i + 0]);
+    else
+      tmp[i + 0] = mad_f_mul(y[i + 0], scale12[i + 0]);
   }
 
   /* SDCT-II */
 
-  sdctII(tmp, X);
+  sdctII(N, tmp, X);
 
   /* scale reduction and output accumulation */
 
   X[0] /= 2;
-  for (i = 1; i < 17; i += 4) {
+  for (i = 1; i < N; i ++) {
     X[i + 0] = X[i + 0] / 2 - X[(i + 0) - 1];
-    X[i + 1] = X[i + 1] / 2 - X[(i + 1) - 1];
-    X[i + 2] = X[i + 2] / 2 - X[(i + 2) - 1];
-    X[i + 3] = X[i + 3] / 2 - X[(i + 3) - 1];
   }
-  X[17] = X[17] / 2 - X[16];
 }
 
 /*
- * NAME:	imdct36
- * DESCRIPTION:	perform X[18]->x[36] IMDCT using Szu-Wei Lee's fast algorithm
+ * NAME:	imdct
+ * DESCRIPTION:	perform X[N/2]->x[N] IMDCT using Szu-Wei Lee's fast algorithm
  */
-static inline
-void imdct36(mad_fixed_t const x[18], mad_fixed_t y[36])
+static
+void imdct(int N, mad_fixed_t const x[18], mad_fixed_t y[36])
 {
-  mad_fixed_t tmp[18];
+  mad_fixed_t tmp[N/2];
   int i;
 
   /* DCT-IV */
 
-  dctIV(x, tmp);
+  dctIV(N/2, x, tmp);
 
-  /* convert 18-point DCT-IV to 36-point IMDCT */
+  /* convert N/2-point DCT-IV to N-point IMDCT */
 
-  for (i =  0; i <  9; i += 3) {
-    y[i + 0] =  tmp[9 + (i + 0)];
-    y[i + 1] =  tmp[9 + (i + 1)];
-    y[i + 2] =  tmp[9 + (i + 2)];
-  }
-  for (i =  9; i < 27; i += 3) {
-    y[i + 0] = -tmp[36 - (9 + (i + 0)) - 1];
-    y[i + 1] = -tmp[36 - (9 + (i + 1)) - 1];
-    y[i + 2] = -tmp[36 - (9 + (i + 2)) - 1];
-  }
-  for (i = 27; i < 36; i += 3) {
-    y[i + 0] = -tmp[(i + 0) - 27];
-    y[i + 1] = -tmp[(i + 1) - 27];
-    y[i + 2] = -tmp[(i + 2) - 27];
+  for (i =  0; i <  N/4; i ++) {
+    y[i + 0] =  tmp[N/4 + (i + 0)];
+  }
+  for (i =  N/4; i < 3*N/4; i ++) {
+    y[i + 0] = -tmp[N - (N/4 + (i + 0)) - 1];
+  }
+  for (i = 3*N/4; i < N; i ++) {
+    y[i + 0] = -tmp[(i + 0) - 3*N/4];
   }
 }
 #  else
@@ -2066,7 +2077,7 @@
 
   /* IMDCT */
 
-  imdct36(X, z);
+  imdct(36, X, z);
 
   /* windowing */
 
@@ -2159,36 +2170,7 @@
   yptr = &y[0];
 
   for (w = 0; w < 3; ++w) {
-    register mad_fixed_t const (*s)[6];
-
-    s = imdct_s;
-
-    for (i = 0; i < 3; ++i) {
-      MAD_F_ML0(hi, lo, X[0], (*s)[0]);
-      MAD_F_MLA(hi, lo, X[1], (*s)[1]);
-      MAD_F_MLA(hi, lo, X[2], (*s)[2]);
-      MAD_F_MLA(hi, lo, X[3], (*s)[3]);
-      MAD_F_MLA(hi, lo, X[4], (*s)[4]);
-      MAD_F_MLA(hi, lo, X[5], (*s)[5]);
-
-      yptr[i + 0] = MAD_F_MLZ(hi, lo);
-      yptr[5 - i] = -yptr[i + 0];
-
-      ++s;
-
-      MAD_F_ML0(hi, lo, X[0], (*s)[0]);
-      MAD_F_MLA(hi, lo, X[1], (*s)[1]);
-      MAD_F_MLA(hi, lo, X[2], (*s)[2]);
-      MAD_F_MLA(hi, lo, X[3], (*s)[3]);
-      MAD_F_MLA(hi, lo, X[4], (*s)[4]);
-      MAD_F_MLA(hi, lo, X[5], (*s)[5]);
-
-      yptr[ i + 6] = MAD_F_MLZ(hi, lo);
-      yptr[11 - i] = yptr[i + 6];
-
-      ++s;
-    }
-
+    imdct(12, X, yptr);
     yptr += 12;
     X    += 6;
   }

--Boundary-00=_O/OQA81J7Mejltb--