# https://tmkk.undo.jp/lame/lame-3.100-altivec-20171014.diff

--- libmp3lame/fft.c.orig	2017-09-07 04:33:36.000000000 +0900
+++ libmp3lame/fft.c	2017-10-14 18:02:08.000000000 +0900
@@ -38,6 +38,12 @@
 # include <config.h>
 #endif
 
+#ifdef ALTIVEC
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
+
 #include "lame.h"
 #include "machine.h"
 #include "encoder.h"
@@ -66,6 +72,17 @@ fht(FLOAT * fz, int n)
     int     k4;
     FLOAT  *fi, *gi;
     FLOAT const *fn;
+#ifdef ALTIVEC
+    float csvec[16] __attribute__ ((aligned (16)));
+    vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
+    vector float vfi0,vfi1,vfi2,vfi3,vgi0,vgi1,vgi2,vgi3,vf0,vf1,vf2,vf3,vg0,vg1,vg2,vg3;
+    vector float vprev1,vprev2,vprev3,vprev4,vc1,vc2,vs1,vs2,vzero;
+    vector unsigned char vperm1,vperm2;
+    
+    vperm1 = (vector unsigned char)VINIT16(16,17,18,19,12,13,14,15,8,9,10,11,4,5,6,7);
+    vperm2 = (vector unsigned char)VINIT16(16,17,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
+    vzero = vec_xor(vzero,vzero);
+#endif
 
     n <<= 1;            /* to get BLKSIZE, because of 3DNow! ASM routine */
     fn = fz + n;
@@ -103,6 +120,238 @@ fht(FLOAT * fz, int n)
         } while (fi < fn);
         c1 = tri[0];
         s1 = tri[1];
+#ifdef ALTIVEC
+        if(kx < 4) {
+            for (i = 1; i < kx; i++) {
+                FLOAT   c2, s2;
+                c2 = 1 - (2 * s1) * s1;
+                s2 = (2 * s1) * c1;
+                fi = fz + i;
+                gi = fz + k1 - i;
+                do {
+                    FLOAT   a, b, g0, f0, f1, g1, f2, g2, f3, g3;
+                    b = s2 * fi[k1] - c2 * gi[k1];
+                    a = c2 * fi[k1] + s2 * gi[k1];
+                    f1 = fi[0] - a;
+                    f0 = fi[0] + a;
+                    g1 = gi[0] - b;
+                    g0 = gi[0] + b;
+                    b = s2 * fi[k3] - c2 * gi[k3];
+                    a = c2 * fi[k3] + s2 * gi[k3];
+                    f3 = fi[k2] - a;
+                    f2 = fi[k2] + a;
+                    g3 = gi[k2] - b;
+                    g2 = gi[k2] + b;
+                    b = s1 * f2 - c1 * g3;
+                    a = c1 * f2 + s1 * g3;
+                    fi[k2] = f0 - a;
+                    fi[0] = f0 + a;
+                    gi[k3] = g1 - b;
+                    gi[k1] = g1 + b;
+                    b = c1 * g2 - s1 * f3;
+                    a = s1 * g2 + c1 * f3;
+                    gi[k2] = g0 - a;
+                    gi[0] = g0 + a;
+                    fi[k3] = f1 - b;
+                    fi[k1] = f1 + b;
+                    gi += k4;
+                    fi += k4;
+                } while (fi < fn);
+                c2 = c1;
+                c1 = c2 * tri[0] - s1 * tri[1];
+                s1 = c2 * tri[1] + s1 * tri[0];
+            }
+        }
+        else {
+            FLOAT   c2, s2;
+            for(i = 1; i < 4; i++) {
+                c2 = 1 - (2*s1)*s1;
+                s2 = (2*s1)*c1;
+                csvec[i] = c1;
+                csvec[i+4] = c2;
+                csvec[i+8] = s1;
+                csvec[i+12] = s2;
+                c2 = c1;
+                c1 = c2 * tri[0] - s1 * tri[1];
+                s1 = c2 * tri[1] + s1 * tri[0];
+            }
+            vc1 = vec_ld(0,csvec);
+            vc2 = vec_ld(16,csvec);
+            vs1 = vec_ld(32,csvec);
+            vs2 = vec_ld(48,csvec);
+            fi = fz;
+            gi = fz + k1;
+            do {
+                vfi0 = vec_ld(0,fi);
+                vfi1 = vec_ld(0,fi+k1);
+                vfi2 = vec_ld(0,fi+k2);
+                vfi3 = vec_ld(0,fi+k3);
+                vprev1 = vec_ld(0,gi-4);
+                vprev2 = vec_ld(0,gi+k1-4);
+                vprev3 = vec_ld(0,gi+k2-4);
+                vprev4 = vec_ld(0,gi+k3-4);
+                vgi0 = vec_perm(vprev1,vprev1,vperm1);
+                vgi1 = vec_perm(vprev2,vprev2,vperm1);
+                vgi2 = vec_perm(vprev3,vprev3,vperm1);
+                vgi3 = vec_perm(vprev4,vprev4,vperm1);
+                
+                v1 = vec_madd(vfi1,vc2,vzero);
+                v2 = vec_madd(vfi1,vs2,vzero);
+                v3 = vec_madd(vfi3,vc2,vzero);
+                v4 = vec_madd(vfi3,vs2,vzero);
+                v5 = vec_madd(vgi1,vs2,v1);
+                v6 = vec_nmsub(vgi1,vc2,v2);
+                v7 = vec_madd(vgi3,vs2,v3);
+                v8 = vec_nmsub(vgi3,vc2,v4);
+                
+                vf0 = vec_add(vfi0,v5);
+                vf1 = vec_sub(vfi0,v5);
+                vg0 = vec_add(vgi0,v6);
+                vg1 = vec_sub(vgi0,v6);
+                vf2 = vec_add(vfi2,v7);
+                vf3 = vec_sub(vfi2,v7);
+                vg2 = vec_add(vgi2,v8);
+                vg3 = vec_sub(vgi2,v8);
+                
+                v1 = vec_madd(vf2,vc1,vzero);
+                v2 = vec_madd(vf2,vs1,vzero);
+                v3 = vec_madd(vg2,vs1,vzero);
+                v4 = vec_madd(vg2,vc1,vzero);
+                v5 = vec_madd(vg3,vs1,v1);
+                v6 = vec_nmsub(vg3,vc1,v2);
+                v7 = vec_madd(vf3,vc1,v3);
+                v8 = vec_nmsub(vf3,vs1,v4);
+                
+                v9 = vec_add(vf0,v5);
+                v10 = vec_sub(vf0,v5);
+                v11 = vec_add(vg1,v6);
+                v12 = vec_sub(vg1,v6);
+                v13 = vec_add(vg0,v7);
+                v14 = vec_sub(vg0,v7);
+                v15 = vec_add(vf1,v8);
+                v16 = vec_sub(vf1,v8);
+                
+                v1 = vec_perm(v9,vfi0,vperm2);
+                v2 = vec_perm(v10,vfi2,vperm2);
+                v3 = vec_perm(v15,vfi1,vperm2);
+                v4 = vec_perm(v16,vfi3,vperm2);
+                vec_st(v1,0,fi);
+                vec_st(v2,0,fi+k2);
+                vec_st(v3,0,fi+k1);
+                vec_st(v4,0,fi+k3);
+                
+                v1 = vec_perm(v11,vprev2,vperm1);
+                v2 = vec_perm(v12,vprev4,vperm1);
+                v3 = vec_perm(v13,vprev1,vperm1);
+                v4 = vec_perm(v14,vprev3,vperm1);
+                vec_st(v1,0,gi+k1-4);
+                vec_st(v2,0,gi+k3-4);
+                vec_st(v3,0,gi-4);
+                vec_st(v4,0,gi+k2-4);
+                
+                gi += k4;
+                fi += k4;
+            } while (fi<fn);
+            
+            /* rest loop */
+            
+            for (i = 4; i < kx; i+=4) {
+                int j;
+                for(j = 0; j < 4; j++) {
+                    c2 = 1 - (2*s1)*s1;
+                    s2 = (2*s1)*c1;
+                    csvec[j] = c1;
+                    csvec[j+4] = c2;
+                    csvec[j+8] = s1;
+                    csvec[j+12] = s2;
+                    c2 = c1;
+                    c1 = c2 * tri[0] - s1 * tri[1];
+                    s1 = c2 * tri[1] + s1 * tri[0];
+                }
+                vc1 = vec_ld(0,csvec);
+                vc2 = vec_ld(16,csvec);
+                vs1 = vec_ld(32,csvec);
+                vs2 = vec_ld(48,csvec);
+                fi = fz + i;
+                gi = fz + k1 - i;
+                do {
+                    vfi0 = vec_ld(0,fi);
+                    vfi1 = vec_ld(0,fi+k1);
+                    vfi2 = vec_ld(0,fi+k2);
+                    vfi3 = vec_ld(0,fi+k3);
+                    vprev1 = vec_ld(0,gi-4);
+                    v1 = vec_ld(0,gi);
+                    vprev2 = vec_ld(0,gi+k1-4);
+                    v2 = vec_ld(0,gi+k1);
+                    vprev3 = vec_ld(0,gi+k2-4);
+                    v3 = vec_ld(0,gi+k2);
+                    vprev4 = vec_ld(0,gi+k3-4);
+                    v4 = vec_ld(0,gi+k3);
+                    vgi0 = vec_perm(vprev1,v1,vperm1);
+                    vgi1 = vec_perm(vprev2,v2,vperm1);
+                    vgi2 = vec_perm(vprev3,v3,vperm1);
+                    vgi3 = vec_perm(vprev4,v4,vperm1);
+                    
+                    v1 = vec_madd(vfi1,vc2,vzero);
+                    v2 = vec_madd(vfi1,vs2,vzero);
+                    v3 = vec_madd(vfi3,vc2,vzero);
+                    v4 = vec_madd(vfi3,vs2,vzero);
+                    v5 = vec_madd(vgi1,vs2,v1);
+                    v6 = vec_nmsub(vgi1,vc2,v2);
+                    v7 = vec_madd(vgi3,vs2,v3);
+                    v8 = vec_nmsub(vgi3,vc2,v4);
+                    
+                    vf0 = vec_add(vfi0,v5);
+                    vf1 = vec_sub(vfi0,v5);
+                    vg0 = vec_add(vgi0,v6);
+                    vg1 = vec_sub(vgi0,v6);
+                    vf2 = vec_add(vfi2,v7);
+                    vf3 = vec_sub(vfi2,v7);
+                    vg2 = vec_add(vgi2,v8);
+                    vg3 = vec_sub(vgi2,v8);
+                    
+                    v1 = vec_madd(vf2,vc1,vzero);
+                    v2 = vec_madd(vf2,vs1,vzero);
+                    v3 = vec_madd(vg2,vs1,vzero);
+                    v4 = vec_madd(vg2,vc1,vzero);
+                    v5 = vec_madd(vg3,vs1,v1);
+                    v6 = vec_nmsub(vg3,vc1,v2);
+                    v7 = vec_madd(vf3,vc1,v3);
+                    v8 = vec_nmsub(vf3,vs1,v4);
+                    
+                    v9 = vec_add(vf0,v5);
+                    v10 = vec_sub(vf0,v5);
+                    v11 = vec_add(vg1,v6);
+                    v12 = vec_sub(vg1,v6);
+                    v13 = vec_add(vg0,v7);
+                    v14 = vec_sub(vg0,v7);
+                    v15 = vec_add(vf1,v8);
+                    v16 = vec_sub(vf1,v8);
+                    
+                    vec_st(v9,0,fi);
+                    vec_st(v10,0,fi+k2);
+                    vec_st(v15,0,fi+k1);
+                    vec_st(v16,0,fi+k3);
+                    
+                    v1 = vec_perm(v11,vprev2,vperm1);
+                    v2 = vec_perm(v12,vprev4,vperm1);
+                    v3 = vec_perm(v13,vprev1,vperm1);
+                    v4 = vec_perm(v14,vprev3,vperm1);
+                    vec_st(v1,0,gi+k1-4);
+                    vec_ste(v11,0,gi+k1);
+                    vec_st(v2,0,gi+k3-4);
+                    vec_ste(v12,0,gi+k3);
+                    vec_st(v3,0,gi-4);
+                    vec_ste(v13,0,gi);
+                    vec_st(v4,0,gi+k2-4);
+                    vec_ste(v14,0,gi+k2);
+                    
+                    gi += k4;
+                    fi += k4;
+                } while (fi<fn);
+            }
+        }
+#else
         for (i = 1; i < kx; i++) {
             FLOAT   c2, s2;
             c2 = 1 - (2 * s1) * s1;
@@ -142,6 +391,7 @@ fht(FLOAT * fz, int n)
             c1 = c2 * tri[0] - s1 * tri[1];
             s1 = c2 * tri[1] + s1 * tri[0];
         }
+#endif
         tri += 2;
     } while (k4 < n);
 }
--- libmp3lame/gain_analysis.c.orig	2017-10-11 04:08:39.000000000 +0900
+++ libmp3lame/gain_analysis.c	2017-10-14 18:04:59.000000000 +0900
@@ -92,6 +92,12 @@
 #include <config.h>
 #endif
 
+#ifdef ALTIVEC
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -109,6 +115,67 @@
 
 
 /*lint -save -e736 loss of precision */
+#ifdef ALTIVEC
+static const Float_t ABYule[9][2 * YULE_ORDER + 1 + 3] __attribute__ ((aligned (16))) = {
+    {0.03857599435200, -3.84664617118067, -0.02160367184185, 7.81501653005538, -0.00123395316851,
+     -11.34170355132042, -0.00009291677959, 13.05504219327545, -0.01655260341619,
+     -12.28759895145294, 0.02161526843274, 9.48293806319790, -0.02074045215285, -5.87257861775999,
+     0.00594298065125, 2.75465861874613, 0.00306428023191, -0.86984376593551, 0.00012025322027,
+     0.13919314567432, 0.00288463683916, 0.0, 0.0, 0.0},
+    {0.05418656406430, -3.47845948550071, -0.02911007808948, 6.36317777566148, -0.00848709379851,
+     -8.54751527471874, -0.00851165645469, 9.47693607801280, -0.00834990904936, -8.81498681370155,
+     0.02245293253339, 6.85401540936998, -0.02596338512915, -4.39470996079559, 0.01624864962975,
+     2.19611684890774, -0.00240879051584, -0.75104302451432, 0.00674613682247, 0.13149317958808,
+     -0.00187763777362, 0.0, 0.0, 0.0},
+    {0.15457299681924, -2.37898834973084, -0.09331049056315, 2.84868151156327, -0.06247880153653,
+     -2.64577170229825, 0.02163541888798, 2.23697657451713, -0.05588393329856, -1.67148153367602,
+     0.04781476674921, 1.00595954808547, 0.00222312597743, -0.45953458054983, 0.03174092540049,
+     0.16378164858596, -0.01390589421898, -0.05032077717131, 0.00651420667831, 0.02347897407020,
+     -0.00881362733839, 0.0, 0.0, 0.0},
+    {0.30296907319327, -1.61273165137247, -0.22613988682123, 1.07977492259970, -0.08587323730772,
+     -0.25656257754070, 0.03282930172664, -0.16276719120440, -0.00915702933434, -0.22638893773906,
+     -0.02364141202522, 0.39120800788284, -0.00584456039913, -0.22138138954925, 0.06276101321749,
+     0.04500235387352, -0.00000828086748, 0.02005851806501, 0.00205861885564, 0.00302439095741,
+     -0.02950134983287, 0.0, 0.0, 0.0},
+    {0.33642304856132, -1.49858979367799, -0.25572241425570, 0.87350271418188, -0.11828570177555,
+     0.12205022308084, 0.11921148675203, -0.80774944671438, -0.07834489609479, 0.47854794562326,
+     -0.00469977914380, -0.12453458140019, -0.00589500224440, -0.04067510197014, 0.05724228140351,
+     0.08333755284107, 0.00832043980773, -0.04237348025746, -0.01635381384540, 0.02977207319925,
+     -0.01760176568150, 0.0, 0.0, 0.0},
+    {0.44915256608450, -0.62820619233671, -0.14351757464547, 0.29661783706366, -0.22784394429749,
+     -0.37256372942400, -0.01419140100551, 0.00213767857124, 0.04078262797139, -0.42029820170918,
+     -0.12398163381748, 0.22199650564824, 0.04097565135648, 0.00613424350682, 0.10478503600251,
+     0.06747620744683, -0.01863887810927, 0.05784820375801, -0.03193428438915, 0.03222754072173,
+     0.00541907748707, 0.0, 0.0, 0.0},
+    {0.56619470757641, -1.04800335126349, -0.75464456939302, 0.29156311971249, 0.16242137742230,
+     -0.26806001042947, 0.16744243493672, 0.00819999645858, -0.18901604199609, 0.45054734505008,
+     0.30931782841830, -0.33032403314006, -0.27562961986224, 0.06739368333110, 0.00647310677246,
+     -0.04784254229033, 0.08647503780351, 0.01639907836189, -0.03788984554840, 0.01807364323573,
+     -0.00588215443421, 0.0, 0.0, 0.0},
+    {0.58100494960553, -0.51035327095184, -0.53174909058578, -0.31863563325245, -0.14289799034253,
+     -0.20256413484477, 0.17520704835522, 0.14728154134330, 0.02377945217615, 0.38952639978999,
+     0.15558449135573, -0.23313271880868, -0.25344790059353, -0.05246019024463, 0.01628462406333,
+     -0.02505961724053, 0.06920467763959, 0.02442357316099, -0.03721611395801, 0.01818801111503,
+     -0.00749618797172, 0.0, 0.0, 0.0},
+    {0.53648789255105, -0.25049871956020, -0.42163034350696, -0.43193942311114, -0.00275953611929,
+     -0.03424681017675, 0.04267842219415, -0.04678328784242, -0.10214864179676, 0.26408300200955,
+     0.14590772289388, 0.15113130533216, -0.02459864859345, -0.17556493366449, -0.11202315195388,
+     -0.18823009262115, -0.04060034127000, 0.05477720428674, 0.04788665548180, 0.04704409688120,
+     -0.02217936801134, 0.0, 0.0, 0.0}
+};
+
+static const Float_t ABButter[9][2 * BUTTER_ORDER + 1 + 3] __attribute__ ((aligned (16))) = {
+    {0.98621192462708, -1.97223372919527, -1.97242384925416, 0.97261396931306, 0.98621192462708, 0.0, 0.0, 0.0},
+    {0.98500175787242, -1.96977855582618, -1.97000351574484, 0.97022847566350, 0.98500175787242, 0.0, 0.0, 0.0},
+    {0.97938932735214, -1.95835380975398, -1.95877865470428, 0.95920349965459, 0.97938932735214, 0.0, 0.0, 0.0},
+    {0.97531843204928, -1.95002759149878, -1.95063686409857, 0.95124613669835, 0.97531843204928, 0.0, 0.0, 0.0},
+    {0.97316523498161, -1.94561023566527, -1.94633046996323, 0.94705070426118, 0.97316523498161, 0.0, 0.0, 0.0},
+    {0.96454515552826, -1.92783286977036, -1.92909031105652, 0.93034775234268, 0.96454515552826, 0.0, 0.0, 0.0},
+    {0.96009142950541, -1.91858953033784, -1.92018285901082, 0.92177618768381, 0.96009142950541, 0.0, 0.0, 0.0},
+    {0.95856916599601, -1.91542108074780, -1.91713833199203, 0.91885558323625, 0.95856916599601, 0.0, 0.0, 0.0},
+    {0.94597685600279, -1.88903307939452, -1.89195371200558, 0.89487434461664, 0.94597685600279, 0.0, 0.0, 0.0}
+};
+#else
 static const Float_t ABYule[9][multiple_of(4, 2 * YULE_ORDER + 1)] = {
     /* 20                 18                 16                 14                 12                 10                 8                  6                  4                  2                 0                 19                 17                 15                 13                 11                 9                  7                  5                  3                  1              */
     { 0.00288463683916,  0.00012025322027,  0.00306428023191,  0.00594298065125, -0.02074045215285,  0.02161526843274, -0.01655260341619, -0.00009291677959, -0.00123395316851, -0.02160367184185, 0.03857599435200, 0.13919314567432, -0.86984376593551,  2.75465861874613, -5.87257861775999,  9.48293806319790,-12.28759895145294, 13.05504219327545,-11.34170355132042,  7.81501653005538, -3.84664617118067},
@@ -134,6 +201,7 @@ static const Float_t ABButter[9][multipl
     {0.95856916599601, 0.91885558323625, -1.91713833199203, -1.91542108074780, 0.95856916599601},
     {0.94597685600279, 0.89487434461664, -1.89195371200558, -1.88903307939452, 0.94597685600279}
 };
+#endif
 
 /*lint -restore */
 
@@ -143,6 +211,191 @@ static const Float_t ABButter[9][multipl
 
 /* When calling this procedure, make sure that ip[-order] and op[-order] point to real data! */
 
+#ifdef ALTIVEC
+
+static void
+filterIntegrated (const Float_t* input, Float_t* output, Float_t* output2, size_t nSamples, const Float_t* kernel, const Float_t* kernel2)
+{
+	vector float v1,v2,v3,v4,v5,v6,vbase;
+	vector float vmask1,vmask2,vout1,vout2,vout3,vout4,vzero,vkernel1,vkernel2,vkernel3,vkernel4,vkernel5,vkernel6,vkernel7,vkernel8;
+	vector float vo1, vo2, vo3, vo4, vi2, vi3;
+	vector unsigned char vc1,vc2,vc3,vc4,vc5,vperm1,vperm2,vperm4,vperm5,vperm6;
+	
+	vbase = (vector float)VINIT4ALL(1e-10f);
+	vperm1 = (vector unsigned char)VINIT16(24,25,26,27,16,17,18,19,8,9,10,11,0,1,2,3);
+	vperm2 = (vector unsigned char)VINIT16(28,29,30,31,20,21,22,23,12,13,14,15,4,5,6,7);
+	vc1 = vec_splat_u8(1);
+	vc2 = vec_splat_u8(5);
+	vc3 = vec_sl(vc1,vc2);
+	vc4 = vec_sl(vc3,vc1);
+	vc5 = vec_or(vc3,vc4);
+	v1 = (vector float)vec_splat_s32(-1);
+	vmask1 = vec_sro(v1,vc3);
+	vmask2 = vec_sro(v1,vc4);
+	vzero = vec_xor(vzero,vzero);
+	
+	v1 = vec_ld(0,kernel);
+	v2 = vec_ld(16,kernel);
+	v3 = vec_ld(32,kernel);
+	v4 = vec_ld(48,kernel);
+	v5 = vec_ld(64,kernel);
+	v6 = vec_ld(80,kernel);
+	vkernel1 = vec_perm(v1,v2,vperm1);
+	vkernel2 = vec_perm(v1,v2,vperm2);
+	vkernel3 = vec_perm(v3,v4,vperm1);
+	vkernel4 = vec_perm(v3,v4,vperm2);
+	vkernel5 = vec_perm(v5,v6,vperm1);
+	vkernel6 = vec_perm(v5,v6,vperm2);
+	vkernel5 = vec_and(vkernel5,vmask1);
+	vkernel6 = vec_and(vkernel6,vmask2);
+	
+	v1 = vec_ld(0,kernel2);
+	v2 = vec_ld(16,kernel2);
+	vkernel7 = vec_perm(v1,v2,vperm1);
+	vkernel8 = vec_perm(v1,v2,vperm2);
+	vkernel7 = vec_and(vkernel7,vmask1);
+	vkernel8 = vec_and(vkernel8,vmask2);
+	
+	vperm4 = vec_lvsl(0,input-7);
+	vperm5 = vec_lvsl(0,output-4);
+	
+	v1 = vec_ld(15,input-7);
+	v2 = vec_ld(0,input-7);
+	v3 = vec_ld(0,input-10);
+	v4 = vec_ld(15,input-11);
+	vi2 = vec_perm(v2,v1,vperm4);
+	vi3 = vec_perm(v3,v4,vec_lvsl(0,input-10));
+	vi3 = vec_sro(vi3,vc3);
+	
+	v1 = vec_ld(15,output-4);
+	v2 = vec_ld(0,output-4);
+	v3 = vec_ld(0,output-8);
+	v4 = vec_ld(0,output-10);
+	v5 = vec_ld(15,output-10);
+	vo1 = vec_perm(v2,v1,vperm5);
+	vo2 = vec_perm(v3,v2,vperm5);
+	vo3 = vec_perm(v4,v5,vec_lvsl(0,output-10));
+	vo3 = vec_sro(vo3,vc4);
+	
+	v1 = vec_ld(15,output2-2);
+	v2 = vec_ld(0,output2-2);
+	vo4 = vec_perm(v2,v1,vec_lvsl(0,output2-2));
+	vo4 = vec_sro(vo4,vc4);
+	
+	vperm4 = vec_lvsl(0,input-3);
+	vperm5 = vec_lvsr(0,output);
+	
+	/* 1st loop */
+	v1 = vec_ld(15,input-3);
+	v3 = vec_ld(0,input-3);
+	v5 = vec_perm(v3,v1,vperm4);
+		
+	vout1 = vec_madd(v5,vkernel1,vbase);
+	vout2 = vec_madd(vo1,vkernel2,vbase);
+
+	vout1 = vec_madd(vi2,vkernel3,vout1);
+	vout2 = vec_madd(vo2,vkernel4,vout2);
+
+	vout1 = vec_madd(vi3,vkernel5,vout1);
+	vout2 = vec_madd(vo3,vkernel6,vout2);
+		
+	vi3 = vec_sld(vi3,vi2,4);
+	vi2 = vec_sld(vi2,v5,4);
+		
+	vout1 = vec_sub(vout1,vout2);
+	
+	v1 = vec_slo(vout1,vc3);
+	v2 = vec_slo(vout1,vc4);
+	v3 = vec_slo(vout1,vc5);
+	vout1 = vec_add(vout1,v1);
+	vout2 = vec_add(v2,v3);
+	vout1 = vec_add(vout1,vout2);
+		
+	vo3 = vec_sld(vo3,vo2,4);
+	vo2 = vec_sld(vo2,vo1,4);
+	vo1 = vec_sld(vo1,vout1,4);
+		
+	vout2 = vec_perm(vout1,vout1,vperm5);
+	vec_ste(vout2,0,output);
+		
+	++output;
+	++input;
+	--nSamples;
+	
+	while(nSamples--) {
+		vperm4 = vec_lvsl(0,input-3);
+		vperm5 = vec_lvsr(0,output);
+		vperm6 = vec_lvsr(0,output2);
+		
+		v1 = vec_ld(15,input-3);
+		v3 = vec_ld(0,input-3);
+		v5 = vec_perm(v3,v1,vperm4);
+		
+		vout1 = vec_madd(v5,vkernel1,vbase);
+		vout2 = vec_madd(vo1,vkernel2,vbase);
+
+		vout1 = vec_madd(vi2,vkernel3,vout1);
+		vout2 = vec_madd(vo2,vkernel4,vout2);
+
+		vout1 = vec_madd(vi3,vkernel5,vout1);
+		vout2 = vec_madd(vo3,vkernel6,vout2);
+		
+		vout3 = vec_nmsub(vo4,vkernel8,vzero);
+		vout4 = vec_madd(vo1,vkernel7,vout3);
+		
+		vi3 = vec_sld(vi3,vi2,4);
+		vi2 = vec_sld(vi2,v5,4);
+		
+		vout1 = vec_sub(vout1,vout2);
+		
+		v1 = vec_slo(vout1,vc3);
+		v2 = vec_slo(vout1,vc4);
+		v3 = vec_slo(vout1,vc5);
+		vout1 = vec_add(vout1,v1);
+		vout2 = vec_add(v2,v3);
+		vout1 = vec_add(vout1,vout2);
+		
+		vo3 = vec_sld(vo3,vo2,4);
+		vo2 = vec_sld(vo2,vo1,4);
+		vo1 = vec_sld(vo1,vout1,4);
+		
+		v4 = vec_slo(vout4,vc3);
+		v5 = vec_slo(vout4,vc4);
+		v6 = vec_slo(vout4,vc5);
+		vout4 = vec_add(vout4,v4);
+		vout3 = vec_add(v5,v6);
+		vout3 = vec_add(vout3,vout4);
+		
+		vo4 = vec_sld(vo4,vout3,4);
+		
+		vout2 = vec_perm(vout1,vout1,vperm5);
+		vout4 = vec_perm(vout3,vout3,vperm6);
+		vec_ste(vout2,0,output);
+		vec_ste(vout4,0,output2);
+		
+		++output;
+		++output2;
+		++input;
+	}
+	
+	vperm6 = vec_lvsr(0,output2);
+	
+	vout3 = vec_nmsub(vo4,vkernel8,vzero);
+	vout4 = vec_madd(vo1,vkernel7,vout3);
+	
+	v1 = vec_slo(vout4,vc3);
+	v2 = vec_slo(vout4,vc4);
+	v3 = vec_slo(vout4,vc5);
+	vout4 = vec_add(vout4,v1);
+	vout3 = vec_add(v2,v3);
+	vout3 = vec_add(vout3,vout4);
+	
+	vout4 = vec_perm(vout3,vout3,vperm6);
+	vec_ste(vout4,0,output2);
+}
+
+#else
+
 static void
 filterYule(const Float_t * input, Float_t * output, size_t nSamples, const Float_t * const kernel)
 {
@@ -189,7 +442,7 @@ filterButter(const Float_t * input, Floa
     }
 }
 
-
+#endif
 
 static int ResetSampleFrequency(replaygain_t * rgData, long samplefreq);
 
@@ -323,6 +576,10 @@ AnalyzeSamples(replaygain_t * rgData, co
             curright = right_samples + cursamplepos;
         }
 
+#ifdef ALTIVEC
+        filterIntegrated(curleft, rgData->lstep + rgData->totsamp, rgData->lout + rgData->totsamp, cursamples, ABYule[rgData->freqindex], ABButter[rgData->freqindex]);
+        filterIntegrated(curright, rgData->rstep + rgData->totsamp, rgData->rout + rgData->totsamp, cursamples, ABYule[rgData->freqindex], ABButter[rgData->freqindex]);
+#else
         YULE_FILTER(curleft, rgData->lstep + rgData->totsamp, cursamples,
                     ABYule[rgData->freqindex]);
         YULE_FILTER(curright, rgData->rstep + rgData->totsamp, cursamples,
@@ -332,6 +589,7 @@ AnalyzeSamples(replaygain_t * rgData, co
                       ABButter[rgData->freqindex]);
         BUTTER_FILTER(rgData->rstep + rgData->totsamp, rgData->rout + rgData->totsamp, cursamples,
                       ABButter[rgData->freqindex]);
+#endif
 
         curleft = rgData->lout + rgData->totsamp; /* Get the squared values */
         curright = rgData->rout + rgData->totsamp;
--- libmp3lame/lame.c.orig	2017-10-11 04:08:39.000000000 +0900
+++ libmp3lame/lame.c	2017-10-14 18:02:08.000000000 +0900
@@ -30,6 +30,11 @@
 # include <config.h>
 #endif
 
+#ifdef ALTIVEC
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
 
 #include "lame.h"
 #include "machine.h"
@@ -603,7 +608,12 @@ lame_init_params(lame_global_flags * gfp
         gfc->CPU_features.SSE = 0;
         gfc->CPU_features.SSE2 = 0;
     }
-
+#ifdef ALTIVEC
+    /* turn off JAVA mode explicitly */
+    vector unsigned short vscr = vec_mfvscr();
+    vscr = vec_or(vscr,(vector unsigned short)VINIT8(0,0,0,0,0,0,1,0));
+    vec_mtvscr(vscr);
+#endif
 
     cfg->vbr = gfp->VBR;
     cfg->error_protection = gfp->error_protection;
--- libmp3lame/machine.h.orig	2017-10-11 04:08:39.000000000 +0900
+++ libmp3lame/machine.h	2017-10-14 18:02:08.000000000 +0900
@@ -184,6 +184,24 @@ typedef FLOAT sample_t;
 #  endif
 #endif
 
+#ifdef ALTIVEC
+#ifdef __APPLE_CC__
+#define VINIT4(a,b,c,d)                          (a,b,c,d)
+#define VINIT8(a,b,c,d,e,f,g,h)                  (a,b,c,d,e,f,g,h)
+#define VINIT16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) (a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p)
+#define VINIT4ALL(a)                             (a,a,a,a)
+#define VINIT8ALL(a)                             (a,a,a,a,a,a,a,a)
+#define VINIT16ALL(a)                            (a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a)
+#else
+#define VINIT4(a,b,c,d)                          {a,b,c,d}
+#define VINIT8(a,b,c,d,e,f,g,h)                  {a,b,c,d,e,f,g,h}
+#define VINIT16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p}
+#define VINIT4ALL(a)                             {a,a,a,a}
+#define VINIT8ALL(a)                             {a,a,a,a,a,a,a,a}
+#define VINIT16ALL(a)                            {a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a}
+#endif
+#endif
+
 #endif
 
 /* end of machine.h */
--- libmp3lame/newmdct.c.orig	2011-05-08 01:05:17.000000000 +0900
+++ libmp3lame/newmdct.c	2017-10-14 18:02:08.000000000 +0900
@@ -30,6 +30,12 @@
 # include <config.h>
 #endif
 
+#ifdef ALTIVEC
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
+
 #include "lame.h"
 #include "machine.h"
 #include "encoder.h"
@@ -39,7 +45,7 @@
 
 
 #ifndef USE_GOGO_SUBBAND
-static const FLOAT enwindow[] = {
+static const FLOAT enwindow[] __attribute__ ((aligned (16))) = {
     -4.77e-07 * 0.740951125354959 / 2.384e-06, 1.03951e-04 * 0.740951125354959 / 2.384e-06,
     9.53674e-04 * 0.740951125354959 / 2.384e-06, 2.841473e-03 * 0.740951125354959 / 2.384e-06,
     3.5758972e-02 * 0.740951125354959 / 2.384e-06, 3.401756e-03 * 0.740951125354959 / 2.384e-06, 9.83715e-04 * 0.740951125354959 / 2.384e-06, 9.9182e-05 * 0.740951125354959 / 2.384e-06, /* 15 */
@@ -230,7 +236,7 @@ static const FLOAT enwindow[] = {
 #define NS 12
 #define NL 36
 
-static const FLOAT win[4][NL] = {
+static const FLOAT win[4][NL] __attribute__ ((aligned (16))) = {
     {
      2.382191739347913e-13,
      6.423305872147834e-13,
@@ -435,6 +441,443 @@ window_subband(const sample_t * x1, FLOA
 
     const sample_t *x2 = &x1[238 - 14 - 286];
 
+#ifdef ALTIVEC
+    vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
+    vector float vw1,vw2,vw3,vw4,vw5,vw6,vw7,vw8,vs,vt,vzero;
+    vector unsigned char vperm2,vperm3,vperm4,vperm5;
+    vzero = vec_xor(vzero,vzero);
+    vperm5 = (vector unsigned char)VINIT16(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
+    vperm2 = vec_lvsl(0,wp+8);
+    vperm3 = (vector unsigned char)VINIT16(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+    vperm4 = vec_lvsl(0,x1+1);
+    vperm4 = vec_perm(vperm4,vperm4,vperm5);
+    
+    for(i=0;i<3;i++) {
+        v1 = vec_ld(0,wp-10);
+        v2 = vec_ld(16,wp-10);
+        v5 = vec_ld(0,wp+8);
+        v6 = vec_ld(16,wp+8);
+        v7 = vec_ld(32,wp+8);
+        v3 = vec_ld(0,wp+26);
+        v4 = vec_ld(16,wp+26);
+        v8 = vec_ld(0,wp+44);
+        v9 = vec_ld(16,wp+44);
+        v10 = vec_ld(32,wp+44);
+        
+        v5 = vec_perm(v5,v6,vperm2);
+        v6 = vec_perm(v6,v7,vperm2);
+        v7 = vec_perm(v8,v9,vperm2);
+        v8 = vec_perm(v9,v10,vperm2);
+        v9 = vec_mergeh(v1,v3);
+        v10 = vec_mergeh(v2,v4);
+        v11 = vec_mergeh(v5,v7);
+        v12 = vec_mergeh(v6,v8);
+        v13 = vec_mergel(v1,v3);
+        v14 = vec_mergel(v2,v4);
+        v15 = vec_mergel(v5,v7);
+        v16 = vec_mergel(v6,v8);
+        vw1 = vec_mergeh(v9,v11);
+        vw5 = vec_mergeh(v10,v12);
+        vw2 = vec_mergel(v9,v11);
+        vw6 = vec_mergel(v10,v12);
+        vw3 = vec_mergeh(v13,v15);
+        vw7 = vec_mergeh(v14,v16);
+        vw4 = vec_mergel(v13,v15);
+        vw8 = vec_mergel(v14,v16);
+        
+        v3 = vec_ld(0,x2-224);
+        vs = vec_madd(vw1,v3,vzero);
+        v4 = vec_ld(16,x1+221);
+        v5 = vec_ld(0,x1+221);
+        v6 = vec_perm(v5,v4,vperm4);
+        vt = vec_madd(vw1,v6,vzero);
+        
+        v3 = vec_ld(0,x2-160);
+        vs = vec_madd(vw2,v3,vs);
+        v4 = vec_ld(16,x1+157);
+        v5 = vec_ld(0,x1+157);
+        v6 = vec_perm(v5,v4,vperm4);
+        vt = vec_madd(vw2,v6,vt);
+        
+        v3 = vec_ld(0,x2-96);
+        vs = vec_madd(vw3,v3,vs);
+        v4 = vec_ld(16,x1+93);
+        v5 = vec_ld(0,x1+93);
+        v6 = vec_perm(v5,v4,vperm4);
+        vt = vec_madd(vw3,v6,vt);
+        
+        v3 = vec_ld(0,x2-32);
+        vs = vec_madd(vw4,v3,vs);
+        v4 = vec_ld(16,x1+29);
+        v5 = vec_ld(0,x1+29);
+        v6 = vec_perm(v5,v4,vperm4);
+        vt = vec_madd(vw4,v6,vt);
+        
+        
+        v3 = vec_ld(0,x2+32);
+        vs = vec_madd(vw5,v3,vs);
+        v4 = vec_ld(16,x1-35);
+        v5 = vec_ld(0,x1-35);
+        v6 = vec_perm(v5,v4,vperm4);
+        vt = vec_madd(vw5,v6,vt);
+        
+        v3 = vec_ld(0,x2+96);
+        vs = vec_madd(vw6,v3,vs);
+        v4 = vec_ld(16,x1-99);
+        v5 = vec_ld(0,x1-99);
+        v6 = vec_perm(v5,v4,vperm4);
+        vt = vec_madd(vw6,v6,vt);
+        
+        v3 = vec_ld(0,x2+160);
+        vs = vec_madd(vw7,v3,vs);
+        v4 = vec_ld(16,x1-163);
+        v5 = vec_ld(0,x1-163);
+        v6 = vec_perm(v5,v4,vperm4);
+        vt = vec_madd(vw7,v6,vt);
+        
+        v3 = vec_ld(0,x2+224);
+        vs = vec_madd(vw8,v3,vs);
+        v4 = vec_ld(16,x1-227);
+        v5 = vec_ld(0,x1-227);
+        v6 = vec_perm(v5,v4,vperm4);
+        vt = vec_madd(vw8,v6,vt);
+        
+        
+        v1 = vec_ld(0,wp-2);
+        v2 = vec_ld(16,wp-2);
+        v5 = vec_ld(0,wp+16);
+        v6 = vec_ld(16,wp+16);
+        v7 = vec_ld(32,wp+16);
+        v3 = vec_ld(0,wp+34);
+        v4 = vec_ld(16,wp+34);
+        v8 = vec_ld(0,wp+52);
+        v9 = vec_ld(16,wp+52);
+        v10 = vec_ld(32,wp+52);
+        
+        v5 = vec_perm(v5,v6,vperm2);
+        v6 = vec_perm(v6,v7,vperm2);
+        v7 = vec_perm(v8,v9,vperm2);
+        v8 = vec_perm(v9,v10,vperm2);
+        v9 = vec_mergeh(v1,v3);
+        v10 = vec_mergeh(v2,v4);
+        v11 = vec_mergeh(v5,v7);
+        v12 = vec_mergeh(v6,v8);
+        v13 = vec_mergel(v1,v3);
+        v14 = vec_mergel(v2,v4);
+        v15 = vec_mergel(v5,v7);
+        v16 = vec_mergel(v6,v8);
+        vw1 = vec_mergeh(v9,v11);
+        vw5 = vec_mergeh(v10,v12);
+        vw2 = vec_mergel(v9,v11);
+        vw6 = vec_mergel(v10,v12);
+        vw3 = vec_mergeh(v13,v15);
+        vw7 = vec_mergeh(v14,v16);
+        vw4 = vec_mergel(v13,v15);
+        vw8 = vec_mergel(v14,v16);
+        
+        v3 = vec_ld(0,x2+256);
+        vt = vec_nmsub(vw1,v3,vt);
+        v4 = vec_ld(16,x1-259);
+        v5 = vec_ld(0,x1-259);
+        v6 = vec_perm(v5,v4,vperm4);
+        vs = vec_madd(vw1,v6,vs);
+        
+        v3 = vec_ld(0,x2+192);
+        vt = vec_nmsub(vw2,v3,vt);
+        v4 = vec_ld(16,x1-195);
+        v5 = vec_ld(0,x1-195);
+        v6 = vec_perm(v5,v4,vperm4);
+        vs = vec_madd(vw2,v6,vs);
+        
+        v3 = vec_ld(0,x2+128);
+        vt = vec_nmsub(vw3,v3,vt);
+        v4 = vec_ld(16,x1-131);
+        v5 = vec_ld(0,x1-131);
+        v6 = vec_perm(v5,v4,vperm4);
+        vs = vec_madd(vw3,v6,vs);
+        
+        v3 = vec_ld(0,x2+64);
+        vt = vec_nmsub(vw4,v3,vt);
+        v4 = vec_ld(16,x1-67);
+        v5 = vec_ld(0,x1-67);
+        v6 = vec_perm(v5,v4,vperm4);
+        vs = vec_madd(vw4,v6,vs);
+        
+        
+        v3 = vec_ld(0,x2);
+        vt = vec_nmsub(vw5,v3,vt);
+        v4 = vec_ld(16,x1-3);
+        v5 = vec_ld(0,x1-3);
+        v6 = vec_perm(v5,v4,vperm4);
+        vs = vec_madd(vw5,v6,vs);
+        
+        v3 = vec_ld(0,x2-64);
+        vt = vec_nmsub(vw6,v3,vt);
+        v4 = vec_ld(16,x1+61);
+        v5 = vec_ld(0,x1+61);
+        v6 = vec_perm(v5,v4,vperm4);
+        vs = vec_madd(vw6,v6,vs);
+        
+        v3 = vec_ld(0,x2-128);
+        vt = vec_nmsub(vw7,v3,vt);
+        v4 = vec_ld(16,x1+125);
+        v5 = vec_ld(0,x1+125);
+        v6 = vec_perm(v5,v4,vperm4);
+        vs = vec_madd(vw7,v6,vs);
+        
+        v3 = vec_ld(0,x2-192);
+        vt = vec_nmsub(vw8,v3,vt);
+        v4 = vec_ld(16,x1+189);
+        v5 = vec_ld(0,x1+189);
+        v6 = vec_perm(v5,v4,vperm4);
+        vs = vec_madd(vw8,v6,vs);
+        
+        /*end*/
+        
+        v3 = vec_ld(0,wp+6);
+        
+        v4 = vec_ld(0,wp+24);
+        v5 = vec_ld(16,wp+24);
+        v6 = vec_perm(v4,v5,vperm2);
+        
+        v9 = vec_ld(0,wp+42);
+        
+        v10 = vec_ld(0,wp+60);
+        v11 = vec_ld(16,wp+60);
+        v12 = vec_perm(v10,v11,vperm2);
+        
+        v13 = vec_mergeh(v3,v9);
+        v14 = vec_mergeh(v6,v12);;
+        vw1 = vec_mergeh(v13,v14);
+        vw2 = vec_mergel(v13,v14);
+        
+        vs = vec_madd(vs,vw1,vzero);
+        v1 = vec_sub(vt,vs);
+        v2 = vec_add(vt,vs);
+        v3 = vec_madd(vw2,v1,vzero);
+        v4 = vec_mergeh(v2,v3);
+        v5 = vec_mergel(v2,v3);
+        vec_st(v4,0,a+i*8);
+        vec_st(v5,16,a+i*8);
+        
+        wp += 72;
+        x1-=4;
+        x2+=4;
+    }
+    
+    v1 = vec_ld(0,wp-10);
+    v2 = vec_ld(16,wp-10);
+    v5 = vec_ld(0,wp+8);
+    v6 = vec_ld(16,wp+8);
+    v7 = vec_ld(32,wp+8);
+    v3 = vec_ld(0,wp+26);
+    v4 = vec_ld(16,wp+26);
+    v8 = vec_ld(0,wp+44);
+    v9 = vec_ld(16,wp+44);
+    v10 = vec_ld(32,wp+44);
+    
+    v5 = vec_perm(v5,v6,vperm2);
+    v6 = vec_perm(v6,v7,vperm2);
+    v7 = vec_perm(v8,v9,vperm2);
+    v8 = vec_perm(v9,v10,vperm2);
+    v9 = vec_mergeh(v1,v3);
+    v10 = vec_mergeh(v2,v4);
+    v11 = vec_mergeh(v5,v7);
+    v12 = vec_mergeh(v6,v8);
+    v13 = vec_mergel(v1,v3);
+    v14 = vec_mergel(v2,v4);
+    v15 = vec_mergel(v5,v7);
+    v16 = vec_mergel(v6,v8);
+    vw1 = vec_mergeh(v9,v11);
+    vw5 = vec_mergeh(v10,v12);
+    vw2 = vec_mergel(v9,v11);
+    vw6 = vec_mergel(v10,v12);
+    vw3 = vec_mergeh(v13,v15);
+    vw7 = vec_mergeh(v14,v16);
+    vw4 = vec_mergel(v13,v15);
+    vw8 = vec_mergel(v14,v16);
+    
+    v3 = vec_ld(0,x2-224);
+    vs = vec_madd(vw1,v3,vzero);
+    v4 = vec_ld(16,x1+221);
+    v5 = vec_ld(0,x1+221);
+    v6 = vec_perm(v5,v4,vperm4);
+    vt = vec_madd(vw1,v6,vzero);
+    
+    v3 = vec_ld(0,x2-160);
+    vs = vec_madd(vw2,v3,vs);
+    v4 = vec_ld(16,x1+157);
+    v5 = vec_ld(0,x1+157);
+    v6 = vec_perm(v5,v4,vperm4);
+    vt = vec_madd(vw2,v6,vt);
+    
+    v3 = vec_ld(0,x2-96);
+    vs = vec_madd(vw3,v3,vs);
+    v4 = vec_ld(16,x1+93);
+    v5 = vec_ld(0,x1+93);
+    v6 = vec_perm(v5,v4,vperm4);
+    vt = vec_madd(vw3,v6,vt);
+    
+    v3 = vec_ld(0,x2-32);
+    vs = vec_madd(vw4,v3,vs);
+    v4 = vec_ld(16,x1+29);
+    v5 = vec_ld(0,x1+29);
+    v6 = vec_perm(v5,v4,vperm4);
+    vt = vec_madd(vw4,v6,vt);
+    
+    
+    v3 = vec_ld(0,x2+32);
+    vs = vec_madd(vw5,v3,vs);
+    v4 = vec_ld(16,x1-35);
+    v5 = vec_ld(0,x1-35);
+    v6 = vec_perm(v5,v4,vperm4);
+    vt = vec_madd(vw5,v6,vt);
+    
+    v3 = vec_ld(0,x2+96);
+    vs = vec_madd(vw6,v3,vs);
+    v4 = vec_ld(16,x1-99);
+    v5 = vec_ld(0,x1-99);
+    v6 = vec_perm(v5,v4,vperm4);
+    vt = vec_madd(vw6,v6,vt);
+    
+    v3 = vec_ld(0,x2+160);
+    vs = vec_madd(vw7,v3,vs);
+    v4 = vec_ld(16,x1-163);
+    v5 = vec_ld(0,x1-163);
+    v6 = vec_perm(v5,v4,vperm4);
+    vt = vec_madd(vw7,v6,vt);
+    
+    v3 = vec_ld(0,x2+224);
+    vs = vec_madd(vw8,v3,vs);
+    v4 = vec_ld(16,x1-227);
+    v5 = vec_ld(0,x1-227);
+    v6 = vec_perm(v5,v4,vperm4);
+    vt = vec_madd(vw8,v6,vt);
+    
+    
+    v1 = vec_ld(0,wp-2);
+    v2 = vec_ld(16,wp-2);
+    v5 = vec_ld(0,wp+16);
+    v6 = vec_ld(16,wp+16);
+    v7 = vec_ld(32,wp+16);
+    v3 = vec_ld(0,wp+34);
+    v4 = vec_ld(16,wp+34);
+    v8 = vec_ld(0,wp+52);
+    v9 = vec_ld(16,wp+52);
+    v10 = vec_ld(32,wp+52);
+    
+    v5 = vec_perm(v5,v6,vperm2);
+    v6 = vec_perm(v6,v7,vperm2);
+    v7 = vec_perm(v8,v9,vperm2);
+    v8 = vec_perm(v9,v10,vperm2);
+    v9 = vec_mergeh(v1,v3);
+    v10 = vec_mergeh(v2,v4);
+    v11 = vec_mergeh(v5,v7);
+    v12 = vec_mergeh(v6,v8);
+    v13 = vec_mergel(v1,v3);
+    v14 = vec_mergel(v2,v4);
+    v15 = vec_mergel(v5,v7);
+    v16 = vec_mergel(v6,v8);
+    vw1 = vec_mergeh(v9,v11);
+    vw5 = vec_mergeh(v10,v12);
+    vw2 = vec_mergel(v9,v11);
+    vw6 = vec_mergel(v10,v12);
+    vw3 = vec_mergeh(v13,v15);
+    vw7 = vec_mergeh(v14,v16);
+    vw4 = vec_mergel(v13,v15);
+    vw8 = vec_mergel(v14,v16);
+    
+    v3 = vec_ld(0,x2+256);
+    vt = vec_nmsub(vw1,v3,vt);
+    v4 = vec_ld(16,x1-259);
+    v5 = vec_ld(0,x1-259);
+    v6 = vec_perm(v5,v4,vperm4);
+    vs = vec_madd(vw1,v6,vs);
+    
+    v3 = vec_ld(0,x2+192);
+    vt = vec_nmsub(vw2,v3,vt);
+    v4 = vec_ld(16,x1-195);
+    v5 = vec_ld(0,x1-195);
+    v6 = vec_perm(v5,v4,vperm4);
+    vs = vec_madd(vw2,v6,vs);
+    
+    v3 = vec_ld(0,x2+128);
+    vt = vec_nmsub(vw3,v3,vt);
+    v4 = vec_ld(16,x1-131);
+    v5 = vec_ld(0,x1-131);
+    v6 = vec_perm(v5,v4,vperm4);
+    vs = vec_madd(vw3,v6,vs);
+    
+    v3 = vec_ld(0,x2+64);
+    vt = vec_nmsub(vw4,v3,vt);
+    v4 = vec_ld(16,x1-67);
+    v5 = vec_ld(0,x1-67);
+    v6 = vec_perm(v5,v4,vperm4);
+    vs = vec_madd(vw4,v6,vs);
+    
+    
+    v3 = vec_ld(0,x2);
+    vt = vec_nmsub(vw5,v3,vt);
+    v4 = vec_ld(16,x1-3);
+    v5 = vec_ld(0,x1-3);
+    v6 = vec_perm(v5,v4,vperm4);
+    vs = vec_madd(vw5,v6,vs);
+    
+    v3 = vec_ld(0,x2-64);
+    vt = vec_nmsub(vw6,v3,vt);
+    v4 = vec_ld(16,x1+61);
+    v5 = vec_ld(0,x1+61);
+    v6 = vec_perm(v5,v4,vperm4);
+    vs = vec_madd(vw6,v6,vs);
+    
+    v3 = vec_ld(0,x2-128);
+    vt = vec_nmsub(vw7,v3,vt);
+    v4 = vec_ld(16,x1+125);
+    v5 = vec_ld(0,x1+125);
+    v6 = vec_perm(v5,v4,vperm4);
+    vs = vec_madd(vw7,v6,vs);
+    
+    v3 = vec_ld(0,x2-192);
+    vt = vec_nmsub(vw8,v3,vt);
+    v4 = vec_ld(16,x1+189);
+    v5 = vec_ld(0,x1+189);
+    v6 = vec_perm(v5,v4,vperm4);
+    vs = vec_madd(vw8,v6,vs);
+    
+    /*end*/
+    
+    v3 = vec_ld(0,wp+6);
+    
+    v4 = vec_ld(0,wp+24);
+    v5 = vec_ld(16,wp+24);
+    v6 = vec_perm(v4,v5,vperm2);
+    
+    v9 = vec_ld(0,wp+42);
+    
+    v10 = vec_ld(0,wp+60);
+    v11 = vec_ld(16,wp+60);
+    v12 = vec_perm(v10,v11,vperm2);
+    
+    v13 = vec_mergeh(v3,v9);
+    v14 = vec_mergeh(v6,v12);;
+    vw1 = vec_mergeh(v13,v14);
+    vw2 = vec_mergel(v13,v14);
+    
+    vs = vec_madd(vs,vw1,vzero);
+    v1 = vec_sub(vt,vs);
+    v2 = vec_add(vt,vs);
+    v3 = vec_madd(vw2,v1,vzero);
+    v4 = vec_ld(16,a+24);
+    v5 = vec_mergeh(v2,v3);
+    v6 = vec_mergel(v2,v3);
+    v7 = vec_perm(v6,v4,vperm3);
+    vec_st(v5,0,a+24);
+    vec_st(v7,16,a+24);
+    
+    wp += 54;
+    x1-=3;
+    x2+=3;
+#else
     for (i = -15; i < 0; i++) {
         FLOAT   w, s, t;
 
@@ -501,6 +944,7 @@ window_subband(const sample_t * x1, FLOA
         x1--;
         x2++;
     }
+#endif
     {
         FLOAT   s, t, u, v;
         t = x1[-16] * wp[-10];
--- libmp3lame/psymodel.c.orig	2017-09-07 04:38:23.000000000 +0900
+++ libmp3lame/psymodel.c	2017-10-14 18:10:00.000000000 +0900
@@ -146,6 +146,12 @@ blocktype_d[2]        block type to use 
 
 #include <float.h>
 
+#ifdef ALTIVEC
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
+
 #include "lame.h"
 #include "machine.h"
 #include "encoder.h"
@@ -164,6 +170,48 @@ blocktype_d[2]        block type to use 
 #define  LN_TO_LOG10  0.2302585093
 #endif
 
+#ifdef ALTIVEC
+static inline vector float fast_log10_altivec_2(vector float v3)
+{
+    vector float va,vb,vc,vhalf,vzero,vsqrt2,vconst4;
+    vector float v1,v2,v4,v5,v6,v7,v8,vz,vz2,vlog;
+    vector unsigned int vconst1,vconst2,vshamt;
+    vector signed int vconst3;
+    
+    va = (vector float)VINIT4ALL(0.8685890659);
+    vb = (vector float)VINIT4ALL(0.2894672153);
+    vc = (vector float)VINIT4ALL(0.1793365895);
+    vhalf = (vector float)VINIT4ALL(0.15051499783);
+    vsqrt2 = (vector float)VINIT4ALL(1.4142135623731);
+    vconst4 = (vector float)VINIT4ALL(0.301029995664);
+    vzero = vec_xor(vzero,vzero);
+    vconst1 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(9));
+    vconst2 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(7));
+    vconst2 = vec_nor(vconst2,vconst2);
+    vconst3 = (vector signed int)vec_rl(vconst2,vec_splat_u32(7));
+    vshamt = vec_add(vec_splat_u32(9),vec_splat_u32(7));
+    vshamt = vec_add(vshamt,vec_splat_u32(7));
+    vconst2 = vec_sl((vector unsigned int)vconst3,vshamt);
+    
+    v4 = (vector float)vec_sel(vconst2,(vector unsigned int)v3,vconst1);
+    v5 = vec_add(v4,vsqrt2);
+    v6 = vec_sub(v4,vsqrt2);
+    v7 = vec_re(v5);
+    vz = vec_madd(v6, vec_madd(vec_nmsub(v7,v5,(vector float)vconst2),v7,v7), vzero);
+    v8 = (vector float)vec_sr((vector unsigned int)v3,vshamt);
+    vlog = vec_ctf(vec_sub((vector signed int)v8,vconst3),0);
+    
+    vz2 = vec_madd(vz,vz,vzero);
+    vlog = vec_madd(vlog,vconst4,vhalf);
+    
+    v1 = vec_madd(vz2,vc,vb);
+    v2 = vec_madd(vz2,v1,va);
+    vlog = vec_madd(vz,v2,vlog);
+    
+    return vlog;
+}
+#endif
+
 
 /*
    L3psycho_anal.  Compute psycho acoustics.
@@ -253,6 +301,11 @@ static const FLOAT ma_max_i1 = 3.6517412
 static const FLOAT ma_max_i2 = 31.622776601683793;
 /* pow(10, (MLIMIT) / 10.0); */
 static const FLOAT ma_max_m  = 31.622776601683793;
+#ifdef ALTIVEC
+static const vector float vmamax1 = (vector float)VINIT4ALL(3.651741);
+static const vector float vmamax2 = (vector float)VINIT4ALL(31.622777);
+#endif
+
 
     /*This is the masking table:
        According to tonality, values are going from 0dB (TMN)
@@ -666,6 +719,14 @@ static void
 vbrpsy_compute_fft_l(lame_internal_flags * gfc, const sample_t * const buffer[2], int chn,
                      int gr_out, FLOAT fftenergy[HBLKSIZE], FLOAT(*wsamp_l)[BLKSIZE])
 {
+#ifdef ALTIVEC
+    vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,vhalf,vprev,vzero,vsqrt2;
+    vector unsigned char vperm;
+    vhalf = vec_ctf(vec_splat_s32(1),1);
+    vsqrt2 = (vector float)VINIT4ALL(0.7071067811865001);
+    vzero = vec_xor(vzero,vzero);
+    vperm = (vector unsigned char)VINIT16(0,1,2,3,28,29,30,31,24,25,26,27,20,21,22,23);
+#endif
     SessionConfig_t const *const cfg = &gfc->cfg;
     PsyStateVar_t *psv = &gfc->sv_psy;
     plotting_data *plt = cfg->analysis ? gfc->pinfo : 0;
@@ -675,19 +736,80 @@ vbrpsy_compute_fft_l(lame_internal_flags
         fft_long(gfc, *wsamp_l, chn, buffer);
     }
     else if (chn == 2) {
-        FLOAT const sqrt2_half = SQRT2 * 0.5f;
         /* FFT data for mid and side channel is derived from L & R */
+#ifdef ALTIVEC
+        for(j = 0; j < BLKSIZE; j += 8) {
+            v1 = vec_ld(0,wsamp_l[0]+j);
+            v2 = vec_ld(0,wsamp_l[1]+j);
+            v3 = vec_ld(16,wsamp_l[0]+j);
+            v4 = vec_ld(16,wsamp_l[1]+j);
+            
+            v5 = vec_add(v1,v2);
+            v6 = vec_sub(v1,v2);
+            v7 = vec_add(v3,v4);
+            v8 = vec_sub(v3,v4);
+            v9 = vec_madd(v5,vsqrt2,vzero);
+            v10 = vec_madd(v6,vsqrt2,vzero);
+            v11 = vec_madd(v7,vsqrt2,vzero);
+            v12 = vec_madd(v8,vsqrt2,vzero);
+            
+            vec_st(v9,0,wsamp_l[0]+j);
+            vec_st(v10,0,wsamp_l[1]+j);
+            vec_st(v11,16,wsamp_l[0]+j);
+            vec_st(v12,16,wsamp_l[1]+j);
+        }
+#else
+        FLOAT const sqrt2_half = SQRT2 * 0.5f;
         for (j = BLKSIZE - 1; j >= 0; --j) {
             FLOAT const l = wsamp_l[0][j];
             FLOAT const r = wsamp_l[1][j];
             wsamp_l[0][j] = (l + r) * sqrt2_half;
             wsamp_l[1][j] = (l - r) * sqrt2_half;
         }
+#endif
     }
 
     /*********************************************************************
     *  compute energies
     *********************************************************************/
+#ifdef ALTIVEC
+    vprev = vec_ld(0,(*wsamp_l));
+    for(j = 0; j < BLKSIZE/2; j += 16) {
+        v1 = vec_ld(0,(*wsamp_l)+j);
+        v2 = vec_ld(16,(*wsamp_l)+j);
+        v3 = vec_ld(32,(*wsamp_l)+j);
+        v4 = vec_ld(48,(*wsamp_l)+j);
+        v5 = vec_ld(48,(*wsamp_l)+1008-j);
+        v6 = vec_ld(32,(*wsamp_l)+1008-j);
+        v7 = vec_ld(16,(*wsamp_l)+1008-j);
+        v8 = vec_ld(0,(*wsamp_l)+1008-j);
+        v9 = vec_perm(vprev,v5,vperm);
+        v10 = vec_perm(v5,v6,vperm);
+        v11 = vec_perm(v6,v7,vperm);
+        v12 = vec_perm(v7,v8,vperm);
+        vprev = v8;
+        v1 = vec_madd(v1,v1,vzero);
+        v2 = vec_madd(v2,v2,vzero);
+        v3 = vec_madd(v3,v3,vzero);
+        v4 = vec_madd(v4,v4,vzero);
+        v5 = vec_madd(v9,v9,v1);
+        v6 = vec_madd(v10,v10,v2);
+        v7 = vec_madd(v11,v11,v3);
+        v8 = vec_madd(v12,v12,v4);
+        v9 = vec_madd(v5,vhalf,vzero);
+        v10 = vec_madd(v6,vhalf,vzero);
+        v11 = vec_madd(v7,vhalf,vzero);
+        v12 = vec_madd(v8,vhalf,vzero);
+        
+        vec_st(v9,0,fftenergy+j);
+        vec_st(v10,16,fftenergy+j);
+        vec_st(v11,32,fftenergy+j);
+        vec_st(v12,48,fftenergy+j);
+    }
+    
+    v1 = vec_madd(vprev,vprev,vzero);
+    vec_ste(v1,0,fftenergy+j);
+#else
     fftenergy[0] = wsamp_l[0][0];
     fftenergy[0] *= fftenergy[0];
 
@@ -696,13 +818,51 @@ vbrpsy_compute_fft_l(lame_internal_flags
         FLOAT const im = (*wsamp_l)[BLKSIZE / 2 + j];
         fftenergy[BLKSIZE / 2 - j] = (re * re + im * im) * 0.5f;
     }
+#endif
     /* total energy */
     {
+#ifdef ALTIVEC
+#ifdef ALTIVEC_970
+        v5 = vec_ld(0,fftenergy+8);
+        v6 = vec_ld(0,fftenergy+508);
+        v7 = vec_ld(0,fftenergy+512);
+        v8 = vec_xor(v8,v8);
+        v5 = vec_sld(v5,v8,12);
+        v7 = vec_sld(v8,v7,4);
+#else
+        v5 = vec_lde(0,fftenergy+11);
+        v6 = vec_ld(0,fftenergy+508);
+        v7 = vec_lde(0,fftenergy+512);
+        v8 = vec_xor(v8,v8);
+#endif
+        for(j=12;j<508;j+=16) {
+            v1 = vec_ld(0,fftenergy+j);
+            v2 = vec_ld(16,fftenergy+j);
+            v3 = vec_ld(32,fftenergy+j);
+            v4 = vec_ld(48,fftenergy+j);
+            v5 = vec_add(v1,v5);
+            v6 = vec_add(v2,v6);
+            v7 = vec_add(v3,v7);
+            v8 = vec_add(v4,v8);
+        }
+        v5 = vec_add(v5,v6);
+        v7 = vec_add(v7,v8);
+        v5 = vec_add(v5,v7);
+        v6 = vec_sld(v5,v5,4);
+        v7 = vec_sld(v5,v5,8);
+        v8 = vec_sld(v5,v5,12);
+        v5 = vec_add(v5,v6);
+        v7 = vec_add(v7,v8);
+        v5 = vec_add(v5,v7);
+        v5 = vec_perm(v5,v5,vec_lvsr(0, psv->tot_ener+chn));
+        vec_ste(v5,0,psv->tot_ener+chn);
+#else
         FLOAT   totalenergy = 0.0f;
         for (j = 11; j < HBLKSIZE; j++)
             totalenergy += fftenergy[j];
 
         psv->tot_ener[chn] = totalenergy;
+#endif
     }
 
     if (plt) {
@@ -716,27 +876,96 @@ vbrpsy_compute_fft_l(lame_internal_flags
 
 static void
 vbrpsy_compute_fft_s(lame_internal_flags const *gfc, const sample_t * const buffer[2], int chn,
-                     int sblock, FLOAT(*fftenergy_s)[HBLKSIZE_s], FLOAT(*wsamp_s)[3][BLKSIZE_s])
+                     int sblock, FLOAT(*fftenergy_s)[HBLKSIZE_s+3], FLOAT(*wsamp_s)[3][BLKSIZE_s])
 {
+#ifdef ALTIVEC
+    vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,vhalf,vprev,vzero,vsqrt2;
+    vector unsigned char vperm;
+    vhalf = vec_ctf(vec_splat_s32(1),1);
+    vsqrt2 = (vector float)VINIT4ALL(0.7071067811865001);
+    vzero = vec_xor(vzero,vzero);
+    vperm = (vector unsigned char)VINIT16(0,1,2,3,28,29,30,31,24,25,26,27,20,21,22,23);
+#endif
     int     j;
 
     if (sblock == 0 && chn < 2) {
         fft_short(gfc, *wsamp_s, chn, buffer);
     }
     if (chn == 2) {
-        FLOAT const sqrt2_half = SQRT2 * 0.5f;
         /* FFT data for mid and side channel is derived from L & R */
+#ifdef ALTIVEC
+        for(j = 0; j < BLKSIZE_s; j += 8) {
+            v1 = vec_ld(0,wsamp_s[0][sblock]+j);
+            v2 = vec_ld(0,wsamp_s[1][sblock]+j);
+            v3 = vec_ld(16,wsamp_s[0][sblock]+j);
+            v4 = vec_ld(16,wsamp_s[1][sblock]+j);
+            
+            v5 = vec_add(v1,v2);
+            v6 = vec_sub(v1,v2);
+            v7 = vec_add(v3,v4);
+            v8 = vec_sub(v3,v4);
+            v9 = vec_madd(v5,vsqrt2,vzero);
+            v10 = vec_madd(v6,vsqrt2,vzero);
+            v11 = vec_madd(v7,vsqrt2,vzero);
+            v12 = vec_madd(v8,vsqrt2,vzero);
+            
+            vec_st(v9,0,wsamp_s[0][sblock]+j);
+            vec_st(v10,0,wsamp_s[1][sblock]+j);
+            vec_st(v11,16,wsamp_s[0][sblock]+j);
+            vec_st(v12,16,wsamp_s[1][sblock]+j);
+        }
+#else
+        FLOAT const sqrt2_half = SQRT2 * 0.5f;
         for (j = BLKSIZE_s - 1; j >= 0; --j) {
             FLOAT const l = wsamp_s[0][sblock][j];
             FLOAT const r = wsamp_s[1][sblock][j];
             wsamp_s[0][sblock][j] = (l + r) * sqrt2_half;
             wsamp_s[1][sblock][j] = (l - r) * sqrt2_half;
         }
+#endif
     }
 
     /*********************************************************************
     *  compute energies
     *********************************************************************/
+#ifdef ALTIVEC
+    vprev = vec_ld(0,(*wsamp_s)[sblock]);
+    for(j = 0; j < BLKSIZE_s/2; j += 16) {
+        v1 = vec_ld(0,(*wsamp_s)[sblock]+j);
+        v2 = vec_ld(16,(*wsamp_s)[sblock]+j);
+        v3 = vec_ld(32,(*wsamp_s)[sblock]+j);
+        v4 = vec_ld(48,(*wsamp_s)[sblock]+j);
+        v5 = vec_ld(48,(*wsamp_s)[sblock]+240-j);
+        v6 = vec_ld(32,(*wsamp_s)[sblock]+240-j);
+        v7 = vec_ld(16,(*wsamp_s)[sblock]+240-j);
+        v8 = vec_ld(0,(*wsamp_s)[sblock]+240-j);
+        v9 = vec_perm(vprev,v5,vperm);
+        v10 = vec_perm(v5,v6,vperm);
+        v11 = vec_perm(v6,v7,vperm);
+        v12 = vec_perm(v7,v8,vperm);
+        vprev = v8;
+        v1 = vec_madd(v1,v1,vzero);
+        v2 = vec_madd(v2,v2,vzero);
+        v3 = vec_madd(v3,v3,vzero);
+        v4 = vec_madd(v4,v4,vzero);
+        v5 = vec_madd(v9,v9,v1);
+        v6 = vec_madd(v10,v10,v2);
+        v7 = vec_madd(v11,v11,v3);
+        v8 = vec_madd(v12,v12,v4);
+        v9 = vec_madd(v5,vhalf,vzero);
+        v10 = vec_madd(v6,vhalf,vzero);
+        v11 = vec_madd(v7,vhalf,vzero);
+        v12 = vec_madd(v8,vhalf,vzero);
+        
+        vec_st(v9,0,fftenergy_s[sblock]+j);
+        vec_st(v10,16,fftenergy_s[sblock]+j);
+        vec_st(v11,32,fftenergy_s[sblock]+j);
+        vec_st(v12,48,fftenergy_s[sblock]+j);
+    }
+    
+    v1 = vec_madd(vprev,vprev,vzero);
+    vec_ste(v1,0,fftenergy_s[sblock]+j);
+#else
     fftenergy_s[sblock][0] = (*wsamp_s)[sblock][0];
     fftenergy_s[sblock][0] *= fftenergy_s[sblock][0];
     for (j = BLKSIZE_s / 2 - 1; j >= 0; --j) {
@@ -744,6 +973,7 @@ vbrpsy_compute_fft_s(lame_internal_flags
         FLOAT const im = (*wsamp_s)[sblock][BLKSIZE_s / 2 + j];
         fftenergy_s[sblock][BLKSIZE_s / 2 - j] = (re * re + im * im) * 0.5f;
     }
+#endif
 }
 
 
@@ -772,7 +1002,24 @@ vbrpsy_attack_detection(lame_internal_fl
                         FLOAT energy[4], FLOAT sub_short_factor[4][3], int ns_attacks[4][4],
                         int uselongblock[2])
 {
-    FLOAT   ns_hpfsmpl[2][576];
+#ifdef ALTIVEC
+    vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
+    vector float vsum,vsum1,vsum2,vsuma,vsumb,vsumc,vsumd,vmaska,vmaskb,vmaskc,vmaskd;
+    vector unsigned char vmask1,vmask2,vmask3,vmask4,vmask1inv,vmask2inv,vmask3inv,vmask4inv,vperm,vs4,vs8,vs12;
+    
+    vperm = (vector unsigned char)VINIT16(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
+    v1 = (vector float)vec_splat_u8(1);
+    v2 = (vector float)vec_splat_u8(5);
+    vs4 = vec_sl((vector unsigned char)v1,(vector unsigned char)v2);
+    vs8 = vec_sl(vs4,(vector unsigned char)v1);
+    vs12 = vec_or(vs4,vs8);
+    v3 = (vector float)vec_splat_s32(-1);
+    vmaska = vec_slo(v3,vs12);
+    vmaskb = vec_sro(vmaska,vs4);
+    vmaskc = vec_sro(vmaska,vs8);
+    vmaskd = vec_sro(vmaska,vs12);
+#endif
+    FLOAT   ns_hpfsmpl[2][576] __attribute__ ((aligned (16)));
     SessionConfig_t const *const cfg = &gfc->cfg;
     PsyStateVar_t *const psv = &gfc->sv_psy;
     plotting_data *plt = cfg->analysis ? gfc->pinfo : 0;
@@ -785,14 +1032,142 @@ vbrpsy_attack_detection(lame_internal_fl
     /* Don't copy the input buffer into a temporary buffer */
     /* unroll the loop 2 times */
     for (chn = 0; chn < n_chn_out; chn++) {
-        static const FLOAT fircoef[] = {
+        static const FLOAT fircoef[] __attribute__ ((aligned (16))) = {
             -8.65163e-18 * 2, -0.00851586 * 2, -6.74764e-18 * 2, 0.0209036 * 2,
             -3.36639e-17 * 2, -0.0438162 * 2, -1.54175e-17 * 2, 0.0931738 * 2,
-            -5.52212e-17 * 2, -0.313819 * 2
+            -5.52212e-17 * 2, -0.313819 * 2, 0.0, 0.0
         };
         /* apply high pass filter of fs/4 */
         const sample_t *const firbuf = &buffer[chn][576 - 350 - NSFIRLEN + 192];
-        assert(dimension_of(fircoef) == ((NSFIRLEN - 1) / 2));
+        //assert(dimension_of(fircoef) == ((NSFIRLEN - 1) / 2));
+#ifdef ALTIVEC
+        v1 = vec_ld(0, firbuf+10);
+        vmask1 = vec_lvsl(0, firbuf);
+        vmask2 = vec_lvsl(0, firbuf+1);
+        vmask3 = vec_lvsl(0, firbuf+2);
+        vmask4 = vec_lvsl(0, firbuf+3);
+        vmask1inv = vec_perm(vmask1,vmask1,vperm);
+        vmask2inv = vec_perm(vmask2,vmask2,vperm);
+        vmask3inv = vec_perm(vmask3,vmask3,vperm);
+        vmask4inv = vec_perm(vmask4,vmask4,vperm);
+        for(i=0;i<576;) {
+            v2 = vec_ld(16,firbuf+i+10);
+            vsum1 = vec_perm(v1, v2, vmask3);
+            v1 = v2;
+            
+            vsum2 = vec_splat(vsum1, 0);
+            vsum = vec_and(vsum2, vmaska);
+            v3 = vec_ld(0, firbuf+i);
+            v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
+            for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
+                v5 = vec_ld(16, firbuf+i+j);
+                v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
+                v7 = vec_perm(v3,v5,vmask1);
+                v8 = vec_perm(v6,v4,vmask3inv);
+                v3 = v5;
+                v4 = v6;
+                v10 = vec_ld(0,fircoef+j);
+                v11 = vec_add(v7,v8);
+                vsum = vec_madd(v10,v11,vsum);
+            }
+            
+            v12 = vec_slo(vsum,vs4);
+            v13 = vec_slo(vsum,vs8);
+            v14 = vec_slo(vsum,vs12);
+            v15 = vec_add(vsum,v12);
+            v16 = vec_add(v13,v14);
+            vsuma = vec_add(v15,v16);
+            vsuma = vec_and(vsuma,vmaska);
+            
+            i++;
+            
+            vsum2 = vec_splat(vsum1, 1);
+            vsum = vec_and(vsum2, vmaska);
+            v3 = vec_ld(0, firbuf+i);
+            v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
+            vmask2 = vec_lvsl(0, firbuf+i);
+            for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
+                v5 = vec_ld(16, firbuf+i+j);
+                v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
+                v7 = vec_perm(v3,v5,vmask2);
+                v8 = vec_perm(v6,v4,vmask4inv);
+                v3 = v5;
+                v4 = v6;
+                v10 = vec_ld(0,fircoef+j);
+                v11 = vec_add(v7,v8);
+                vsum = vec_madd(v10,v11,vsum);
+            }
+            
+            v12 = vec_sro(vsum,vs4);
+            v13 = vec_slo(vsum,vs4);
+            v14 = vec_slo(vsum,vs8);
+            v15 = vec_add(vsum,v12);
+            v16 = vec_add(v13,v14);
+            vsumb = vec_add(v15,v16);
+            vsumb = vec_and(vsumb,vmaskb);
+            
+            i++;
+            
+            vsum2 = vec_splat(vsum1, 2);
+            vsum = vec_and(vsum2, vmaska);
+            v3 = vec_ld(0, firbuf+i);
+            v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
+            vmask2 = vec_lvsl(0, firbuf+i);
+            for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
+                v5 = vec_ld(16, firbuf+i+j);
+                v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
+                v7 = vec_perm(v3,v5,vmask3);
+                v8 = vec_perm(v6,v4,vmask1inv);
+                v3 = v5;
+                v4 = v6;
+                v10 = vec_ld(0,fircoef+j);
+                v11 = vec_add(v7,v8);
+                vsum = vec_madd(v10,v11,vsum);
+            }
+            
+            v12 = vec_sro(vsum,vs4);
+            v13 = vec_sro(vsum,vs8);
+            v14 = vec_slo(vsum,vs4);
+            v15 = vec_add(vsum,v12);
+            v16 = vec_add(v13,v14);
+            vsumc = vec_add(v15,v16);
+            vsumc = vec_and(vsumc,vmaskc);
+            
+            i++;
+            
+            vsum2 = vec_splat(vsum1, 3);
+            vsum = vec_and(vsum2, vmaska);
+            v3 = vec_ld(0, firbuf+i);
+            v4 = vec_ld(16,firbuf+i+NSFIRLEN-3);
+            vmask2 = vec_lvsl(0, firbuf+i);
+            for(j=0;j<(NSFIRLEN-1)/2;j+=4) {
+                v5 = vec_ld(16, firbuf+i+j);
+                v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j);
+                v7 = vec_perm(v3,v5,vmask4);
+                v8 = vec_perm(v6,v4,vmask2inv);
+                v3 = v5;
+                v4 = v6;
+                v10 = vec_ld(0,fircoef+j);
+                v11 = vec_add(v7,v8);
+                vsum = vec_madd(v10,v11,vsum);
+            }
+            
+            v12 = vec_sro(vsum,vs4);
+            v13 = vec_sro(vsum,vs8);
+            v14 = vec_sro(vsum,vs12);
+            v15 = vec_add(vsum,v12);
+            v16 = vec_add(v13,v14);
+            vsumd = vec_add(v15,v16);
+            vsumd = vec_and(vsumd,vmaskd);
+            
+            vsum1 = vec_or(vsuma,vsumb);
+            vsum2 = vec_or(vsumc,vsumd);
+            vsum = vec_or(vsum1,vsum2);
+            
+            i++;
+            vec_st(vsum,0,ns_hpfsmpl[chn]+i-4);
+        }
+#else
         for (i = 0; i < 576; i++) {
             FLOAT   sum1, sum2;
             sum1 = firbuf[i + 10];
@@ -803,6 +1178,7 @@ vbrpsy_attack_detection(lame_internal_fl
             }
             ns_hpfsmpl[chn][i] = sum1 + sum2;
         }
+#endif
         masking_ratio[gr_out][chn].en = psv->en[chn];
         masking_ratio[gr_out][chn].thm = psv->thm[chn];
         if (n_chn_psy > 2) {
@@ -841,9 +1217,28 @@ vbrpsy_attack_detection(lame_internal_fl
         for (i = 0; i < 9; i++) {
             FLOAT const *const pfe = pf + 576 / 9;
             FLOAT   p = 1.;
+#ifdef ALTIVEC
+            FLOAT vmax[4] __attribute__ ((aligned (16)));
+            v1 = (vector float)vec_splat_s32(1);
+            v2 = vec_ctf((vector signed int)v1,0);
+            for (; pf < pfe; pf+=4) {
+                v3 = vec_ld(0,pf);
+                v4 = vec_abs(v3);
+                v2 = vec_max(v2,v4);
+            }
+            v5 = vec_slo(v2,vs4);
+            v6 = vec_slo(v2,vs8);
+            v7 = vec_slo(v2,vs12);
+            v8 = vec_max(v2,v5);
+            v9 = vec_max(v6,v7);
+            v10 =vec_max(v8,v9);
+            vec_st(v10,0,vmax);
+            p = vmax[0];
+#else
             for (; pf < pfe; pf++)
                 if (p < fabs(*pf))
                     p = fabs(*pf);
+#endif
             psv->last_en_subshort[chn][i] = en_subshort[i + 3] = p;
             en_short[1 + i / 3] += p;
             if (p > en_subshort[i + 3 - 2]) {
@@ -1039,7 +1434,7 @@ vbrpsy_calc_mask_index_s(lame_internal_f
 
 
 static void
-vbrpsy_compute_masking_s(lame_internal_flags * gfc, const FLOAT(*fftenergy_s)[HBLKSIZE_s],
+vbrpsy_compute_masking_s(lame_internal_flags * gfc, const FLOAT(*fftenergy_s)[HBLKSIZE_s+3],
                          FLOAT * eb, FLOAT * thr, int chn, int sblock)
 {
     PsyStateVar_t *const psv = &gfc->sv_psy;
@@ -1147,24 +1542,286 @@ vbrpsy_compute_masking_l(lame_internal_f
 {
     PsyStateVar_t *const psv = &gfc->sv_psy;
     PsyConst_CB2SB_t const *const gdl = &gfc->cd_psy->l;
-    FLOAT   max[CBANDS], avg[CBANDS];
-    unsigned char mask_idx_l[CBANDS + 2];
+    FLOAT   max[CBANDS] __attribute__ ((aligned (16))), avg[CBANDS];
+    unsigned char mask_idx_l[CBANDS + 2] __attribute__ ((aligned (16)));
     int     k, b;
+#ifdef ALTIVEC
+    float tmp[4] __attribute__ ((aligned (16)));
+    const vector unsigned char v31 = (vector unsigned char)VINIT16ALL(31);
+    const vector unsigned int vmask1 = (vector unsigned int)VINIT4ALL(0xff);
+    const vector signed int vone = (vector signed int)VINIT4ALL(1);
+    const vector unsigned int vtab1 = (vector unsigned int)VINIT4(0x3f800000,0x3f4b5936,0x3f218698,0x3f218698);
+    const vector unsigned int vtab2 = (vector unsigned int)VINIT4(0x3f218698,0x3f218698,0x3f218698,0x3e809bfa);
+    const vector unsigned int vtab3 = (vector unsigned int)VINIT4(0x3df09e99,0,0,0);
+    const vector unsigned int vtable1 = (vector unsigned int)VINIT4(0x3fe39e89,0x3fec53e5,0x3ff55ea7,0x3ff9149b);
+    const vector unsigned int vtable2 = (vector unsigned int)VINIT4(0x3ffcd90e,0x3fea8f7b,0x3fd997da,0x3fbf84e2);
+    const vector unsigned int vtable3 = (vector unsigned int)VINIT4(0x3fa8917c,0x3f800000,0,0);
+    const vector float vzero = vec_xor(vzero,vzero);
+#endif
 
  /*********************************************************************
     *    Calculate the energy and the tonality of each partition.
  *********************************************************************/
     calc_energy(gdl, fftenergy, eb_l, max, avg);
     calc_mask_index_l(gfc, max, avg, mask_idx_l);
+#ifdef ALTIVEC
+    const vector unsigned char vmaskidx1 = vec_ld(0,mask_idx_l); //needs to be aligned
+    const vector unsigned char vmaskidx2 = vec_ld(16,mask_idx_l);
+    const vector unsigned char vmaskidx3 = vec_ld(32,mask_idx_l);
+    const vector unsigned char vmaskidx4 = vec_ld(48,mask_idx_l);
+    tmp[0] = gfc->sv_qnt.masking_lower;
+    vector float vmasking_lower_coeff = vec_ld(0,tmp);
+    vmasking_lower_coeff = vec_splat(vmasking_lower_coeff,0);
+#endif
 
  /*********************************************************************
     *      convolve the partitioned energy and unpredictability
     *      with the spreading function, s3_l[b][k]
  ********************************************************************/
     k = 0;
-    for (b = 0; b < gdl->npart; b++) {
+#ifdef ALTIVEC
+    for (b = 0; b < gdl->npart-3; b+=4) {
+        vector signed int v1,v2,v3,v4,v5,vkk,vkk2,vlast,vdd,vdd_n,vk,vk2;
+        vector float vf1,vf2,vf3,vf4,vecb,vx,veb,vavgmask,vmasking_lower;
+        vmasking_lower = vec_ld(0,gdl->masking_lower+b);
+        vmasking_lower = vec_madd(vmasking_lower,vmasking_lower_coeff,vzero);
+        int tmp2[4] __attribute__ ((aligned (16)));
+        int tmp3[4] __attribute__ ((aligned (16)));
+        
+        v1 = vec_ld(0,gdl->s3ind[b]); //needs to be aligned
+        v2 = vec_ld(0,gdl->s3ind[b+1]);
+        v3 = vec_ld(0,gdl->s3ind[b+2]);
+        v4 = vec_ld(0,gdl->s3ind[b+3]);
+        v1 = vec_mergeh(v1,v3);
+        v2 = vec_mergeh(v2,v4);
+        vkk = vec_mergeh(v1,v2);
+        vlast = vec_mergel(v1,v2);
+        
+        v1 = vec_sub(vlast,vkk);
+        v1 = vec_sel(v1,(vector signed int)vzero,vec_cmpgt((vector signed int)vzero,v1));
+        vec_st(v1,0,tmp2);
+        
+        tmp3[0] = k;
+        tmp3[1] = k+tmp2[0]+1;
+        tmp3[2] = k+tmp2[0]+tmp2[1]+2;
+        tmp3[3] = k+tmp2[0]+tmp2[1]+tmp2[2]+3;
+        k = k+tmp2[0]+tmp2[1]+tmp2[2]+tmp2[3]+4;
+        vk = vec_ld(0,tmp3);
+        
+        v1 = (vector signed int)vec_perm(vmaskidx1,vmaskidx2,(vector unsigned char)vkk);
+        v2 = (vector signed int)vec_perm(vmaskidx3,vmaskidx4,(vector unsigned char)vkk);
+        vdd = vec_sel(v1,v2,vec_cmpgt(vkk,(vector signed int)VINIT4ALL(31)));
+        vdd = vec_and(vdd,(vector signed int)vmask1);
+        vdd_n = vone;
+        
+        tmp[0] = gdl->s3[tmp3[0]];
+        tmp[1] = gdl->s3[tmp3[1]];
+        tmp[2] = gdl->s3[tmp3[2]];
+        tmp[3] = gdl->s3[tmp3[3]];
+        vf1 = vec_ld(0,tmp);
+        
+        vec_st(vkk,0,tmp2);
+        tmp[0] = eb_l[tmp2[0]];
+        tmp[1] = eb_l[tmp2[1]];
+        tmp[2] = eb_l[tmp2[2]];
+        tmp[3] = eb_l[tmp2[3]];
+        veb = vec_ld(0,tmp);
+        
+        vecb = vec_madd(vf1,veb,vzero);
+        
+        v1 = vec_sl(vdd,vec_splat_u32(2));
+        v2 = vec_add(v1,vec_splat_s32(1));
+        v3 = vec_add(v1,vec_splat_s32(2));
+        v4 = vec_add(v2,vec_splat_s32(2));
+        v1 = vec_sl(v1,vec_splat_u32(-8));
+        v2 = vec_sl(v2,vec_splat_u32(-16));
+        v3 = vec_sl(v3,vec_splat_u32(8));
+        v1 = vec_or(v1,v2);
+        v3 = vec_or(v3,v4);
+        v1 = vec_or(v1,v3);
+        
+        vf1 = (vector float)vec_perm(vtab1,vtab2,(vector unsigned char)v1);
+        vf2 = (vector float)vec_perm(vtab3,vtab2,(vector unsigned char)v1);
+        vf2 = vec_sel(vf1,vf2,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31));
+        vecb = vec_madd(vecb,vf2,vzero);
+        
+        vkk = vec_add(vkk,vone);
+        vk = vec_add(vk,vone);
+        while(vec_any_le(vkk,vlast)) {
+            vkk2 = vec_sel(vkk,vlast,vec_cmpgt(vkk,vlast));
+            vk2 = vec_sel(vk,(vector signed int)vzero,vec_cmpgt(vkk,vlast));
+            v1 = (vector signed int)vec_perm(vmaskidx1,vmaskidx2,(vector unsigned char)vkk2);
+            v2 = (vector signed int)vec_perm(vmaskidx3,vmaskidx4,(vector unsigned char)vkk2);
+            v1 = vec_sel(v1,v2,vec_cmpgt(vkk2,(vector signed int)VINIT4ALL(31)));
+            v1 = vec_and(v1,(vector signed int)vmask1);
+            v2 = (vector signed int)vec_cmpgt(vkk,vlast);
+            v2 = vec_nor(v2,v2);
+            v5 = vec_and(v1,v2);
+            v2 = vec_and(vone,v2);
+            vdd = vec_add(vdd,v5);
+            vdd_n = vec_add(vdd_n,v2);
+            
+            vec_st(vk2,0,tmp2);
+            tmp[0] = gdl->s3[tmp2[0]];
+            tmp[1] = gdl->s3[tmp2[1]];
+            tmp[2] = gdl->s3[tmp2[2]];
+            tmp[3] = gdl->s3[tmp2[3]];
+            vf1 = vec_ld(0,tmp);
+            
+            vec_st(vkk,0,tmp2);
+            tmp[0] = eb_l[tmp2[0]];
+            tmp[1] = eb_l[tmp2[1]];
+            tmp[2] = eb_l[tmp2[2]];
+            tmp[3] = eb_l[tmp2[3]];
+            veb = vec_ld(0,tmp);
+            
+            vx = vec_madd(vf1,veb,vzero);
+            
+            v1 = vec_sl(v5,vec_splat_u32(2));
+            v2 = vec_add(v1,vec_splat_s32(1));
+            v3 = vec_add(v1,vec_splat_s32(2));
+            v4 = vec_add(v2,vec_splat_s32(2));
+            v1 = vec_sl(v1,vec_splat_u32(-8));
+            v2 = vec_sl(v2,vec_splat_u32(-16));
+            v3 = vec_sl(v3,vec_splat_u32(8));
+            v1 = vec_or(v1,v2);
+            v3 = vec_or(v3,v4);
+            v1 = vec_or(v1,v3);
+            
+            vf1 = (vector float)vec_perm(vtab1,vtab2,(vector unsigned char)v1);
+            vf2 = (vector float)vec_perm(vtab3,vtab2,(vector unsigned char)v1);
+            vf2 = vec_sel(vf1,vf2,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31));
+            vx = vec_madd(vx,vf2,vzero);
+            {
+                vector float vratio,vout,vf5;
+                vf1 = vec_sel(vecb,vzero,vec_cmplt(vecb,vzero));
+                vf2 = vec_sel(vx,vzero,vec_cmplt(vx,vzero));
+                vf3 = vec_sel(vf1,vf2,vec_cmpgt(vf2,vf1));
+                vf4 = vec_sel(vf2,vf1,vec_cmpgt(vf2,vf1));
+                vf5 = vec_re(vf4);
+                vratio = vec_madd(vf3,vec_madd(vec_nmsub(vf4,vf5,(vector float)VINIT4ALL(1.0)),vf5,vf5),vzero);
+                
+                tmp2[0] = b;
+                tmp2[1] = b+1;
+                tmp2[2] = b+2;
+                tmp2[3] = b+3;
+                tmp3[0] = mask_add_delta(mask_idx_l[b]);
+                tmp3[1] = mask_add_delta(mask_idx_l[b+1]);
+                tmp3[2] = mask_add_delta(mask_idx_l[b+2]);
+                tmp3[3] = mask_add_delta(mask_idx_l[b+3]);
+                v1 = vec_ld(0,tmp2);
+                v1 = vec_sub(vkk2,v1);
+                v2 = vec_ld(0,tmp3);
+                v1 = vec_abs(v1);
+                v5 = (vector signed int)vec_cmpgt(v1,v2);
+                v3 = (vector signed int)vec_cmpge(vratio,vmamax1);
+                
+                vf4 = vec_add(vf1,vf2);
+                if(vec_any_eq(vec_or(v5,v3),(vector signed int)vzero)) {
+                    vf3 = fast_log10_altivec_2(vratio);
+                    v1 = vec_cts(vf3,4);
+                    v1 = vec_sl(v1,vec_splat_u32(2));
+                    v2 = vec_add(v1,vec_splat_s32(1));
+                    v3 = vec_add(v1,vec_splat_s32(2));
+                    v4 = vec_add(v2,vec_splat_s32(2));
+                    v1 = vec_sl(v1,vec_splat_u32(-8));
+                    v2 = vec_sl(v2,vec_splat_u32(-16));
+                    v3 = vec_sl(v3,vec_splat_u32(8));
+                    v1 = vec_or(v1,v2);
+                    v3 = vec_or(v3,v4);
+                    v1 = vec_or(v1,v3);
+                    vf3 = (vector float)vec_perm(vtable1,vtable2,(vector unsigned char)v1);
+                    vf5 = (vector float)vec_perm(vtable3,vtable2,(vector unsigned char)v1);
+                    vf5 = vec_sel(vf3,vf5,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31));
+                    vf5 = vec_madd(vf4,vf5,vzero);
+                    vf5 = vec_sel(vf5,vf4,vec_cmpge(vratio,vmamax1));
+                }
+                else vf5 = vf4;
+                
+                vout = vec_sel(vf1,vf2,vec_cmpgt(vf2,vf1));
+                vout = vec_sel(vout,vf4,vec_cmpgt(vmamax2,vratio));
+                vout = vec_sel(vf5,vout,(vector unsigned int)v5);
+                vout = vec_sel(vout,vecb,(vector unsigned int)vec_cmple(vx,vzero));
+                vout = vec_sel(vout,vx,(vector unsigned int)vec_cmple(vecb,vzero));
+                vecb = vec_sel(vout,vecb,vec_cmpgt(vkk,vlast));
+            }
+            vkk = vec_add(vkk,vone);
+            vk = vec_add(vk,vone);
+        }
+        vdd = vec_sl(vdd,(vector unsigned int)vone);
+        vdd_n = vec_sl(vdd_n,(vector unsigned int)vone);
+        vdd = vec_add(vdd,vone);
+        vf1 = vec_ctf(vdd,0);
+        vf2 = vec_ctf(vdd_n,0);
+        vf2 = vec_re(vf2);
+        vf1 = vec_madd(vf1,vf2,vzero);
+        vdd = vec_cts(vf1,0);
+        
+        v1 = vec_sl(vdd,vec_splat_u32(2));
+        v2 = vec_add(v1,vec_splat_s32(1));
+        v3 = vec_add(v1,vec_splat_s32(2));
+        v4 = vec_add(v2,vec_splat_s32(2));
+        v1 = vec_sl(v1,vec_splat_u32(-8));
+        v2 = vec_sl(v2,vec_splat_u32(-16));
+        v3 = vec_sl(v3,vec_splat_u32(8));
+        v1 = vec_or(v1,v2);
+        v3 = vec_or(v3,v4);
+        v1 = vec_or(v1,v3);
+        
+        vf1 = (vector float)vec_perm(vtab1,vtab2,(vector unsigned char)v1);
+        vf2 = (vector float)vec_perm(vtab3,vtab2,(vector unsigned char)v1);
+        vf1 = vec_sel(vf1,vf2,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31));
+        vf2 = vec_ctf(vone,1);
+        vavgmask = vec_madd(vf1,vf2,vzero);
+        vecb = vec_madd(vecb,vavgmask,vzero);
+        
+        vf4 = vec_ld(0,eb_l+b); //needs to be aligned
+        if (psv->blocktype_old[chn & 0x01] == SHORT_TYPE) {
+            vf1 = vec_ld(0,psv->nb_l1[chn]+b); //needs to be aligned
+            vf3 = vec_madd(vf1,(vector float)VINIT4ALL(rpelev),vzero);
+            vf2 = vec_madd(vf4,(vector float)VINIT4ALL(NS_PREECHO_ATT2),vzero);
+            vf3 = vec_sel(vf2,vf3,vec_cmpgt(vf3,vzero));
+            vf3 = vec_min(vecb,vf3);
+            //vec_st(vf3,0,thr+b); //needs to be aligned
+        }
+        else {
+            vf1 = vec_ld(0,psv->nb_l1[chn]+b); //needs to be aligned
+            vf2 = vec_ld(0,psv->nb_l2[chn]+b); //needs to be aligned
+            vf3 = vec_madd(vf1,(vector float)VINIT4ALL(rpelev),vzero);
+            vf2 = vec_madd(vf2,(vector float)VINIT4ALL(rpelev2),vzero);
+            vf3 = vec_sel(vzero,vf3,vec_cmpgt(vf3,vzero));
+            vf2 = vec_sel(vzero,vf2,vec_cmpgt(vf2,vzero));
+            if (psv->blocktype_old[chn & 0x01] == NORM_TYPE) {
+                vf3 = vec_min(vf3,vf2);
+            }
+            vf3 = vec_min(vecb,vf3);
+            //vec_st(vf3,0,thr+b); //needs to be aligned
+        }
+        vec_st(vf1,0,psv->nb_l2[chn]+b); //needs to be aligned
+        vec_st(vecb,0,psv->nb_l1[chn]+b); //needs to be aligned
+        {
+            vx = vec_ld(0,max+b); //needs to be aligned
+            vf1 = vec_ld(0,gdl->minval+b);
+            vx = vec_madd(vx,vf1,vzero);
+            vx = vec_madd(vx,vavgmask,vzero);
+            vf3 = vec_sel(vf3,vx,vec_cmpgt(vf3,vx));
+            //vec_st(vf3,0,thr+b); //needs to be aligned
+        }
+		v1 = (vector signed int)vec_cmpgt(vmasking_lower,(vector float)VINIT4ALL(1.0f));
+		vf1 = vec_madd(vf3,vmasking_lower,vzero);
+		vf3 = vec_sel(vf3,vf1,v1);
+        vf3 = vec_sel(vf3,vf4,vec_cmpgt(vf3,vf4));
+		vf1 = vec_madd(vf3,vmasking_lower,vzero);
+		vf3 = vec_sel(vf1,vf3,v1);
+        vec_st(vf3,0,thr+b); //needs to be aligned
+    }
+#else
+    b=0;
+#endif
+    for (; b < gdl->npart; b++) {
         FLOAT   x, ecb, avg_mask, t;
         FLOAT const masking_lower = gdl->masking_lower[b] * gfc->sv_qnt.masking_lower;
+		//fprintf(stderr,"%f\n",masking_lower);
         /* convolve the partitioned energy with the spreading function */
         int     kk = gdl->s3ind[b][0];
         int const last = gdl->s3ind[b][1];
@@ -1423,11 +2080,11 @@ L3psycho_anal_vbr(lame_internal_flags * 
     /* fft and energy calculation   */
     FLOAT(*wsamp_l)[BLKSIZE];
     FLOAT(*wsamp_s)[3][BLKSIZE_s];
-    FLOAT   fftenergy[HBLKSIZE];
-    FLOAT   fftenergy_s[3][HBLKSIZE_s];
-    FLOAT   wsamp_L[2][BLKSIZE];
-    FLOAT   wsamp_S[2][3][BLKSIZE_s];
-    FLOAT   eb[4][CBANDS], thr[4][CBANDS];
+    FLOAT   fftenergy[HBLKSIZE] __attribute__ ((aligned (16)));
+    FLOAT   fftenergy_s[3][HBLKSIZE_s+3] __attribute__ ((aligned (16)));
+    FLOAT   wsamp_L[2][BLKSIZE] __attribute__ ((aligned (16)));
+    FLOAT   wsamp_S[2][3][BLKSIZE_s] __attribute__ ((aligned (16)));
+    FLOAT   eb[4][CBANDS] __attribute__ ((aligned (16))), thr[4][CBANDS] __attribute__ ((aligned (16)));
 
     FLOAT   sub_short_factor[4][3];
     FLOAT   thmm;
@@ -1436,7 +2093,7 @@ L3psycho_anal_vbr(lame_internal_flags * 
         (cfg->msfix > 0.f) ? (cfg->ATH_offset_factor * gfc->ATH->adjust_factor) : 1.f;
 
     const   FLOAT(*const_eb)[CBANDS] = (const FLOAT(*)[CBANDS]) eb;
-    const   FLOAT(*const_fftenergy_s)[HBLKSIZE_s] = (const FLOAT(*)[HBLKSIZE_s]) fftenergy_s;
+    const   FLOAT(*const_fftenergy_s)[HBLKSIZE_s+3] = (const FLOAT(*)[HBLKSIZE_s+3]) fftenergy_s;
 
     /* block type  */
     int     ns_attacks[4][4] = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} };
@@ -1824,7 +2481,7 @@ compute_bark_values(PsyConst_CB2SB_t con
 }
 
 static int
-init_s3_values(FLOAT ** p, int (*s3ind)[2], int npart,
+init_s3_values(FLOAT ** p, int (*s3ind)[4], int npart,
                FLOAT const *bval, FLOAT const *bval_width, FLOAT const *norm)
 {
     FLOAT   s3[CBANDS][CBANDS];
--- libmp3lame/quantize.c.orig	2017-08-15 22:40:45.000000000 +0900
+++ libmp3lame/quantize.c	2017-10-14 18:02:08.000000000 +0900
@@ -28,6 +28,12 @@
 # include <config.h>
 #endif
 
+#ifdef ALTIVEC
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
+
 #include "lame.h"
 #include "machine.h"
 #include "encoder.h"
@@ -42,7 +48,26 @@
 #endif
 
 
-
+#ifdef PPC_FRSQRTE
+static inline double __frsqrte(double number)
+{
+    double y;
+    asm("frsqrte %0,%1" : "=f" (y) : "f" (number));
+    return y;
+}
+
+static inline double ppc_sqrt(double x) {
+    double y;
+    const double halfx = 0.5 * x;
+    y = __frsqrte(x);
+    y *= 1.5 - halfx * y * y;
+    y *= 1.5 - halfx * y * y;
+    y *= 1.5 - halfx * y * y;
+    //y *= 1.5 - halfx * y * y;
+    y *= x;
+    return (x == 0.0) ? 0 : y;
+}
+#endif
 
 /* convert from L/R <-> Mid/Side */
 static void
@@ -72,9 +97,162 @@ ms_convert(III_side_info_t * l3_side, in
 static void
 init_xrpow_core_c(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * sum)
 {
+#ifdef ALTIVEC
+    vector float v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20;
+    vector float vsum,vsum2,vsum3,vsum4,vmax,vmax2,vmax3,vmax4,vzero;
+    vector unsigned char vc1,vc2,vc3,vc4,vc5,vperm;
+    vector float vconst1 = (vector float)VINIT4ALL(0.25);
+    vector float vconst2 = (vector float)VINIT4ALL(1.25);
+#endif
     int     i;
     FLOAT   tmp;
     *sum = 0;
+#ifdef ALTIVEC
+    vc1 = vec_splat_u8(1);
+    vc2 = vec_splat_u8(5);
+    vc3 = vec_sl(vc1,vc2);
+    vc4 = vec_sl(vc3,vc1);
+    vc5 = vec_or(vc3,vc4);
+    vsum = vec_xor(vsum,vsum);
+    vzero = vec_xor(vzero,vzero);
+    vmax = vec_xor(vmax,vmax);
+    vsum2 = vec_xor(vsum2,vsum2);
+    vmax2 = vec_xor(vmax2,vmax2);
+    vsum3 = vec_xor(vsum3,vsum3);
+    vmax3 = vec_xor(vmax3,vmax3);
+    vsum4 = vec_xor(vsum4,vsum4);
+    vmax4 = vec_xor(vmax4,vmax4);
+    
+    v0 = vec_ld(0,(cod_info->xr));
+    vperm = vec_lvsl(0,(cod_info->xr));
+    for (i = 0; i <= upper-15; i+=16) {
+        v1 = vec_ld(16,(cod_info->xr)+i);
+        v2 = vec_ld(32,(cod_info->xr)+i);
+        v3 = vec_ld(48,(cod_info->xr)+i);
+        v4 = vec_ld(64,(cod_info->xr)+i);
+        v5 = vec_perm(v0,v1,vperm);
+        v6 = vec_perm(v1,v2,vperm);
+        v7 = vec_perm(v2,v3,vperm);
+        v8 = vec_perm(v3,v4,vperm);
+        v0 = v4;
+        v9 = vec_abs(v5);
+        v10 = vec_abs(v6);
+        v11 = vec_abs(v7);
+        v12 = vec_abs(v8);
+        vsum = vec_add(vsum,v9);
+        vsum2 = vec_add(vsum2,v10);
+        vsum3 = vec_add(vsum3,v11);
+        vsum4 = vec_add(vsum4,v12);
+        v1 = vec_re(vec_rsqrte(vec_rsqrte(v9)));
+        v2 = vec_re(vec_rsqrte(vec_rsqrte(v10)));
+        v3 = vec_re(vec_rsqrte(vec_rsqrte(v11)));
+        v4 = vec_re(vec_rsqrte(vec_rsqrte(v12)));
+        v5 = (vector float)vec_cmpeq(vzero,v9);
+        v6 = (vector float)vec_cmpeq(vzero,v10);
+        v7 = (vector float)vec_cmpeq(vzero,v11);
+        v8 = (vector float)vec_cmpeq(vzero,v12);
+        v13 = vec_madd(v1,v1,vzero);
+        v14 = vec_madd(v2,v2,vzero);
+        v15 = vec_madd(v3,v3,vzero);
+        v16 = vec_madd(v4,v4,vzero);
+        v13 = vec_madd(v13,v13,vzero);
+        v14 = vec_madd(v14,v14,vzero);
+        v15 = vec_madd(v15,v15,vzero);
+        v16 = vec_madd(v16,v16,vzero);
+        v17 = vec_madd(v9,vconst1,vzero);
+        v18 = vec_madd(v10,vconst1,vzero);
+        v19 = vec_madd(v11,vconst1,vzero);
+        v20 = vec_madd(v12,vconst1,vzero);
+        v13 = vec_nmsub(v13,v17,vconst2);
+        v14 = vec_nmsub(v14,v18,vconst2);
+        v15 = vec_nmsub(v15,v19,vconst2);
+        v16 = vec_nmsub(v16,v20,vconst2);
+        v1 = vec_madd(v13,v1,vzero);
+        v2 = vec_madd(v14,v2,vzero);
+        v3 = vec_madd(v15,v3,vzero);
+        v4 = vec_madd(v16,v4,vzero);
+        v1 = vec_sel(v1,vzero,(vector unsigned int)v5);
+        v2 = vec_sel(v2,vzero,(vector unsigned int)v6);
+        v3 = vec_sel(v3,vzero,(vector unsigned int)v7);
+        v4 = vec_sel(v4,vzero,(vector unsigned int)v8);
+        v17 = vec_madd(v1,v9,vzero);
+        v18 = vec_madd(v2,v10,vzero);
+        v19 = vec_madd(v3,v11,vzero);
+        v20 = vec_madd(v4,v12,vzero);
+        vec_st(v17,0,xrpow+i);
+        vec_st(v18,16,xrpow+i);
+        vec_st(v19,32,xrpow+i);
+        vec_st(v20,48,xrpow+i);
+        vmax = vec_max(v17,vmax);
+        vmax2 = vec_max(v18,vmax2);
+        vmax3 = vec_max(v19,vmax3);
+        vmax4 = vec_max(v20,vmax4);
+    }
+    vmax = vec_max(vmax,vmax2);
+    vmax3 = vec_max(vmax3,vmax4);
+    vmax = vec_max(vmax,vmax3);
+    vsum = vec_add(vsum,vsum2);
+    vsum3 = vec_add(vsum3,vsum4);
+    vsum = vec_add(vsum,vsum3);
+    v1 = vec_slo(vmax,vc3);
+    v2 = vec_slo(vsum,vc3);
+    v3 = vec_max(v1,vmax);
+    v4 = vec_add(v2,vsum);
+    v5 = vec_slo(v3,vc4);
+    v6 = vec_slo(v4,vc4);
+    vmax = vec_max(v3,v5);
+    vsum = vec_add(v4,v6);
+    vmax = vec_perm(vmax,vmax,vec_lvsr(0,&(cod_info->xrpow_max)));
+    vsum = vec_perm(vsum,vsum,vec_lvsr(0,sum));
+    vec_ste(vmax,0,&(cod_info->xrpow_max));
+    vec_ste(vsum,0,sum);
+    
+    for (; i <= upper; i++) {
+        tmp = fabs(cod_info->xr[i]);
+        *sum += tmp;
+        xrpow[i] = sqrt(tmp * sqrt(tmp));
+        
+        if (xrpow[i] > cod_info->xrpow_max)
+        cod_info->xrpow_max = xrpow[i];
+    }
+#else
+#ifdef PPC_FRSQRTE
+    FLOAT   tmp2,tmp3,tmp4;
+    
+    for (i = 0; i <= upper-3; i+=4) {
+        tmp = fabs (cod_info->xr[i]);
+        tmp2 = fabs (cod_info->xr[i+1]);
+        tmp3 = fabs (cod_info->xr[i+2]);
+        tmp4 = fabs (cod_info->xr[i+3]);
+        *sum += tmp;
+        *sum += tmp2;
+        *sum += tmp3;
+        *sum += tmp4;
+        
+        xrpow[i] = ppc_sqrt (tmp * ppc_sqrt(tmp));
+        xrpow[i+1] = ppc_sqrt (tmp2 * ppc_sqrt(tmp2));
+        xrpow[i+2] = ppc_sqrt (tmp3 * ppc_sqrt(tmp3));
+        xrpow[i+3] = ppc_sqrt (tmp4 * ppc_sqrt(tmp4));
+        
+        if (xrpow[i] > cod_info->xrpow_max)
+        cod_info->xrpow_max = xrpow[i];
+        if (xrpow[i+1] > cod_info->xrpow_max)
+        cod_info->xrpow_max = xrpow[i+1];
+        if (xrpow[i+2] > cod_info->xrpow_max)
+        cod_info->xrpow_max = xrpow[i+2];
+        if (xrpow[i+3] > cod_info->xrpow_max)
+        cod_info->xrpow_max = xrpow[i+3];
+    }
+    
+    for (; i <= upper; i++) {
+        tmp = fabs(cod_info->xr[i]);
+        *sum += tmp;
+        xrpow[i] = ppc_sqrt(tmp * ppc_sqrt(tmp));
+
+        if (xrpow[i] > cod_info->xrpow_max)
+            cod_info->xrpow_max = xrpow[i];
+    }
+#else
     for (i = 0; i <= upper; ++i) {
         tmp = fabs(cod_info->xr[i]);
         *sum += tmp;
@@ -83,6 +261,8 @@ init_xrpow_core_c(gr_info * const cod_in
         if (xrpow[i] > cod_info->xrpow_max)
             cod_info->xrpow_max = xrpow[i];
     }
+#endif
+#endif
 }
 
 
@@ -1495,7 +1675,7 @@ VBR_old_iteration_loop(lame_internal_fla
     EncResult_t *const eov = &gfc->ov_enc;
     FLOAT   l3_xmin[2][2][SFBMAX];
 
-    FLOAT   xrpow[576];
+    FLOAT   xrpow[576] __attribute__ ((aligned (16)));
     int     bands[2][2];
     int     frameBits[15];
     int     used_bits;
@@ -1650,7 +1830,7 @@ VBR_new_iteration_loop(lame_internal_fla
     EncResult_t *const eov = &gfc->ov_enc;
     FLOAT   l3_xmin[2][2][SFBMAX];
 
-    FLOAT   xrpow[2][2][576];
+    FLOAT   xrpow[2][2][576] __attribute__ ((aligned (16)));
     int     frameBits[15];
     int     used_bits;
     int     max_bits[2][2];
@@ -1904,7 +2084,7 @@ ABR_iteration_loop(lame_internal_flags *
     SessionConfig_t const *const cfg = &gfc->cfg;
     EncResult_t *const eov = &gfc->ov_enc;
     FLOAT   l3_xmin[SFBMAX];
-    FLOAT   xrpow[576];
+    FLOAT   xrpow[576] __attribute__ ((aligned (16)));
     int     targ_bits[2][2];
     int     mean_bits, max_frame_bits;
     int     ch, gr, ath_over;
@@ -1991,7 +2171,7 @@ CBR_iteration_loop(lame_internal_flags *
 {
     SessionConfig_t const *const cfg = &gfc->cfg;
     FLOAT   l3_xmin[SFBMAX];
-    FLOAT   xrpow[576];
+    FLOAT   xrpow[576] __attribute__ ((aligned (16)));
     int     targ_bits[2];
     int     mean_bits, max_bits;
     int     gr, ch;
--- libmp3lame/quantize_pvt.c.orig	2017-09-07 04:33:36.000000000 +0900
+++ libmp3lame/quantize_pvt.c	2017-10-14 18:02:08.000000000 +0900
@@ -27,6 +27,13 @@
 # include <config.h>
 #endif
 
+#ifdef ALTIVEC
+#undef TAKEHIRO_IEEE754_HACK
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
+
 
 #include "lame.h"
 #include "machine.h"
@@ -751,6 +758,39 @@ calc_xmin(lame_internal_flags const *gfc
 static  FLOAT
 calc_noise_core_c(const gr_info * const cod_info, int *startline, int l, FLOAT step)
 {
+#ifdef ALTIVEC
+    vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,va,vb,vstep,vzero,vnoise1,vnoise2,vix01;
+    vector unsigned char vperm1,vperm2,vperm5,vperm6;
+    vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vx7,vshamt,vone;
+#ifdef ALTIVEC_970
+    vector unsigned int vmask1,vmask2,vmask3;
+    vector float v10,v11,v12,v13,v14,v15,v16,v17;
+#else
+    vector unsigned char vc1,vc2,vc3,vc4,vc5,vc6,vperm3,vperm4,vmask;
+#endif
+    float temp[4] __attribute__ ((aligned (16)));
+    
+    temp[0] = step;
+    vstep = vec_ld(0,temp);
+    vzero = vec_xor(vzero,vzero);
+    vperm6 = (vector unsigned char)VINIT16(0,0,3,19,0,0,7,23,0,0,11,27,0,0,15,31);
+    vperm5 = vec_sld(vperm6,vperm6,2);
+#ifdef ALTIVEC_970
+    vmask1 = vec_splat_u32(-1);
+    vmask2 = vec_sld((vector unsigned int)vzero,vmask1,8);
+    vmask3 = vec_sld((vector unsigned int)vzero,vmask1,4);
+    vmask1 = vec_sld((vector unsigned int)vzero,vmask1,12);
+#else
+    vperm3 = (vector unsigned char)VINIT16(0,0,0,0,0,0,0,0,0,1,2,3,16,17,18,19);
+    vperm4 = vec_sld(vperm3,(vector unsigned char)vzero,8);
+    vmask = (vector unsigned char)VINIT16ALL(16);
+#endif
+    vstep = vec_splat(vstep,0);
+    vnoise1 = vec_xor(vnoise1,vnoise1);
+    vnoise2 = vec_xor(vnoise2,vnoise2);
+    vone = vec_splat_s32(1);
+    vshamt = vec_splat_s32(2);
+#endif
     FLOAT   noise = 0;
     int     j = *startline;
     const int *const ix = cod_info->l3_enc;
@@ -767,9 +807,55 @@ calc_noise_core_c(const gr_info * const 
         }
     }
     else if (j > cod_info->big_values) {
-        FLOAT   ix01[2];
+        FLOAT   ix01[4] __attribute__ ((aligned (16)));
         ix01[0] = 0;
         ix01[1] = step;
+#ifdef ALTIVEC
+        vix01 = vec_ld(0,ix01);
+        v1 = vec_ld(0,cod_info->xr+j);
+        vperm1 = vec_lvsl(0,cod_info->xr+j);
+        vx1 = vec_ld(0,ix+j);
+        vperm2 = vec_lvsl(0,ix+j);
+        for(;l>1;l-=2) {
+            v2 = vec_ld(16,cod_info->xr+j);
+            vx2 = vec_ld(16,ix+j);
+            v3 = vec_perm(v1,v2,vperm1);
+            vx3 = vec_perm(vx1,vx2,vperm2);
+            va = vec_abs(v3);
+            v1 = v2;
+            vx1 = vx2;
+            
+            vx4 = vec_sl(vx3,(vector unsigned int)vshamt);
+            vx5 = vec_add(vx4,vone);
+            vx6 = vec_add(vx4,vshamt);
+            vx7 = vec_add(vx5,vshamt);
+            vx2 = vec_perm(vx4,vx5,vperm5);
+            vx3 = vec_perm(vx6,vx7,vperm6);
+            vx4 = vec_or(vx2,vx3);
+            
+            v2 = vec_perm(vix01,vix01,(vector unsigned char)vx4);
+            va = vec_sub(va,v2);
+            
+            vnoise1 = vec_madd(va,va,vnoise1);
+            
+            j += 4;
+        }
+        v1 = vec_sld(vnoise1,vnoise1,8);
+        v2 = vec_add(vnoise1,v1);
+        v3 = vec_sld(v2,v2,4);
+        v4 = vec_add(v2,v3);
+        v5 = vec_perm(v4,v4,vec_lvsr(0,&noise));
+        vec_ste(v5,0,&noise);
+		if(l) {
+            FLOAT   temp;
+            temp = fabs(cod_info->xr[j]) - ix01[ix[j]];
+            j++;
+            noise += temp * temp;
+            temp = fabs(cod_info->xr[j]) - ix01[ix[j]];
+            j++;
+            noise += temp * temp;
+        }
+#else
         while (l--) {
             FLOAT   temp;
             temp = fabs(cod_info->xr[j]) - ix01[ix[j]];
@@ -779,8 +865,138 @@ calc_noise_core_c(const gr_info * const 
             j++;
             noise += temp * temp;
         }
+#endif
     }
     else {
+#ifdef ALTIVEC
+        vperm1 = vec_lvsl(0,cod_info->xr+j);
+        v1 = vec_ld(0,cod_info->xr+j);
+        for(;l>3;l-=4) {
+            v2 = vec_ld(16,cod_info->xr+j);
+            v3 = vec_ld(32,cod_info->xr+j);
+            v4 = vec_perm(v1,v2,vperm1);
+            v5 = vec_perm(v2,v3,vperm1);
+            va = vec_abs(v4);
+            vb = vec_abs(v5);
+            v1 = v3;
+            
+#ifdef ALTIVEC_970
+            v2 = vec_lde(0,pow43+ix[j]);
+            v6 = vec_lde(0,pow43+ix[j+1]);
+            v10 = vec_lde(0,pow43+ix[j+2]);
+            v14 = vec_lde(0,pow43+ix[j+3]);
+            v4 = vec_perm(v2,v2,vec_lvsl(0,pow43+ix[j]));
+            v8 = vec_perm(v6,v6,vec_lvsl(-4,pow43+ix[j+1]));
+            v12 = vec_perm(v10,v10,vec_lvsl(-8,pow43+ix[j+2]));
+            v16 = vec_perm(v14,v14,vec_lvsl(-12,pow43+ix[j+3]));
+            v4 = vec_sel(v4,v8,vmask1);
+            v4 = vec_sel(v4,v12,vmask2);
+            v4 = vec_sel(v4,v16,vmask3);
+            va = vec_nmsub(v4,vstep,va);
+#else
+            vc1 = vec_lvsl(0,pow43+ix[j]);
+            vc2 = vec_lvsl(0,pow43+ix[j+1]);
+            vc3 = vec_lvsl(0,pow43+ix[j+2]);
+            vc4 = vec_lvsl(0,pow43+ix[j+3]);
+            vc2 = vec_or(vc2,vmask);
+            vc4 = vec_or(vc4,vmask);
+            v2 = vec_lde(0,pow43+ix[j]);
+            v3 = vec_lde(0,pow43+ix[j+1]);
+            v4 = vec_lde(0,pow43+ix[j+2]);
+            v5 = vec_lde(0,pow43+ix[j+3]);
+            vc5 = vec_perm(vc1,vc2,vperm3);
+            vc6 = vec_perm(vc3,vc4,vperm4);
+            v6 = vec_perm(v2,v3,vc5);
+            v7 = vec_perm(v4,v5,vc6);
+            v8 = vec_sld(v6,v7,8);
+            va = vec_nmsub(v8,vstep,va);
+#endif
+            j+=4;
+            
+#ifdef ALTIVEC_970
+            v3 = vec_lde(0,pow43+ix[j]);
+            v7 = vec_lde(0,pow43+ix[j+1]);
+            v11 = vec_lde(0,pow43+ix[j+2]);
+            v15 = vec_lde(0,pow43+ix[j+3]);
+            v5 = vec_perm(v3,v3,vec_lvsl(0,pow43+ix[j]));
+            v9 = vec_perm(v7,v7,vec_lvsl(-4,pow43+ix[j+1]));
+            v13 = vec_perm(v11,v11,vec_lvsl(-8,pow43+ix[j+2]));
+            v17 = vec_perm(v15,v15,vec_lvsl(-12,pow43+ix[j+3]));
+            v5 = vec_sel(v5,v9,vmask1);
+            v5 = vec_sel(v5,v13,vmask2);
+            v5 = vec_sel(v5,v17,vmask3);
+            vb = vec_nmsub(v5,vstep,vb);
+#else
+            vc1 = vec_lvsl(0,pow43+ix[j]);
+            vc2 = vec_lvsl(0,pow43+ix[j+1]);
+            vc3 = vec_lvsl(0,pow43+ix[j+2]);
+            vc4 = vec_lvsl(0,pow43+ix[j+3]);
+            vc2 = vec_or(vc2,vmask);
+            vc4 = vec_or(vc4,vmask);
+            v2 = vec_lde(0,pow43+ix[j]);
+            v3 = vec_lde(0,pow43+ix[j+1]);
+            v4 = vec_lde(0,pow43+ix[j+2]);
+            v5 = vec_lde(0,pow43+ix[j+3]);
+            vc5 = vec_perm(vc1,vc2,vperm3);
+            vc6 = vec_perm(vc3,vc4,vperm4);
+            v6 = vec_perm(v2,v3,vc5);
+            v7 = vec_perm(v4,v5,vc6);
+            v8 = vec_sld(v6,v7,8);
+            vb = vec_nmsub(v8,vstep,vb);
+#endif
+            
+            vnoise1 = vec_madd(va,va,vnoise1);
+            vnoise2 = vec_madd(vb,vb,vnoise2);
+            
+            j+=4;
+        }
+        vnoise1 = vec_add(vnoise1,vnoise2);
+        
+        for(;l>1;l-=2) {
+            v2 = vec_ld(16,cod_info->xr+j);
+            v4 = vec_perm(v1,v2,vperm1);
+            va = vec_abs(v4);
+            v1 = v2;
+            
+            v2 = vec_lde(0,pow43+ix[j]);
+            v3 = vec_lde(0,pow43+ix[j+1]);
+            v4 = vec_lde(0,pow43+ix[j+2]);
+            v5 = vec_lde(0,pow43+ix[j+3]);
+            v6 = vec_perm(v2,v2,vec_lvsl(0,pow43+ix[j]));
+            v7 = vec_perm(v3,v3,vec_lvsl(-4,pow43+ix[j+1]));
+            v8 = vec_perm(v4,v4,vec_lvsl(-8,pow43+ix[j+2]));
+            v9 = vec_perm(v5,v5,vec_lvsl(-12,pow43+ix[j+3]));
+#ifdef ALTIVEC_970
+            v6 = vec_sel(v6,v7,vmask1);
+            v6 = vec_sel(v6,v8,vmask2);
+            v6 = vec_sel(v6,v9,vmask3);
+#else
+            v6 = vec_or(v6,v7);
+            v6 = vec_or(v6,v8);
+            v6 = vec_or(v6,v9);
+#endif
+            va = vec_nmsub(v6,vstep,va);
+            
+            vnoise1 = vec_madd(va,va,vnoise1);
+            
+            j += 4;
+        }
+        v1 = vec_sld(vnoise1,vnoise1,8);
+        v2 = vec_add(vnoise1,v1);
+        v3 = vec_sld(v2,v2,4);
+        v4 = vec_add(v2,v3);
+        v5 = vec_perm(v4,v4,vec_lvsr(0,&noise));
+        vec_ste(v5,0,&noise);
+        if(l) {
+            FLOAT   temp;
+            temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step;
+            j++;
+            noise += temp * temp;
+            temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step;
+            j++;
+            noise += temp * temp;
+        }
+#else
         while (l--) {
             FLOAT   temp;
             temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step;
@@ -790,6 +1006,7 @@ calc_noise_core_c(const gr_info * const 
             j++;
             noise += temp * temp;
         }
+#endif
     }
 
     *startline = j;
--- libmp3lame/tables.c.orig	2011-05-08 01:05:17.000000000 +0900
+++ libmp3lame/tables.c	2017-10-14 18:02:08.000000000 +0900
@@ -406,7 +406,7 @@ const uint8_t t33l[] = {
 };
 
 
-const struct huffcodetab ht[HTN] = {
+const struct huffcodetab ht[HTN] __attribute__ ((aligned (16))) = {
     /* xlen, linmax, table, hlen */
     {0, 0, NULL, NULL},
     {2, 0, t1HB, t1l},
--- libmp3lame/takehiro.c.orig	2017-09-07 04:33:36.000000000 +0900
+++ libmp3lame/takehiro.c	2017-10-14 18:02:08.000000000 +0900
@@ -26,6 +26,12 @@
 # include <config.h>
 #endif
 
+#ifdef ALTIVEC
+#undef TAKEHIRO_IEEE754_HACK
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
 
 #include "lame.h"
 #include "machine.h"
@@ -222,6 +228,150 @@ quantize_lines_xrpow(unsigned int l, FLO
 static void
 quantize_lines_xrpow(unsigned int l, FLOAT istep, const FLOAT * xr, int *ix)
 {
+#ifdef ALTIVEC
+    vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,va,vb,vistep,vzero;
+    vector signed int vx1,vx2,vx3,vx4,vprev;
+    vector unsigned char vperm1,vperm2;
+    const vector float const1 = (vector float)VINIT4(0.4053964553387788,3.404263724373839,5.465086767819913,1.0);
+    const vector float const2 = (vector float)VINIT4(7.719205369637751,10.93017829043677,0,0);
+#ifndef ALTIVEC_970
+    unsigned int temp[4] __attribute__ ((aligned (16)));
+#endif
+    float temp2[4] __attribute__ ((aligned (16)));
+    temp2[0] = istep;
+    vistep = vec_ld(0,temp2);
+    vzero = vec_xor(vzero,vzero);
+    vistep = vec_splat(vistep,0);
+    
+    l = l >> 1;
+    
+    vperm1 = vec_lvsl(0,xr);
+    vperm2 = vec_lvsr(0,ix);
+    v1 = vec_ld(0,xr);
+    vx1 = vec_ld(-16,ix);
+    vx2 = vec_ld(0,ix);
+    vprev = vec_perm(vx1,vx2,vec_lvsl(0,ix));
+    for(;l>3;l-=4) {
+        v2 = vec_ld(16,xr);
+        v3 = vec_ld(32,xr);
+        v4 = vec_perm(v1,v2,vperm1);
+        v5 = vec_perm(v2,v3,vperm1);
+        va = vec_madd(v4,vistep,vzero);
+        vb = vec_madd(v5,vistep,vzero);
+        v1 = v3;
+        
+        v2 = vec_floor(va);
+        v3 = vec_floor(vb);
+        v4 = vec_splat(const1,2);
+        v5 = vec_splat(const1,1);
+        v6 = vec_splat(const2,1);
+        v7 = vec_splat(const2,0);
+        v8 = vec_madd(v2,v4,v5);
+        v9 = vec_madd(v3,v4,v5);
+        v10 = vec_madd(v2,v6,v7);
+        v11 = vec_madd(v3,v6,v7);
+        v4 = vec_splat(const1,0);
+        v5 = vec_splat(const1,3);
+        v8 = vec_madd(v8,v2,v4);
+        v9 = vec_madd(v9,v3,v4);
+        v10 = vec_madd(v10,v2,v5);
+        v11 = vec_madd(v11,v3,v5);
+        v6 = vec_re(v10);
+        v7 = vec_re(v11);
+        v10 = vec_nmsub(v10,v6,v5);
+        v11 = vec_nmsub(v11,v7,v5);
+        v10 = vec_madd(v10,v6,v6);
+        v11 = vec_madd(v11,v7,v7);
+        va = vec_madd(v8,v10,va);
+        vb = vec_madd(v9,v11,vb);
+        
+        vx1 = vec_cts(va,0);
+        vx2 = vec_cts(vb,0);
+        
+        vx3 = vec_perm(vprev,vx1,vperm2);
+        vx4 = vec_perm(vx1,vx2,vperm2);
+        vec_st(vx3,0,ix);
+        vec_st(vx4,16,ix);
+        vprev = vx2;
+        xr += 8;
+        ix += 8;
+    }
+    vx1 = vec_ld(0,ix);
+    vx2 = vec_ld(16,ix);
+    vx3 = vec_perm(vx1,vx2,vec_lvsl(0,ix));
+    vx4 = vec_perm(vprev,vx3,vperm2);
+    vec_st(vx4,0,ix);
+    
+#ifdef ALTIVEC_970
+    for(;l>1;l-=2) {
+        FLOAT   x0, x1, x2, x3;
+        int     rx0, rx1, rx2, rx3;
+
+        x0 = *xr++ * istep;
+        x1 = *xr++ * istep;
+        XRPOW_FTOI(x0, rx0);
+        x2 = *xr++ * istep;
+        XRPOW_FTOI(x1, rx1);
+        x3 = *xr++ * istep;
+        XRPOW_FTOI(x2, rx2);
+        x0 += QUANTFAC(rx0);
+        XRPOW_FTOI(x3, rx3);
+        x1 += QUANTFAC(rx1);
+        XRPOW_FTOI(x0, *ix++);
+        x2 += QUANTFAC(rx2);
+        XRPOW_FTOI(x1, *ix++);
+        x3 += QUANTFAC(rx3);
+        XRPOW_FTOI(x2, *ix++);
+        XRPOW_FTOI(x3, *ix++);
+    }
+#else
+    for(;l>1;l-=2) {
+        v2 = vec_ld(16,xr);
+        v4 = vec_perm(v1,v2,vperm1);
+        v1 = v2;
+        va = vec_madd(v4,vistep,vzero);
+        vx1 = vec_cts(va,0);
+        vec_st((vector unsigned int)vx1,0,temp);
+        v2 = vec_lde(0,adj43+temp[0]);
+        v3 = vec_lde(0,adj43+temp[1]);
+        v4 = vec_lde(0,adj43+temp[2]);
+        v5 = vec_lde(0,adj43+temp[3]);
+        v6 = vec_perm(v2,v2,vec_lvsl(0,adj43+temp[0]));
+        v7 = vec_perm(v3,v3,vec_lvsl(-4,adj43+temp[1]));
+        v8 = vec_perm(v4,v4,vec_lvsl(-8,adj43+temp[2]));
+        v9 = vec_perm(v5,v5,vec_lvsl(-12,adj43+temp[3]));
+        v6 = vec_or(v6,v7);
+        v6 = vec_or(v6,v8);
+        v6 = vec_or(v6,v9);
+        va = vec_add(va,v6);
+        vx1 = vec_cts(va,0);
+        vx3 = vec_perm(vprev,vx1,vperm2);
+        vec_st(vx3,0,ix);
+        vprev = vx1;
+        xr += 4;
+        ix += 4;
+    }
+    vx1 = vec_ld(0,ix);
+    vx2 = vec_ld(16,ix);
+    vx3 = vec_perm(vx1,vx2,vec_lvsl(0,ix));
+    vx4 = vec_perm(vprev,vx3,vperm2);
+    vec_st(vx4,0,ix);
+#endif
+    
+    if (l) {
+        FLOAT   x0, x1;
+        int     rx0, rx1;
+
+        x0 = *xr++ * istep;
+        x1 = *xr++ * istep;
+        XRPOW_FTOI(x0, rx0);
+        XRPOW_FTOI(x1, rx1);
+        x0 += QUANTFAC(rx0);
+        x1 += QUANTFAC(rx1);
+        XRPOW_FTOI(x0, *ix++);
+        XRPOW_FTOI(x1, *ix++);
+    }
+#else
     unsigned int remaining;
 
     assert(l > 0);
@@ -263,7 +413,7 @@ quantize_lines_xrpow(unsigned int l, FLO
         XRPOW_FTOI(x0, *ix++);
         XRPOW_FTOI(x1, *ix++);
     }
-
+#endif
 }
 
 
@@ -420,6 +570,60 @@ quantize_xrpow(const FLOAT * xp, int *pi
 /*	      ix_max							 */
 /*************************************************************************/
 
+#ifdef ALTIVEC
+int
+ix_max_vec(const int *ix, const int *end)
+{
+    int vresult[4] __attribute__ ((aligned (16)));
+    int max1=0, max2=0;
+    vector signed int v1, v2, v3, v4, v5, v6, v7, vmax;
+    vector unsigned char vmask,vc1,vc2,vc3,vc4;
+    
+    if(end - ix < 8) goto normal;
+    int i = (end-ix)/4;
+    int remain = (end-ix)%4;
+    vc1 = vec_splat_u8(1);
+    vc2 = vec_splat_u8(5);
+    vc3 = vec_sl(vc1,vc2);
+    vc4 = vec_sl(vc3,vc1);
+    
+    v1 = vec_ld(0, ix);
+    vmask = vec_lvsl(0, ix);
+    vmax = vec_xor(vmax, vmax);
+    
+    while(i--) {
+        v2 = vec_ld(16, ix);
+        v3 = vec_perm(v1, v2, vmask);
+        v1 = v2;
+        vmax = vec_max(vmax,v3);
+        ix += 4;
+    }
+    
+    v4 = vec_slo(vmax,vc3);
+    v5 = vec_max(vmax,v4);
+    v6 = vec_slo(v5,vc4);
+    v7 = vec_max(v5,v6);
+    vec_st(v7,0,vresult);
+    
+    max1 = vresult[0];
+    if(!remain) return max1;
+    //max2 = vresult[2];
+    /*if(vresult[2] > max1) max1 = vresult[2];
+    if(vresult[3] > max2) max2 = vresult[3];*/
+    
+  normal:
+    
+    do{
+        int x1 = *ix++;
+        int x2 = *ix++;
+        if (max1 < x1) max1 = x1;
+        if (max2 < x2) max2 = x2;
+    } while (ix < end);
+    if(max1 < max2) max1 = max2;
+    
+    return max1;
+}
+#else
 static int
 ix_max(const int *ix, const int *end)
 {
@@ -438,14 +642,14 @@ ix_max(const int *ix, const int *end)
         max1 = max2;
     return max1;
 }
+#endif
 
 
 
 
 
 
-
-
+#if !defined(ALTIVEC) || (defined(ALTIVEC) && !defined(ALTIVEC_970))
 static int
 count_bit_ESC(const int *ix, const int *const end, int t1, const int t2, unsigned int *const s)
 {
@@ -481,6 +685,7 @@ count_bit_ESC(const int *ix, const int *
     *s += sum;
     return t1;
 }
+#endif
 
 
 static int
@@ -507,6 +712,7 @@ static const int huf_tbl_noESC[] = {
 };
 
 
+#if !defined(ALTIVEC)
 static int
 count_bit_noESC_from2(const int *ix, const int *end, int max, unsigned int *s)
 {
@@ -533,6 +739,7 @@ count_bit_noESC_from2(const int *ix, con
     *s += sum;
     return t1;
 }
+#endif
 
 
 inline static int
@@ -572,6 +779,651 @@ count_bit_noESC_from3(const int *ix, con
     return t;  
 }
 
+#ifdef ALTIVEC
+#ifdef ALTIVEC_970
+static int
+count_bit_ESC_altivec(const int *ix, const int *const end, int t1, const int t2, int *const s)
+{
+    /* ESC-table is used */
+    int const linbits = ht[t1].xlen * 65536 + ht[t2].xlen;
+    int     sum = 0, sum2;
+    vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
+    vector unsigned int vsum;
+    vector unsigned char vmask,vperm1,vperm2,vshamt;
+    vector unsigned char vzero,vs1,vs2,vs3,vs4,vs5,vs6,vlimit1,vlimit2,vone;
+    unsigned char tmp[16] __attribute__ ((aligned (16)));
+    unsigned int tmp2[4] __attribute__ ((aligned (16)));
+    
+    vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
+    vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
+    vlimit1 = vec_splat_u8(14);
+    vlimit2 = vec_splat_u8(15);
+    vone = vec_splat_u8(1);
+    vshamt = vec_splat_u8(4);
+    vzero = vec_xor(vzero,vzero);
+    vsum = vec_xor(vsum,vsum);
+    
+    if((int)(end - ix) < 32) goto normal;
+    v0 = vec_ld(0,ix);
+    vmask = vec_lvsl(0,ix);
+    do {
+        v1 = vec_ld(16,ix);
+        v2 = vec_ld(32,ix);
+        v3 = vec_ld(48,ix);
+        v4 = vec_ld(64,ix);
+        v5 = vec_ld(80,ix);
+        v6 = vec_ld(96,ix);
+        v7 = vec_ld(112,ix);
+        v8 = vec_ld(128,ix);
+        v9 = vec_perm(v0,v1,vmask);
+        v10 = vec_perm(v1,v2,vmask);
+        v11 = vec_perm(v2,v3,vmask);
+        v12 = vec_perm(v3,v4,vmask);
+        v13 = vec_perm(v4,v5,vmask);
+        v14 = vec_perm(v5,v6,vmask);
+        v15 = vec_perm(v6,v7,vmask);
+        v16 = vec_perm(v7,v8,vmask);
+        v0 = v8;
+        v1 = vec_perm(v9,v10,vperm1);
+        v2 = vec_perm(v9,v10,vperm2);
+        v3 = vec_perm(v11,v12,vperm1);
+        v4 = vec_perm(v11,v12,vperm2);
+        v5 = vec_perm(v13,v14,vperm1);
+        v6 = vec_perm(v13,v14,vperm2);
+        v7 = vec_perm(v15,v16,vperm1);
+        v8 = vec_perm(v15,v16,vperm2);
+        
+        v1 = (vector signed int)vec_packs(v1,v3);
+        v2 = (vector signed int)vec_packs(v2,v4);
+        v3 = (vector signed int)vec_packs(v5,v7);
+        v4 = (vector signed int)vec_packs(v6,v8);
+        vs1 = vec_packs((vector unsigned short)v1,(vector unsigned short)v3);
+        vs2 = vec_packs((vector unsigned short)v2,(vector unsigned short)v4);
+        vs3 = vec_sel(vs1,vlimit2,vec_cmpgt(vs1,vlimit1));
+        vs4 = vec_sel(vs2,vlimit2,vec_cmpgt(vs2,vlimit1));
+        vs5 = vec_sel(vzero,vone,vec_cmpgt(vs1,vlimit1));
+        vs6 = vec_sel(vzero,vone,vec_cmpgt(vs2,vlimit1));
+        vs5 = vec_add(vs5,vs6);
+        vsum = vec_sum4s(vs5,vsum);
+        vs3 = vec_sl(vs3,vshamt);
+        vs3 = vec_add(vs3,vs4);
+        vec_st(vs3,0,tmp);
+        
+        sum += largetbl[tmp[0]];
+        sum += largetbl[tmp[1]];
+        sum += largetbl[tmp[2]];
+        sum += largetbl[tmp[3]];
+        sum += largetbl[tmp[4]];
+        sum += largetbl[tmp[5]];
+        sum += largetbl[tmp[6]];
+        sum += largetbl[tmp[7]];
+        sum += largetbl[tmp[8]];
+        sum += largetbl[tmp[9]];
+        sum += largetbl[tmp[10]];
+        sum += largetbl[tmp[11]];
+        sum += largetbl[tmp[12]];
+        sum += largetbl[tmp[13]];
+        sum += largetbl[tmp[14]];
+        sum += largetbl[tmp[15]];
+        
+        ix += 32;
+    } while(ix < end-31);
+    
+    vsum = (vector unsigned int)vec_sums((vector signed int)vsum,(vector signed int)vzero);
+    vec_st(vsum,0,tmp2);
+    sum += tmp2[3] * linbits;
+    
+    while (ix < end) {
+        unsigned int x = *ix++;
+        unsigned int y = *ix++;
+
+        if (x >= 15u) {
+            x = 15u;
+            sum += linbits;
+        }
+        if (y >= 15u) {
+            y = 15u;
+            sum += linbits;
+        }
+        x <<= 4u;
+        x += y;
+        sum += largetbl[x];
+    }
+    goto end;
+    
+normal:
+    do {
+        unsigned int x = *ix++;
+        unsigned int y = *ix++;
+
+        if (x >= 15u) {
+            x = 15u;
+            sum += linbits;
+        }
+        if (y >= 15u) {
+            y = 15u;
+            sum += linbits;
+        }
+        x <<= 4u;
+        x += y;
+        sum += largetbl[x];
+    } while (ix < end);
+
+end:
+    sum2 = sum & 0xffffu;
+    sum >>= 16u;
+
+    if (sum > sum2) {
+        sum = sum2;
+        t1 = t2;
+    }
+
+    *s += sum;
+    return t1;
+}
+#endif
+
+inline static int
+count_bit_noESC_from2_altivec1(const int *ix, const int *end, int max, unsigned int *s)
+{
+    int t1 = huf_tbl_noESC[max - 1];
+    /* No ESC-words */
+    unsigned int sum = 0;
+    int sum1, sum2;
+    const unsigned int xlen = 3;
+    const unsigned int *table = table23;
+    vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
+    vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2;
+    vector unsigned char vmask,vperm1,vperm2,vx;
+    vector unsigned char vhlen1,vhlen2;
+    vector signed char vs1,vs2;
+    
+    vhlen1 = (vector unsigned char)VINIT16(1,4,7,4,5,7,6,7,8,0,0,0,0,0,0,0);
+    vhlen2 = (vector unsigned char)VINIT16(2,3,7,4,4,7,6,7,8,0,0,0,0,0,0,0);
+    vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
+    vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
+    vxlen = vec_splat_s32(3);
+    vzero = vec_xor(vzero,vzero);
+    vsum1 = vec_xor(vsum1,vsum1);
+    vsum2 = vec_xor(vsum2,vsum2);
+    
+    if((int)(end - ix) < 32) goto normal;
+    v0 = vec_ld(0,ix);
+    vmask = vec_lvsl(0,ix);
+    do {
+        v1 = vec_ld(16,ix);
+        v2 = vec_ld(32,ix);
+        v3 = vec_ld(48,ix);
+        v4 = vec_ld(64,ix);
+        v5 = vec_ld(80,ix);
+        v6 = vec_ld(96,ix);
+        v7 = vec_ld(112,ix);
+        v8 = vec_ld(128,ix);
+        v9 = vec_perm(v0,v1,vmask);
+        v10 = vec_perm(v1,v2,vmask);
+        v11 = vec_perm(v2,v3,vmask);
+        v12 = vec_perm(v3,v4,vmask);
+        v13 = vec_perm(v4,v5,vmask);
+        v14 = vec_perm(v5,v6,vmask);
+        v15 = vec_perm(v6,v7,vmask);
+        v16 = vec_perm(v7,v8,vmask);
+        v0 = v8;
+        v1 = vec_perm(v9,v10,vperm1);
+        v2 = vec_perm(v9,v10,vperm2);
+        v3 = vec_perm(v11,v12,vperm1);
+        v4 = vec_perm(v11,v12,vperm2);
+        v5 = vec_perm(v13,v14,vperm1);
+        v6 = vec_perm(v13,v14,vperm2);
+        v7 = vec_perm(v15,v16,vperm1);
+        v8 = vec_perm(v15,v16,vperm2);
+        vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2);
+        vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4);
+        vx3 = (vector signed int)vec_pack(vx1,vx2);
+        vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6);
+        vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8);
+        vx6 = (vector signed int)vec_pack(vx4,vx5);
+        vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6);
+        
+        vs1 = (vector signed char)vec_perm(vhlen1,vhlen1,vx);
+        vs2 = (vector signed char)vec_perm(vhlen2,vhlen2,vx);
+        
+        vsum1 = vec_sum4s(vs1,vsum1);
+        vsum2 = vec_sum4s(vs2,vsum2);
+        
+        ix += 32;
+    } while(ix < end-31);
+    
+    vsum1 = vec_sums(vsum1,vzero);
+    vsum2 = vec_sums(vsum2,vzero);
+    
+    vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1));
+    vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2));
+    vec_ste(vsum1,0,&sum1);
+    vec_ste(vsum2,0,&sum2);
+    
+    while (ix < end) {
+        unsigned int const x0 = *ix++;
+        unsigned int const x1 = *ix++;
+        sum += table[ x0 * xlen + x1 ];
+    }
+    
+    sum2 += sum & 0xffffu;
+    sum = (sum>>16u) + sum1;
+    
+    goto end;
+    
+normal:
+    do {
+        unsigned int const x0 = *ix++;
+        unsigned int const x1 = *ix++;
+        sum += table[ x0 * xlen + x1 ];
+    } while (ix < end);
+
+    sum2 = sum & 0xffffu;
+    sum >>= 16u;
+
+end:
+    if (sum > sum2) {
+        sum = sum2;
+        t1++;
+    }
+
+    *s += sum;
+    return t1;
+}
+
+inline static int
+count_bit_noESC_from2_altivec2(const int *ix, const int *end, int max, unsigned int *s)
+{
+    int t1 = huf_tbl_noESC[max - 1];
+    /* No ESC-words */
+    unsigned int sum = 0;
+    int sum1, sum2;
+    const unsigned int xlen = 4;
+    const unsigned int *table = table56;
+    vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
+    vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2;
+    vector unsigned char vmask,vperm1,vperm2,vx;
+    vector unsigned char vhlen1,vhlen2;
+    vector signed char vs1,vs2;
+    
+    vhlen1 = (vector unsigned char)VINIT16(1,4,7,8,4,5,8,9,7,8,9,10,8,8,9,10);
+    vhlen2 = (vector unsigned char)VINIT16(3,4,6,8,4,4,6,7,5,6,7,8,7,7,8,9);
+    vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
+    vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
+    vxlen = vec_splat_s32(4);
+    vzero = vec_xor(vzero,vzero);
+    vsum1 = vec_xor(vsum1,vsum1);
+    vsum2 = vec_xor(vsum2,vsum2);
+    
+    if((int)(end - ix) < 32) goto normal;
+    v0 = vec_ld(0,ix);
+    vmask = vec_lvsl(0,ix);
+    do {
+        v1 = vec_ld(16,ix);
+        v2 = vec_ld(32,ix);
+        v3 = vec_ld(48,ix);
+        v4 = vec_ld(64,ix);
+        v5 = vec_ld(80,ix);
+        v6 = vec_ld(96,ix);
+        v7 = vec_ld(112,ix);
+        v8 = vec_ld(128,ix);
+        v9 = vec_perm(v0,v1,vmask);
+        v10 = vec_perm(v1,v2,vmask);
+        v11 = vec_perm(v2,v3,vmask);
+        v12 = vec_perm(v3,v4,vmask);
+        v13 = vec_perm(v4,v5,vmask);
+        v14 = vec_perm(v5,v6,vmask);
+        v15 = vec_perm(v6,v7,vmask);
+        v16 = vec_perm(v7,v8,vmask);
+        v0 = v8;
+        v1 = vec_perm(v9,v10,vperm1);
+        v2 = vec_perm(v9,v10,vperm2);
+        v3 = vec_perm(v11,v12,vperm1);
+        v4 = vec_perm(v11,v12,vperm2);
+        v5 = vec_perm(v13,v14,vperm1);
+        v6 = vec_perm(v13,v14,vperm2);
+        v7 = vec_perm(v15,v16,vperm1);
+        v8 = vec_perm(v15,v16,vperm2);
+        
+        vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2);
+        vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4);
+        vx3 = (vector signed int)vec_pack(vx1,vx2);
+        vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6);
+        vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8);
+        vx6 = (vector signed int)vec_pack(vx4,vx5);
+        vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6);
+        
+        vs1 = (vector signed char)vec_perm(vhlen1,vhlen1,vx);
+        vs2 = (vector signed char)vec_perm(vhlen2,vhlen2,vx);
+        
+        vsum1 = vec_sum4s(vs1,vsum1);
+        vsum2 = vec_sum4s(vs2,vsum2);
+        
+        ix += 32;
+    } while(ix < end-31);
+    
+    vsum1 = vec_sums(vsum1,vzero);
+    vsum2 = vec_sums(vsum2,vzero);
+    
+    vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1));
+    vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2));
+    vec_ste(vsum1,0,&sum1);
+    vec_ste(vsum2,0,&sum2);
+
+    while (ix < end) {
+        unsigned int const x0 = *ix++;
+        unsigned int const x1 = *ix++;
+        sum += table[ x0 * xlen + x1 ];
+    }
+    
+    sum2 += sum & 0xffffu;
+    sum = (sum>>16u) + sum1;
+    
+    goto end;
+    
+normal:
+    do {
+        unsigned int const x0 = *ix++;
+        unsigned int const x1 = *ix++;
+        sum += table[ x0 * xlen + x1 ];
+    } while (ix < end);
+
+    sum2 = sum & 0xffffu;
+    sum >>= 16u;
+
+end:
+    if (sum > sum2) {
+        sum = sum2;
+        t1++;
+    }
+
+    *s += sum;
+    return t1;
+}
+
+inline static int
+count_bit_noESC_from3_altivec1(const int *ix, const int *const end, int max, unsigned int *s)
+{
+    int t1 = huf_tbl_noESC[max - 1];
+    /* No ESC-words */
+    unsigned int sum1 = 0;
+    unsigned int sum2 = 0;
+    unsigned int sum3 = 0;
+    const unsigned int xlen = 6;
+    const uint8_t *const hlen1 = ht[7].hlen;
+    const uint8_t *const hlen2 = ht[8].hlen;
+    const uint8_t *const hlen3 = ht[9].hlen;
+    int     t;
+    vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
+    vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2,vsum3;
+    vector unsigned char vmask,vperm1,vperm2,vx,v31;
+    vector unsigned char vhlen11,vhlen12,vhlen13,vhlen21,vhlen22,vhlen23,vhlen31,vhlen32,vhlen33;
+    vector signed char vs1,vs2,vs3;
+    
+    vhlen11 = (vector unsigned char)VINIT16(1,4,7,9,9,10,4,6,8,9,9,10,7,7,9,10);
+    vhlen12 = (vector unsigned char)VINIT16(10,11,8,9,10,11,11,11,8,9,10,11,11,12,9,10);
+    vhlen13 = (vector unsigned char)VINIT16(11,12,12,12,0,0,0,0,0,0,0,0,0,0,0,0);
+    vhlen21 = (vector unsigned char)VINIT16(2,4,7,9,9,10,4,4,6,10,10,10,7,6,8,10);
+    vhlen22 = (vector unsigned char)VINIT16(10,11,9,10,10,11,11,12,9,9,10,11,12,12,10,10);
+    vhlen23 = (vector unsigned char)VINIT16(11,11,13,13,0,0,0,0,0,0,0,0,0,0,0,0);
+    vhlen31 = (vector unsigned char)VINIT16(3,4,6,7,9,10,4,5,6,7,8,10,5,6,7,8);
+    vhlen32 = (vector unsigned char)VINIT16(9,10,7,7,8,9,9,10,8,8,9,9,10,11,9,9);
+    vhlen33 = (vector unsigned char)VINIT16(10,10,11,11,0,0,0,0,0,0,0,0,0,0,0,0);
+    vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
+    vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
+    v31 = (vector unsigned char)VINIT16ALL(31);
+    vxlen = vec_splat_s32(6);
+    vzero = vec_xor(vzero,vzero);
+    vsum1 = vec_xor(vsum1,vsum1);
+    vsum2 = vec_xor(vsum2,vsum2);
+    vsum3 = vec_xor(vsum3,vsum3);
+    
+    if((int)(end - ix) < 32) goto normal;
+    //int *end2 = ix + 32*((int)(end - ix)/32);
+    v0 = vec_ld(0,ix);
+    vmask = vec_lvsl(0,ix);
+    do {
+        v1 = vec_ld(16,ix);
+        v2 = vec_ld(32,ix);
+        v3 = vec_ld(48,ix);
+        v4 = vec_ld(64,ix);
+        v5 = vec_ld(80,ix);
+        v6 = vec_ld(96,ix);
+        v7 = vec_ld(112,ix);
+        v8 = vec_ld(128,ix);
+        v9 = vec_perm(v0,v1,vmask);
+        v10 = vec_perm(v1,v2,vmask);
+        v11 = vec_perm(v2,v3,vmask);
+        v12 = vec_perm(v3,v4,vmask);
+        v13 = vec_perm(v4,v5,vmask);
+        v14 = vec_perm(v5,v6,vmask);
+        v15 = vec_perm(v6,v7,vmask);
+        v16 = vec_perm(v7,v8,vmask);
+        v0 = v8;
+        v1 = vec_perm(v9,v10,vperm1);
+        v2 = vec_perm(v9,v10,vperm2);
+        v3 = vec_perm(v11,v12,vperm1);
+        v4 = vec_perm(v11,v12,vperm2);
+        v5 = vec_perm(v13,v14,vperm1);
+        v6 = vec_perm(v13,v14,vperm2);
+        v7 = vec_perm(v15,v16,vperm1);
+        v8 = vec_perm(v15,v16,vperm2);
+        vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2);
+        vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4);
+        vx3 = (vector signed int)vec_pack(vx1,vx2);
+        vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6);
+        vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8);
+        vx6 = (vector signed int)vec_pack(vx4,vx5);
+        vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6);
+        
+        v1 = (vector signed int)vec_perm(vhlen11,vhlen12,vx);
+        v2 = (vector signed int)vec_perm(vhlen13,vhlen13,vx);
+        v3 = (vector signed int)vec_perm(vhlen21,vhlen22,vx);
+        v4 = (vector signed int)vec_perm(vhlen23,vhlen23,vx);
+        v5 = (vector signed int)vec_perm(vhlen31,vhlen32,vx);
+        v6 = (vector signed int)vec_perm(vhlen33,vhlen33,vx);
+        v7 = (vector signed int)vec_cmpgt(vx,v31);
+        vs1 = (vector signed char)vec_sel(v1,v2,(vector unsigned int)v7);
+        vs2 = (vector signed char)vec_sel(v3,v4,(vector unsigned int)v7);
+        vs3 = (vector signed char)vec_sel(v5,v6,(vector unsigned int)v7);
+        
+        vsum1 = vec_sum4s(vs1,vsum1);
+        vsum2 = vec_sum4s(vs2,vsum2);
+        vsum3 = vec_sum4s(vs3,vsum3);
+        
+        ix += 32;
+    } while(ix < end-31);
+    
+    vsum1 = vec_sums(vsum1,vzero);
+    vsum2 = vec_sums(vsum2,vzero);
+    vsum3 = vec_sums(vsum3,vzero);
+    
+    vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1));
+    vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2));
+    vsum3 = vec_perm(vsum3,vsum3,vec_lvsr(4,&sum3));
+    vec_ste(vsum1,0,&sum1);
+    vec_ste(vsum2,0,&sum2);
+    vec_ste(vsum3,0,&sum3);
+    
+    while (ix < end) {
+        int x = ix[0] * xlen + ix[1];
+        ix += 2;
+        sum1 += hlen1[x];
+        sum2 += hlen2[x];
+        sum3 += hlen3[x];
+    }
+    goto end;
+    
+  normal:
+    
+    do {
+        int x = ix[0] * xlen + ix[1];
+        ix += 2;
+        sum1 += hlen1[x];
+        sum2 += hlen2[x];
+        sum3 += hlen3[x];
+    } while (ix < end);
+    
+  end:
+    
+    t = t1;
+    if (sum1 > sum2) {
+        sum1 = sum2;
+        t++;
+    }
+    if (sum1 > sum3) {
+        sum1 = sum3;
+        t = t1+2;
+    }
+    *s += sum1;
+    
+    return t;
+}
+
+inline static int
+count_bit_noESC_from3_altivec2(const int *ix, const int *const end, int max, unsigned int *s)
+{
+    int t1 = huf_tbl_noESC[max - 1];
+    /* No ESC-words */
+    unsigned int sum1 = 0;
+    unsigned int sum2 = 0;
+    unsigned int sum3 = 0;
+    const unsigned int xlen = 8;
+    const uint8_t *const hlen1 = ht[10].hlen;
+    const uint8_t *const hlen2 = ht[11].hlen;
+    const uint8_t *const hlen3 = ht[12].hlen;
+    int     t;
+    vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
+    vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2,vsum3;
+    vector unsigned char vmask,vperm1,vperm2,vx,v31;
+    vector unsigned char vhlen11,vhlen12,vhlen13,vhlen14,vhlen21,vhlen22,vhlen23,vhlen24,vhlen31,vhlen32,vhlen33,vhlen34;
+    vector signed char vs1,vs2,vs3;
+    
+    vhlen11 = (vector unsigned char)VINIT16( 1,  4,  7,  9, 10, 10, 10, 11, 4,  6,  8,  9, 10, 11, 10, 10);
+    vhlen12 = (vector unsigned char)VINIT16( 7,  8,  9, 10, 11, 12, 11, 11, 8,  9, 10, 11, 12, 12, 11, 12);
+    vhlen13 = (vector unsigned char)VINIT16( 9, 10, 11, 12, 12, 12, 12, 12,10, 11, 12, 12, 13, 13, 12, 13);
+    vhlen14 = (vector unsigned char)VINIT16( 9, 10, 11, 12, 12, 12, 13, 13,10, 10, 11, 12, 12, 13, 13, 13);
+    vhlen21 = (vector unsigned char)VINIT16( 2,  4,  6,  8,  9, 10,  9, 10, 4,  5,  6,  8, 10, 10,  9, 10);
+    vhlen22 = (vector unsigned char)VINIT16( 6,  7,  8,  9, 10, 11, 10, 10, 8,  8,  9, 11, 10, 12, 10, 11);
+    vhlen23 = (vector unsigned char)VINIT16( 9, 10, 10, 11, 11, 12, 11, 12, 9, 10, 11, 12, 12, 13, 12, 13);
+    vhlen24 = (vector unsigned char)VINIT16( 9,  9,  9, 10, 11, 12, 12, 12, 9,  9, 10, 11, 12, 12, 12, 12);
+    vhlen31 = (vector unsigned char)VINIT16( 4,  4,  6,  8,  9, 10, 10, 10, 4,  5,  6,  7,  9,  9, 10, 10);
+    vhlen32 = (vector unsigned char)VINIT16( 6,  6,  7,  8,  9, 10,  9, 10, 7,  7,  8,  8,  9, 10, 10, 10);
+    vhlen33 = (vector unsigned char)VINIT16( 8,  8,  9,  9, 10, 10, 10, 11, 9,  9, 10, 10, 10, 11, 10, 11);
+    vhlen34 = (vector unsigned char)VINIT16( 9,  9,  9, 10, 10, 11, 11, 12,10, 10, 10, 11, 11, 11, 11, 12);
+    vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
+    vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
+    v31 = (vector unsigned char)VINIT16ALL(31);
+    vxlen = vec_splat_s32(8);
+    vzero = vec_xor(vzero,vzero);
+    vsum1 = vec_xor(vsum1,vsum1);
+    vsum2 = vec_xor(vsum2,vsum2);
+    vsum3 = vec_xor(vsum3,vsum3);
+    
+    if((int)(end - ix) < 32) goto normal;
+    //int *end2 = ix + 32*((int)(end - ix)/32);
+    v0 = vec_ld(0,ix);
+    vmask = vec_lvsl(0,ix);
+    do {
+        v1 = vec_ld(16,ix);
+        v2 = vec_ld(32,ix);
+        v3 = vec_ld(48,ix);
+        v4 = vec_ld(64,ix);
+        v5 = vec_ld(80,ix);
+        v6 = vec_ld(96,ix);
+        v7 = vec_ld(112,ix);
+        v8 = vec_ld(128,ix);
+        v9 = vec_perm(v0,v1,vmask);
+        v10 = vec_perm(v1,v2,vmask);
+        v11 = vec_perm(v2,v3,vmask);
+        v12 = vec_perm(v3,v4,vmask);
+        v13 = vec_perm(v4,v5,vmask);
+        v14 = vec_perm(v5,v6,vmask);
+        v15 = vec_perm(v6,v7,vmask);
+        v16 = vec_perm(v7,v8,vmask);
+        v0 = v8;
+        v1 = vec_perm(v9,v10,vperm1);
+        v2 = vec_perm(v9,v10,vperm2);
+        v3 = vec_perm(v11,v12,vperm1);
+        v4 = vec_perm(v11,v12,vperm2);
+        v5 = vec_perm(v13,v14,vperm1);
+        v6 = vec_perm(v13,v14,vperm2);
+        v7 = vec_perm(v15,v16,vperm1);
+        v8 = vec_perm(v15,v16,vperm2);
+        
+        vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2);
+        vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4);
+        vx3 = (vector signed int)vec_pack(vx1,vx2);
+        vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6);
+        vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8);
+        vx6 = (vector signed int)vec_pack(vx4,vx5);
+        vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6);
+        
+        v1 = (vector signed int)vec_perm(vhlen11,vhlen12,vx);
+        v2 = (vector signed int)vec_perm(vhlen13,vhlen14,vx);
+        v3 = (vector signed int)vec_perm(vhlen21,vhlen22,vx);
+        v4 = (vector signed int)vec_perm(vhlen23,vhlen24,vx);
+        v5 = (vector signed int)vec_perm(vhlen31,vhlen32,vx);
+        v6 = (vector signed int)vec_perm(vhlen33,vhlen34,vx);
+        v7 = (vector signed int)vec_cmpgt(vx,v31);
+        vs1 = (vector signed char)vec_sel(v1,v2,(vector unsigned int)v7);
+        vs2 = (vector signed char)vec_sel(v3,v4,(vector unsigned int)v7);
+        vs3 = (vector signed char)vec_sel(v5,v6,(vector unsigned int)v7);
+        
+        vsum1 = vec_sum4s(vs1,vsum1);
+        vsum2 = vec_sum4s(vs2,vsum2);
+        vsum3 = vec_sum4s(vs3,vsum3);
+        
+        ix += 32;
+    } while(ix < end-31);
+    
+    vsum1 = vec_sums(vsum1,vzero);
+    vsum2 = vec_sums(vsum2,vzero);
+    vsum3 = vec_sums(vsum3,vzero);
+    
+    vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1));
+    vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2));
+    vsum3 = vec_perm(vsum3,vsum3,vec_lvsr(4,&sum3));
+    vec_ste(vsum1,0,&sum1);
+    vec_ste(vsum2,0,&sum2);
+    vec_ste(vsum3,0,&sum3);
+    
+    while (ix < end) {
+        int x = ix[0] * xlen + ix[1];
+        ix += 2;
+        sum1 += hlen1[x];
+        sum2 += hlen2[x];
+        sum3 += hlen3[x];
+    }
+    goto end;
+    
+  normal:
+    
+    do {
+        int x = ix[0] * xlen + ix[1];
+        ix += 2;
+        sum1 += hlen1[x];
+        sum2 += hlen2[x];
+        sum3 += hlen3[x];
+    } while (ix < end);
+    
+  end:
+    
+    t = t1;
+    if (sum1 > sum2) {
+        sum1 = sum2;
+        t++;
+    }
+    if (sum1 > sum3) {
+        sum1 = sum3;
+        t = t1+2;
+    }
+    *s += sum1;
+    
+    return t;
+}
+#endif
 
 /*************************************************************************/
 /*	      choose table						 */
@@ -599,12 +1451,21 @@ typedef int (*count_fnc)(const int* ix, 
 static const count_fnc count_fncs[] = 
 { &count_bit_null
 , &count_bit_noESC
+#ifdef ALTIVEC
+, &count_bit_noESC_from2_altivec1
+, &count_bit_noESC_from2_altivec2
+, &count_bit_noESC_from3_altivec1
+, &count_bit_noESC_from3_altivec1
+, &count_bit_noESC_from3_altivec2
+, &count_bit_noESC_from3_altivec2
+#else
 , &count_bit_noESC_from2
 , &count_bit_noESC_from2
 , &count_bit_noESC_from3
 , &count_bit_noESC_from3
 , &count_bit_noESC_from3
 , &count_bit_noESC_from3
+#endif
 , &count_bit_noESC_from3
 , &count_bit_noESC_from3
 , &count_bit_noESC_from3
@@ -621,7 +1482,11 @@ choose_table_nonMMX(const int *ix, const
     unsigned int* s = (unsigned int*)_s;
     unsigned int  max;
     int     choice, choice2;
+#ifdef ALTIVEC
+    max = ix_max_vec(ix, end);
+#else
     max = ix_max(ix, end);
+#endif
 
     if (max <= 15) {
       return count_fncs[max](ix, end, max, s);
@@ -643,7 +1508,11 @@ choose_table_nonMMX(const int *ix, const
             break;
         }
     }
+#if defined(ALTIVEC) && defined(ALTIVEC_970)
+    return count_bit_ESC_altivec(ix, end, choice, choice2, s);
+#else
     return count_bit_ESC(ix, end, choice, choice2, s);
+#endif
 }
 
 
--- libmp3lame/util.c.orig	2017-09-07 04:33:36.000000000 +0900
+++ libmp3lame/util.c	2017-10-14 18:11:48.000000000 +0900
@@ -26,6 +26,12 @@
 # include <config.h>
 #endif
 
+#if defined(ALTIVEC) && !defined(ALTIVEC_970)
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
+
 #include <float.h>
 #include "lame.h"
 #include "machine.h"
@@ -954,6 +960,108 @@ disable_FPE(void)
  *
  ***********************************************************************/
 
+#if defined(ALTIVEC) && !defined(ALTIVEC_970)
+
+inline ieee754_float32_t fast_log10_altivec(ieee754_float32_t x)
+{
+    vector float va,vb,vc,vhalf,vzero,vsqrt2,vconst4;
+    vector float v1,v2,v3,v4,v5,v6,v7,v8,vz,vz2,vlog;
+    vector unsigned int vconst1,vconst2,vshamt;
+    vector signed int vconst3;
+    float out __attribute__ ((aligned (16)));
+    
+    va = (vector float)VINIT4ALL(0.8685890659);
+    vb = (vector float)VINIT4ALL(0.2894672153);
+    vc = (vector float)VINIT4ALL(0.1793365895);
+    vhalf = (vector float)VINIT4ALL(0.15051499783);
+    vsqrt2 = (vector float)VINIT4ALL(1.4142135623731);
+    vconst4 = (vector float)VINIT4ALL(0.301029995664);
+    vzero = vec_xor(vzero,vzero);
+    vconst1 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(9));
+    vconst2 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(7));
+    vconst2 = vec_nor(vconst2,vconst2);
+    vconst3 = (vector signed int)vec_rl(vconst2,vec_splat_u32(7));
+    vshamt = vec_add(vec_splat_u32(9),vec_splat_u32(7));
+    vshamt = vec_add(vshamt,vec_splat_u32(7));
+    vconst2 = vec_sl((vector unsigned int)vconst3,vshamt);
+    
+    v1 = vec_ld(0,&x);
+    v2 = vec_perm(v1,v1,vec_lvsl(0,&x));
+    v3 = vec_splat(v2,0);
+    
+    v4 = (vector float)vec_sel(vconst2,(vector unsigned int)v3,vconst1);
+    v5 = vec_add(v4,vsqrt2);
+    v6 = vec_sub(v4,vsqrt2);
+    v7 = vec_re(v5);
+    vz = vec_madd(v6, vec_madd(vec_nmsub(v7,v5,(vector float)vconst2),v7,v7), vzero);
+    v8 = (vector float)vec_sr((vector unsigned int)v3,vshamt);
+    vlog = vec_ctf(vec_sub((vector signed int)v8,vconst3),0);
+    
+    vz2 = vec_madd(vz,vz,vzero);
+    vlog = vec_madd(vlog,vconst4,vhalf);
+    
+    v1 = vec_madd(vz2,vc,vb);
+    v2 = vec_madd(vz2,v1,va);
+    vlog = vec_madd(vz,v2,vlog);
+    
+    vec_ste(vlog,0,&out);
+    
+    return out;
+}
+
+inline ieee754_float32_t fast_loge_altivec(ieee754_float32_t x)
+{
+    vector float va,vb,vc,vhalf,vzero,vsqrt2,vconst4;
+    vector float v1,v2,v3,v4,v5,v6,v7,v8,vz,vz2,vlog;
+    vector unsigned int vconst1,vconst2,vshamt;
+    vector signed int vconst3;
+    float out __attribute__ ((aligned (16)));
+    
+    va = (vector float)VINIT4ALL(2.0000006209);
+    vb = (vector float)VINIT4ALL(0.6664778517);
+    vc = (vector float)VINIT4ALL(0.4139745860);
+    vhalf = (vector float)VINIT4ALL(0.34657359028);
+    vsqrt2 = (vector float)VINIT4ALL(1.4142135623731);
+    vconst4 = (vector float)VINIT4ALL(0.6931471805599);
+    vzero = vec_xor(vzero,vzero);
+    vconst1 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(9));
+    vconst2 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(7));
+    vconst2 = vec_nor(vconst2,vconst2);
+    vconst3 = (vector signed int)vec_rl(vconst2,vec_splat_u32(7));
+    vshamt = vec_add(vec_splat_u32(9),vec_splat_u32(7));
+    vshamt = vec_add(vshamt,vec_splat_u32(7));
+    vconst2 = vec_sl((vector unsigned int)vconst3,vshamt);
+    
+    v1 = vec_ld(0,&x);
+    v2 = vec_perm(v1,v1,vec_lvsl(0,&x));
+    v3 = vec_splat(v2,0);
+    
+    v4 = (vector float)vec_sel(vconst2,(vector unsigned int)v3,vconst1);
+    v5 = vec_add(v4,vsqrt2);
+    v6 = vec_sub(v4,vsqrt2);
+    v7 = vec_re(v5);
+    vz = vec_madd(v6, vec_madd(vec_nmsub(v7,v5,(vector float)vconst2),v7,v7), vzero);
+    v8 = (vector float)vec_sr((vector unsigned int)v3,vshamt);
+    vlog = vec_ctf(vec_sub((vector signed int)v8,vconst3),0);
+    
+    vz2 = vec_madd(vz,vz,vzero);
+    vlog = vec_madd(vlog,vconst4,vhalf);
+    
+    v1 = vec_madd(vz2,vc,vb);
+    v2 = vec_madd(vz2,v1,va);
+    vlog = vec_madd(vz,v2,vlog);
+    
+    vec_ste(vlog,0,&out);
+    
+    return out;
+}
+
+void
+init_log_table(void)
+{
+}
+
+#else
 
 #define LOG2_SIZE       (512)
 #define LOG2_SIZE_L2    (9)
@@ -1004,6 +1112,8 @@ fast_log2(ieee754_float32_t x)
     return log2val;
 }
 
+#endif
+
 #else /* Don't use FAST_LOG */
 
 
--- libmp3lame/util.h.orig	2017-09-07 04:33:36.000000000 +0900
+++ libmp3lame/util.h	2017-10-14 18:02:08.000000000 +0900
@@ -93,10 +93,17 @@ extern  "C" {
 
 /* log/log10 approximations */
 #ifdef USE_FAST_LOG
+#if defined(ALTIVEC) && !defined(ALTIVEC_970)
+#define         FAST_LOG10(x)       (fast_log10_altivec(x))
+#define         FAST_LOG(x)         (fast_loge_altivec(x))
+#define         FAST_LOG10_X(x,y)   (fast_log10_altivec(x)*(y))
+#define         FAST_LOG_X(x,y)     (fast_loge_altivec(x)*(y))
+#else
 #define         FAST_LOG10(x)       (fast_log2(x)*(LOG2/LOG10))
 #define         FAST_LOG(x)         (fast_log2(x)*LOG2)
 #define         FAST_LOG10_X(x,y)   (fast_log2(x)*(LOG2/LOG10*(y)))
 #define         FAST_LOG_X(x,y)     (fast_log2(x)*(LOG2*(y)))
+#endif
 #else
 #define         FAST_LOG10(x)       log10(x)
 #define         FAST_LOG(x)         log(x)
@@ -186,14 +193,14 @@ extern  "C" {
      */
 
     typedef struct {
-        FLOAT   masking_lower[CBANDS];
+        FLOAT   masking_lower[CBANDS] __attribute__ ((aligned (16)));
         FLOAT   minval[CBANDS];
         FLOAT   rnumlines[CBANDS];
         FLOAT   mld_cb[CBANDS];
         FLOAT   mld[Max(SBMAX_l,SBMAX_s)];
         FLOAT   bo_weight[Max(SBMAX_l,SBMAX_s)]; /* band weight long scalefactor bands, at transition */
         FLOAT   attack_threshold; /* short block tuning */
-        int     s3ind[CBANDS][2];
+        int     s3ind[CBANDS][4] __attribute__ ((aligned (16)));
         int     numlines[CBANDS];
         int     bm[Max(SBMAX_l,SBMAX_s)];
         int     bo[Max(SBMAX_l,SBMAX_s)];
@@ -219,7 +226,7 @@ extern  "C" {
 
     typedef struct {
 
-        FLOAT   nb_l1[4][CBANDS], nb_l2[4][CBANDS];
+        FLOAT   nb_l1[4][CBANDS] __attribute__ ((aligned (16))), nb_l2[4][CBANDS] __attribute__ ((aligned (16)));
         FLOAT   nb_s1[4][CBANDS], nb_s2[4][CBANDS];
 
         III_psy_xmin thm[4];
@@ -246,7 +253,7 @@ extern  "C" {
     /* variables used by encoder.c */
     typedef struct {
         /* variables for newmdct.c */
-        FLOAT   sb_sample[2][2][18][SBLIMIT];
+        FLOAT   sb_sample[2][2][18][SBLIMIT] __attribute__ ((aligned (16)));
         FLOAT   amp_filter[32];
 
         /* variables used by util.c */
@@ -293,7 +300,7 @@ extern  "C" {
 #ifndef  MFSIZE
 # define MFSIZE  ( 3*1152 + ENCDELAY - MDCTDELAY )
 #endif
-        sample_t mfbuf[2][MFSIZE];
+        sample_t mfbuf[2][MFSIZE] __attribute__ ((aligned (16)));
 
         int     mf_samples_to_encode;
         int     mf_size;
@@ -567,7 +574,12 @@ extern  "C" {
 
 /* log/log10 approximations */
     extern void init_log_table(void);
+#if defined(ALTIVEC) && !defined(ALTIVEC_970)
+    extern ieee754_float32_t fast_log10_altivec(ieee754_float32_t x);
+    extern ieee754_float32_t fast_loge_altivec(ieee754_float32_t x);
+#else
     extern ieee754_float32_t fast_log2(ieee754_float32_t x);
+#endif
 
     int     isResamplingNecessary(SessionConfig_t const* cfg);
 
--- libmp3lame/vbrquantize.c.orig	2012-02-07 22:36:35.000000000 +0900
+++ libmp3lame/vbrquantize.c	2017-10-14 18:02:08.000000000 +0900
@@ -26,6 +26,12 @@
 #  include <config.h>
 #endif
 
+#ifdef ALTIVEC
+#undef TAKEHIRO_IEEE754_HACK
+#ifndef __APPLE_CC__
+#include <altivec.h>
+#endif
+#endif
 
 #include "lame.h"
 #include "machine.h"
@@ -217,8 +223,23 @@ k_34_4(DOUBLEX x[4], int l3[4])
 static  FLOAT
 calc_sfb_noise_x34(const FLOAT * xr, const FLOAT * xr34, unsigned int bw, uint8_t sf)
 {
+#ifdef ALTIVEC
+    float vpow[8] __attribute__ ((aligned (16)));
+    vector float v0, v1, v2, v3, v4, v5, v6,v7,v8,v9,v10,v11,v12,v13;
+    vector unsigned char vperm1, vperm2,vc1,vc2,vc3;
+    vector signed int vl1,vl2,vl3;
+    vector float vxfsf, vsfpow, vsfpow34, vabs, vzero;
+    unsigned int s1,s2,s3,s4,s5,s6,s7,s8;
+    const vector float const1 = (vector float)VINIT4(0.4053964553387788,3.404263724373839,5.465086767819913,1.0);
+    const vector float const2 = (vector float)VINIT4(7.719205369637751,10.93017829043677,0,0);
+#ifdef ALTIVEC_970
+    vector unsigned int vmask1,vmask2,vmask3;
+#else
+    vector unsigned char vperm3,vperm4,vc4,vc5,vc6,vmask;
+#endif
+#endif
     DOUBLEX x[4];
-    int     l3[4];
+    int     l3[4] __attribute__ ((aligned (16)));
     const FLOAT sfpow = pow20[sf + Q_MAX2]; /*pow(2.0,sf/4.0); */
     const FLOAT sfpow34 = ipow20[sf]; /*pow(sfpow,-3.0/4.0); */
 
@@ -226,6 +247,239 @@ calc_sfb_noise_x34(const FLOAT * xr, con
     unsigned int i = bw >> 2u;
     unsigned int const remaining = (bw & 0x03u);
 
+#ifdef ALTIVEC
+    vpow[0] = sfpow;
+    vpow[1] = sfpow34;
+    vsfpow = vec_ld(0,vpow);
+    vxfsf = vec_xor(vxfsf,vxfsf);
+    vsfpow34 = vec_splat(vsfpow,1);
+    vsfpow = vec_splat(vsfpow,0);
+    vperm1 = vec_lvsl(0,xr);
+    vperm2 = vec_lvsl(0,xr34);
+    v0 = vec_ld(0,xr);
+    v1 = vec_ld(0,xr34);
+    vabs = (vector float)vec_splat_s32(-1);
+    vabs = (vector float)vec_sl((vector unsigned int)vabs, (vector unsigned int)vabs);
+    vzero = vec_xor(vzero,vzero);
+#ifdef ALTIVEC_970
+    vc1 = vec_splat_u8(1);
+    vc2 = vec_splat_u8(5);
+    vc3 = vec_sl(vc1,vc2);
+    vmask1 = (vector unsigned int)vec_splat_s32(-1);
+    vmask1 = vec_sro(vmask1,vc3);
+    vmask2 = vec_sro(vmask1,vc3);
+    vmask3 = vec_sro(vmask2,vc3);
+#else
+    vperm3 = (vector unsigned char)VINIT16(0,0,0,0,0,0,0,0,0,1,2,3,16,17,18,19);
+    vperm4 = vec_sld(vperm3,(vector unsigned char)vzero,8);
+    vmask = (vector unsigned char)VINIT16ALL(16);
+#endif
+    for (; i > 1; i -= 2) {
+        
+        v2 = vec_ld(16,xr34);
+        v3 = vec_ld(32,xr34);
+        v4 = vec_perm(v1,v2,vperm2);
+        v5 = vec_perm(v2,v3,vperm2);
+        v12 = vec_madd(v4,vsfpow34,vzero);
+        v13 = vec_madd(v5,vsfpow34,vzero);
+        v1 = v3;
+        
+        v2 = vec_floor(v12);
+        v3 = vec_floor(v13);
+        v4 = vec_splat(const1,2);
+        v5 = vec_splat(const1,1);
+        v6 = vec_splat(const2,1);
+        v7 = vec_splat(const2,0);
+        v8 = vec_madd(v2,v4,v5);
+        v9 = vec_madd(v3,v4,v5);
+        v10 = vec_madd(v2,v6,v7);
+        v11 = vec_madd(v3,v6,v7);
+        v4 = vec_splat(const1,0);
+        v5 = vec_splat(const1,3);
+        v8 = vec_madd(v8,v2,v4);
+        v9 = vec_madd(v9,v3,v4);
+        v10 = vec_madd(v10,v2,v5);
+        v11 = vec_madd(v11,v3,v5);
+        v6 = vec_re(v10);
+        v7 = vec_re(v11);
+        v10 = vec_nmsub(v10,v6,v5);
+        v11 = vec_nmsub(v11,v7,v5);
+        v10 = vec_madd(v10,v6,v6);
+        v11 = vec_madd(v11,v7,v7);
+        v10 = vec_madd(v8,v10,v12);
+        v11 = vec_madd(v9,v11,v13);
+        
+        vl1 = vec_cts(v10,0);
+        vl2 = vec_cts(v11,0);
+        vl3 = (vector signed int)vec_pack(vl1,vl2);
+        vec_st(vl3,0,l3);
+        
+        s1 = l3[0] >> 16;
+        s2 = l3[0] & 0xffff;
+        s3 = l3[1] >> 16;
+        s4 = l3[1] & 0xffff;
+        s5 = l3[2] >> 16;
+        s6 = l3[2] & 0xffff;
+        s7 = l3[3] >> 16;
+        s8 = l3[3] & 0xffff;
+        
+#ifdef ALTIVEC_970
+        v2 = vec_lde(0,pow43+s1);
+        v3 = vec_lde(0,pow43+s2);
+        v4 = vec_lde(0,pow43+s3);
+        v5 = vec_lde(0,pow43+s4);
+        v2 = vec_perm(v2,v2,vec_lvsl(0,pow43+s1));
+        v3 = vec_perm(v3,v3,vec_lvsl(-4,pow43+s2));
+        v4 = vec_perm(v4,v4,vec_lvsl(-8,pow43+s3));
+        v5 = vec_perm(v5,v5,vec_lvsl(-12,pow43+s4));
+        v12 = vec_sel(v2,v3,vmask1);
+        v12 = vec_sel(v12,v4,vmask2);
+        v12 = vec_sel(v12,v5,vmask3);
+        
+        v2 = vec_lde(0,pow43+s5);
+        v3 = vec_lde(0,pow43+s6);
+        v4 = vec_lde(0,pow43+s7);
+        v5 = vec_lde(0,pow43+s8);
+        v2 = vec_perm(v2,v2,vec_lvsl(0,pow43+s5));
+        v3 = vec_perm(v3,v3,vec_lvsl(-4,pow43+s6));
+        v4 = vec_perm(v4,v4,vec_lvsl(-8,pow43+s7));
+        v5 = vec_perm(v5,v5,vec_lvsl(-12,pow43+s8));
+        v13 = vec_sel(v2,v3,vmask1);
+        v13 = vec_sel(v13,v4,vmask2);
+        v13 = vec_sel(v13,v5,vmask3);
+#else
+        vc1 = vec_lvsl(0,pow43+s1);
+        vc2 = vec_lvsl(0,pow43+s2);
+        vc3 = vec_lvsl(0,pow43+s3);
+        vc4 = vec_lvsl(0,pow43+s4);
+        vc2 = vec_or(vc2,vmask);
+        vc4 = vec_or(vc4,vmask);
+        v2 = vec_lde(0,pow43+s1);
+        v3 = vec_lde(0,pow43+s2);
+        v4 = vec_lde(0,pow43+s3);
+        v5 = vec_lde(0,pow43+s4);
+        vc5 = vec_perm(vc1,vc2,vperm3);
+        vc6 = vec_perm(vc3,vc4,vperm4);
+        v6 = vec_perm(v2,v3,vc5);
+        v7 = vec_perm(v4,v5,vc6);
+        v12 = vec_sld(v6,v7,8);
+        
+        vc1 = vec_lvsl(0,pow43+s5);
+        vc2 = vec_lvsl(0,pow43+s6);
+        vc3 = vec_lvsl(0,pow43+s7);
+        vc4 = vec_lvsl(0,pow43+s8);
+        vc2 = vec_or(vc2,vmask);
+        vc4 = vec_or(vc4,vmask);
+        v2 = vec_lde(0,pow43+s5);
+        v3 = vec_lde(0,pow43+s6);
+        v4 = vec_lde(0,pow43+s7);
+        v5 = vec_lde(0,pow43+s8);
+        vc5 = vec_perm(vc1,vc2,vperm3);
+        vc6 = vec_perm(vc3,vc4,vperm4);
+        v6 = vec_perm(v2,v3,vc5);
+        v7 = vec_perm(v4,v5,vc6);
+        v13 = vec_sld(v6,v7,8);
+#endif
+        
+        v2 = vec_ld(16, xr);
+        v3 = vec_ld(32, xr);
+        v6 = vec_perm(v0,v2,vperm1);
+        v7 = vec_perm(v2,v3,vperm1);
+        v0 = v3;
+        v8 = vec_andc(v6,vabs);
+        v9 = vec_andc(v7,vabs);
+        v10 = vec_nmsub(vsfpow, v12, v8);
+        v11 = vec_nmsub(vsfpow, v13, v9);
+        vxfsf = vec_madd(v10, v10, vxfsf);
+        vxfsf = vec_madd(v11, v11, vxfsf);
+        
+        xr += 8;
+        xr34 += 8;
+    }
+    if (i) {
+#ifdef ALTIVEC_970
+        x[0] = sfpow34 * xr34[0];
+        x[1] = sfpow34 * xr34[1];
+        x[2] = sfpow34 * xr34[2];
+        x[3] = sfpow34 * xr34[3];
+
+        k_34_4(x, l3);
+
+        vpow[0] = pow43[l3[0]];
+        vpow[1] = pow43[l3[1]];
+        vpow[2] = pow43[l3[2]];
+        vpow[3] = pow43[l3[3]];
+        v1 = vec_ld(0, vpow);
+        v2 = vec_ld(16, xr);
+        v3 = vec_perm(v0,v2,vperm1);
+        v4 = vec_andc(v3,vabs);
+        v5 = vec_nmsub(vsfpow, v1, v4);
+        vxfsf = vec_madd(v5, v5, vxfsf);
+#else
+        v2 = vec_ld(16,xr34);
+        v3 = vec_perm(v1,v2,vperm2);
+        v4 = vec_madd(v3,vsfpow34,vzero);
+        vl1 = vec_cts(v4,0);
+        vec_st(vl1,0,l3);
+        
+        v5 = vec_lde(0,adj43+l3[0]);
+        v6 = vec_lde(0,adj43+l3[1]);
+        v7 = vec_lde(0,adj43+l3[2]);
+        v8 = vec_lde(0,adj43+l3[3]);
+        v9 = vec_perm(v5,v5,vec_lvsl(0,adj43+l3[0]));
+        v10 = vec_perm(v6,v6,vec_lvsl(-4,adj43+l3[1]));
+        v11 = vec_perm(v7,v7,vec_lvsl(-8,adj43+l3[2]));
+        v12 = vec_perm(v8,v8,vec_lvsl(-12,adj43+l3[3]));
+        v9 = vec_or(v9,v10);
+        v9 = vec_or(v9,v11);
+        v9 = vec_or(v9,v12);
+        
+        v10 = vec_add(v4,v9);
+        vl1 = vec_cts(v10,0);
+        vec_st(vl1,0,l3);
+        
+        v2 = vec_lde(0,pow43+l3[0]);
+        v3 = vec_lde(0,pow43+l3[1]);
+        v4 = vec_lde(0,pow43+l3[2]);
+        v5 = vec_lde(0,pow43+l3[3]);
+        v6 = vec_perm(v2,v2,vec_lvsl(0,pow43+l3[0]));
+        v7 = vec_perm(v3,v3,vec_lvsl(-4,pow43+l3[1]));
+        v8 = vec_perm(v4,v4,vec_lvsl(-8,pow43+l3[2]));
+        v9 = vec_perm(v5,v5,vec_lvsl(-12,pow43+l3[3]));
+        v6 = vec_or(v6,v7);
+        v6 = vec_or(v6,v8);
+        v6 = vec_or(v6,v9);
+        
+        v2 = vec_ld(16, xr);
+        v3 = vec_perm(v0,v2,vperm1);
+        v4 = vec_andc(v3,vabs);
+        v5 = vec_nmsub(vsfpow, v6, v4);
+        vxfsf = vec_madd(v5, v5, vxfsf);
+#endif
+        xr += 4;
+        xr34 += 4;
+    }
+	if (remaining) {
+        x[0] = x[1] = x[2] = x[3] = 0;
+        switch( remaining ) {
+        case 3: x[2] = sfpow34 * xr34[2];
+        case 2: x[1] = sfpow34 * xr34[1];
+        case 1: x[0] = sfpow34 * xr34[0];
+        }
+
+        k_34_4(x, l3);
+        x[0] = x[1] = x[2] = x[3] = 0;
+
+        switch( remaining ) {
+        case 3: x[2] = fabsf(xr[2]) - sfpow * pow43[l3[2]];
+        case 2: x[1] = fabsf(xr[1]) - sfpow * pow43[l3[1]];
+        case 1: x[0] = fabsf(xr[0]) - sfpow * pow43[l3[0]];
+        }
+        xfsf += (x[0] * x[0] + x[1] * x[1]) + (x[2] * x[2] + x[3] * x[3]);
+    }
+    vec_st(vxfsf,0,vpow);
+    return xfsf + vpow[0] + vpow[1] + vpow[2] + vpow[3];
+#else
     while (i-- > 0) {
         x[0] = sfpow34 * xr34[0];
         x[1] = sfpow34 * xr34[1];
@@ -262,6 +516,7 @@ calc_sfb_noise_x34(const FLOAT * xr, con
         xfsf += (x[0] * x[0] + x[1] * x[1]) + (x[2] * x[2] + x[3] * x[3]);
     }
     return xfsf;
+#endif
 }
 
 