/*
 * A vector quantizer for texture compression
 *
 * Copyright (c) 2012 Jens Ogniewski, Information Coding Group,
 *                    Linköpings University, Sweden
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <math.h>
#include "vec_quant.h"
#include <gsl_linalg.h>


const float modvecRB[4]= {8.0,16.0/3.0,8.0/3.0,0.0};
const float modvecG[4]= {4.0,8.0/3.0,4.0/3.0,0.0};

inline double *dsvd(double **in)
{
  int i,j,k;
  double *ret=(double *)malloc((unsigned int) 3*sizeof(double));
  gsl_matrix *a = gsl_matrix_alloc (3, 3);
  gsl_matrix *w = gsl_matrix_alloc (3, 3);
  gsl_vector *s = gsl_vector_alloc (3);
  gsl_vector *ws = gsl_vector_alloc (3);

  for(i=0;i<3;i++) {
    for (j=0;j<3;j++) {
      a->data[i*a->tda+j]=0.0;
      for (k=0;k<16;k++) {
        a->data[i*a->tda+j]=a->data[i*a->tda+j]+in[k][i]*in[k][j];
      }
    }
  }

  gsl_linalg_SV_decomp(a, w, s, ws);

  if (s->data[0]<0)           printf("NEJ!\n");
  if (s->data[s->stride*1]<0) printf("NEJ!\n");
  if (s->data[s->stride*2]<0) printf("NEJ!\n");

  int tmppos=0;
  if ((s->data[s->stride*1]>s->data[s->stride*2])&&(s->data[s->stride*1]>s->data[0])) tmppos=1;
  if ((s->data[s->stride*2]>s->data[s->stride*1])&&(s->data[s->stride*2]>s->data[0])) tmppos=2;

  ret[0]=a->data[tmppos];
  ret[1]=a->data[1*a->tda+tmppos];
  ret[2]=a->data[2*a->tda+tmppos]; 
  return ret;
}

inline double calculate_yuv_cost(unsigned char *c1, unsigned char *c2) {
  double ytmp1,utmp1,vtmp1,ytmp2,utmp2,vtmp2;

  ytmp1=   0.299*((double)c1[0])+  0.587*((double)c1[1])+  0.114*((double)c1[2]);
  utmp1=-0.14713*((double)c1[0])-0.28886*((double)c1[1])+  0.436*((double)c1[2]);
  vtmp1=   0.615*((double)c1[0])-0.51499*((double)c1[1])-0.10001*((double)c1[2]);

  ytmp2=   0.299*((double)c2[0])+  0.587*((double)c2[1])+  0.114*((double)c2[2]);
  utmp2=-0.14713*((double)c2[0])-0.28886*((double)c2[1])+  0.436*((double)c2[2]);
  vtmp2=   0.615*((double)c2[0])-0.51499*((double)c2[1])-0.10001*((double)c2[2]);

  return (ytmp1-ytmp2)*(ytmp1-ytmp2)*2.0+(utmp1-utmp2)*(utmp1-utmp2)+(vtmp1-vtmp2)*(vtmp1-vtmp2);
}

inline double calculate_euclidean_cost(unsigned char *c1, unsigned char *c2) {

  return (c1[0]-c2[0])*(c1[0]-c2[0])+(c1[1]-c2[1])*(c1[1]-c2[1])+(c1[2]-c2[2])*(c1[2]-c2[2]);
}

void vector_quantization(unsigned char *in, unsigned int width, unsigned int height, unsigned char **min_out, unsigned char **max_out, unsigned char **colsel, int use_yuv_weight, int do_refinement, int deblock, int wraparound) {
  int c,g,h,j,k,l;

  *max_out =(unsigned char *)malloc((width>>2)*(height>>2)*4);
  *min_out =(unsigned char *)malloc((width>>2)*(height>>2)*4);
  *colsel  =(unsigned char *)malloc(width*height);

  if ((*colsel==NULL)||(*min_out==NULL)||(*max_out==NULL)) {
    printf("\n\nVector quantizer: error allocating memory!\n\n");
  }

  for (g=0;g<height>>2;g++) {
    for (h=0;h<width>>2;h++) {
      double mincost=1073741823.0;
      double avg[3];
      double d,e;
      int tmpi;
      int xmin,xmax,ymin,ymax;
      double len;
      int minvec[3], maxvec[3], tmpminvec[3], tmpmaxvec[3], oldminvec[3], oldmaxvec[3];
      int mini=1073741823,maxi=-1073741823;
      unsigned char tmpcol[4][3];
      int ws=(4+2*deblock)*(4+2*deblock);
      double **coltmp;
      double *s;
      double *stmp;
      avg[0]=0.0f;
      avg[1]=0.0f;
      avg[2]=0.0f;
      maxvec[0]=0;
      maxvec[1]=0;
      maxvec[2]=0;
      minvec[0]=0;
      minvec[1]=0;
      minvec[2]=0;
      // 1.   Determine the line
      xmin=h*4-deblock;
      ymin=g*4-deblock;
      xmax=h*4+4+deblock;
      ymax=g*4+4+deblock;
      if (xmin<0) {
        if (wraparound<=0) xmin=h*4;
      }
      if (ymin<0) {
        if (wraparound<=0) ymin=g*4;
      }
      if (xmax>width-1) {
        if (wraparound<=0) xmax=h*4+4;
      }
      if (ymax>height-1) {
        if (wraparound<=0) ymax=g*4+4;
      }
      if (wraparound<=0) ws=(xmax-xmin)*(ymax-ymin);
      //printf("%i %i %i %i\n",xmin,xmax,ymin,ymax);

      coltmp=(double **)malloc((unsigned int)ws*sizeof(double *));
      for (k=0;k<ws;k++) coltmp[k]=(double *)malloc((unsigned int)3*sizeof(double));

      if (use_yuv_weight>0) {
       // 1.1. calculate intercept
        for (j=ymin;j<ymax;j++) {
          for (k=xmin;k<xmax;k++) {
            int tmpx=k,tmpy=j;
            if (tmpx<0) {
              tmpx=tmpx+width;
            }
            if (tmpy<0) {
              tmpy=tmpy+height;
            }
            if (tmpx>width-1) {
              tmpx=tmpx-width;
            }
            if (tmpy>height-1) {
              tmpy=tmpy-height;
            }
            coltmp[(j-ymin)*(xmax-xmin)+k-xmin][0]=( 0.299  *((double)in[(tmpy*width+tmpx)*4])
                                                    +0.587  *((double)in[(tmpy*width+tmpx)*4+1])
                                                    +0.114  *((double)in[(tmpy*width+tmpx)*4+2]))*sqrt(2.0);
            coltmp[(j-ymin)*(xmax-xmin)+k-xmin][1]= -0.14713*((double)in[(tmpy*width+tmpx)*4])
                                                    -0.28886*((double)in[(tmpy*width+tmpx)*4+1])
                                                    +0.436  *((double)in[(tmpy*width+tmpx)*4+2]);
            coltmp[(j-ymin)*(xmax-xmin)+k-xmin][2]=  0.615  *((double)in[(tmpy*width+tmpx)*4])
                                                    -0.51499*((double)in[(tmpy*width+tmpx)*4+1])
                                                    -0.10001*((double)in[(tmpy*width+tmpx)*4+2]);
            avg[0]=avg[0]+coltmp[(j-ymin)*(xmax-xmin)+k-xmin][0];
            avg[1]=avg[1]+coltmp[(j-ymin)*(xmax-xmin)+k-xmin][1];
            avg[2]=avg[2]+coltmp[(j-ymin)*(xmax-xmin)+k-xmin][2];
          }
        }
        avg[0]=avg[0]/((float)ws);
        avg[1]=avg[1]/((float)ws);
        avg[2]=avg[2]/((float)ws);
        for (j=0;j<ws;j++) {
          coltmp[j][0]=coltmp[j][0]-avg[0];
          coltmp[j][1]=coltmp[j][1]-avg[1];
          coltmp[j][2]=coltmp[j][2]-avg[2];
        }
        // 1.2. find slope
        stmp=dsvd(coltmp);
        s=(double *)malloc(3*sizeof(double));
        s[0]=stmp[0]/sqrt(2.0)                +1.13983*stmp[2];
        s[1]=stmp[0]/sqrt(2.0)-0.39465*stmp[1]-0.5806 *stmp[2];
        s[2]=stmp[0]/sqrt(2.0)                +2.03211*stmp[2];

        avg[0]=0.0;
        avg[1]=0.0;
        avg[2]=0.0;
        for (j=ymin;j<ymax;j++) {
          for (k=xmin;k<xmax;k++) {
            for (c=0;c<3;c++) {
              int tmpx=k,tmpy=j;
              if (tmpx<0) {
                tmpx=tmpx+width;
              }
              if (tmpy<0) {
                tmpy=tmpy+height;
              }
              if (tmpx>width-1) {
                tmpx=tmpx-width;
              }
              if (tmpy>height-1) {
                tmpy=tmpy-height;
              }
              avg[c]=avg[c]+(double)in[(tmpy*width+tmpx)*4+c];
            }
          }
        }
        avg[0]=avg[0]/((float)ws);
        avg[1]=avg[1]/((float)ws);
        avg[2]=avg[2]/((float)ws);

      } else {
       // 1.1. calculate intercept
        for (j=ymin;j<ymax;j++) {
          for (k=xmin;k<xmax;k++) {
            for (c=0;c<3;c++) {
              avg[c]=avg[c]+(double)in[(j*width+k)*4+c];
            }
          }
        }
        avg[0]=avg[0]/((float)ws);
        avg[1]=avg[1]/((float)ws);
        avg[2]=avg[2]/((float)ws);
        for (j=ymin;j<ymax;j++) {
          for (k=xmin;k<xmax;k++) {
            for (c=0;c<3;c++) {
              int tmpx=k,tmpy=j;
              if (tmpx<0) {
                tmpx=tmpx+width;
              }
              if (tmpy<0) {
                tmpy=tmpy+height;
              }
              if (tmpx>width-1) {
                tmpx=tmpx-width;
              }
              if (tmpy>height-1) {
                tmpy=tmpy-height;
              }
              coltmp[(j-ymin)*(xmax-xmin)+k-xmin][c]=(double)in[(tmpy*width+tmpx)*4+c]-avg[c];
            }
          }
        }
        // 1.2. find slope
        s=dsvd(coltmp);
      }

      len = sqrt((s[0]*s[0])+(s[1]*s[1])+(s[2]*s[2]));
      if (len!=0) {
        double x1,x2,mind=1073741823,maxd=-1073741823;
        double ld=0.0;
        s[0]=s[0]/len;
        s[1]=s[1]/len;
        s[2]=s[2]/len;

        // 2.   Find optimal points on the line
        // 2.1. Calculate search space
        if (use_yuv_weight>0) {
          for (j=0;j<ws;j++) {
            double tmpd=coltmp[j][0]*stmp[0]+coltmp[j][1]*stmp[1]+coltmp[j][2]*stmp[2];
            ld=ld+coltmp[j][0]*coltmp[j][0]+coltmp[j][1]*coltmp[j][1]+coltmp[j][2]*coltmp[j][2]-tmpd*tmpd;
            if (tmpd>maxd) {
              maxd=tmpd;
            }
            if (tmpd<mind) {
              mind=tmpd;
            }
          }
          maxd=(stmp[0]*maxd/sqrt(2.0)+1.13983*stmp[2]*maxd)/s[0];
          mind=(stmp[0]*mind/sqrt(2.0)+1.13983*stmp[2]*mind)/s[0];
          if (mind>maxd) {
            double tmpd=mind;
            mind=maxd;
            maxd=tmpd;
          }
        } else {
          for (j=0;j<ws;j++) {
            double tmpd=coltmp[j][0]*s[0]+coltmp[j][1]*s[1]+coltmp[j][2]*s[2];
            ld=ld+coltmp[j][0]*coltmp[j][0]+coltmp[j][1]*coltmp[j][1]+coltmp[j][2]*coltmp[j][2]-tmpd*tmpd;
            if (tmpd>maxd) {
              maxd=tmpd;
            }
            if (tmpd<mind) {
              mind=tmpd;
            }
          }
        }
        x1=2.0*mind-1.0*maxd;
        if (x1>mind) x1=mind;
        x2=2.0*maxd-1.0*mind;
        if (x2<maxd) x2=maxd;
        mini=floor(x1);
        maxi=ceil(x2);
        if ((mini==0)&&(maxi==0)) {mini=-1.0;maxi=1.0;}
        // 2.2. Test all possible candidates
        oldminvec[0]=-1;
        oldminvec[1]=-1;
        oldminvec[2]=-1;
        double ns1=1.0,ns2=1.0;
        for(d=mini+1;d<=maxi+1;d=d+ns1) {
          d--;
          do {
            tmpminvec[0]=rint((s[0]*d+avg[0])/8);
            if (tmpminvec[0]<0)  tmpminvec[0]=0;
            if (tmpminvec[0]>31) tmpminvec[0]=31;
            tmpminvec[1]=rint((s[1]*d+avg[1])/4);
            if (tmpminvec[1]<0)  tmpminvec[1]=0;
            if (tmpminvec[1]>63) tmpminvec[1]=63;
            tmpminvec[2]=rint((s[2]*d+avg[2])/8);
            if (tmpminvec[2]<0)  tmpminvec[2]=0;
            if (tmpminvec[2]>31) tmpminvec[2]=31;
            d++;
            if (d==maxi+1) break;
          } while ((oldminvec[0]==tmpminvec[0])&&(oldminvec[1]==tmpminvec[1])&&(oldminvec[2]==tmpminvec[2]));
          oldminvec[0]=tmpminvec[0];
          oldminvec[1]=tmpminvec[1];
          oldminvec[2]=tmpminvec[2];
          oldmaxvec[0]=-1;
          oldmaxvec[1]=-1;
          oldmaxvec[2]=-1;

          for(e=maxi-1;e>=d-1;e=e-ns2) {
            double stepcost=0.0;
            e++;
            do {
              tmpmaxvec[0]=rint((s[0]*e+avg[0])/8);
              if (tmpmaxvec[0]<0)  tmpmaxvec[0]=0;
              if (tmpmaxvec[0]>31) tmpmaxvec[0]=31;
              tmpmaxvec[1]=rint((s[1]*e+avg[1])/4);
              if (tmpmaxvec[1]<0)  tmpmaxvec[1]=0;
              if (tmpmaxvec[1]>63) tmpmaxvec[1]=63;
              tmpmaxvec[2]=rint((s[2]*e+avg[2])/8);
              if (tmpmaxvec[2]<0)  tmpmaxvec[2]=0;
              if (tmpmaxvec[2]>31) tmpmaxvec[2]=31;
              e--;
              if (e==d-1) break;
            } while ((oldmaxvec[0]==tmpmaxvec[0])&&(oldmaxvec[1]==tmpmaxvec[1])&&(oldmaxvec[2]==tmpmaxvec[2]));
            oldmaxvec[0]=tmpmaxvec[0];
            oldmaxvec[1]=tmpmaxvec[1];
            oldmaxvec[2]=tmpmaxvec[2];

            tmpi=8*tmpminvec[0];
            if (tmpi>255) tmpi=255;
            tmpcol[0][0]=tmpi;
            tmpi=4*tmpminvec[1];
            if (tmpi>255) tmpi=255;
            tmpcol[0][1]=tmpi;
            tmpi=8*tmpminvec[2];
            if (tmpi>255) tmpi=255;
            tmpcol[0][2]=tmpi;
            tmpi=rint(((float)(16*tmpminvec[0]+8*tmpmaxvec[0]))/3.0);
            if (tmpi>255) tmpi=255;
            tmpcol[1][0]=tmpi;
            tmpi=rint(((float)( 8*tmpminvec[1]+4*tmpmaxvec[1]))/3.0);
            if (tmpi>255) tmpi=255;
            tmpcol[1][1]=tmpi;
            tmpi=rint(((float)(16*tmpminvec[2]+8*tmpmaxvec[2]))/3.0);
            if (tmpi>255) tmpi=255;
            tmpcol[1][2]=tmpi;
            tmpi=rint(((float)(8*tmpminvec[0]+16*tmpmaxvec[0]))/3.0);
            if (tmpi>255) tmpi=255;  
            tmpcol[2][0]=tmpi;
            tmpi=rint(((float)(4*tmpminvec[1]+ 8*tmpmaxvec[1]))/3.0);
            if (tmpi>255) tmpi=255;
            tmpcol[2][1]=tmpi;
            tmpi=rint(((float)(8*tmpminvec[2]+16*tmpmaxvec[2]))/3.0);
            if (tmpi>255) tmpi=255;
            tmpcol[2][2]=tmpi;
            tmpi=8*tmpmaxvec[0];
            if (tmpi>255) tmpi=255;        
            tmpcol[3][0]=tmpi;
            tmpi=4*tmpmaxvec[1];
            if (tmpi>255) tmpi=255;
            tmpcol[3][1]=tmpi;
            tmpi=8*tmpmaxvec[2];
            if (tmpi>255) tmpi=255;
            tmpcol[3][2]=tmpi;
  
            for (j=ymin;j<ymax;j++) {
              for (k=xmin;k<xmax;k++) {
                double colcost=1073741823.0;
                int tmpx=k,tmpy=j;
                if (tmpx<0) {
                  tmpx=tmpx+width;
                }
                if (tmpy<0) {
                  tmpy=tmpy+height;
                }
                if (tmpx>width-1) {
                  tmpx=tmpx-width;
                }
                if (tmpy>height-1) {
                  tmpy=tmpy-height;
                }
                for (l=0;l<4;l++) {
                  double lcost;
                  if (use_yuv_weight>0) lcost=calculate_yuv_cost(&in[(tmpy*width+tmpx)*4],tmpcol[l]);
                  else                  lcost=calculate_euclidean_cost(&in[(tmpy*width+tmpx)*4],tmpcol[l]);
                  if (lcost<colcost) colcost=lcost;
                }
                stepcost=stepcost+colcost;
              }
            }
            if (stepcost<mincost) {
              minvec[0]=tmpminvec[0];
              minvec[1]=tmpminvec[1];
              minvec[2]=tmpminvec[2];
              maxvec[0]=tmpmaxvec[0];
              maxvec[1]=tmpmaxvec[1];
              maxvec[2]=tmpmaxvec[2];
              mincost=stepcost;
            }
          }
        }
      } else {
        printf("0\n");
        minvec[0]=rint((avg[0])/8);
        if (minvec[0]<0)  minvec[0]=0;
        if (minvec[0]>31) minvec[0]=31;
        minvec[1]=rint((avg[1])/4);
        if (minvec[1]<0)  minvec[1]=0;
        if (minvec[1]>63) minvec[1]=63;
        minvec[2]=rint((avg[2])/8);
        if (minvec[2]<0)  minvec[2]=0;
        if (minvec[2]>31) minvec[2]=31;
        maxvec[0]=rint((avg[0])/8);
        if (maxvec[0]<0)  maxvec[0]=0;
        if (maxvec[0]>31) maxvec[0]=31;
        maxvec[1]=rint((avg[1])/4);
        if (maxvec[1]<0)  maxvec[1]=0;
        if (maxvec[1]>63) maxvec[1]=63;
        maxvec[2]=rint((avg[2])/8);
        if (maxvec[2]<0)  maxvec[2]=0;
        if (maxvec[2]>31) maxvec[2]=31;
        mincost=0.0;
        unsigned char tmpcvec[3];
        tmpcvec[0]=minvec[0]*8.0;
        tmpcvec[1]=minvec[1]*4.0;
        tmpcvec[2]=minvec[2]*8.0;
        for (j=ymin;j<ymax;j++) {
          for (k=xmin;k<xmax;k++) {
            if (use_yuv_weight>0)  mincost=mincost+calculate_yuv_cost(&in[(j*width+k)*4],tmpcvec);
            else                   mincost=mincost+calculate_euclidean_cost(&in[(j*width+k)*4],tmpcvec);
          }
        }
      }
      //3.   Find local optimum
      if (do_refinement>0) {
        int chg, chgcnt=0, oldcnt=0;
        do {
          int c1[3],c2[3],c1min[3],c2min[3],c1max[3],c2max[3];
          chg=0;
          tmpminvec[0]=minvec[0];
          tmpminvec[1]=minvec[1];
          tmpminvec[2]=minvec[2];
          tmpmaxvec[0]=maxvec[0];
          tmpmaxvec[1]=maxvec[1];
          tmpmaxvec[2]=maxvec[2];
          if (minvec[0]>0)  c1min[0]=-1;
          else c1min[0]=0;
          if (minvec[0]<31) c1max[0]=1;
          else c1max[0]=0;
          if (minvec[1]>0)  c1min[1]=-1;
          else c1min[1]=0;
          if (minvec[1]<63) c1max[1]=1;
          else c1max[1]=0;
          if (minvec[2]>0)  c1min[2]=-1;
          else c1min[2]=0;
          if (minvec[2]<31) c1max[2]=1;
          else c1max[2]=0;
          if (maxvec[0]>0)  c2min[0]=-1;
          else c2min[0]=0;
          if (maxvec[0]<31) c2max[0]=1;
          else c2max[0]=0;
          if (maxvec[1]>0)  c2min[1]=-1;
          else c2min[1]=0;
          if (maxvec[1]<63) c2max[1]=1;
          else c2max[1]=0;
          if (maxvec[2]>0)  c2min[2]=-1;
          else c2min[2]=0;
          if (maxvec[2]<31) c2max[2]=1;
          else c2max[2]=0;
          for(c1[0]=c1min[0];c1[0]<=c1max[0];c1[0]=c1[0]+2) {
            for(c1[1]=c1min[1];c1[1]<=c1max[1];c1[1]=c1[1]+1) {
              for(c1[2]=c1min[2];c1[2]<=c1max[2];c1[2]=c1[2]+1) {
                for(c2[0]=c2min[0];c2[0]<=c2max[0];c2[0]=c2[0]+1) {
                  for(c2[1]=c2min[1];c2[1]<=c2max[1];c2[1]=c2[1]+1) {
                    for(c2[2]=c2min[2];c2[2]<=c2max[2];c2[2]=c2[2]+1) {
                      double stepcost=0.0;
                      tmpi=8*(tmpminvec[0]+c1[0]);
                      if (tmpi>255) tmpi=255;
                      tmpcol[0][0]=tmpi;
                      tmpi=4*(tmpminvec[1]+c1[1]);
                      if (tmpi>255) tmpi=255;
                      tmpcol[0][1]=tmpi;
                      tmpi=8*(tmpminvec[2]+c1[2]);
                      if (tmpi>255) tmpi=255;
                      tmpcol[0][2]=tmpi;
                      tmpi=rint(((float)(16*(tmpminvec[0]+c1[0])+8*(tmpmaxvec[0]+c2[0])))/3.0);
                      if (tmpi>255) tmpi=255;
                      tmpcol[1][0]=tmpi;
                      tmpi=rint(((float)( 8*(tmpminvec[1]+c1[1])+4*(tmpmaxvec[1]+c2[1])))/3.0);
                      if (tmpi>255) tmpi=255;
                      tmpcol[1][1]=tmpi;
                      tmpi=rint(((float)(16*(tmpminvec[2]+c1[2])+8*(tmpmaxvec[2]+c2[2])))/3.0);
                      if (tmpi>255) tmpi=255;
                      tmpcol[1][2]=tmpi;
                      tmpi=rint(((float)(8*(tmpminvec[0]+c1[0])+16*(tmpmaxvec[0]+c2[0])))/3.0);
                      if (tmpi>255) tmpi=255;  
                      tmpcol[2][0]=tmpi;
                      tmpi=rint(((float)(4*(tmpminvec[1]+c1[1])+ 8*(tmpmaxvec[1]+c2[1])))/3.0);
                      if (tmpi>255) tmpi=255;
                      tmpcol[2][1]=tmpi;
                      tmpi=rint(((float)(8*(tmpminvec[2]+c1[2])+16*(tmpmaxvec[2]+c2[2])))/3.0);
                      if (tmpi>255) tmpi=255;
                      tmpcol[2][2]=tmpi;
                      tmpi=8*(tmpmaxvec[0]+c2[0]);
                      if (tmpi>255) tmpi=255;        
                      tmpcol[3][0]=tmpi;
                      tmpi=4*(tmpmaxvec[1]+c2[1]);
                      if (tmpi>255) tmpi=255;
                      tmpcol[3][1]=tmpi;
                      tmpi=8*(tmpmaxvec[2]+c2[2]);
                      if (tmpi>255) tmpi=255;
                      tmpcol[3][2]=tmpi;

                      for (j=ymin;j<ymax;j++) {
                        for (k=xmin;k<xmax;k++) {
                          double colcost=1073741823.0;
                          int tmpx=k,tmpy=j;
                          if (tmpx<0) {
                            tmpx=tmpx+width;
                          }
                          if (tmpy<0) {
                            tmpy=tmpy+height;
                          }
                          if (tmpx>width-1) {
                            tmpx=tmpx-width;
                          }
                          if (tmpy>height-1) {
                            tmpy=tmpy-height;
                          }
                          for (l=0;l<4;l++) {
                            double lcost;
                            if(use_yuv_weight>0) lcost=rint(calculate_yuv_cost(&in[(j*width+k)*4],tmpcol[l]));
                            else                 lcost=calculate_euclidean_cost(&in[(j*width+k)*4],tmpcol[l]);
                            if (lcost<colcost) colcost=lcost;
                          }
                          stepcost=stepcost+colcost;
                        }
                      }
                      if (stepcost<mincost) {
                        mincost=stepcost;
                        chg=1;
                        oldcnt=chgcnt;
                        chgcnt++;
                        minvec[0]=tmpminvec[0]+c1[0];
                        minvec[1]=tmpminvec[1]+c1[1];
                        minvec[2]=tmpminvec[2]+c1[2];
                        maxvec[0]=tmpmaxvec[0]+c2[0];
                        maxvec[1]=tmpmaxvec[1]+c2[1];
                        maxvec[2]=tmpmaxvec[2]+c2[2];
                      }
                    }
                  }
                }
              }
            }
          } 
        } while (chg);
      }
      // Optimal point found -> calculate the indices for all pixels in the block
      (*min_out)[(g*(width>>2)+h)*4]  =minvec[0];
      (*min_out)[(g*(width>>2)+h)*4+1]=minvec[1];
      (*min_out)[(g*(width>>2)+h)*4+2]=minvec[2];
      (*min_out)[(g*(width>>2)+h)*4+3]=0xff;
      (*max_out)[(g*(width>>2)+h)*4]  =maxvec[0];
      (*max_out)[(g*(width>>2)+h)*4+1]=maxvec[1];
      (*max_out)[(g*(width>>2)+h)*4+2]=maxvec[2];
      (*max_out)[(g*(width>>2)+h)*4+3]=0xff;
      tmpcol[0][0]=8*minvec[0];
      tmpcol[0][1]=4*minvec[1];
      tmpcol[0][2]=8*minvec[2];
      tmpcol[1][0]=rint(((float)(16*minvec[0]+8*maxvec[0]))/3.0);
      tmpcol[1][1]=rint(((float)( 8*minvec[1]+4*maxvec[1]))/3.0);
      tmpcol[1][2]=rint(((float)(16*minvec[2]+8*maxvec[2]))/3.0);  
      tmpcol[2][0]=rint(((float)(8*minvec[0]+16*maxvec[0]))/3.0);
      tmpcol[2][1]=rint(((float)(4*minvec[1]+ 8*maxvec[1]))/3.0);
      tmpcol[2][2]=rint(((float)(8*minvec[2]+16*maxvec[2]))/3.0);        
      tmpcol[3][0]=8*maxvec[0];
      tmpcol[3][1]=4*maxvec[1];
      tmpcol[3][2]=8*maxvec[2];
      for (j=0;j<4;j++) {
        for (k=0;k<4;k++) { 
          double col_cost = 1073741823.0;
          for (l=0;l<4;l++) {
            double lcost;
            if (use_yuv_weight>0) lcost=calculate_yuv_cost(&in[((g*4+j)*width+h*4+k)*4],tmpcol[l]);
            else                  lcost=calculate_euclidean_cost(&in[((g*4+j)*width+h*4+k)*4],tmpcol[l]);
            if (lcost < col_cost) {
              col_cost=lcost;
              (*colsel)[(g*4+j)*width+h*4+k]=l;
            }
          }
        }
      }
      if (use_yuv_weight>0) free(stmp);
      free(s);
      for (k=0;k<ws;k++) free(coltmp[k]);
      free(coltmp);
    }
  }
}

void vector_quantization_decode(unsigned char *min, unsigned char *max, unsigned char *colsel, unsigned int width, unsigned int height, unsigned char **decoded_image) {
  int c,g,h,j,k;
  *decoded_image = (unsigned char *)malloc(height*width*4);

  if (decoded_image==NULL) {
    printf("\n\nVector quantizer: error allocating memory!\n\n");
  }

  for (g=0;g<height>>2;g++) {
    for (h=0;h<width>>2;h++) {
      int tmpcol[4][3];
      tmpcol[0][0]=8*min[(g*(width>>2)+h)*4];
      tmpcol[0][1]=4*min[(g*(width>>2)+h)*4+1];
      tmpcol[0][2]=8*min[(g*(width>>2)+h)*4+2];
      tmpcol[1][0]=rint(((float)(16*min[(g*(width>>2)+h)*4]+8*max[(g*(width>>2)+h)*4]))/3.0);
      tmpcol[1][1]=rint(((float)( 8*min[(g*(width>>2)+h)*4+1]+4*max[(g*(width>>2)+h)*4+1]))/3.0);
      tmpcol[1][2]=rint(((float)(16*min[(g*(width>>2)+h)*4+2]+8*max[(g*(width>>2)+h)*4+2]))/3.0);  
      tmpcol[2][0]=rint(((float)(8*min[(g*(width>>2)+h)*4]+16*max[(g*(width>>2)+h)*4]))/3.0);
      tmpcol[2][1]=rint(((float)(4*min[(g*(width>>2)+h)*4+1]+ 8*max[(g*(width>>2)+h)*4+1]))/3.0);
      tmpcol[2][2]=rint(((float)(8*min[(g*(width>>2)+h)*4+2]+16*max[(g*(width>>2)+h)*4+2]))/3.0);        
      tmpcol[3][0]=8*max[(g*(width>>2)+h)*4];
      tmpcol[3][1]=4*max[(g*(width>>2)+h)*4+1];
      tmpcol[3][2]=8*max[(g*(width>>2)+h)*4+2];
      //printf("(%i %i %i)(%i %i %i)(%i %i %i)(%i %i %i)\n",tmpcol[0][0],tmpcol[0][1],tmpcol[0][2],tmpcol[1][0],tmpcol[1][1],tmpcol[1][2],tmpcol[2][0],tmpcol[2][1],tmpcol[2][2],tmpcol[3][0],tmpcol[3][1],tmpcol[3][2]);
      for (j=0;j<4;j++) {
        for (k=0;k<4;k++) {
          /* (*decoded_image)[((g*4+j)*width+h*4+k)*4]= rint((float)min[(g*(width>>2)+h)*4]*modvecRB[colsel[(g*4+j)*width+h*4+k]]
                                                        +(float)max[(g*(width>>2)+h)*4]*(8.0-modvecRB[colsel[(g*4+j)*width+h*4+k]]));
          (*decoded_image)[((g*4+j)*width+h*4+k)*4+1]= rint((float)min[(g*(width>>2)+h)*4+1]*modvecG[colsel[(g*4+j)*width+h*4+k]]
                                                          +(float)max[(g*(width>>2)+h)*4+1]*(4.0-modvecG[colsel[(g*4+j)*width+h*4+k]]));
          (*decoded_image)[((g*4+j)*width+h*4+k)*4+2]= rint((float)min[(g*(width>>2)+h)*4+2]*modvecRB[colsel[(g*4+j)*width+h*4+k]]
                                                          +(float)max[(g*(width>>2)+h)*4+2]*(8.0-modvecRB[colsel[(g*4+j)*width+h*4+k]]));
          (*decoded_image)[((g*4+j)*width+h*4+k)*4+3]=0xff;*/
          for (c=0;c<3;c++) (*decoded_image)[((g*4+j)*width+h*4+k)*4+c]= tmpcol[colsel[(g*4+j)*width+h*4+k]][c];
          (*decoded_image)[((g*4+j)*width+h*4+k)*4+3]=0xff;
        }
      }
    }
  }
}
