/*****************************************************************************
 * $Id: vop-deint-weave.c,v 1.1 2004/09/18 16:46:55 alainjj Exp $
 * Program under GNU General Public License (see ../COPYING)
 * Deinterlace routines for xine by Miguel Freitas
 * based of DScaler project sources (deinterlace.sourceforge.net)
 *****************************************************************************/
#include <stdio.h>
#include "config.h"

#ifdef ARCH_X86

#include "colorspace.h"
#include "vop.h"
#include "memcpy.h"
#include "cpu_accel.h"
extern int debug;

static int deinterlace_weave_yuv_mmx(vop2 *v,  unsigned char *dest, unsigned char *src, 
				   int width, int height) {
  int Line;
  uint64_t *YVal1;
  uint64_t *YVal2;
  uint64_t *YVal3;
  uint64_t *YVal4;
  uint64_t *Dest;
  uint8_t* pEvenLines = src;
  uint8_t* pOddLines = src+2*width;
  uint8_t* pPrevLines;

  int LineLength = width * 2;
  int SourcePitch = width * 4;
  int IsOdd = 1;

  long TemporalTolerance = 300;
  long SpatialTolerance = 600;
  long SimilarityThreshold = 25;

  int n;

  uint64_t qwSpatialTolerance;
  uint64_t qwTemporalTolerance;
  uint64_t qwThreshold;

  static mmx_t YMask = {ub:{0xff,0,0xff,0,0xff,0,0xff,0}};
  static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};


  if (IsOdd)
    pPrevLines = v->bufs[(v->buf_cur+v->nbufs-1)%v->nbufs] + 2*width;
  else
    pPrevLines = v->bufs[(v->buf_cur+v->nbufs-1)%v->nbufs];

  // Since the code uses MMX to process 4 pixels at a time, we need our constants
  // to be represented 4 times per quadword.
  qwSpatialTolerance = SpatialTolerance;
  qwSpatialTolerance += (qwSpatialTolerance << 48) + (qwSpatialTolerance << 32) + (qwSpatialTolerance << 16);
  qwTemporalTolerance = TemporalTolerance;
  qwTemporalTolerance += (qwTemporalTolerance << 48) + (qwTemporalTolerance << 32) + (qwTemporalTolerance << 16);
  qwThreshold = SimilarityThreshold;
  qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);

  // copy first even line no matter what, and the first odd line if we're
  // processing an even field.
  fast_memcpy(dest, pEvenLines, LineLength);
  if (!IsOdd)
    fast_memcpy(dest + LineLength, pOddLines, LineLength);

  height = height / 2;
  for (Line = 0; Line < height - 1; ++Line)
  {
    if (IsOdd)
    {
      YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
      YVal2 = (uint64_t *)(pOddLines + Line * SourcePitch);
      YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
      YVal4 = (uint64_t *)(pPrevLines + Line * SourcePitch);
      Dest = (uint64_t *)(dest + (Line * 2 + 1) * LineLength);
    }
    else
    {
      YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
      YVal2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
      YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
      YVal4 = (uint64_t *)(pPrevLines + (Line + 1) * SourcePitch);
      Dest = (uint64_t *)(dest + (Line * 2 + 2) * LineLength);
    }

    // For ease of reading, the comments below assume that we're operating on an odd
    // field (i.e., that bIsOdd is true).  The exact same processing is done when we
    // operate on an even field, but the roles of the odd and even fields are reversed.
    // It's just too cumbersome to explain the algorithm in terms of "the next odd
    // line if we're doing an odd field, or the next even line if we're doing an
    // even field" etc.  So wherever you see "odd" or "even" below, keep in mind that
    // half the time this function is called, those words' meanings will invert.

    // Copy the even scanline below this one to the overlay buffer, since we'll be
    // adapting the current scanline to the even lines surrounding it.  The scanline
    // above has already been copied by the previous pass through the loop.
    fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);

    n = LineLength >> 3;
    while( n-- )
    {
      movq_m2r ( *YVal1++, mm0 );    // mm0 = E1
      movq_m2r ( *YVal2++, mm1 );    // mm1 = O
      movq_m2r ( *YVal3++, mm2 );    // mm2 = E2

      movq_r2r ( mm0, mm3 );       // mm3 = intensity(E1)
      movq_r2r ( mm1, mm4 );       // mm4 = intensity(O)
      movq_r2r ( mm2, mm6 );       // mm6 = intensity(E2)

      pand_m2r ( YMask, mm3 );
      pand_m2r ( YMask, mm4 );
      pand_m2r ( YMask, mm6 );

      // Average E1 and E2 for interpolated bobbing.
      // leave result in mm0
      pand_m2r ( Mask, mm0 ); // mm0 = E1 with lower chroma bit stripped off
      pand_m2r ( Mask, mm2 ); // mm2 = E2 with lower chroma bit stripped off
      psrlw_i2r ( 01, mm0 );    // mm0 = E1 / 2
      psrlw_i2r ( 01, mm2 );    // mm2 = E2 / 2
      paddb_r2r ( mm2, mm0 );

      // The meat of the work is done here.  We want to see whether this pixel is
      // close in luminosity to ANY of: its top neighbor, its bottom neighbor,
      // or its predecessor.  To do this without branching, we use MMX's
      // saturation feature, which gives us Z(x) = x if x>=0, or 0 if x<0.
      //
      // The formula we're computing here is
      //		Z(ST - (E1 - O) ^ 2) + Z(ST - (E2 - O) ^ 2) + Z(TT - (Oold - O) ^ 2)
      // where ST is spatial tolerance and TT is temporal tolerance.  The idea
      // is that if a pixel is similar to none of its neighbors, the resulting
      // value will be pretty low, probably zero.  A high value therefore indicates
      // that the pixel had a similar neighbor.  The pixel in the same position
      // in the field before last (Oold) is considered a neighbor since we want
      // to be able to display 1-pixel-high horizontal lines.

      movq_m2r ( *&qwSpatialTolerance, mm7 );
      movq_r2r ( mm3, mm5 );     // mm5 = E1
      psubsw_r2r ( mm4, mm5 );   // mm5 = E1 - O
      psraw_i2r ( 1, mm5 );
      pmullw_r2r ( mm5, mm5 );   // mm5 = (E1 - O) ^ 2
      psubusw_r2r ( mm5, mm7 );  // mm7 = ST - (E1 - O) ^ 2, or 0 if that's negative

      movq_m2r ( *&qwSpatialTolerance, mm3 );
      movq_r2r ( mm6, mm5 );    // mm5 = E2
      psubsw_r2r ( mm4, mm5 );  // mm5 = E2 - O
      psraw_i2r ( 1, mm5 );
      pmullw_r2r ( mm5, mm5 );  // mm5 = (E2 - O) ^ 2
      psubusw_r2r ( mm5, mm3 ); // mm0 = ST - (E2 - O) ^ 2, or 0 if that's negative
      paddusw_r2r ( mm3, mm7 ); // mm7 = (ST - (E1 - O) ^ 2) + (ST - (E2 - O) ^ 2)

      movq_m2r ( *&qwTemporalTolerance, mm3 );
      movq_m2r ( *YVal4++, mm5 ); // mm5 = Oold
      pand_m2r ( YMask, mm5 );
      psubsw_r2r ( mm4, mm5 );  // mm5 = Oold - O
      psraw_i2r ( 1, mm5 ); // XXX
      pmullw_r2r ( mm5, mm5 );  // mm5 = (Oold - O) ^ 2
      psubusw_r2r ( mm5, mm3 ); /* mm0 = TT - (Oold - O) ^ 2, or 0 if that's negative */
      paddusw_r2r ( mm3, mm7 ); // mm7 = our magic number

      /*
       * Now compare the similarity totals against our threshold.  The pcmpgtw
       * instruction will populate the target register with a bunch of mask bits,
       * filling words where the comparison is true with 1s and ones where it's
       * false with 0s.  A few ANDs and NOTs and an OR later, we have bobbed
       * values for pixels under the similarity threshold and weaved ones for
       * pixels over the threshold.
       */

      pcmpgtw_m2r( *&qwThreshold, mm7 ); // mm7 = 0xffff where we're greater than the threshold, 0 elsewhere
      movq_r2r ( mm7, mm6 );  // mm6 = 0xffff where we're greater than the threshold, 0 elsewhere
      pand_r2r ( mm1, mm7 );  // mm7 = weaved data where we're greater than the threshold, 0 elsewhere
      pandn_r2r ( mm0, mm6 ); // mm6 = bobbed data where we're not greater than the threshold, 0 elsewhere
      por_r2r ( mm6, mm7 );   // mm7 = bobbed and weaved data

      movq_r2m ( mm7, *Dest++ );
    }
  }

  // Copy last odd line if we're processing an odd field.
  if (IsOdd)
  {
    fast_memcpy(dest + (height * 2 - 1) * LineLength,
                      pOddLines + (height - 1) * SourcePitch,
                      LineLength);
  }

  // clear out the MMX registers ready for doing floating point
  // again
  emms();

  return 1;
}

vop vop_deint_weave = {
  "deintweave",  /* name */
  2,           /* 2 images are needed */
  VIDEO_YUYV,  /* input format  */
  VIDEO_YUYV,  /* output format */
  deinterlace_weave_yuv_mmx,  /* The TREATMENT function */
  NULL,        /* No reinitialization function */
  0,           /* the width of the input is equal to the output width*/
  0,           /* idem for the height */
  -1,          /* the destination height is 480 for ntsc, 576 for ntsc */
  1            /* preferably the last treatment */
};

#endif

