// This file is a part of MPDN Extensions.
// https://github.com/zachsaw/MPDN_Extensions
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3.0 of the License, or (at your option) any later version.
// 
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this library.
// 
sampler YSampler : register(s0);
sampler YuvSampler : register(s2);
float4 floatConsts1 : register(c0);
float4 floatConsts2 : register(c1);
float4 floatConsts3 : register(c2);
#define inputSize (floatConsts1.xy)
#define inputPixSize (floatConsts1.zw)
#define outputSize (floatConsts2.xy)
#define outputPixSize (floatConsts2.zw)
#define chromaOffset (floatConsts3.xy)
#define power (floatConsts3.z)
#define sqr(x) dot(x, x)
#define noise (0.05)
#define bitnoise (1.0 / (2.0 * 255.0))
#define radius 0.5
#define taps 4
#define loopStart (1 - ceil(taps / 2.0))
#define loopStop (floor(taps / 2.0))
#define factor (outputPixSize / inputPixSize)
#define pi acos(-1)
#define Kernel(x) (cos(pi * (x) / taps))  // Hann kernel
#define sinc(x) sin(pi * (x)) / (x)
#define BCWeights(B, C, x) (x > 2.0 ? 0 : x <= 1.0 ? ((2 - 1.5 * B - C) * x + (-3 + 2 * B + C)) * x * x + (1 - B / 3.0) : (((-B / 6.0 - C) * x + (B + 5 * C)) * x + (-2 * B - 8 * C)) * x + ((4.0 / 3.0) * B + 4 * C))
#define IntKernel(x) (BCWeights(1.0 / 3.0, 1.0 / 3.0, abs(x)))
// #define IntKernel(x) (cos(0.5 * pi * saturate(abs(x))))
#define GetYuv(x, y) tex2D(YuvSampler, inputPixSize * (pos + int2(x, y) + 0.5))

float4 main(float2 tex : TEXCOORD0) : COLOR
{
  float4 c0 = tex2D(YSampler, tex);
  float2 pos = tex * inputSize.xy - chromaOffset - 0.5;
  float2 offset = pos - floor(pos);
  pos -= offset;
  float localVar = sqr(noise);
  float4 fitAvg = 0;
  float4 fitVar = 0;
  float4 fitCov = 0;
  float4 intAvg = 0;
  float4 intVar = 0;
  [loop] for (int X = loopStart; X <= loopStop; X++)
  [loop] for (int Y = loopStart; Y <= loopStop; Y++)
  {
    float4 yuv = GetYuv(X, Y);
    float dI2 = sqr(yuv.x - c0.x);
    float var = yuv.w + sqr(bitnoise);
    float2 kernel = Kernel(float2(X, Y) - offset);
    float weight = kernel.x * kernel.y / (dI2 + var + localVar);
    fitAvg += weight * float4(yuv.xyz, 1);
    fitVar += weight * float4(float3(var, sqr(bitnoise), sqr(bitnoise)) + yuv.xyz * yuv.xyz, weight);
    fitCov += weight * float4(yuv.x * yuv.yz, var, 0);
    kernel = IntKernel(float2(X, Y) - offset);
    weight = kernel.x * kernel.y;
    intAvg += weight * float4(yuv.xyz, 1);
    intVar += weight * float4(float3(var, sqr(bitnoise), sqr(bitnoise)) + yuv.xyz * yuv.xyz, weight);
  }
  float weightSum = fitAvg.w;
  float weightSqrSum = fitVar.w;
  fitAvg /= weightSum;
  float3 Var = (fitVar / weightSum) - fitAvg * fitAvg;
  float2 Cov = (fitCov / weightSum) - fitAvg.x * fitAvg.yz;
  float intWeightSum = intAvg.w;
  float intWeightSqrSum = intVar.w;
  intAvg /= intWeightSum;
  intVar = (intVar / intWeightSum) - intAvg * intAvg;
  float2 R2 = saturate((Cov * Cov) / (Var.x * Var.yz));
  float2 errFit = (1 - R2) * (weightSqrSum / sqr(weightSum) + sqr((c0 - fitAvg).x) / Var.x) / (1 - weightSqrSum / sqr(weightSum));
  float2 errInt = lerp((intVar.yz / Var.yz) * intWeightSqrSum / sqr(intWeightSum), (sqr((c0 - intAvg).x) + intVar.x) / Var.x, R2);
  float2 strength = saturate(power * errInt / lerp(errFit, errInt, power));
  c0.yz = lerp(intAvg.yz, fitAvg.yz + ((c0 - fitAvg).x * Cov / Var.x), strength);
  return float4(c0.yz, fitAvg.yz);
}
