/******************************************************************************
 *
 * Copyright (C) 2022 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */

/**
*******************************************************************************
* @file
*  isvce_downscaler.c
*
* @brief
*  Contains downscaler functions required by the SVC encoder
*
* @author
*  ittiam
*
* @par List of Functions:
*  - isvce_get_downscaler_data_size()
*  - isvce_get_downscaler_padding_dims()
*  - isvce_get_downscaler_normalized_filtered_pixel()
*  - isvce_horizontal_downscale_and_transpose()
*  - isvce_process_downscaler()
*  - isvce_initialize_downscaler()
*
* @remarks
*  None
*
*******************************************************************************
*/

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* system include files */
#include <stdio.h>
#include <stdlib.h>

#include "ih264_typedefs.h"
#include "ih264_macros.h"
#include "isvc_macros.h"
#include "ih264_platform_macros.h"
#include "iv2.h"
#include "isvc_defs.h"
#include "isvce_defs.h"
#include "isvc_structs.h"
#include "isvc_structs.h"
#include "isvce_downscaler.h"
#include "isvce_downscaler_private_defs.h"

/**
******************************************************************************
* @brief  lanczos filter coefficients for 2x downscaling
* @remarks Though the length of the filter is 8, the
* same coefficients
* are replicated so that 2 rows can be processed at one
* go in SIMD
******************************************************************************
*/
static WORD8 gai1_lanczos_coefficients_2x[NUM_SCALER_FILTER_PHASES][NUM_SCALER_FILTER_TAPS * 2] = {
    {-7, 0, 39, 64, 39, 0, -7, 0, -7, 0, 39, 64, 39, 0, -7, 0},
    {-6, 0, 33, 62, 41, 4, -6, 0, -6, 0, 33, 62, 41, 4, -6, 0},
    {-5, -1, 29, 57, 45, 9, -5, -1, -5, -1, 29, 57, 45, 9, -5, -1},
    {-4, -2, 23, 55, 48, 14, -4, -2, -4, -2, 23, 55, 48, 14, -4, -2},
    {-3, -3, 18, 52, 52, 18, -3, -3, -3, -3, 18, 52, 52, 18, -3, -3},
    {-2, -4, 13, 49, 54, 24, -2, -4, -2, -4, 13, 49, 54, 24, -2, -4},
    {-1, -5, 9, 44, 58, 29, -1, -5, -1, -5, 9, 44, 58, 29, -1, -5},
    {0, -6, 3, 42, 61, 34, 0, -6, 0, -6, 3, 42, 61, 34, 0, -6}};

/**
******************************************************************************
* @brief  lanczos filter coefficients for 1.5x downscaling
* @remarks Though the length of the filter is 8, the same coefficients
* are replicated so that 2 rows can be processed at one go in SIMD.
******************************************************************************
*/
static WORD8 gai1_lanczos_coefficients_3by2x[NUM_SCALER_FILTER_PHASES][NUM_SCALER_FILTER_TAPS * 2] =
    {{0, -11, 32, 86, 32, -11, 0, 0, 0, -11, 32, 86, 32, -11, 0, 0},
     {0, -10, 26, 79, 39, -5, 0, 0, 0, -10, 26, 79, 39, -5, 0, 0},
     {0, -8, 21, 72, 46, 0, -2, 0, 0, -8, 21, 72, 46, 0, -2, 0},
     {0, -6, 15, 66, 52, 3, -3, 0, 0, -6, 15, 66, 52, 3, -3, 0},
     {0, -6, 10, 60, 60, 10, -6, 0, 0, -6, 10, 60, 60, 10, -6, 0},
     {0, -3, 3, 52, 66, 15, -6, 0, 0, -3, 3, 52, 66, 15, -6, 0},
     {0, -2, 0, 46, 72, 21, -8, 0, 0, -2, 0, 46, 72, 21, -8, 0},
     {0, 0, -5, 39, 79, 26, -10, 0, 0, 0, -5, 39, 79, 26, -10, 0}};

/**
*******************************************************************************
*
* @brief
*   gets the memory size required for downscaler
*
* @par Description:
*   returns the memory required by the downscaler context and state structs
*   for allocation.
*
* @returns
*
* @remarks
*
*
*******************************************************************************
*/

UWORD32 isvce_get_downscaler_data_size(UWORD8 u1_num_spatial_layers, DOUBLE d_scaling_factor,
                                       UWORD32 u4_width, UWORD32 u4_height)
{
    UWORD32 u4_size = 0;

    if(u1_num_spatial_layers > 1)
    {
        u4_size += sizeof(downscaler_state_t);

        u4_size +=
            (u4_height + NUM_SCALER_FILTER_TAPS * 2) * ((UWORD32) (u4_width / d_scaling_factor));
    }

    return u4_size;
}

/**
*******************************************************************************
*
* @brief
*   gets the padding size required for filtering
*
* @par Description:
*   gets the padding size required for filtering
*
* @returns
*
* @remarks
*
*
*******************************************************************************
*/

void isvce_get_downscaler_padding_dims(padding_dims_t *ps_pad_dims)
{
    ps_pad_dims->u1_left_pad_size = ALIGN8(NUM_SCALER_FILTER_TAPS / 2);
    ps_pad_dims->u1_right_pad_size = ALIGN8(NUM_SCALER_FILTER_TAPS / 2);
    ps_pad_dims->u1_top_pad_size = NUM_SCALER_FILTER_TAPS / 2;
    ps_pad_dims->u1_bottom_pad_size = NUM_SCALER_FILTER_TAPS / 2;
}

/**
*******************************************************************************
*
* @brief
*   processes downscaler
*
* @par Description:
*   calls the function for padding and scaling
*
* @param[in] ps_scaler
*  pointer to downdownscaler context
*
* @param[in] ps_src_buf_props
*  pointer to source buffer props struct
*
* @param[in] u4_blk_wd
*  width of the block to be processed
*
* @param[in] u4_blk_ht
*  height of the block to be processed
*
* @returns
*
* @remarks
*
*
*******************************************************************************
*/

void isvce_process_downscaler(downscaler_ctxt_t *ps_scaler, yuv_buf_props_t *ps_src_buf_props,
                              yuv_buf_props_t *ps_dst_buf_props, UWORD32 u4_blk_wd,
                              UWORD32 u4_blk_ht)
{
    buffer_container_t s_src_buf;
    buffer_container_t s_dst_buf;

    UWORD32 u4_scaled_block_size_x, u4_scaled_block_size_y;

    downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;

    ASSERT(ps_src_buf_props->e_color_format == IV_YUV_420SP_UV);

    u4_scaled_block_size_x = (UWORD32) (u4_blk_wd / ps_scaler->d_scaling_factor);
    u4_scaled_block_size_y = (UWORD32) (u4_blk_ht / ps_scaler->d_scaling_factor);

    /* luma */
    s_src_buf = ps_src_buf_props->as_component_bufs[Y];
    s_src_buf.pv_data = ((UWORD8 *) s_src_buf.pv_data) - (NUM_SCALER_FILTER_TAPS / 2) -
                        (NUM_SCALER_FILTER_TAPS / 2) * s_src_buf.i4_data_stride;

    s_dst_buf.pv_data = ps_scaler_state->pv_scratch_buf;
    s_dst_buf.i4_data_stride = u4_blk_ht + NUM_SCALER_FILTER_TAPS;

    ps_scaler_state->pf_downscaler(ps_scaler, &s_src_buf, &s_dst_buf, ps_scaler_state->pai1_filters,
                                   u4_scaled_block_size_x, u4_blk_ht + NUM_SCALER_FILTER_TAPS, 0);

    s_src_buf = s_dst_buf;
    s_dst_buf = ps_dst_buf_props->as_component_bufs[Y];

    ps_scaler_state->pf_downscaler(ps_scaler, &s_src_buf, &s_dst_buf, ps_scaler_state->pai1_filters,
                                   u4_scaled_block_size_y, u4_scaled_block_size_x, 0);

    /* chroma */
    u4_blk_ht /= 2;
    u4_scaled_block_size_y /= 2;

    s_src_buf = ps_src_buf_props->as_component_bufs[U];
    s_src_buf.pv_data = ((UWORD8 *) s_src_buf.pv_data) - NUM_SCALER_FILTER_TAPS -
                        (NUM_SCALER_FILTER_TAPS / 2) * s_src_buf.i4_data_stride;

    s_dst_buf.pv_data = ps_scaler_state->pv_scratch_buf;
    s_dst_buf.i4_data_stride = u4_blk_ht + NUM_SCALER_FILTER_TAPS;

    ps_scaler_state->pf_downscaler(ps_scaler, &s_src_buf, &s_dst_buf, ps_scaler_state->pai1_filters,
                                   u4_scaled_block_size_x, u4_blk_ht + NUM_SCALER_FILTER_TAPS, 1);

    s_src_buf = s_dst_buf;
    s_dst_buf = ps_dst_buf_props->as_component_bufs[U];

    ps_scaler_state->pf_downscaler(ps_scaler, &s_src_buf, &s_dst_buf, ps_scaler_state->pai1_filters,
                                   u4_scaled_block_size_y, u4_scaled_block_size_x, 0);
}

/**
*******************************************************************************
*
* @brief
*   normalized dot product computer for downscaler
*
* @par Description:
*   Given the downscaler filter coefficients, source buffer, the function
*   calculates the dot product between them, adds an offset and normalizes it
*
* @param[in] ps_scaler
*  pointer to src buf
*
* @param[in] pi1_filter
*  pointer to filter coefficients
*
* @returns
*
* @remarks
*
*******************************************************************************
*/

static UWORD8 isvce_get_downscaler_normalized_filtered_pixel(UWORD8 *pu1_src, WORD8 *pi1_filter)
{
    WORD32 i;
    WORD32 i4_norm_dot_product;
    UWORD8 u1_out_pixel;
    WORD32 i4_dot_product_sum = 0;
    WORD32 i4_rounding_offset = 1 << (FILTER_COEFF_Q - 1);
    WORD32 i4_normalizing_factor = 1 << FILTER_COEFF_Q;

    for(i = 0; i < NUM_SCALER_FILTER_TAPS; i++)
    {
        i4_dot_product_sum += (pu1_src[i] * pi1_filter[i]);
    }

    i4_norm_dot_product = ((i4_dot_product_sum + i4_rounding_offset) / i4_normalizing_factor);
    u1_out_pixel = (UWORD8) CLIP_U8(i4_norm_dot_product);

    return u1_out_pixel;
}

/**
*******************************************************************************
*
* @brief
*   horizontal scaler function
*
* @par Description:
*   Does horizontal scaling for the given block
*
* @param[in] ps_scaler
*  pointer to downscaler context
*
* @param[in] ps_src
*  pointer to source buffer container
*
* @param[in] ps_dst
*  pointer to destination buffer container
*
* @param[in] pai1_filters
*  pointer to array of downscaler filters
*
* @param[in] u4_blk_wd
*  width of the block after horizontal scaling (output block width)
*
* @param[in] u4_blk_ht
*  height of the current block (input block height)
*
* @param[in] u1_is_chroma
*  flag suggesting whether the buffer is luma or chroma
*
*
* @returns
*
* @remarks
*  The same function is used for vertical scaling too as
*  the horizontally scaled input in stored in transpose fashion.
*
*******************************************************************************
*/

static void isvce_horizontal_downscale_and_transpose(
    downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst,
    FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma)
{
    WORD32 i, j, k;
    UWORD8 u1_phase;
    UWORD8 u1_filtered_out_pixel;
    UWORD8 *pu1_src_j, *pu1_dst_j;
    UWORD8 u1_filtered_out_u_pixel, u1_filtered_out_v_pixel;
    UWORD8 *pu1_in_pixel;
    UWORD8 *pu1_out_pixel;
    WORD8 *pi1_filter_grid;
    UWORD16 u2_full_pixel_inc;
    UWORD8 au1_temp_u_buff[NUM_SCALER_FILTER_TAPS];
    UWORD8 au1_temp_v_buff[NUM_SCALER_FILTER_TAPS];

    downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;

    UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset;
    UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment;
    UWORD8 *pu1_src = ps_src->pv_data;
    UWORD32 u4_in_stride = ps_src->i4_data_stride;
    UWORD8 *pu1_dst = ps_dst->pv_data;
    UWORD32 u4_out_stride = ps_dst->i4_data_stride;
    UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos;

    /* Offset the input so that the input pixel to be processed
    co-incides with the centre of filter (4th coefficient)*/
    pu1_src += (1 + u1_is_chroma);

    ASSERT((1 << DOWNSCALER_Q) == ps_scaler_state->u4_vert_increment);

    if(!u1_is_chroma)
    {
        for(j = 0; j < (WORD32) u4_blk_ht; j++)
        {
            pu1_src_j = pu1_src + (j * u4_in_stride);
            pu1_dst_j = pu1_dst + j;

            u4_center_pixel_pos = u4_center_pixel_pos_src;

            for(i = 0; i < (WORD32) u4_blk_wd; i++)
            {
                u1_phase = get_filter_phase(u4_center_pixel_pos);
                pi1_filter_grid = pai1_filters[u1_phase];

                /* Doing the Calculation for current Loop Count  */
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);

                u1_filtered_out_pixel =
                    isvce_get_downscaler_normalized_filtered_pixel(pu1_in_pixel, pi1_filter_grid);
                *pu1_out_pixel = u1_filtered_out_pixel;

                /* Update the context for next Loop Count */
                u4_center_pixel_pos += u4_src_horz_increments;
            }
        }
    }
    else
    {
        for(j = 0; j < (WORD32) u4_blk_ht; j++)
        {
            pu1_src_j = pu1_src + (j * u4_in_stride);
            pu1_dst_j = pu1_dst + j;

            u4_center_pixel_pos = u4_center_pixel_pos_src;

            for(i = 0; i < (WORD32) u4_blk_wd; i++)
            {
                u1_phase = get_filter_phase(u4_center_pixel_pos);
                pi1_filter_grid = pai1_filters[u1_phase];

                /*Doing the Calculation for current Loop Count  */
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);

                for(k = 0; k < NUM_SCALER_FILTER_TAPS; k++)
                {
                    au1_temp_u_buff[k] = *(pu1_in_pixel + (2 * k));
                    au1_temp_v_buff[k] = *(pu1_in_pixel + ((2 * k) + 1));
                }

                u1_filtered_out_u_pixel = isvce_get_downscaler_normalized_filtered_pixel(
                    au1_temp_u_buff, pi1_filter_grid);
                u1_filtered_out_v_pixel = isvce_get_downscaler_normalized_filtered_pixel(
                    au1_temp_v_buff, pi1_filter_grid);
                *pu1_out_pixel = u1_filtered_out_u_pixel;
                *(pu1_out_pixel + u4_out_stride) = u1_filtered_out_v_pixel;

                /* Update the context for next Loop Count */
                u4_center_pixel_pos += u4_src_horz_increments;
            }
        }
    }
}

void isvce_downscaler_function_selector(downscaler_state_t *ps_scaler_state, IV_ARCH_T e_arch)
{
    switch(e_arch)
    {
#if defined(X86)
        case ARCH_X86_SSE42:
        {
            ps_scaler_state->pf_downscaler = isvce_horizontal_downscale_and_transpose_sse42;

            break;
        }
#elif defined(ARMV8)
        case ARCH_ARM_A53:
        case ARCH_ARM_A57:
        case ARCH_ARM_V8_NEON:
        {
            ps_scaler_state->pf_downscaler = isvce_horizontal_downscale_and_transpose_neon;

            break;
        }
#elif defined(ARM) && !defined(DISABLE_NEON)
        case ARCH_ARM_A9Q:
        case ARCH_ARM_A9A:
        case ARCH_ARM_A9:
        case ARCH_ARM_A7:
        case ARCH_ARM_A5:
        case ARCH_ARM_A15:
        {
            ps_scaler_state->pf_downscaler = isvce_horizontal_downscale_and_transpose_neon;

            break;
        }
#endif
        default:
        {
            ps_scaler_state->pf_downscaler = isvce_horizontal_downscale_and_transpose;

            break;
        }
    }
}

/**
*******************************************************************************
*
* @brief
*   initializes the downscaler context
*
* @par Description:
*   initializes the downscaler context for the given scaling factor
*   with padding size, filter size, etc.
*
* @param[in] ps_scaler
*   pointer downscaler context
*
* @param[in] ps_mem_rec
*   pointer to memory allocated to downscaler process
*
* @param[in] d_scaling_factor
*   scaling reatio of width/ height between two consecutive SVC layers
*
* @param[in] u1_num_spatial_layers
*   scaling reatio of width/ height between two consecutive SVC layers
*
* @param[in] u4_wd
*   width of the input
*
* @param[in] u4_ht
*   height of the input
*
* @param[in] e_arch
*   architecure type
*
* @returns
*
* @remarks
*  when ARM intrinsics are added, update should be done here
*
*******************************************************************************
*/

void isvce_initialize_downscaler(downscaler_ctxt_t *ps_scaler, iv_mem_rec_t *ps_mem_rec,
                                 DOUBLE d_scaling_factor, UWORD8 u1_num_spatial_layers,
                                 UWORD32 u4_in_width, UWORD32 u4_in_height, IV_ARCH_T e_arch)
{
    if(u1_num_spatial_layers > 1)
    {
        downscaler_state_t *ps_scaler_state;

        UWORD8 *pu1_buf = (UWORD8 *) ps_mem_rec->pv_base;

        ps_scaler_state = (downscaler_state_t *) pu1_buf;
        pu1_buf += sizeof(ps_scaler_state[0]);

        ps_scaler_state->pv_scratch_buf = pu1_buf;
        ps_scaler_state->u4_in_wd = u4_in_width;
        ps_scaler_state->u4_in_ht = u4_in_height;

        ps_scaler->pv_scaler_state = ps_scaler_state;
        ps_scaler->d_scaling_factor = d_scaling_factor;
        ps_scaler->u1_num_spatial_layers = u1_num_spatial_layers;

        isvce_downscaler_function_selector(ps_scaler_state, e_arch);

        ps_scaler_state->u4_horz_increment = (UWORD32) (d_scaling_factor * (1 << DOWNSCALER_Q));

        ps_scaler_state->u4_vert_increment = (1 << DOWNSCALER_Q);
        ps_scaler_state->i4_init_offset = 0;
        ps_scaler_state->pai1_filters = (d_scaling_factor == 2.0) ? gai1_lanczos_coefficients_2x
                                                                  : gai1_lanczos_coefficients_3by2x;
    }
}