from typing import Dict, List, Any

import torch as torch
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration

import gradio as gr
import subprocess
import numpy as np
import time

import pandas as pd

from datasets import Audio, Dataset


class EndpointHandler():

    # model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
    #                               model='silero_vad', force_reload=False, onnx=True)

    # (get_speech_timestamps,
    #  _, read_audio,
    #  *_) = utils


    def __init__(self, path=""):
        device = 0 if torch.cuda.is_available() else "cpu"
        # self.pipe = pipeline(
        #     task="automatic-speech-recognition",
        #     model="openai/whisper-large",
        #     # chunk_length_s=30,
        #     device=device,
        # )
        self.processor = WhisperProcessor.from_pretrained("openai/whisper-large")
        self.model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
        self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(language="nl", task="transcribe")
        # self.pipe.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="nl", task="transcribe")

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
       data args:
            inputs (:obj: `str`)
            date (:obj: `str`)
      Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """
        #print request
        print("request")
        print(data)
        print(data["inputs"])
        # audio_data = read(io.BytesIO(data))
        # get inputs, inputs in request body is possible equal to wav or mp3 file
        inputs = data.pop("inputs", data)
        print("here comes text")
        print(inputs)
        data = [inputs]
        ds = pd.DataFrame(data, columns=['audio'])
        ds = Dataset.from_pandas(ds)
        # load dummy dataset and read soundfiles
        ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
        input_speech = next(iter(ds))["audio"]["array"]
        input_features = self.processor(input_speech, return_tensors="pt").input_features
        predicted_ids = self.model.generate(input_features, forced_decoder_ids=self.model.config.forced_decoder_ids)
        transcription = self.processor.batch_decode(predicted_ids)
        print("this is the description")
        print(transcription)
        # print(self.pipe(inputs))
        # text = self.pipe(inputs)["text"]
        # text = self.transcribe(inputs)
        # print(text)
        return transcription