import yaml
import random
import inspect
import numpy as np
import typing as tp
from abc import ABC

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio

from tools.torch_tools import wav_to_fbank

from diffusers.utils.torch_utils import randn_tensor
from transformers import HubertModel
from libs.rvq.descript_quantize3 import ResidualVectorQuantize

from models_gpt.models.gpt2_rope2_time_new_correct_mask_noncasual_reflow import GPT2Model
from models_gpt.models.gpt2_config import GPT2Config
from our_MERT_BESTRQ.mert_fairseq.models.musicfm.musicfm_model import MusicFMModel, MusicFMConfig

from torch.cuda.amp import autocast


class HubertModelWithFinalProj(HubertModel):
    def __init__(self, config):
        super().__init__(config)

        # The final projection layer is only used for backward compatibility.
        # Following https://github.com/auspicious3000/contentvec/issues/6
        # Remove this layer is necessary to achieve the desired outcome.
        print("hidden_size:",config.hidden_size)
        print("classifier_proj_size:",config.classifier_proj_size)
        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)


class SampleProcessor(torch.nn.Module):
    def project_sample(self, x: torch.Tensor):
        """Project the original sample to the 'space' where the diffusion will happen."""
        """Project back from diffusion space to the actual sample space."""
        return z

class Feature1DProcessor(SampleProcessor):
    def __init__(self, dim: int = 100, power_std = 1., \
                 num_samples: int = 100_000, cal_num_frames: int = 600):
        super().__init__()

        self.num_samples = num_samples
        self.dim = dim
        self.power_std = power_std
        self.cal_num_frames = cal_num_frames
        self.register_buffer('counts', torch.zeros(1))
        self.register_buffer('sum_x', torch.zeros(dim))
        self.register_buffer('sum_x2', torch.zeros(dim))
        self.register_buffer('sum_target_x2', torch.zeros(dim))
        self.counts: torch.Tensor
        self.sum_x: torch.Tensor
        self.sum_x2: torch.Tensor

    @property
    def mean(self):
        mean = self.sum_x / self.counts
        if(self.counts < 10):
            mean = torch.zeros_like(mean)
        return mean

    @property
    def std(self):
        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
        if(self.counts < 10):
            std = torch.ones_like(std)
        return std

    @property
    def target_std(self):
        return 1

    def project_sample(self, x: torch.Tensor):
        assert x.dim() == 3
        if self.counts.item() < self.num_samples:
            self.counts += len(x)
            self.sum_x += x[:,:,0:self.cal_num_frames].mean(dim=(2,)).sum(dim=0)
            self.sum_x2 += x[:,:,0:self.cal_num_frames].pow(2).mean(dim=(2,)).sum(dim=0)
        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
        x = (x - self.mean.view(1, -1, 1)) * rescale.view(1, -1, 1)
        return x

    def return_sample(self, x: torch.Tensor):
        assert x.dim() == 3
        rescale = (self.std / self.target_std) ** self.power_std
        # print(rescale, self.mean)
        x = x * rescale.view(1, -1, 1) + self.mean.view(1, -1, 1)
        return x

def pad_or_tunc_tolen(prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds, len_size=77):
    if(prior_text_encoder_hidden_states.shape[1]<len_size):
        prior_text_encoder_hidden_states = torch.cat([prior_text_encoder_hidden_states, \
            torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], \
            prior_text_encoder_hidden_states.shape[2], device=prior_text_mask.device, \
            dtype=prior_text_encoder_hidden_states.dtype)],1)
        prior_text_mask = torch.cat([prior_text_mask, torch.zeros(prior_text_mask.shape[0], len_size-prior_text_mask.shape[1], device=prior_text_mask.device, dtype=prior_text_mask.dtype)],1)
    else:
        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states[:,0:len_size]
        prior_text_mask = prior_text_mask[:,0:len_size]
    prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.permute(0,2,1).contiguous()
    return prior_text_encoder_hidden_states, prior_text_mask, prior_prompt_embeds

class BASECFM(torch.nn.Module, ABC):
    def __init__(
        self,
        estimator,
        mlp,
        ssl_layer
    ):
        super().__init__()
        self.sigma_min = 1e-4

        self.estimator = estimator
        self.mlp = mlp
        self.ssl_layer = ssl_layer

    @torch.inference_mode()
    def forward(self, mu, n_timesteps, temperature=1.0):
        """Forward diffusion

        Args:
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_channels, mel_timesteps, n_feats)
            n_timesteps (int): number of diffusion steps
            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.

        Returns:
            sample: generated mel-spectrogram
                shape: (batch_size, n_channels, mel_timesteps, n_feats)
        """
        z = torch.randn_like(mu) * temperature
        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
        return self.solve_euler(z, t_span=t_span)

    def solve_euler(self, x, latent_mask_input,incontext_x, incontext_length, t_span, mu,attention_mask, guidance_scale):
        """
        Fixed euler solver for ODEs.
        Args:
            x (torch.Tensor): random noise
            t_span (torch.Tensor): n_timesteps interpolated
                shape: (n_timesteps + 1,)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_channels, mel_timesteps, n_feats)
        """
        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
        noise = x.clone()

        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
        # Or in future might add like a return_all_steps flag
        sol = []

        for step in range(1, len(t_span)):
            # print("incontext_x.shape:",incontext_x.shape)
            # print("noise.shape:",noise.shape)
            # print("t.shape:",t.shape)
            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
            if(guidance_scale > 1.0):

                model_input = torch.cat([ \
                    torch.cat([latent_mask_input, latent_mask_input], 0), \
                    torch.cat([incontext_x, incontext_x], 0), \
                    torch.cat([torch.zeros_like(mu), mu], 0), \
                    torch.cat([x, x], 0), \
                    ], 2)
                timestep=t.unsqueeze(-1).repeat(2)

                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
            else:
                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
                timestep=t.unsqueeze(-1)
                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
            
            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
            # print("dphi_dt.shape:",dphi_dt.shape)
            # print("x.shape:",x.shape)

            x = x + dt * dphi_dt
            t = t + dt
            sol.append(x)
            if step < len(t_span) - 1:
                dt = t_span[step + 1] - t

        return sol[-1]

    def projection_loss(self,hidden_proj, bestrq_emb):
        bsz = hidden_proj.shape[0]

        hidden_proj_normalized = F.normalize(hidden_proj, dim=-1)
        bestrq_emb_normalized = F.normalize(bestrq_emb, dim=-1)

        proj_loss = -(hidden_proj_normalized * bestrq_emb_normalized).sum(dim=-1) 
        proj_loss = 1+proj_loss.mean()

        return proj_loss

    def compute_loss(self, x1, mu,  latent_masks,attention_mask,wav2vec_embeds, validation_mode=False):
        """Computes diffusion loss

        Args:
            x1 (torch.Tensor): Target
                shape: (batch_size, n_channels, mel_timesteps, n_feats)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_channels, mel_timesteps, n_feats)

        Returns:
            loss: conditional flow matching loss
            y: conditional flow
                shape: (batch_size, n_channels, mel_timesteps, n_feats)
        """
        b = mu[0].shape[0]
        len_x = x1.shape[2]
        # random timestep
        if(validation_mode):
            t = torch.ones([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype) * 0.5
        else:
            t = torch.rand([b, 1, 1], device=mu[0].device, dtype=mu[0].dtype)
        # sample noise p(x_0)
        z = torch.randn_like(x1)

        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
        u = x1 - (1 - self.sigma_min) * z
        # print("y.shape:",y.shape)
        #self.unet(inputs_embeds=model_input, attention_mask=attention_mask,encoder_hidden_states=text_embedding,encoder_attention_mask=txt_attn_mask,time_step=timesteps).last_hidden_state
        model_input = torch.cat([*mu,y], 2)
        t=t.squeeze(-1).squeeze(-1)
        # print("model_input.shape:",model_input.shape)
        # print("attention_mask.shape:",attention_mask.shape)
        out = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=t,output_hidden_states=True)
        hidden_layer = out.hidden_states[self.ssl_layer]
        hidden_proj = self.mlp(hidden_layer)
        # print("hidden_proj.shape:",hidden_proj.shape)
        # print("mert_emb.shape:",mert_emb.shape)
        # exit()

        
        out = out.last_hidden_state
        
        out=out[:,:,-len_x:]
        # out=self.proj_out(out)

        weight = (latent_masks > 1.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() + (latent_masks < 0.5).unsqueeze(-1).repeat(1, 1, out.shape[-1]).float() * 0.01
        # print("out.shape",out.shape)
        # print("u.shape",u.shape)
        loss_re = F.mse_loss(out * weight, u * weight, reduction="sum") / weight.sum()
        # print("hidden_proj.shape:",hidden_proj.shape)
        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
        loss_cos = self.projection_loss(hidden_proj, wav2vec_embeds)
        loss = loss_re + loss_cos * 0.5
        # print("loss_cos:",loss_cos,loss_cos.device)
        print("loss:",loss,loss.device)
        # exit()
        return loss, loss_re, loss_cos

class PromptCondAudioDiffusion(nn.Module):
    def __init__(
        self,
        num_channels,
        unet_model_name=None,
        unet_model_config_path=None,
        snr_gamma=None,
        hubert_layer=None,
        ssl_layer=None,
        uncondition=True,
    ):
        super().__init__()

        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"

        self.unet_model_name = unet_model_name
        self.unet_model_config_path = unet_model_config_path
        self.snr_gamma = snr_gamma
        self.uncondition = uncondition
        self.num_channels = num_channels
        self.hubert_layer = hubert_layer
        self.ssl_layer = ssl_layer

        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
        self.normfeat = Feature1DProcessor(dim=64)

        self.sample_rate = 48000
        self.num_samples_perseg = self.sample_rate * 20 // 1000
        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
        # self.wav2vec = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
        # self.wav2vec_processor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0", trust_remote_code=True)
        self.bestrq = MusicFMModel(MusicFMConfig())
        self.rsq48tobestrq = torchaudio.transforms.Resample(48000, 24000)
        self.rsq48tohubert = torchaudio.transforms.Resample(48000, 16000)
        self.rvq_bestrq_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 1, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
        for v in self.rvq_bestrq_emb.parameters():v.requires_grad = False
        # self.hubert = HubertModelWithFinalProj.from_pretrained("ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68")
        # for v in self.hubert.parameters():v.requires_grad = False
        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
        # self.xvecmodel = XVECModel()
        config = GPT2Config(n_positions=1000,n_layer=39,n_head=30,n_embd=1200)
        unet = GPT2Model(config)
        mlp =  nn.Sequential(
            nn.Linear(1200, 1024), 
            nn.SiLU(),                  
            nn.Linear(1024, 1024),      
            nn.SiLU(),                 
            nn.Linear(1024, 768)  
        )
        self.set_from = "random"
        self.cfm_wrapper = BASECFM(unet, mlp,self.ssl_layer)
        self.mask_emb = torch.nn.Embedding(3, 48)
        print("Transformer initialized from pretrain.")
        torch.cuda.empty_cache()
        # self.unet.set_attn_processor(AttnProcessor2_0())
        # self.unet.set_use_memory_efficient_attention_xformers(True)
        
        # self.start_embedding = nn.Parameter(torch.randn(1,1024))
        # self.end_embedding = nn.Parameter(torch.randn(1,1024))

    def compute_snr(self, timesteps):
        """
        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
        """
        alphas_cumprod = self.noise_scheduler.alphas_cumprod
        sqrt_alphas_cumprod = alphas_cumprod**0.5
        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5

        # Expand the tensors.
        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)

        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)

        # Compute SNR.
        snr = (alpha / sigma) ** 2
        return snr

    def preprocess_audio(self, input_audios, threshold=0.9):
        assert len(input_audios.shape) == 2, input_audios.shape
        norm_value = torch.ones_like(input_audios[:,0])
        max_volume = input_audios.abs().max(dim=-1)[0]
        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
        return input_audios/norm_value.unsqueeze(-1)

    def extract_wav2vec_embeds(self, input_audios,output_len):
        wav2vec_stride = 2

        wav2vec_embeds = self.hubert(self.rsq48tohubert(input_audios), output_hidden_states=True).hidden_states # 1, 4096, 1024
        # print(wav2vec_embeds)
        # print("audio.shape:",input_audios.shape)
        wav2vec_embeds_last=wav2vec_embeds[self.hubert_layer]
        # print("wav2vec_embeds_last.shape:",wav2vec_embeds_last.shape)
        wav2vec_embeds_last=torch.nn.functional.interpolate(wav2vec_embeds_last.permute(0, 2, 1), size=output_len, mode='linear', align_corners=False).permute(0, 2, 1)
        return wav2vec_embeds_last

    def extract_mert_embeds(self, input_audios):
        prompt_stride = 3
        inputs = self.clap_embd_extractor.mulan.audio.processor(self.rsp48toclap(input_audios), sampling_rate=self.clap_embd_extractor.mulan.audio.sr, return_tensors="pt")
        input_values = inputs['input_values'].squeeze(0).to(input_audios.device, dtype = input_audios.dtype)
        prompt_embeds = self.clap_embd_extractor.mulan.audio.model(input_values, output_hidden_states=True).hidden_states # batch_size, Time steps, 1024 
        mert_emb= prompt_embeds[-1]
        mert_emb = torch.nn.functional.interpolate(mert_emb.permute(0, 2, 1), size=500, mode='linear', align_corners=False).permute(0, 2, 1)

        return mert_emb
    
    def extract_bestrq_embeds(self, input_audio_0,input_audio_1,layer):
        self.bestrq.eval()
        # print("audio shape:",input_audio_0.shape)
        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
        # print("input_wav_mean.shape:",input_wav_mean.shape)
        # input_wav_mean = torch.randn(2,1720320*2).to(input_audio_0.device)
        input_wav_mean = self.bestrq(self.rsq48tobestrq(input_wav_mean), features_only = True)
        layer_results = input_wav_mean['layer_results']
        # print("layer_results.shape:",layer_results[layer].shape)
        bestrq_emb = layer_results[layer]
        bestrq_emb = bestrq_emb.permute(0,2,1).contiguous()
        #[b,t,1024] t=t/960
        #35.84s->batch,896,1024
        return bestrq_emb


    def extract_spk_embeds(self, input_audios):
        spk_embeds = self.xvecmodel(self.rsq48towav2vec(input_audios))
        spk_embeds = self.spk_linear(spk_embeds).reshape(spk_embeds.shape[0], 16, 1, 32)
        return spk_embeds

    def extract_lyric_feats(self, lyric):
        with torch.no_grad():
            try:
                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = lyric, return_one=False)
            except:
                text_encoder_hidden_states, text_mask, text_prompt_embeds = self.clap_embd_extractor(texts = [""] * len(lyric), return_one=False)
            text_encoder_hidden_states = text_encoder_hidden_states.to(self.device)
            text_mask = text_mask.to(self.device)
            text_encoder_hidden_states, text_mask, text_prompt_embeds = \
                pad_or_tunc_tolen(text_encoder_hidden_states, text_mask, text_prompt_embeds)
            text_encoder_hidden_states = text_encoder_hidden_states.permute(0,2,1).contiguous()
            return text_encoder_hidden_states, text_mask

    def extract_energy_bar(self, input_audios):
        if(input_audios.shape[-1] % self.num_samples_perseg > 0):
            energy_bar = input_audios[:,:-1 * (input_audios.shape[-1] % self.num_samples_perseg)].reshape(input_audios.shape[0],-1,self.num_samples_perseg)
        else:
            energy_bar = input_audios.reshape(input_audios.shape[0],-1,self.num_samples_perseg)
        energy_bar = (energy_bar.pow(2.0).mean(-1).sqrt() + 1e-6).log10() * 20 # B T
        energy_bar = (energy_bar / 2.0 + 16).clamp(0,16).int()
        energy_embedding = self.energy_embedding(energy_bar)
        energy_embedding = energy_embedding.view(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 2, 32).reshape(energy_embedding.shape[0], energy_embedding.shape[1] // 2, 64).permute(0,2,1) # b 128 t
        return energy_embedding

    def forward(self, input_audios, lyric, latents, latent_masks, validation_mode=False, \
        additional_feats = ['spk', 'lyric'], \
        train_rvq=True, train_ssl=False,layer=5):
        if not hasattr(self,"device"):
            self.device = input_audios.device
        if not hasattr(self,"dtype"):
            self.dtype = input_audios.dtype
        device = self.device
        input_audio_0 = input_audios[:,0,:]
        input_audio_1 = input_audios[:,1,:]
        input_audio_0 = self.preprocess_audio(input_audio_0)
        input_audio_1 = self.preprocess_audio(input_audio_1)
        input_audios_wav2vec = (input_audio_0 + input_audio_1) / 2.0
        # energy_embedding = self.extract_energy_bar(input_audios)
        # print("energy_embedding.shape:",energy_embedding.shape)
        # with autocast(enabled=False):
        if(train_ssl):
            self.wav2vec.train()
            wav2vec_embeds = self.extract_wav2vec_embeds(input_audios)
            self.clap_embd_extractor.train()
            prompt_embeds = self.extract_mert_embeds(input_audios)
            if('spk' in additional_feats):
                self.xvecmodel.train()
                spk_embeds = self.extract_spk_embeds(input_audios).repeat(1,1,prompt_embeds.shape[-1]//2,1)
        else:
            with torch.no_grad():
                with autocast(enabled=False):
                    bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
                    # mert_emb = self.extract_mert_embeds(input_audios_mert)
                    
                    wav2vec_embeds = self.extract_wav2vec_embeds(input_audios_wav2vec,bestrq_emb.shape[2])

                bestrq_emb = bestrq_emb.detach()
        if('lyric' in additional_feats):
            text_encoder_hidden_states, text_mask = self.extract_lyric_feats(lyric)
        else:
            text_encoder_hidden_states, text_mask = None, None

        # prompt_embeds_13 = torch.cat([mert_emb_13, energy_embedding], 1)
        # print("prompt_embes.shape:",prompt_embeds.shape)
        #prompt_embes.shape:  torch.Size([3, 1088, 896])
        # print("wav2vec_embeds.shape:",wav2vec_embeds.shape)
        #wav2vec_embeds.shape:torch.Size([3, 1024, 896])
        if(train_rvq):
            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
        else:
            bestrq_emb = bestrq_emb.float()
            self.rvq_bestrq_emb.eval()
            # with autocast(enabled=False):
            quantized_bestrq_emb, _, _, commitment_loss_bestrq_emb, codebook_loss_bestrq_emb,_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t
            commitment_loss_bestrq_emb = commitment_loss_bestrq_emb.detach()
            codebook_loss_bestrq_emb = codebook_loss_bestrq_emb.detach()
            quantized_bestrq_emb = quantized_bestrq_emb.detach()

        commitment_loss = commitment_loss_bestrq_emb
        codebook_loss = codebook_loss_bestrq_emb


        alpha=1
        quantized_bestrq_emb = quantized_bestrq_emb * alpha + bestrq_emb * (1-alpha)

        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
        # print("latent_masks.shape:",latent_masks.shape)
        # quantized_bestrq_emb = torch.nn.functional.interpolate(quantized_bestrq_emb, size=(int(quantized_bestrq_emb.shape[-1]/999*937),), mode='linear', align_corners=True)

        
        scenario = np.random.choice(['start_seg', 'other_seg'])
        if(scenario == 'other_seg'):
            for binx in range(input_audios.shape[0]):
                # latent_masks[binx,0:64] = 1
                latent_masks[binx,0:random.randint(64,128)] = 1
        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
        # print("quantized_bestrq_emb1.shape:",quantized_bestrq_emb.shape)
        # print("latent_masks.shape:",latent_masks.shape)
        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)


        if self.uncondition:
            mask_indices = [k for k in range(quantized_bestrq_emb.shape[0]) if random.random() < 0.1]
            if len(mask_indices) > 0:
                quantized_bestrq_emb[mask_indices] = 0
        # print("latents.shape:",latents.shape)
        latents = latents.permute(0,2,1).contiguous()
        latents = self.normfeat.project_sample(latents)
        latents = latents.permute(0,2,1).contiguous()
        incontext_latents = latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
        attention_mask=(latent_masks > 0.5)
        B, L = attention_mask.size()
        attention_mask = attention_mask.view(B, 1, L)
        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
        attention_mask = attention_mask.unsqueeze(1)
        # print("incontext_latents.shape:",incontext_latents.shape)
        # print("quantized_bestrq_emb.shape:",quantized_bestrq_emb.shape)
        latent_mask_input = self.mask_emb(latent_masks)
        #64+48+64+1024
        loss,loss_re, loss_cos = self.cfm_wrapper.compute_loss(latents, [latent_mask_input,incontext_latents, quantized_bestrq_emb],  latent_masks,attention_mask,wav2vec_embeds, validation_mode=validation_mode)
        return loss,loss_re, loss_cos, commitment_loss.mean(), codebook_loss.mean()

    def init_device_dtype(self, device, dtype):
        self.device = device
        self.dtype = dtype

    @torch.no_grad()
    def fetch_codes(self, input_audios, additional_feats,layer):
        input_audio_0 = input_audios[[0],:]
        input_audio_1 = input_audios[[1],:]
        input_audio_0 = self.preprocess_audio(input_audio_0)
        input_audio_1 = self.preprocess_audio(input_audio_1)

        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
        # bestrq_middle = bestrq_middle.detach()
        # bestrq_last = bestrq_last.detach()
        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
        bestrq_emb = bestrq_emb.detach()

        # self.rvq_bestrq_middle.eval()
        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
        # self.rvq_bestrq_last.eval()
        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t

        self.rvq_bestrq_emb.eval()
        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t


        if('spk' in additional_feats):
            self.xvecmodel.eval()
            spk_embeds = self.extract_spk_embeds(input_audios)
        else:
            spk_embeds = None

        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
        return [codes_bestrq_emb], [bestrq_emb], spk_embeds
        # return [codes_prompt_13, codes_wav2vec_last], [prompt_embeds_13, wav2vec_embeds_last], spk_embeds
    

    @torch.no_grad()
    def fetch_codes_batch(self, input_audios, additional_feats,layer):
        input_audio_0 = input_audios[:,0,:]
        input_audio_1 = input_audios[:,1,:]
        input_audio_0 = self.preprocess_audio(input_audio_0)
        input_audio_1 = self.preprocess_audio(input_audio_1)

        # bestrq_middle,bestrq_last = self.extract_bestrq_embeds(input_audios)
        # bestrq_middle = bestrq_middle.detach()
        # bestrq_last = bestrq_last.detach()
        bestrq_emb = self.extract_bestrq_embeds(input_audio_0,input_audio_1,layer)
        bestrq_emb = bestrq_emb.detach()

        # self.rvq_bestrq_middle.eval()
        # quantized_bestrq_middle, codes_bestrq_middle, *_ = self.rvq_bestrq_middle(bestrq_middle) # b,d,t
        # self.rvq_bestrq_last.eval()
        # quantized_bestrq_last, codes_bestrq_last, *_ = self.rvq_bestrq_last(bestrq_last) # b,d,t

        self.rvq_bestrq_emb.eval()
        quantized_bestrq_emb, codes_bestrq_emb, *_ = self.rvq_bestrq_emb(bestrq_emb) # b,d,t


        if('spk' in additional_feats):
            self.xvecmodel.eval()
            spk_embeds = self.extract_spk_embeds(input_audios)
        else:
            spk_embeds = None

        # return [codes_prompt, codes_wav2vec], [prompt_embeds, wav2vec_embeds], spk_embeds
        # return [codes_prompt_7, codes_prompt_13, codes_prompt_20, codes_wav2vec_half, codes_wav2vec_last], [prompt_embeds_7, prompt_embeds_13, prompt_embeds_20, wav2vec_embeds_half, wav2vec_embeds_last], spk_embeds
        # return [codes_bestrq_middle, codes_bestrq_last], [bestrq_middle, bestrq_last], spk_embeds
        return [codes_bestrq_emb], [bestrq_emb], spk_embeds

    @torch.no_grad()
    def inference_codes(self, codes, spk_embeds, true_latents, latent_length, additional_feats, incontext_length=127,
                  guidance_scale=2, num_steps=20,
                  disable_progress=True, scenario='start_seg'):
        classifier_free_guidance = guidance_scale > 1.0
        device = self.device
        dtype = self.dtype
        # codes_bestrq_middle, codes_bestrq_last = codes
        codes_bestrq_emb = codes[0]
        batch_size = codes_bestrq_emb.shape[0]
        quantized_bestrq_emb,_,_=self.rvq_bestrq_emb.from_codes(codes_bestrq_emb)
        quantized_bestrq_emb = quantized_bestrq_emb.permute(0,2,1).contiguous()
        if('spk' in additional_feats):
            spk_embeds = spk_embeds.repeat(1,1,quantized_bestrq_emb.shape[-2],1).detach()
        num_frames = quantized_bestrq_emb.shape[1]
        num_channels_latents = self.num_channels
        shape = (batch_size,  num_frames, 64)
        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
        latent_masks = torch.zeros(latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device)
        latent_masks[:,0:latent_length] = 2
        if(scenario=='other_seg'):
            latent_masks[:,0:incontext_length] = 1

        quantized_bestrq_emb = (latent_masks > 0.5).unsqueeze(-1) * quantized_bestrq_emb \
            + (latent_masks < 0.5).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,1,1024)
        true_latents = true_latents.permute(0,2,1).contiguous()
        true_latents = self.normfeat.project_sample(true_latents)
        true_latents = true_latents.permute(0,2,1).contiguous()
        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]

        attention_mask=(latent_masks > 0.5)
        B, L = attention_mask.size()
        attention_mask = attention_mask.view(B, 1, L)
        attention_mask = attention_mask * attention_mask.transpose(-1, -2)
        attention_mask = attention_mask.unsqueeze(1)
        latent_mask_input = self.mask_emb(latent_masks)

        if('spk' in additional_feats):
            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last, spk_embeds],1)
            additional_model_input = torch.cat([quantized_bestrq_emb, spk_embeds],1)
        else:
            # additional_model_input = torch.cat([quantized_bestrq_middle, quantized_bestrq_last],1)
            additional_model_input = torch.cat([quantized_bestrq_emb],1)

        temperature = 1.0
        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_bestrq_emb.device)
        latents = self.cfm_wrapper.solve_euler(latents * temperature, latent_mask_input,incontext_latents, incontext_length, t_span, additional_model_input,attention_mask,  guidance_scale)

        latents[:,0:incontext_length,:] = incontext_latents[:,0:incontext_length,:]
        latents = latents.permute(0,2,1).contiguous()
        latents = self.normfeat.return_sample(latents)
        # latents = latents.permute(0,2,1).contiguous()
        return latents

    @torch.no_grad()
    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
                  disable_progress=True,layer=5,scenario='start_seg'):
        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)

        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
            guidance_scale=guidance_scale, num_steps=num_steps, \
            disable_progress=disable_progress,scenario=scenario)
        return latents
    
    @torch.no_grad()
    def inference_rtf(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
                  disable_progress=True,layer=5,scenario='start_seg'):
        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
        import time
        start = time.time()
        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
            guidance_scale=guidance_scale, num_steps=num_steps, \
            disable_progress=disable_progress,scenario=scenario)
        return latents,time.time()-start

    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
        divisor = 4
        shape = (batch_size, num_channels_latents, num_frames, 32)
        if(num_frames%divisor>0):
            num_frames = round(num_frames/float(divisor))*divisor
            shape = (batch_size, num_channels_latents, num_frames, 32)
        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
        return latents