import { AutoTokenizer, CLIPTextModelWithProjection, env } from '@xenova/transformers';

env.allowLocalModels = false;
export class CLIPModelManager {
  static model_id = 'Xenova/clip-vit-base-patch32';
  static tokenizer;
  static text_model;

  static async getInstance() {
    // Load tokenizer and text model
    if (this.tokenizer === undefined) {
      this.tokenizer = AutoTokenizer.from_pretrained(this.model_id);
    }

    if (this.text_model === undefined) {
      this.text_model = CLIPTextModelWithProjection.from_pretrained(this.model_id, {
        quantized: false,
      });
    }

    return Promise.all([this.tokenizer, this.text_model]);
  }
}

export async function getClipTextEmbeddings(text) {
  const [tokenizer, text_model] = await CLIPModelManager.getInstance();

  const text_inputs = tokenizer([text], { padding: true, truncation: true });

  const { text_embeds } = await text_model(text_inputs);
  const query_embedding = text_embeds.tolist()[0];

  return query_embedding;
}
