| import sys |
| import teradataml as tdml |
| from tabulate import tabulate |
|
|
| import json |
|
|
|
|
| with open('conversion_config.json') as json_file: |
| conversion_config = json.load(json_file) |
|
|
|
|
| model_id = conversion_config["model_id"] |
| number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"] |
| precision_to_filename_map = conversion_config["precision_to_filename_map"] |
| |
| host = sys.argv[1] |
| username = sys.argv[2] |
| password = sys.argv[3] |
| |
| print("Setting up connection to teradata...") |
| tdml.create_context(host = host, username = username, password = password) |
| print("Done\n\n") |
| |
| |
| print("Deploying tokenizer...") |
| try: |
| tdml.db_drop_table('tokenizer_table') |
| except: |
| print("Can't drop tokenizers table - it's not existing") |
| tdml.save_byom('tokenizer', |
| 'tokenizer.json', |
| 'tokenizer_table') |
| print("Done\n\n") |
| |
| print("Testing models...") |
| try: |
| tdml.db_drop_table('model_table') |
| except: |
| print("Can't drop models table - it's not existing") |
| |
| for precision, file_name in precision_to_filename_map.items(): |
| print(f"Deploying {precision} model...") |
| tdml.save_byom(precision, |
| file_name, |
| 'model_table') |
| print(f"Model {precision} is deployed\n") |
| |
| print(f"Calculating embeddings with {precision} model...") |
| try: |
| tdml.db_drop_table('emails_embeddings_store') |
| except: |
| print("Can't drop embeddings table - it's not existing") |
| |
| tdml.execute_sql(f""" |
| create volatile table emails_embeddings_store as ( |
| select |
| * |
| from mldb.ONNXEmbeddings( |
| on emails.emails as InputTable |
| on (select * from model_table where model_id = '{precision}') as ModelTable DIMENSION |
| on (select model as tokenizer from tokenizer_table where model_id = 'tokenizer') as TokenizerTable DIMENSION |
| |
| using |
| Accumulate('id', 'txt') |
| ModelOutputTensor('sentence_embedding') |
| EnableMemoryCheck('false') |
| OutputFormat('FLOAT32({number_of_generated_embeddings})') |
| OverwriteCachedModel('true') |
| ) a |
| ) with data on commit preserve rows |
| |
| """) |
| print("Embeddings calculated") |
| print(f"Testing semantic search with cosine similiarity on the output of the model with precision '{precision}'...") |
| tdf_embeddings_store = tdml.DataFrame('emails_embeddings_store') |
| tdf_embeddings_store_tgt = tdf_embeddings_store[tdf_embeddings_store.id == 3] |
| |
| tdf_embeddings_store_ref = tdf_embeddings_store[tdf_embeddings_store.id != 3] |
| |
| cos_sim_pd = tdml.DataFrame.from_query(f""" |
| SELECT |
| dt.target_id, |
| dt.reference_id, |
| e_tgt.txt as target_txt, |
| e_ref.txt as reference_txt, |
| (1.0 - dt.distance) as similiarity |
| FROM |
| TD_VECTORDISTANCE ( |
| ON ({tdf_embeddings_store_tgt.show_query()}) AS TargetTable |
| ON ({tdf_embeddings_store_ref.show_query()}) AS ReferenceTable DIMENSION |
| USING |
| TargetIDColumn('id') |
| TargetFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]') |
| RefIDColumn('id') |
| RefFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]') |
| DistanceMeasure('cosine') |
| topk(3) |
| ) AS dt |
| JOIN emails.emails e_tgt on e_tgt.id = dt.target_id |
| JOIN emails.emails e_ref on e_ref.id = dt.reference_id; |
| """).to_pandas() |
| print(tabulate(cos_sim_pd, headers='keys', tablefmt='fancy_grid')) |
| print("Done\n\n") |
| |
| |
| tdml.remove_context() |