| import pandas as pd |
| import os |
| import subprocess |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
| |
| def modify_sequences(sequence): |
| modified_sequence = sequence.upper() |
| modified_sequence = '\n'.join([modified_sequence[i:i+60] for i in range(0, len(modified_sequence), 60)]) |
|
|
| fasta = "<|endoftext|>" |
| modified_sequence = fasta + "\n" + modified_sequence |
|
|
| return modified_sequence |
|
|
| |
| def to_txt_file(df, filename): |
| with open(filename, 'w') as f: |
| for sequence in df['Sequence']: |
| f.write(sequence + '\n') |
|
|
|
|
| |
| path = "/workspace/sg666/MDpLM" |
|
|
| train = pd.read_csv(path + "/data/membrane/train.csv") |
| val = pd.read_csv(path + "/data/membrane/val.csv") |
| test = pd.read_csv(path + "/data/membrane/test.csv") |
|
|
| train = pd.concat([train, val]) |
|
|
| train['Sequence'] = train['Sequence'].apply(modify_sequences) |
| test['Sequence'] = test['Sequence'].apply(modify_sequences) |
|
|
|
|
| |
| to_txt_file(train, path + '/benchmarks/Generation/ProtGPT2/protgpt2_train.txt') |
| to_txt_file(test, path + '/benchmarks/Generation/ProtGPT2/protgpt2_test.txt') |
|
|
|
|
| tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2") |
| model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2") |
|
|
| finetune_protgpt2_command = [ |
| "python", "run_clm.py", |
| "--model_name_or_path", "nferruz/ProtGPT2", |
| "--train_file", "protgpt2_train.txt", |
| "--validation_file", "protgpt2_test.txt", |
| "--tokenizer_name", "nferruz/ProtGPT2", |
| "--num_train_epochs", "10", |
| "--logging_steps", "1", |
| "--logging_dir", "test", |
| "--do_train", |
| "--do_eval", |
| "--output_dir", "/workspace/sg666/MDpLM/benchmarks/Generation/ProtGPT2/finetuned_models", |
| "--overwrite_output_dir", |
| "--learning_rate", "3e-04", |
| "--per_device_train_batch_size", "2", |
| "--evaluation_strategy", "epoch" |
| ] |
|
|
| try: |
| result = subprocess.run(finetune_protgpt2_command, check=True, text=True, capture_output=True) |
| except subprocess.CalledProcessError as e: |
| print("Command failed with the following error:") |
| print(e.stderr) |
| print("Command output:") |
| print(e.stdout) |
|
|
|
|