| |
| import numpy as np |
| import pandas as pd |
|
|
| import tensorflow |
| import keras |
| import torch |
|
|
| import re |
|
|
| from transformers import BertTokenizer |
| from sklearn.preprocessing import LabelEncoder |
|
|
| |
| def read_data(path): |
| try: |
| df=pd.read_csv(path) |
| return df |
| except FileNotFoundError: |
| print("File not exsists") |
| |
| data=read_data(r"E:\transactify\Dataset\transaction_data.csv") |
| if data is not None: |
| print(data.head(15)) |
| |
| |
| def clean_text(text): |
| text=text.lower() |
| text=re.sub(r"\d+"," ",text) |
| text=re.sub(r"[^\w\s]"," ",text) |
| text=text.strip() |
| return text |
|
|
| def preprocessing_data(df,max_length=20): |
| tokenizer=BertTokenizer.from_pretrained("bert-base-uncased") |
| |
| input_ids=[] |
| attention_masks=[] |
| |
| for description in df["Transaction Description"]: |
| cleaned_text = clean_text(description) |
| |
| |
| print(f"Original Description: {description}") |
| print(f"Cleaned Text: {cleaned_text}") |
| |
| |
| if cleaned_text: |
| encoded_dict = tokenizer.encode_plus( |
| cleaned_text, |
| add_special_tokens=True, |
| max_length=max_length, |
| pad_to_max_length=True, |
| return_attention_mask=True, |
| return_tensors="pt", |
| truncation=True |
| ) |
| |
| input_ids.append(encoded_dict['input_ids']) |
| attention_masks.append(encoded_dict['attention_mask']) |
| else: |
| print("Cleaned text is empty, skipping...") |
|
|
| |
| print(f"Total input_ids collected: {len(input_ids)}") |
| print(f"Total attention_masks collected: {len(attention_masks)}") |
| |
| if not input_ids: |
| raise ValueError("No input_ids were collected. Check the cleaning process.") |
|
|
| input_ids = torch.cat(input_ids, dim=0) |
| attention_masks = torch.cat(attention_masks, dim=0) |
| |
| labelencoder = LabelEncoder() |
| labels = labelencoder.fit_transform(df["Category"]) |
| labels = torch.tensor(labels) |
| |
| return input_ids, attention_masks, labels, labelencoder |
|
|
| input_ids, attention_masks, labels, labelencoder = preprocessing_data(data) |
|
|