| import gradio as gr |
| import pandas as pd |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
| |
| DEFAULT_CHUNK_SIZE = 100 |
| DEFAULT_CHUNK_OVERLAP = 0 |
| DEFAULT_NUM_CHUNKS = 10 |
|
|
| def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks): |
| """ |
| Tokenizes the input text based on the selected method and provided parameters. |
| """ |
| num_chunks = int(num_chunks) |
| output = [] |
|
|
| |
| if not text.strip(): |
| return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count']) |
|
|
| if method == "RecursiveCharacterTextSplitter": |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False) |
| tokenized_texts = text_splitter.split_text(text)[:num_chunks] |
| for i, chunk in enumerate(tokenized_texts): |
| output.append({ |
| 'Chunk #': i, |
| 'Text Chunk': chunk, |
| 'Character Count': len(chunk), |
| 'Token Count': len(chunk.split()) |
| }) |
|
|
| df = pd.DataFrame(output) |
| return df |
|
|
| iface = gr.Interface( |
| fn=tokenize_text, |
| inputs=[ |
| gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]), |
| gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."), |
| gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE), |
| gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP), |
| gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS) |
| ], |
| outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"], height=900,), |
| title="Text Tokenization Tool", |
| description="A tool for tokenizing text using different methods." |
| ) |
|
|
| iface.launch() |
|
|
|
|