| | import requests |
| | import logging |
| | import json |
| | import tiktoken |
| | import gradio as gr |
| | from typing import Any, List |
| | from langchain.schema import Document |
| | from langchain.document_loaders import UnstructuredPDFLoader |
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| |
|
| | from utils import json_validator, fetch_chat |
| |
|
| |
|
| | class LatexTextSplitter(RecursiveCharacterTextSplitter): |
| | """Attempts to split the text along Latex-formatted layout elements.""" |
| |
|
| | def __init__(self, **kwargs: Any): |
| | """Initialize a LatexTextSplitter.""" |
| | separators = [ |
| | |
| | "\chapter{", |
| | "\section{", |
| | "\subsection{", |
| | "\subsubsection{", |
| |
|
| | |
| | "\begin{" |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | " ", |
| | "", |
| | ] |
| | super().__init__(separators=separators, **kwargs) |
| | |
| |
|
| | class Suggest(): |
| | |
| | def __init__(self, max_ideas: int, model: str = "gpt-3.5-turbo"): |
| | self.max_ideas = max_ideas |
| | self.encoder = tiktoken.encoding_for_model(model) |
| | self.model = model |
| | self.idea_list = [] |
| | with open("./sample/sample.tex", "r") as f: |
| | self.sample_content = f.read() |
| | |
| | def split_chunk(self, latex_whole_document: str, chunk_size: int = 2000, retry: int = 5) -> List[Document]: |
| | |
| | chunk_size = min(chunk_size, len(latex_whole_document)) |
| |
|
| | for _ in range(retry): |
| | try: |
| | latex_splitter = LatexTextSplitter( |
| | chunk_size=chunk_size, |
| | chunk_overlap=0, |
| | ) |
| | docs = latex_splitter.create_documents([latex_whole_document]) |
| | return docs |
| | except: |
| | chunk_size = chunk_size // 2 |
| |
|
| | raise Exception("Latex document split check failed.") |
| |
|
| | def analyze(self, latex_whole_document: str, openai_key: str, progress: gr.Progress): |
| |
|
| | logging.info("start analysis") |
| | docs = self.split_chunk(latex_whole_document) |
| | progress(0.05) |
| |
|
| | output_format = """ |
| | |
| | ```json |
| | [ |
| | \\ Potential point for improvement 1 |
| | {{ |
| | "title": string \\ What this modification is about |
| | "thought": string \\ The reason why this should be improved |
| | "action": string \\ how to make improvement |
| | "original": string \\ the original latex snippet that can be improved |
| | "improved": string \\ the improved latex snippet which address your point |
| | }}, |
| | {{}} |
| | ] |
| | ``` |
| | """ |
| |
|
| | ideas = [] |
| | for doc in progress.tqdm(docs): |
| |
|
| | prompt = f""" |
| | I'm a computer science student. |
| | You are my editor. |
| | Your goal is to improve my paper quality at your best. |
| | |
| | |
| | ``` |
| | {doc.page_content} |
| | ``` |
| | The above is a segment of my research paper. If the end of the segment is not complete, just ignore it. |
| | Point out the parts that can be improved. |
| | Focus on grammar, writing, content, section structure. |
| | Ignore comments and those that are outside the document environment. |
| | List out all the points with a latex snippet which is the improved version addressing your point. |
| | Same paragraph should be only address once. |
| | Output the response in the following valid json format: |
| | {output_format} |
| | |
| | """ |
| |
|
| | idea = fetch_chat(prompt, openai_key, model=self.model) |
| | idea = json_validator(idea, openai_key) |
| | if isinstance(idea, list): |
| | ideas += idea |
| | if len(ideas) >= self.max_ideas: |
| | break |
| | else: |
| | |
| | continue |
| |
|
| | if not ideas: |
| | raise gr.Error('No suggestions generated.') |
| |
|
| | logging.info('complete analysis') |
| | return ideas |
| |
|
| | def read_file(self, f: str): |
| | if f is None: |
| | return "" |
| | elif f.name.endswith('pdf'): |
| | loader = UnstructuredPDFLoader(f.name) |
| | pages = loader.load_and_split() |
| | return "\n".join([p.page_content for p in pages]) |
| | elif f.name.endswith('tex'): |
| | with open(f.name, "r") as f: |
| | return f.read() |
| | else: |
| | return "Only support .tex & .pdf" |
| |
|
| | def generate(self, txt: str, openai_key: str, progress=gr.Progress()): |
| | |
| | if not openai_key: |
| | raise gr.Error("Please provide openai key !") |
| | |
| | try: |
| | idea_list = self.analyze(txt, openai_key, progress) |
| | self.idea_list = idea_list |
| | k = min(len(idea_list), self.max_ideas) |
| |
|
| | idea_buttons = [ |
| | gr.Button.update(visible=True, value=i['title']) |
| | for e, i in enumerate(idea_list[:self.max_ideas]) |
| | ] |
| | idea_buttons += [ |
| | gr.Button.update(visible=False) |
| | ] * (self.max_ideas - len(idea_buttons)) |
| |
|
| | idea_details = [ |
| | gr.Textbox.update(value="", label="thought", visible=True), |
| | gr.Textbox.update(value="", label="action", visible=True), |
| | gr.Textbox.update(value="", label="original", visible=True, max_lines=5, lines=5), |
| | gr.Textbox.update(value="", label="improved", visible=True, max_lines=5, lines=5), |
| | ] |
| |
|
| | return [ |
| | gr.Textbox.update("Suggestions", interactive=False, show_label=False), |
| | gr.Button.update(visible=True, value="Analyze") |
| | ] + idea_details + idea_buttons |
| | except Exception as e: |
| | raise gr.Error(str(e)) |