| | import streamlit as st |
| |
|
| | |
| | def show_home_page(): |
| | st.title("Natural Language Processing (NLP)") |
| | st.markdown( |
| | """ |
| | ### Welcome to NLP Guide |
| | Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on the interaction between |
| | computers and humans through natural language. It enables machines to read, understand, and respond to human |
| | language in a way that is both meaningful and useful. NLP powers a wide range of applications like chatbots, |
| | translation tools, sentiment analysis, and search engines. |
| | |
| | Use the menu in the sidebar to explore each topic in detail. |
| | """ |
| | ) |
| |
|
| | |
| | def show_page(page): |
| | if page == "NLP Terminologies": |
| | st.title("NLP Terminologies") |
| | st.markdown( |
| | """ |
| | ### NLP Terminologies (Detailed Explanation) |
| | |
| | - **Tokenization**: Breaking text into smaller units like words or sentences. |
| | - **Stop Words**: Commonly used words (e.g., "the", "is") often removed during preprocessing. |
| | - **Stemming**: Reducing words to their root forms (e.g., "running" -> "run"). |
| | - **Lemmatization**: Converting words to their dictionary base forms (e.g., "running" -> "run"). |
| | - **Corpus**: A large collection of text used for NLP training and analysis. |
| | - **Vocabulary**: The set of all unique words in a corpus. |
| | - **n-grams**: Continuous sequences of n words/characters from text. |
| | - **POS Tagging**: Assigning parts of speech to words. |
| | - **NER (Named Entity Recognition)**: Identifying names, places, organizations, etc. |
| | - **Parsing**: Analyzing grammatical structure of text. |
| | """ |
| | ) |
| | elif page == "One-Hot Vectorization": |
| | st.title("One-Hot Vectorization") |
| | st.markdown( |
| | """ |
| | ### One-Hot Vectorization |
| | |
| | A simple representation where each word in the vocabulary is represented as a binary vector. |
| | |
| | #### How It Works: |
| | - Each unique word in the corpus is assigned an index. |
| | - The vector for a word is all zeros except for a 1 at the index corresponding to that word. |
| | |
| | #### Example: |
| | Vocabulary: ["cat", "dog", "bird"] |
| | - "cat" -> [1, 0, 0] |
| | - "dog" -> [0, 1, 0] |
| | - "bird" -> [0, 0, 1] |
| | |
| | #### Advantages: |
| | - Simple to implement. |
| | |
| | #### Limitations: |
| | - High dimensionality for large vocabularies. |
| | - Does not capture semantic relationships (e.g., "cat" and "kitten" are unrelated). |
| | |
| | #### Applications: |
| | - Useful for small datasets and when computational simplicity is prioritized. |
| | """ |
| | ) |
| | elif page == "Bag of Words": |
| | st.title("Bag of Words (BoW)") |
| | st.markdown( |
| | """ |
| | ### Bag of Words (BoW) |
| | |
| | Bag of Words is a method of representing text data as word frequency counts without considering word order. |
| | |
| | #### How It Works: |
| | 1. Create a vocabulary of all unique words in the text. |
| | 2. Count the frequency of each word in a document. |
| | |
| | #### Example: |
| | Given two sentences: |
| | - "I love NLP." |
| | - "I love programming." |
| | |
| | Vocabulary: ["I", "love", "NLP", "programming"] |
| | - Sentence 1: [1, 1, 1, 0] |
| | - Sentence 2: [1, 1, 0, 1] |
| | |
| | #### Advantages: |
| | - Simple to implement. |
| | |
| | #### Limitations: |
| | - High dimensionality for large vocabularies. |
| | - Does not consider word order or semantic meaning. |
| | - Sensitive to noise and frequent terms. |
| | |
| | #### Applications: |
| | - Text classification and clustering. |
| | """ |
| | ) |
| | elif page == "TF-IDF Vectorizer": |
| | st.title("TF-IDF Vectorizer") |
| | st.markdown( |
| | """ |
| | ### TF-IDF Vectorizer |
| | |
| | Term Frequency-Inverse Document Frequency (TF-IDF) is a statistical measure that evaluates the importance of a word in a document relative to a collection of documents (corpus). |
| | |
| | #### Formula: |
| | \[ \text{TF-IDF} = \text{TF} \times \text{IDF} \] |
| | |
| | - **Term Frequency (TF)**: Number of times a term appears in a document divided by total terms in the document. |
| | - **Inverse Document Frequency (IDF)**: Logarithm of total documents divided by the number of documents containing the term. |
| | |
| | #### Advantages: |
| | - Reduces the weight of common words. |
| | - Highlights unique and important words. |
| | |
| | #### Example: |
| | For the corpus: |
| | - Doc1: "NLP is amazing." |
| | - Doc2: "NLP is fun and amazing." |
| | |
| | TF-IDF highlights words like "fun" and "amazing" over commonly occurring words like "is". |
| | |
| | #### Applications: |
| | - Search engines, information retrieval, and document classification. |
| | """ |
| | ) |
| | elif page == "Word2Vec": |
| | st.title("Word2Vec") |
| | st.markdown( |
| | """ |
| | ### Word2Vec |
| | |
| | Word2Vec is a neural network-based technique for creating dense vector representations of words, capturing their semantic relationships. |
| | |
| | #### Key Concepts: |
| | - **CBOW (Continuous Bag of Words)**: Predicts the target word from its context. |
| | - **Skip-gram**: Predicts the context from the target word. |
| | |
| | #### Advantages: |
| | - Captures semantic meaning (e.g., "king" - "man" + "woman" ≈ "queen"). |
| | - Efficient for large datasets. |
| | |
| | #### Applications: |
| | - Text classification, sentiment analysis, and recommendation systems. |
| | |
| | #### Limitations: |
| | - Requires significant computational resources. |
| | """ |
| | ) |
| | elif page == "FastText": |
| | st.title("FastText") |
| | st.markdown( |
| | """ |
| | ### FastText |
| | |
| | FastText is an extension of Word2Vec that represents words as a combination of character n-grams. |
| | |
| | #### Advantages: |
| | - Handles rare and out-of-vocabulary words. |
| | - Captures subword information (e.g., prefixes and suffixes). |
| | |
| | #### Example: |
| | The word "playing" might be represented by n-grams like "pla", "lay", "ayi", "ing". |
| | |
| | #### Applications: |
| | - Multilingual text processing. |
| | - Handling noisy and incomplete data. |
| | |
| | #### Limitations: |
| | - Higher computational cost compared to Word2Vec. |
| | """ |
| | ) |
| | elif page == "Tokenization": |
| | st.title("Tokenization") |
| | st.markdown( |
| | """ |
| | ### Tokenization |
| | |
| | Tokenization is the process of breaking text into smaller units (tokens) such as words, phrases, or sentences. |
| | |
| | #### Types of Tokenization: |
| | - **Word Tokenization**: Splits text into words. |
| | - **Sentence Tokenization**: Splits text into sentences. |
| | |
| | #### Libraries for Tokenization: |
| | - NLTK, SpaCy, and Hugging Face Transformers. |
| | |
| | #### Example: |
| | Sentence: "NLP is exciting." |
| | - Word Tokens: ["NLP", "is", "exciting", "."] |
| | |
| | #### Applications: |
| | - Preprocessing for machine learning models. |
| | |
| | #### Challenges: |
| | - Handling complex text like abbreviations and multilingual data. |
| | """ |
| | ) |
| | elif page == "Stop Words": |
| | st.title("Stop Words") |
| | st.markdown( |
| | """ |
| | ### Stop Words |
| | |
| | Stop words are commonly used words in a language that are often removed during text preprocessing. |
| | |
| | #### Examples of Stop Words: |
| | - English: "is", "the", "and", "in". |
| | - Spanish: "es", "el", "y", "en". |
| | |
| | #### Why Remove Stop Words? |
| | - To reduce noise in text data. |
| | |
| | #### Applications: |
| | - Sentiment analysis, text classification, and search engines. |
| | |
| | #### Challenges: |
| | - Some stop words might carry context-specific importance. |
| | """ |
| | ) |
| |
|
| | |
| | st.sidebar.title("NLP Topics") |
| | menu_options = [ |
| | "Home", |
| | "NLP Terminologies", |
| | "One-Hot Vectorization", |
| | "Bag of Words", |
| | "TF-IDF Vectorizer", |
| | "Word2Vec", |
| | "FastText", |
| | "Tokenization", |
| | "Stop Words", |
| | ] |
| | selected_page = st.sidebar.radio("Select a topic", menu_options) |
| |
|
| | |
| | if selected_page == "Home": |
| | show_home_page() |
| | else: |
| | show_page(selected_page) |
| |
|