| |
| """Text Processing Plugin""" |
| import re |
| from typing import List |
|
|
| class TextProcessor: |
| """Clean and process text data.""" |
| def clean_text(self, text: str) -> str: |
| """Remove extra whitespace, special chars.""" |
| text = re.sub(r'\s+', ' ', text) |
| text = text.strip() |
| return text |
| |
| def extract_emails(self, text: str) -> List[str]: |
| """Extract email addresses from text.""" |
| pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' |
| return re.findall(pattern, text) |
| |
| def extract_urls(self, text: str) -> List[str]: |
| """Extract URLs from text.""" |
| pattern = r'https?://[^\s]+' |
| return re.findall(pattern, text) |
| |
| def tokenize(self, text: str) -> List[str]: |
| """Simple word tokenization.""" |
| return text.lower().split() |
|
|