| """ |
| String processing pipeline functions for testing function analysis. |
| """ |
|
|
| import re |
| from typing import List |
|
|
|
|
| def normalize_whitespace(text): |
| """Normalize whitespace by removing extra spaces and newlines.""" |
| |
| text = re.sub(r'\s+', ' ', text) |
| |
| return text.strip() |
|
|
|
|
| def remove_special_characters(text, keep_chars=""): |
| """Remove special characters, optionally keeping specified characters.""" |
| |
| pattern = fr"[^a-zA-Z0-9\s{re.escape(keep_chars)}]" |
| return re.sub(pattern, '', text) |
|
|
|
|
| def convert_to_lowercase(text): |
| """Convert text to lowercase.""" |
| return text.lower() |
|
|
|
|
| def remove_stopwords(text, stopwords=None): |
| """Remove common stopwords from text.""" |
| if stopwords is None: |
| stopwords = { |
| 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', |
| 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', |
| 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', |
| 'will', 'would', 'could', 'should', 'may', 'might', 'must' |
| } |
| |
| words = text.split() |
| filtered_words = [word for word in words if word.lower() not in stopwords] |
| return ' '.join(filtered_words) |
|
|
|
|
| def extract_keywords(text, min_length=3): |
| """Extract keywords (words longer than min_length).""" |
| words = text.split() |
| keywords = [word for word in words if len(word) >= min_length] |
| return keywords |
|
|
|
|
| def count_word_frequency(text): |
| """Count frequency of each word in text.""" |
| words = text.split() |
| frequency = {} |
| for word in words: |
| frequency[word] = frequency.get(word, 0) + 1 |
| return frequency |
|
|
|
|
| def capitalize_words(text, exceptions=None): |
| """Capitalize first letter of each word, with exceptions.""" |
| if exceptions is None: |
| exceptions = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'} |
| |
| words = text.split() |
| capitalized = [] |
| |
| for i, word in enumerate(words): |
| if i == 0 or word.lower() not in exceptions: |
| capitalized.append(word.capitalize()) |
| else: |
| capitalized.append(word.lower()) |
| |
| return ' '.join(capitalized) |
|
|
|
|
| def truncate_text(text, max_length=100, suffix="..."): |
| """Truncate text to specified length with suffix.""" |
| if len(text) <= max_length: |
| return text |
| |
| truncated = text[:max_length - len(suffix)] |
| |
| last_space = truncated.rfind(' ') |
| if last_space > max_length * 0.8: |
| truncated = truncated[:last_space] |
| |
| return truncated + suffix |
|
|
|
|
| def text_processing_pipeline(text, operations=None): |
| """Process text through a pipeline of operations.""" |
| if operations is None: |
| operations = [ |
| 'normalize_whitespace', |
| 'remove_special_characters', |
| 'convert_to_lowercase', |
| 'remove_stopwords' |
| ] |
| |
| |
| operation_map = { |
| 'normalize_whitespace': normalize_whitespace, |
| 'remove_special_characters': remove_special_characters, |
| 'convert_to_lowercase': convert_to_lowercase, |
| 'remove_stopwords': remove_stopwords, |
| 'capitalize_words': capitalize_words, |
| 'truncate_text': truncate_text |
| } |
| |
| result = text |
| processing_steps = [] |
| |
| for operation in operations: |
| if operation in operation_map: |
| before = result |
| result = operation_map[operation](result) |
| processing_steps.append({ |
| 'operation': operation, |
| 'before': before[:50] + "..." if len(before) > 50 else before, |
| 'after': result[:50] + "..." if len(result) > 50 else result |
| }) |
| |
| return result, processing_steps |
|
|
|
|
| def analyze_text_statistics(text): |
| """Analyze various statistics about the text.""" |
| words = text.split() |
| |
| stats = { |
| 'character_count': len(text), |
| 'word_count': len(words), |
| 'sentence_count': len(re.findall(r'[.!?]+', text)), |
| 'average_word_length': sum(len(word) for word in words) / len(words) if words else 0, |
| 'longest_word': max(words, key=len) if words else "", |
| 'shortest_word': min(words, key=len) if words else "" |
| } |
| |
| return stats |
|
|
|
|
| if __name__ == "__main__": |
| sample_text = """ |
| This is a SAMPLE text with various formatting issues!!! |
| It has multiple spaces, special @#$% characters, and |
| needs some serious cleaning & processing... |
| """ |
| |
| print("Original text:") |
| print(repr(sample_text)) |
| |
| processed_text, steps = text_processing_pipeline(sample_text) |
| |
| print("\nProcessing steps:") |
| for step in steps: |
| print(f"After {step['operation']}:") |
| print(f" {step['after']}") |
| |
| print(f"\nFinal result: {processed_text}") |
| |
| stats = analyze_text_statistics(processed_text) |
| print(f"\nText statistics: {stats}") |
|
|