Spaces:

Agents-MCP-Hackathon
/

Python-Code-to-Diagram-Generator-MCP

Running

App Files Files Community

Python-Code-to-Diagram-Generator-MCP / examples /string_processing.py

navred61

copied from private space

99a41ea 10 months ago

raw

history blame contribute delete

5.13 kB

	"""
	String processing pipeline functions for testing function analysis.
	"""

	import re
	from typing import List


	def normalize_whitespace(text):
	"""Normalize whitespace by removing extra spaces and newlines."""
	# Replace multiple whitespace with single space
	text = re.sub(r'\s+', ' ', text)
	# Strip leading and trailing whitespace
	return text.strip()


	def remove_special_characters(text, keep_chars=""):
	"""Remove special characters, optionally keeping specified characters."""
	# Keep alphanumeric, spaces, and specified characters
	pattern = fr"[^a-zA-Z0-9\s{re.escape(keep_chars)}]"
	return re.sub(pattern, '', text)


	def convert_to_lowercase(text):
	"""Convert text to lowercase."""
	return text.lower()


	def remove_stopwords(text, stopwords=None):
	"""Remove common stopwords from text."""
	if stopwords is None:
	stopwords = {
	'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
	'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be',
	'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
	'will', 'would', 'could', 'should', 'may', 'might', 'must'
	}

	words = text.split()
	filtered_words = [word for word in words if word.lower() not in stopwords]
	return ' '.join(filtered_words)


	def extract_keywords(text, min_length=3):
	"""Extract keywords (words longer than min_length)."""
	words = text.split()
	keywords = [word for word in words if len(word) >= min_length]
	return keywords


	def count_word_frequency(text):
	"""Count frequency of each word in text."""
	words = text.split()
	frequency = {}
	for word in words:
	frequency[word] = frequency.get(word, 0) + 1
	return frequency


	def capitalize_words(text, exceptions=None):
	"""Capitalize first letter of each word, with exceptions."""
	if exceptions is None:
	exceptions = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}

	words = text.split()
	capitalized = []

	for i, word in enumerate(words):
	if i == 0 or word.lower() not in exceptions:
	capitalized.append(word.capitalize())
	else:
	capitalized.append(word.lower())

	return ' '.join(capitalized)


	def truncate_text(text, max_length=100, suffix="..."):
	"""Truncate text to specified length with suffix."""
	if len(text) <= max_length:
	return text

	truncated = text[:max_length - len(suffix)]
	# Try to break at last complete word
	last_space = truncated.rfind(' ')
	if last_space > max_length * 0.8: # If we can break at a word boundary
	truncated = truncated[:last_space]

	return truncated + suffix


	def text_processing_pipeline(text, operations=None):
	"""Process text through a pipeline of operations."""
	if operations is None:
	operations = [
	'normalize_whitespace',
	'remove_special_characters',
	'convert_to_lowercase',
	'remove_stopwords'
	]

	# Map operation names to functions
	operation_map = {
	'normalize_whitespace': normalize_whitespace,
	'remove_special_characters': remove_special_characters,
	'convert_to_lowercase': convert_to_lowercase,
	'remove_stopwords': remove_stopwords,
	'capitalize_words': capitalize_words,
	'truncate_text': truncate_text
	}

	result = text
	processing_steps = []

	for operation in operations:
	if operation in operation_map:
	before = result
	result = operation_map[operation](result)
	processing_steps.append({
	'operation': operation,
	'before': before[:50] + "..." if len(before) > 50 else before,
	'after': result[:50] + "..." if len(result) > 50 else result
	})

	return result, processing_steps


	def analyze_text_statistics(text):
	"""Analyze various statistics about the text."""
	words = text.split()

	stats = {
	'character_count': len(text),
	'word_count': len(words),
	'sentence_count': len(re.findall(r'[.!?]+', text)),
	'average_word_length': sum(len(word) for word in words) / len(words) if words else 0,
	'longest_word': max(words, key=len) if words else "",
	'shortest_word': min(words, key=len) if words else ""
	}

	return stats


	if __name__ == "__main__":
	sample_text = """
	This is a SAMPLE text with various formatting issues!!!
	It has multiple spaces, special @#$% characters, and
	needs some serious cleaning & processing...
	"""

	print("Original text:")
	print(repr(sample_text))

	processed_text, steps = text_processing_pipeline(sample_text)

	print("\nProcessing steps:")
	for step in steps:
	print(f"After {step['operation']}:")
	print(f" {step['after']}")

	print(f"\nFinal result: {processed_text}")

	stats = analyze_text_statistics(processed_text)
	print(f"\nText statistics: {stats}")