Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / pyproject.toml

seanpedrickcase

Sync: Merge pull request #174 from seanpedrick-case/dev

ae79dd2 about 18 hours ago

raw

history blame contribute delete

5.75 kB

	[build-system]
	requires = ["setuptools>=61.0", "wheel"]
	build-backend = "setuptools.build_meta"

	[project]
	name = "doc_redaction"
	version = "2.2.3"
	description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
	readme = "README_PYPI.md"
	authors = [
	{ name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
	]
	maintainers = [
	{ name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
	]
	license = "AGPL-3.0-only" # This licence type required to use PyMuPDF
	keywords = [
	"redaction",
	"pdf",
	"nlp",
	"documents",
	"document-processing",
	"gradio",
	"pii",
	"pii-detection"
	]
	classifiers = [
	"Development Status :: 5 - Production/Stable",
	"Intended Audience :: Developers",
	"Intended Audience :: Legal Industry",
	"Topic :: Text Processing :: General",
	"Topic :: Security :: Cryptography",
	"Programming Language :: Python :: 3",
	"Programming Language :: Python :: 3.10",
	"Programming Language :: Python :: 3.11",
	"Programming Language :: Python :: 3.12",
	"Programming Language :: Python :: 3.13"
	]
	requires-python = ">=3.10"
	dependencies = [
	"pdfminer.six<=20260107",
	"pdf2image<=1.17.0",
	"pymupdf<=1.27.1",
	"bleach<=6.3.0",
	"opencv-python<=4.13.0.92",
	"presidio_analyzer<=2.2.362",
	"presidio_anonymizer<=2.2.362",
	"presidio-image-redactor<=0.0.58",
	"pikepdf<=10.3.0",
	"pandas<=2.3.3",
	"scikit-learn<=1.8.0",
	"spacy<=3.8.14",
	"gradio<=6.10.0",
	"boto3<=1.42.91",
	"pyarrow<=23.0.1",
	"openpyxl<=3.1.5",
	"Faker<=40.8.0",
	"python-levenshtein<=0.27.3",
	"spaczz<=0.6.1",
	"gradio_image_annotation_redaction==0.5.5",
	"rapidfuzz<=3.14.5",
	"python-dotenv<=1.2.2",
	"awslambdaric<=3.1.1",
	"python-docx<=1.2.0",
	"polars<=1.38.1",
	"defusedxml<=0.7.1",
	"numpy<=2.4.4",
	"spaces<=0.48.3",
	"google-genai<=1.73.0",
	"openai<=2.31.0",
	"markdown<=3.10.2",
	"tabulate<=0.10.0"
	]

	[project.optional-dependencies]

	# For testing
	dev = ["pytest"]
	test = ["pytest", "pytest-cov"]

	# To install the app with paddle and vlm support with pip, example command (in base folder and correct python environment): pip install .[paddle,vlm], or uv pip install .[paddle,vlm] if using uv. Note need to GPU version of Torch below

	# Extra dependencies for PaddleOCR
	# The following installs the CPU version of paddleOCR. If you want the GPU-accelerated version, run pip install the relevant wheel for paddlepaddle-gpu==3.2.1 from the following link: https://www.paddlepaddle.org.cn/packages/stable/cu129/
	paddle = [
	"protobuf<=7.34.0",
	"paddlepaddle>=3.0.0,<=3.2.1",
	"paddleocr<=3.3.0",
	"pycocotools<=2.0.10",
	]

	# Extra dependencies for VLM models
	# The following installs the CPU compatible version of pytorch. For torch cuda support you should run manually pip install torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu129 after installation
	vlm = [
	"torch<=2.9.1",
	"torchvision<=0.24.1",
	"transformers<=5.5.4",
	"accelerate<=1.13.0",
	"bitsandbytes<=0.49.2", # Needed for on the fly quantisation in transformers
	"sentencepiece<=0.2.1", # Needed for PaddleOCRVL
	#"optimum<=2.1.0", # Needed for GPTQ quantised models in transformers. Commented out, as optional
	#"GPTQModel<=5.8.0", # Needed for GPTQ quantised models in transformers. Highly advised to install from a wheel from https://github.com/ModelCloud/GPTQModel
	#"flash_attn<=2.8.3", # Faster inference with transformers. Highly recommended to install from a wheel at https://github.com/Dao-AILab/flash-attention
	]

	# Run Gradio as an mcp server
	mcp = [
	"gradio[mcp]<=6.10.0"
	]

	[project.urls]
	Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
	Repository = "https://github.com/seanpedrick-case/doc_redaction"

	[project.scripts]
	cli_redact = "doc_redaction.cli_redact:main"
	mcp_doc_redaction = "mcp_doc_redaction.server:main"
	doc_redaction_install_deps = "doc_redaction.install_deps:main"

	[tool.setuptools]
	include-package-data = true
	py-modules = [
	"app",
	"agent_routes",
	"cli_redact",
	"lambda_entrypoint",
	"load_dynamo_logs",
	"load_s3_logs",
	]

	[tool.setuptools.packages.find]
	where = ["."]
	include = ["doc_redaction", "tools", "mcp_doc_redaction*"]
	exclude = [
	"test*",
	"skills*",
	"cdk*",
	"src*",
	"example_data*",
	]

	# Configuration for Ruff linter:
	[tool.ruff]
	line-length = 88

	[tool.ruff.lint]
	select = ["E", "F", "I"]
	ignore = [
	"E501", # line-too-long (handled with Black)
	"E402", # module-import-not-at-top-of-file (sometimes needed for conditional imports)
	]

	[tool.ruff.lint.per-file-ignores]
	"__init__.py" = ["F401"] # Allow unused imports in __init__.py

	# Configuration for a Black formatter:
	[tool.black]
	line-length = 88
	target-version = ['py310']

	# Configuration for pytest:
	[tool.pytest.ini_options]
	markers = [
	"integration: optional slow tests (CLI PDF smoke; set PYTEST_CLI_INTEGRATION=1 where needed)",
	]
	filterwarnings = [
	"ignore::DeprecationWarning:click.parser",
	"ignore::DeprecationWarning:weasel.util.config",
	"ignore::DeprecationWarning:builtin type",
	"ignore::DeprecationWarning:websockets.legacy",
	"ignore::DeprecationWarning:websockets.server",
	"ignore::DeprecationWarning:spacy.cli._util",
	"ignore::DeprecationWarning:weasel.util.config",
	"ignore::DeprecationWarning:importlib._bootstrap",
	]
	testpaths = ["test"]
	python_files = ["test_.py", "_test.py"]
	python_classes = ["Test*"]
	python_functions = ["test_*"]
	addopts = [
	"-v",
	"--tb=short",
	"--strict-markers",
	"--disable-warnings",
	"-m",
	"not integration",
	]