Spaces:

andrehoffmann80
/

DOI

Running

App Files Files Community

DOI / src /streamlit_app.py

andrehoffmann80

Update src/streamlit_app.py

a57f2bb verified 2 days ago

raw

history blame contribute delete

38.1 kB

	import datetime
	from urllib.parse import quote

	import requests
	from lxml import etree
	import streamlit as st

	# =====================================================================
	# Namespaces
	# =====================================================================

	CROSSREF_NS = "http://www.crossref.org/schema/4.4.2"
	XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
	JATS_NS = "http://www.ncbi.nlm.nih.gov/JATS1"
	XML_NS = "http://www.w3.org/XML/1998/namespace"
	AI_NS = "http://www.crossref.org/AccessIndicators.xsd"
	MODS_NS = "http://www.loc.gov/mods/v3"
	XML_LANG = f"{{{XML_NS}}}lang"


	# =====================================================================
	# Hilfsfunktionen
	# =====================================================================

	def clean_text(text: str) -> str:
	"""Bereinigt Soft-Hyphen, PDF-Trennungen, Zeilenumbrüche – sonst unverändert."""
	if not text:
	return ""
	return (
	text.replace("\u00AD", "") # Soft Hyphen
	.replace("", "") # alternative Soft Hyphen
	.replace("\n", " ")
	).strip()


	def get_text(node, xpath, ns):
	elem = node.find(xpath, namespaces=ns)
	return clean_text(elem.text) if elem is not None and elem.text else ""


	def build_dora_mods_url(base_url: str, repo_code: str, object_or_url: str) -> str:
	"""
	Erzeugt MODS-URL aus einer DORA-ID wie 'wsl:41900'.
	Wenn schon eine http(s)-URL übergeben wird, wird sie unverändert zurückgegeben.
	Standardmäßig wird admin.dora.lib4ri.ch für den Download verwendet.
	"""
	if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
	return object_or_url
	encoded = quote(object_or_url, safe="")
	base_url = base_url.rstrip("/")
	return f"{base_url}/{repo_code}/islandora/object/{encoded}/datastream/MODS/download"


	def build_persistent_url(repo_code: str, object_id: str) -> str:
	"""
	Erzeugt die neue persistente URL im Format:
	https://www.dora.lib4ri.ch/{repo}/item/{id}
	"""
	# Force public domain for persistent links
	public_base = "https://www.dora.lib4ri.ch"
	return f"{public_base}/{repo_code}/item/{object_id}"


	def fetch_mods_xml(mods_url: str) -> etree._Element:
	"""Lädt eine MODS-Datei von einer URL und gibt den Root-Element zurück."""
	resp = requests.get(mods_url)
	resp.raise_for_status()
	# Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
	parser = etree.XMLParser(recover=True, remove_blank_text=True)
	return etree.fromstring(resp.content, parser=parser)


	def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
	"""Extrahiert Buch-Metadaten aus einem Buch-MODS-Record."""
	ns = book_root.nsmap.copy()
	if "mods" not in ns:
	ns["mods"] = MODS_NS

	# Buchtitel
	book_title = get_text(book_root, ".//mods:titleInfo/mods:title", ns)

	# Serie (falls vorhanden)
	series_title = get_text(
	book_root,
	".//mods:relatedItem[@type='series']/mods:titleInfo/mods:title",
	ns
	)
	series_issn = get_text(
	book_root,
	".//mods:relatedItem[@type='series']/mods:identifier[@type='issn']",
	ns
	)

	# Herausgeber (editor) & Autoren (author - für Monographs)
	editors = []
	authors = []
	for name in book_root.findall(".//mods:name[@type='personal']", ns):
	role = name.find("mods:role/mods:roleTerm", ns)
	if role is not None:
	role_text = role.text.lower()
	if role_text == "editor":
	given = get_text(name, "mods:namePart[@type='given']", ns)
	family = get_text(name, "mods:namePart[@type='family']", ns)
	editors.append({"given": given, "family": family})
	elif role_text == "author":
	given = get_text(name, "mods:namePart[@type='given']", ns)
	family = get_text(name, "mods:namePart[@type='family']", ns)
	# Authors at book level (for Monographs)
	authors.append({"given": given, "family": family})

	# Publisher
	publisher_name = get_text(book_root, ".//mods:originInfo/mods:publisher", ns)

	# Publikationsjahr (online)
	pub_year = get_text(
	book_root,
	".//mods:originInfo/mods:dateIssued[@encoding='w3cdtf'][@keyDate='yes']",
	ns
	)
	if not pub_year:
	pub_year = get_text(book_root, ".//mods:originInfo/mods:dateIssued", ns)

	# DOI & URI
	book_doi = get_text(book_root, ".//mods:identifier[@type='doi']", ns)

	# Persistent URL format
	# Example: https://www.dora.lib4ri.ch/psi/item/psi:84778
	book_id = get_text(book_root, ".//mods:identifier[@type='local']", ns)
	if not book_id:
	# Fallback to building ID from DOI if possible, or use a placeholder
	book_id = book_doi.split("/")[-1] if book_doi else ""

	# Get repo_code from the ID itself (e.g. 'psi' from 'psi:84778')
	current_repo = book_id.split(":")[0] if ":" in book_id else repo_base_url.split("/")[-1]
	book_resource = build_persistent_url(current_repo, book_id) if book_id else ""

	# ISBN / noisbn
	isbn_val = get_text(book_root, ".//mods:identifier[@type='isbn']", ns)
	noisbn_reason = "archive_volume" if not isbn_val else None

	# Default to current date if not found/provided
	today = datetime.date.today()

	meta = {
	"book_title": book_title,
	"series_title": series_title or "",
	"series_issn": series_issn or "",
	"publisher_name": publisher_name,
	"pub_year": int(pub_year[:4]) if pub_year else today.year,
	"pub_month": str(today.month),
	"pub_day": str(today.day),
	"noisbn_reason": noisbn_reason or "",
	"book_doi": book_doi or "",
	"book_resource": book_resource or "",
	"report_number": "",
	"editors": editors,
	"authors": authors,
	}
	return meta


	def mods_to_content_item(mods_root: etree._Element, repo_base_url: str) -> tuple[etree._Element, int]:
	"""Wandelt ein Kapitel-MODS in ein Crossref <content_item> um."""
	ns = mods_root.nsmap.copy()
	if "mods" not in ns:
	ns["mods"] = MODS_NS

	title = get_text(mods_root, ".//mods:titleInfo/mods:title", ns)
	doi = get_text(mods_root, ".//mods:identifier[@type='doi']", ns)
	year = get_text(mods_root, ".//mods:originInfo/mods:dateIssued", ns)
	abstract = get_text(mods_root, ".//mods:abstract", ns)
	first_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:start", ns)
	last_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:end", ns)

	# Autoren
	authors = []
	for name in mods_root.findall(".//mods:name[@type='personal']", ns):
	role = name.find("mods:role/mods:roleTerm", ns)
	if role is not None and role.text == "author":
	given = get_text(name, "mods:namePart[@type='given']", ns)
	family = get_text(name, "mods:namePart[@type='family']", ns)
	authors.append((given, family))

	ci = etree.Element("content_item", component_type="chapter")

	# Contributors
	contribs = etree.SubElement(ci, "contributors")
	for idx, (given, family) in enumerate(authors):
	pn = etree.SubElement(
	contribs,
	"person_name",
	sequence="first" if idx == 0 else "additional",
	contributor_role="author",
	)
	etree.SubElement(pn, "given_name").text = given
	etree.SubElement(pn, "surname").text = family

	# Titel
	titles = etree.SubElement(ci, "titles")
	etree.SubElement(titles, "title").text = title

	# Abstract (JATS)
	jats_abs = etree.SubElement(ci, f"{{{JATS_NS}}}abstract", {XML_LANG: "en"})
	p = etree.SubElement(jats_abs, f"{{{JATS_NS}}}p")
	p.text = abstract

	# Publikationsdatum
	pub = etree.SubElement(ci, "publication_date", media_type="online")
	if year:
	etree.SubElement(pub, "year").text = year[:4]

	# Seiten
	if first_page or last_page:
	pages = etree.SubElement(ci, "pages")
	if first_page:
	etree.SubElement(pages, "first_page").text = first_page
	if last_page:
	etree.SubElement(pages, "last_page").text = last_page

	# License information (AccessIndicators) - must come before doi_data
	ai_program = etree.SubElement(ci, f"{{{AI_NS}}}program", name="AccessIndicators")
	license_ref = etree.SubElement(ai_program, f"{{{AI_NS}}}license_ref")
	license_ref.text = "https://creativecommons.org/licenses/by/4.0/"
	license_ref.set("applies_to", "vor")
	license_ref.set("start_date", year[:4] + "-01-01" if year else "")

	# DOI
	if doi:
	doi_data = etree.SubElement(ci, "doi_data")
	etree.SubElement(doi_data, "doi").text = doi

	# New persistent URL format
	chapter_id = doi.split("/")[-1] if "/" in doi else doi
	repo_code_extracted = chapter_id.split(":")[0] if ":" in chapter_id else repo_base_url.split("/")[-1]
	etree.SubElement(
	doi_data,
	"resource"
	).text = build_persistent_url(repo_code_extracted, chapter_id)

	# Sortierung nach first_page
	try:
	page_number = int(first_page)
	except Exception:
	page_number = 999999

	return ci, page_number


	def build_doi_batch_xml(
	book_meta: dict,
	depositor_meta: dict,
	chapter_items: list[tuple[etree._Element, int]],
	book_type: str = "edited_book",
	) -> bytes:
	"""
	Erzeugt Crossref-<doi_batch>.
	book_type: 'edited_book', 'monograph', oder 'report-paper' (custom internal flag).
	"""
	doi_batch = etree.Element(
	"doi_batch",
	nsmap={
	None: CROSSREF_NS,
	"xsi": XSI_NS,
	"jats": JATS_NS,
	"ai": AI_NS,
	}
	)
	doi_batch.set("version", "4.4.2")
	doi_batch.set(
	f"{{{XSI_NS}}}schemaLocation",
	"http://www.crossref.org/schema/4.4.2 "
	"http://www.crossref.org/schema/deposit/crossref4.4.2.xsd"
	)

	# HEAD
	head = etree.SubElement(doi_batch, "head")
	etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]

	ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
	etree.SubElement(head, "timestamp").text = ts

	depositor = etree.SubElement(head, "depositor")
	etree.SubElement(depositor, "depositor_name").text = depositor_meta["depositor_name"]
	etree.SubElement(depositor, "email_address").text = depositor_meta["depositor_email"]

	etree.SubElement(head, "registrant").text = depositor_meta["registrant"]

	# BODY
	body = etree.SubElement(doi_batch, "body")

	# Determine structure based on book_type
	if book_type == "report-paper":
	report_paper = etree.SubElement(body, "report-paper")
	# Decide between report-paper_metadata and report-paper_series_metadata
	has_series = (book_meta.get("series_title") or book_meta.get("series_issn"))
	if has_series:
	metadata_root = etree.SubElement(report_paper, "report-paper_series_metadata")
	# 1. SERIES METADATA (Required if using report-paper_series_metadata)
	series_metadata = etree.SubElement(metadata_root, "series_metadata")
	if book_meta.get("series_title"):
	stitles = etree.SubElement(series_metadata, "titles")
	etree.SubElement(stitles, "title").text = book_meta["series_title"]
	if book_meta.get("series_issn"):
	etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
	else:
	metadata_root = etree.SubElement(report_paper, "report-paper_metadata")
	else:
	# BOOK STRUCTURE (Edited Book or Monograph)
	book = etree.SubElement(body, "book", book_type=book_type)
	# If it's a monograph or edited book, we often use book_series_metadata or book_metadata
	# For simplicity and to match the schema, let's stick to book_series_metadata if series exists
	if book_meta.get("series_title") or book_meta.get("series_issn"):
	metadata_root = etree.SubElement(book, "book_series_metadata")
	series_metadata = etree.SubElement(metadata_root, "series_metadata")
	if book_meta.get("series_title"):
	stitles = etree.SubElement(series_metadata, "titles")
	etree.SubElement(stitles, "title").text = book_meta["series_title"]
	if book_meta.get("series_issn"):
	etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
	else:
	metadata_root = etree.SubElement(book, "book_metadata")

	# 2. CONTRIBUTORS
	# Contributors (Editors or Authors)
	contributors_list = []
	role = "editor"
	if book_type in ["monograph", "report-paper"]:
	contributors_list = book_meta.get("authors", [])
	role = "author"
	else:
	contributors_list = book_meta.get("editors", [])
	role = "editor"

	if contributors_list:
	contribs = etree.SubElement(metadata_root, "contributors")
	for idx, person in enumerate(contributors_list):
	pn = etree.SubElement(
	contribs,
	"person_name",
	sequence="first" if idx == 0 else "additional",
	contributor_role=role
	)
	etree.SubElement(pn, "given_name").text = person["given"]
	etree.SubElement(pn, "surname").text = person["family"]

	# 3. TITLES
	titles = etree.SubElement(metadata_root, "titles")
	etree.SubElement(titles, "title").text = book_meta["book_title"]

	# 4. PUBLICATION DATE
	pub = etree.SubElement(metadata_root, "publication_date", media_type="online")
	if book_meta.get("pub_month") and book_meta.get("pub_month").strip():
	try:
	etree.SubElement(pub, "month").text = f"{int(book_meta['pub_month']):02d}"
	except ValueError:
	pass
	if book_meta.get("pub_day") and book_meta.get("pub_day").strip():
	try:
	etree.SubElement(pub, "day").text = f"{int(book_meta['pub_day']):02d}"
	except ValueError:
	pass
	etree.SubElement(pub, "year").text = str(book_meta["pub_year"])

	# 5. NOISBN (only for books)
	if book_type != "report-paper":
	if book_meta.get("noisbn_reason"):
	etree.SubElement(metadata_root, "noisbn", reason=book_meta["noisbn_reason"])

	# 6. PUBLISHER
	pub_node = etree.SubElement(metadata_root, "publisher")
	etree.SubElement(pub_node, "publisher_name").text = book_meta["publisher_name"]

	# 7. PUBLISHER ITEM (Report Number) - Only for report-paper
	if book_type == "report-paper" and book_meta.get("report_number"):
	publisher_item = etree.SubElement(metadata_root, "publisher_item")
	etree.SubElement(publisher_item, "identifier", id_type="report-number").text = book_meta["report_number"]

	# 8. DOI DATA
	if book_meta.get("book_doi") or book_meta.get("book_resource"):
	doi_data = etree.SubElement(metadata_root, "doi_data")
	if book_meta.get("book_doi"):
	etree.SubElement(doi_data, "doi").text = book_meta["book_doi"]
	if book_meta.get("book_resource"):
	etree.SubElement(doi_data, "resource").text = book_meta["book_resource"]

	# 10. COMPONENTS (Chapters)
	# Sort and append chapters
	if book_type != "report-paper":
	# For books, chapters are children of <book> node
	# But wait, in the loop below we append to 'book' variable.
	# 'book' variable is only defined if book_type != 'report-paper'.
	pass

	chapter_items.sort(key=lambda x: x[1])
	for ci, _page in chapter_items:
	if book_type == "report-paper":
	report_paper.append(ci)
	else:
	book.append(ci)

	xml_bytes = etree.tostring(
	doi_batch,
	pretty_print=True,
	encoding="UTF-8",
	xml_declaration=True
	)
	return xml_bytes


	class CrossrefSchemaResolver(etree.Resolver):
	"""Custom resolver to fetch included XSD schemas from Crossref and W3C."""

	def resolve(self, url, id, context):
	# Map of known schema locations
	schema_map = {
	'mathml3-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-content.xsd',
	'mathml3-presentation.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-presentation.xsd',
	'mathml3-strict-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-strict-content.xsd',
	'mathml3-common.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-common.xsd',
	}

	# Determine the URL to fetch
	if url.startswith("http://") or url.startswith("https://"):
	schema_url = url
	elif url in schema_map:
	schema_url = schema_map[url]
	else:
	schema_url = f"https://www.crossref.org/schemas/{url}"

	try:
	response = requests.get(schema_url, timeout=15)
	response.raise_for_status()
	return self.resolve_string(response.content, context)
	except Exception:
	# If fetching fails, return None to use default behavior
	return None


	def validate_crossref_xml(xml_bytes: bytes) -> tuple[bool, list[str]]:
	"""
	Validiert Crossref XML gegen das offizielle XSD Schema.

	Returns:
	tuple: (is_valid, error_messages)
	"""
	errors = []

	try:
	# Parse XML
	doc = etree.fromstring(xml_bytes)

	# Crossref XSD Schema URL
	schema_url = "https://www.crossref.org/schemas/crossref4.4.2.xsd"

	# Download schema (mit Caching in Session State für Performance)
	if 'crossref_schema' not in st.session_state:
	try:
	# Create parser with custom resolver
	parser = etree.XMLParser()
	parser.resolvers.add(CrossrefSchemaResolver())

	# Download main schema
	schema_resp = requests.get(schema_url, timeout=30)
	schema_resp.raise_for_status()

	# Parse schema with resolver
	schema_doc = etree.fromstring(schema_resp.content, parser)
	st.session_state.crossref_schema = etree.XMLSchema(schema_doc)
	except Exception as e:
	errors.append(f"Fehler beim Laden des XSD Schemas: {e}")
	return False, errors

	schema = st.session_state.crossref_schema

	# Validierung
	is_valid = schema.validate(doc)

	if not is_valid:
	for error in schema.error_log:
	errors.append(f"Zeile {error.line}: {error.message}")

	return is_valid, errors

	except etree.XMLSyntaxError as e:
	errors.append(f"XML Syntax Fehler: {e}")
	return False, errors
	except Exception as e:
	errors.append(f"Unerwarteter Fehler: {e}")
	return False, errors


	# =====================================================================
	# REPOSITORY CONFIGURATION
	# =====================================================================

	REPO_CONFIG = {
	"wsl": {
	"publisher": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
	"registrant": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
	"prefix": "10.55419",
	"role": "wslx"
	},
	"psi": {
	"publisher": "Paul Scherrer Institute, PSI",
	"registrant": "Paul Scherrer Institute, PSI",
	"prefix": "10.55402",
	"role": "psit"
	},
	"empa": {
	"publisher": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
	"registrant": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
	"prefix": "10.55368",
	"role": "empa"
	},
	"eawag": {
	"publisher": "Swiss Federal Institute of Aquatic Science and Technology, Eawag",
	"registrant": "Swiss Federal Institute of Aquatic Science and Technology, Eawag",
	"prefix": "10.55408",
	"role": "eawa"
	}
	}

	def main():
	st.title("Crossref XML Generator/Uploader")

	st.markdown(
	"Dieses Dashboard lädt MODS-Metadaten direkt aus DORA mittels IDs "
	"und erzeugt ein vollständiges Crossref-XML (`doi_batch`) für Reports (WSL Berichte und PSI Berichte) und Edited Books/Conference Proceedings."
	)

	st.subheader("Konfiguration & Quelle")

	col_config, col_source = st.columns(2)

	with col_config:
	st.markdown("#### Verbindung & Typ")
	base_url = st.text_input(
	"DORA Basis-URL",
	value="https://admin.dora.lib4ri.ch"
	)

	repo_list = list(REPO_CONFIG.keys())
	repo_code = st.selectbox(
	"Repository-Code",
	options=repo_list,
	index=0,
	format_func=lambda x: x.upper()
	)

	repo_config = REPO_CONFIG[repo_code]
	repo_base_url = f"{base_url.rstrip('/')}/{repo_code}"

	pub_type = st.radio(
	"Publikationstyp",
	("Edited Book", "Report (WSL, Monograph Series)", "Report (Eawag, PSI, Paper Series)"),
	horizontal=False
	)

	# Mapping auf Crossref book_type / report type
	cr_book_type = "edited_book"
	if "Monograph" in pub_type:
	cr_book_type = "monograph"
	elif "Paper Series" in pub_type:
	cr_book_type = "report-paper"

	with col_source:
	st.markdown("#### MODS-Quelle")
	# Dynamic default ID based on repo
	default_id = "41891"
	if repo_code == "psi":
	default_id = "84057"

	book_id_or_url = st.text_input(
	"DORA-ID oder MODS-URL",
	value=f"{repo_code}:{default_id}",
	help="Beispiel: wsl:41900 oder komplette URL"
	)

	st.write("") # Spacer
	if st.button("Metadaten laden", type="primary"):
	try:
	mods_url = build_dora_mods_url(base_url, repo_code, book_id_or_url)
	st.info(f"Lade MODS von: {mods_url}")
	book_root = fetch_mods_xml(mods_url)
	meta = parse_book_mods(book_root, repo_base_url)

	# --- Attempt to extract report number from MODS ---
	ns = book_root.nsmap.copy()
	if "mods" not in ns:
	ns["mods"] = MODS_NS
	report_num = get_text(book_root, ".//mods:identifier[@type='report number']", ns)
	if not report_num:
	report_num = get_text(book_root, ".//mods:identifier[@type='report-number']", ns)

	if not report_num:
	# Check <note type="report number">
	report_num = get_text(book_root, ".//mods:note[@type='report number']", ns)

	if report_num:
	meta["report_number"] = report_num
	st.info(f"Report Number gefunden: {report_num}")
	# --------------------------------------------------

	# Update flat fields in session state for widgets
	for k, v in meta.items():
	if k in ["book_title", "series_title", "series_issn", "publisher_name",
	"pub_year", "pub_month", "pub_day", "noisbn_reason",
	"book_doi", "book_resource", "report_number"]:
	st.session_state[k] = v
	st.session_state.book_meta[k] = v

	# Special handling for persons text area
	if cr_book_type in ["monograph", "report-paper"]:
	current_list = meta.get("authors", [])
	else:
	current_list = meta.get("editors", [])
	st.session_state["persons_input"] = "\n".join(f"{e['given']};{e['family']}" for e in current_list)

	st.session_state.book_meta_loaded = True
	st.success("Metadaten erfolgreich geladen.")
	st.rerun()
	except Exception as e:
	st.error(f"Fehler beim Laden der MODS: {e}")
	import traceback
	st.text(traceback.format_exc())

	# Session State Init Logic (unchanged but placed after UI definition for clarity in reading flow, strictly it runs before inputs generally)
	if "book_meta_loaded" not in st.session_state:
	st.session_state.book_meta_loaded = False

	# Current date for defaults
	today = datetime.date.today()

	# Initialize session state keys for widgets if not present
	if "book_title" not in st.session_state:
	st.session_state.book_title = ""
	if "series_title" not in st.session_state:
	st.session_state.series_title = ""
	if "series_issn" not in st.session_state:
	st.session_state.series_issn = ""
	if "publisher_name" not in st.session_state:
	st.session_state.publisher_name = repo_config["publisher"]
	if "pub_year" not in st.session_state:
	st.session_state.pub_year = today.year
	if "pub_month" not in st.session_state:
	st.session_state.pub_month = str(today.month)
	if "pub_day" not in st.session_state:
	st.session_state.pub_day = str(today.day)
	if "noisbn_reason" not in st.session_state:
	st.session_state.noisbn_reason = ""
	if "book_doi" not in st.session_state:
	st.session_state.book_doi = ""
	if "book_resource" not in st.session_state:
	st.session_state.book_resource = ""
	if "report_number" not in st.session_state:
	st.session_state.report_number = ""
	if "persons_input" not in st.session_state:
	st.session_state.persons_input = ""

	if "book_meta" not in st.session_state:
	st.session_state.book_meta = {
	"book_title": "",
	"series_title": "",
	"series_issn": "",
	"publisher_name": repo_config["publisher"],
	"pub_year": today.year,
	"pub_month": str(today.month),
	"pub_day": str(today.day),
	"noisbn_reason": "",
	"book_doi": "",
	"book_resource": "",
	"report_number": "",
	"editors": [],
	"authors": [],
	}

	# CHECK: has the repo code changed since last run?
	if "last_repo_code" not in st.session_state:
	st.session_state.last_repo_code = repo_code
	st.session_state.registrant = repo_config["registrant"]
	st.session_state.cr_role = repo_config.get("role", "")

	if st.session_state.last_repo_code != repo_code:
	# Repo changed! Update defaults
	st.session_state.publisher_name = repo_config["publisher"]
	st.session_state.book_meta["publisher_name"] = repo_config["publisher"]
	st.session_state.registrant = repo_config["registrant"]

	# If the user hasn't typed anything yet or if we force update?
	# Let's force update the role in session state so the input widget picks it up
	st.session_state.cr_role = repo_config.get("role", "")

	st.session_state.last_repo_code = repo_code

	st.markdown("---")
	st.subheader("Metadaten & Inhalte")

	# Use expander for metadata editing to keep UI clean
	with st.expander("Metadaten bearbeiten", expanded=True):
	bm = st.session_state.book_meta

	col_b1, col_b2 = st.columns(2)
	with col_b1:
	st.text_input("Titel", key="book_title")
	st.text_input("Serientitel", key="series_title")
	st.text_input("Serien-ISSN", key="series_issn")
	st.text_input("Publisher Name", key="publisher_name")

	if cr_book_type == "report-paper":
	st.text_input("Report Number", key="report_number")

	with col_b2:
	c_y, c_m, c_d = st.columns(3)
	with c_y:
	st.number_input("Jahr", min_value=1900, max_value=2100, key="pub_year")
	with c_m:
	st.text_input("Monat", key="pub_month")
	with c_d:
	st.text_input("Tag", key="pub_day")

	if cr_book_type != "report-paper":
	st.text_input("noisbn reason", key="noisbn_reason")

	st.markdown("##### Identifikatoren")
	col_id1, col_id2 = st.columns(2)
	with col_id1:
	st.text_input("DOI", key="book_doi")
	with col_id2:
	st.text_input("Resource URL", key="book_resource")

	st.caption(f"Basis DOI Prefix: {repo_config['prefix']}")

	st.markdown("##### Mitwirkende")
	# Decide label based on type
	if cr_book_type in ["monograph", "report-paper"]:
	st.info("Bitte Autoren eintragen (Vorname;Nachname).")
	label = "Autoren"
	else:
	st.info("Bitte Editoren eintragen (Vorname;Nachname).")
	label = "Editoren"

	persons_text = st.text_area(label, key="persons_input", height=100)

	# Parse and save back
	new_persons = []
	for line in persons_text.splitlines():
	line = line.strip()
	if not line:
	continue
	parts = [p.strip() for p in line.split(";")]
	if len(parts) == 2:
	new_persons.append({"given": parts[0], "family": parts[1]})

	if cr_book_type in ["monograph", "report-paper"]:
	bm["authors"] = new_persons
	else:
	bm["editors"] = new_persons

	st.markdown("---")
	st.subheader("Depositor & Batch Info")

	with st.expander("Depositor Details", expanded=False):
	col_d1, col_d2 = st.columns(2)
	with col_d1:
	depositor_name = st.text_input(
	"Depositor Name",
	value="Lib4RI - Library for the Research Institutes within the ETH Domain: Eawag, Empa, PSI & WSL"
	)
	with col_d2:
	depositor_email = st.text_input("Depositor Email", value="dora@lib4ri.ch")

	ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

	batch_prefix = "book"
	if cr_book_type == "report-paper":
	batch_prefix = "report"
	elif cr_book_type == "monograph":
	batch_prefix = "monograph"

	doi_batch_id = st.text_input(
	"DOI Batch ID",
	value=f"{batch_prefix}_{ts}",
	help="Wird im XML-Header verwendet. Sollte eindeutig sein."
	)

	if "registrant" not in st.session_state:
	st.session_state.registrant = repo_config["registrant"]

	registrant = st.text_input("Registrant", value=st.session_state.registrant)
	st.session_state.registrant = registrant

	depositor_meta = {
	"depositor_name": depositor_name,
	"depositor_email": depositor_email,
	"registrant": st.session_state.registrant,
	"doi_batch_id": doi_batch_id
	}

	st.subheader("Kapitel / Inhalte")
	st.caption("Ein Eintrag pro Zeile: ID (z.B. wsl:12345) oder URL")

	st.markdown(
	"Gib eine DORA-ID (z.B. `wsl:41900`) oder eine komplette MODS-URL "
	"pro Zeile ein."
	)

	chapters_text = st.text_area("Kapitel-Liste", height=200, help="Liste der IDs oder URLs")

	st.markdown("---")
	st.subheader("XML Generierung")

	if st.button("Crossref XML generieren", type="primary"):
	try:
	chapter_items = []

	for line in chapters_text.splitlines():
	line = line.strip()
	if not line:
	continue
	mods_url = build_dora_mods_url(base_url, repo_code, line)
	st.write(f"Lade Kapitel-MODS von: {mods_url}")
	mods_root = fetch_mods_xml(mods_url)
	ci, page_no = mods_to_content_item(mods_root, repo_base_url)
	chapter_items.append((ci, page_no))

	if not chapter_items and cr_book_type == "edited_book":
	st.warning("Keine Kapitel angegeben! Ein Edited Book sollte normalerweise Kapitel enthalten.")

	# book_meta aus session state / widgets zusammenbauen
	book_meta = {
	"book_title": st.session_state.book_title,
	"series_title": st.session_state.series_title,
	"series_issn": st.session_state.series_issn,
	"publisher_name": st.session_state.publisher_name,
	"pub_year": int(st.session_state.pub_year) if st.session_state.get("pub_year") else 0,
	"pub_month": st.session_state.pub_month,
	"pub_day": st.session_state.pub_day,
	"noisbn_reason": st.session_state.get("noisbn_reason", ""),
	"book_doi": st.session_state.book_doi,
	"book_resource": st.session_state.book_resource,
	"report_number": st.session_state.get("report_number", ""),
	"editors": new_persons if cr_book_type not in ["monograph", "report-paper"] else [],
	"authors": new_persons if cr_book_type in ["monograph", "report-paper"] else [],
	}

	xml_bytes = build_doi_batch_xml(book_meta, depositor_meta, chapter_items, book_type=cr_book_type)

	# Store in session state
	st.session_state.crossref_xml = xml_bytes
	st.session_state.crossref_filename = "crossref_edited_book.xml"

	st.success("Crossref XML erfolgreich erzeugt!")

	# Validierung gegen Crossref XSD Schema
	st.subheader("XML Validierung")
	with st.spinner("Validiere XML gegen Crossref Schema..."):
	is_valid, validation_errors = validate_crossref_xml(xml_bytes)

	if is_valid:
	st.success("✓ XML ist valide und bereit für Crossref!")
	else:
	st.error("✗ XML Validierung fehlgeschlagen:")
	for error in validation_errors:
	st.error(f" • {error}")
	st.warning("Das XML kann trotzdem heruntergeladen werden, wird aber möglicherweise von Crossref abgelehnt.")

	except Exception as e:
	st.error(f"Fehler bei der Erzeugung des XML: {e}")
	import traceback
	st.text(traceback.format_exc())

	# Display Download and Upload if XML exists in session state
	if "crossref_xml" in st.session_state:
	xml_bytes = st.session_state.crossref_xml

	# Download Button
	st.download_button(
	label="XML herunterladen",
	data=xml_bytes,
	file_name=st.session_state.crossref_filename,
	mime="application/xml"
	)

	# ---------------------------------------------------------
	# Crossref Upload Section
	# ---------------------------------------------------------
	st.markdown("---")
	st.subheader("Automatischer Upload zu Crossref")

	# Determine default role if not in session state
	if "cr_role" not in st.session_state:
	st.session_state.cr_role = REPO_CONFIG.get(st.session_state.last_repo_code, {}).get("role", "")

	col_u1, col_u2 = st.columns(2)
	with col_u1:
	cr_user = st.text_input("Crossref Username", value="dora@lib4ri.ch")
	# Use key to bind to session state
	cr_role = st.text_input("Crossref Role (wslx, empa, eawa, psit)", key="cr_role")
	with col_u2:
	cr_pass = st.text_input("Crossref Password", type="password")

	if st.button("Upload to Crossref"):
	if not cr_user or not cr_pass:
	st.error("Bitte Username und Passwort für Crossref angeben.")
	else:
	with st.spinner("Lade zu Crossref hoch..."):
	res = upload_to_crossref(xml_bytes, cr_user, cr_pass, cr_role)

	if isinstance(res, str) and res.startswith("Exception"):
	st.error(f"Upload fehlgeschlagen: {res}")
	else:
	# Crossref returns 200 even on some logic errors, text contains details
	if res.status_code == 200:
	if "successfully received" in res.text:
	st.success("Upload erfolgreich! Crossref hat die Datei empfangen.")
	with st.expander("Server-Antwort ansehen"):
	st.text(res.text)
	else:
	st.warning("Upload technisch erfolgreich (HTTP 200), aber Crossref meldet eventuell Fehler.")
	with st.expander("Server-Antwort ansehen (Fehleranalyse)"):
	st.text(res.text)
	else:
	st.error(f"HTTP Fehler: {res.status_code}")
	st.text(res.text)


	def upload_to_crossref(xml_content, username, password, role=None):
	url = "https://doi.crossref.org/servlet/deposit"

	# Construct login_id with role if provided (format: username/role)
	login_id = username
	if role and role.strip():
	login_id = f"{username}/{role.strip()}"

	# Multipart form data
	# 'operation': 'doMDUpload'
	# 'login_id': username (or username/role)
	# 'login_passwd': password
	# 'fname': (filename, file_content, content_type)

	files = {
	'fname': ('crossref_submission.xml', xml_content, 'application/xml')
	}
	data = {
	'operation': 'doMDUpload',
	'login_id': login_id,
	'login_passwd': password
	}

	try:
	response = requests.post(url, files=files, data=data, timeout=60)
	return response
	except Exception as e:
	return f"Exception: {e}"

	if __name__ == "__main__":
	main()