import datetime from urllib.parse import quote import requests from lxml import etree import streamlit as st # ===================================================================== # Namespaces # ===================================================================== CROSSREF_NS = "http://www.crossref.org/schema/4.4.2" XSI_NS = "http://www.w3.org/2001/XMLSchema-instance" JATS_NS = "http://www.ncbi.nlm.nih.gov/JATS1" XML_NS = "http://www.w3.org/XML/1998/namespace" AI_NS = "http://www.crossref.org/AccessIndicators.xsd" MODS_NS = "http://www.loc.gov/mods/v3" XML_LANG = f"{{{XML_NS}}}lang" # ===================================================================== # Hilfsfunktionen # ===================================================================== def clean_text(text: str) -> str: """Bereinigt Soft-Hyphen, PDF-Trennungen, Zeilenumbrüche – sonst unverändert.""" if not text: return "" return ( text.replace("\u00AD", "") # Soft Hyphen .replace("­", "") # alternative Soft Hyphen .replace("\n", " ") ).strip() def get_text(node, xpath, ns): elem = node.find(xpath, namespaces=ns) return clean_text(elem.text) if elem is not None and elem.text else "" def build_dora_mods_url(base_url: str, repo_code: str, object_or_url: str) -> str: """ Erzeugt MODS-URL aus einer DORA-ID wie 'wsl:41900'. Wenn schon eine http(s)-URL übergeben wird, wird sie unverändert zurückgegeben. Standardmäßig wird admin.dora.lib4ri.ch für den Download verwendet. """ if object_or_url.startswith("http://") or object_or_url.startswith("https://"): return object_or_url encoded = quote(object_or_url, safe="") base_url = base_url.rstrip("/") return f"{base_url}/{repo_code}/islandora/object/{encoded}/datastream/MODS/download" def build_persistent_url(repo_code: str, object_id: str) -> str: """ Erzeugt die neue persistente URL im Format: https://www.dora.lib4ri.ch/{repo}/item/{id} """ # Force public domain for persistent links public_base = "https://www.dora.lib4ri.ch" return f"{public_base}/{repo_code}/item/{object_id}" def fetch_mods_xml(mods_url: str) -> etree._Element: """Lädt eine MODS-Datei von einer URL und gibt den Root-Element zurück.""" resp = requests.get(mods_url) resp.raise_for_status() # Use recover=True to handle malformed XML (e.g. unescaped HTML in notes) parser = etree.XMLParser(recover=True, remove_blank_text=True) return etree.fromstring(resp.content, parser=parser) def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict: """Extrahiert Buch-Metadaten aus einem Buch-MODS-Record.""" ns = book_root.nsmap.copy() if "mods" not in ns: ns["mods"] = MODS_NS # Buchtitel book_title = get_text(book_root, ".//mods:titleInfo/mods:title", ns) # Serie (falls vorhanden) series_title = get_text( book_root, ".//mods:relatedItem[@type='series']/mods:titleInfo/mods:title", ns ) series_issn = get_text( book_root, ".//mods:relatedItem[@type='series']/mods:identifier[@type='issn']", ns ) # Herausgeber (editor) & Autoren (author - für Monographs) editors = [] authors = [] for name in book_root.findall(".//mods:name[@type='personal']", ns): role = name.find("mods:role/mods:roleTerm", ns) if role is not None: role_text = role.text.lower() if role_text == "editor": given = get_text(name, "mods:namePart[@type='given']", ns) family = get_text(name, "mods:namePart[@type='family']", ns) editors.append({"given": given, "family": family}) elif role_text == "author": given = get_text(name, "mods:namePart[@type='given']", ns) family = get_text(name, "mods:namePart[@type='family']", ns) # Authors at book level (for Monographs) authors.append({"given": given, "family": family}) # Publisher publisher_name = get_text(book_root, ".//mods:originInfo/mods:publisher", ns) # Publikationsjahr (online) pub_year = get_text( book_root, ".//mods:originInfo/mods:dateIssued[@encoding='w3cdtf'][@keyDate='yes']", ns ) if not pub_year: pub_year = get_text(book_root, ".//mods:originInfo/mods:dateIssued", ns) # DOI & URI book_doi = get_text(book_root, ".//mods:identifier[@type='doi']", ns) # Persistent URL format # Example: https://www.dora.lib4ri.ch/psi/item/psi:84778 book_id = get_text(book_root, ".//mods:identifier[@type='local']", ns) if not book_id: # Fallback to building ID from DOI if possible, or use a placeholder book_id = book_doi.split("/")[-1] if book_doi else "" # Get repo_code from the ID itself (e.g. 'psi' from 'psi:84778') current_repo = book_id.split(":")[0] if ":" in book_id else repo_base_url.split("/")[-1] book_resource = build_persistent_url(current_repo, book_id) if book_id else "" # ISBN / noisbn isbn_val = get_text(book_root, ".//mods:identifier[@type='isbn']", ns) noisbn_reason = "archive_volume" if not isbn_val else None # Default to current date if not found/provided today = datetime.date.today() meta = { "book_title": book_title, "series_title": series_title or "", "series_issn": series_issn or "", "publisher_name": publisher_name, "pub_year": int(pub_year[:4]) if pub_year else today.year, "pub_month": str(today.month), "pub_day": str(today.day), "noisbn_reason": noisbn_reason or "", "book_doi": book_doi or "", "book_resource": book_resource or "", "report_number": "", "editors": editors, "authors": authors, } return meta def mods_to_content_item(mods_root: etree._Element, repo_base_url: str) -> tuple[etree._Element, int]: """Wandelt ein Kapitel-MODS in ein Crossref um.""" ns = mods_root.nsmap.copy() if "mods" not in ns: ns["mods"] = MODS_NS title = get_text(mods_root, ".//mods:titleInfo/mods:title", ns) doi = get_text(mods_root, ".//mods:identifier[@type='doi']", ns) year = get_text(mods_root, ".//mods:originInfo/mods:dateIssued", ns) abstract = get_text(mods_root, ".//mods:abstract", ns) first_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:start", ns) last_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:end", ns) # Autoren authors = [] for name in mods_root.findall(".//mods:name[@type='personal']", ns): role = name.find("mods:role/mods:roleTerm", ns) if role is not None and role.text == "author": given = get_text(name, "mods:namePart[@type='given']", ns) family = get_text(name, "mods:namePart[@type='family']", ns) authors.append((given, family)) ci = etree.Element("content_item", component_type="chapter") # Contributors contribs = etree.SubElement(ci, "contributors") for idx, (given, family) in enumerate(authors): pn = etree.SubElement( contribs, "person_name", sequence="first" if idx == 0 else "additional", contributor_role="author", ) etree.SubElement(pn, "given_name").text = given etree.SubElement(pn, "surname").text = family # Titel titles = etree.SubElement(ci, "titles") etree.SubElement(titles, "title").text = title # Abstract (JATS) jats_abs = etree.SubElement(ci, f"{{{JATS_NS}}}abstract", {XML_LANG: "en"}) p = etree.SubElement(jats_abs, f"{{{JATS_NS}}}p") p.text = abstract # Publikationsdatum pub = etree.SubElement(ci, "publication_date", media_type="online") if year: etree.SubElement(pub, "year").text = year[:4] # Seiten if first_page or last_page: pages = etree.SubElement(ci, "pages") if first_page: etree.SubElement(pages, "first_page").text = first_page if last_page: etree.SubElement(pages, "last_page").text = last_page # License information (AccessIndicators) - must come before doi_data ai_program = etree.SubElement(ci, f"{{{AI_NS}}}program", name="AccessIndicators") license_ref = etree.SubElement(ai_program, f"{{{AI_NS}}}license_ref") license_ref.text = "https://creativecommons.org/licenses/by/4.0/" license_ref.set("applies_to", "vor") license_ref.set("start_date", year[:4] + "-01-01" if year else "") # DOI if doi: doi_data = etree.SubElement(ci, "doi_data") etree.SubElement(doi_data, "doi").text = doi # New persistent URL format chapter_id = doi.split("/")[-1] if "/" in doi else doi repo_code_extracted = chapter_id.split(":")[0] if ":" in chapter_id else repo_base_url.split("/")[-1] etree.SubElement( doi_data, "resource" ).text = build_persistent_url(repo_code_extracted, chapter_id) # Sortierung nach first_page try: page_number = int(first_page) except Exception: page_number = 999999 return ci, page_number def build_doi_batch_xml( book_meta: dict, depositor_meta: dict, chapter_items: list[tuple[etree._Element, int]], book_type: str = "edited_book", ) -> bytes: """ Erzeugt Crossref-. book_type: 'edited_book', 'monograph', oder 'report-paper' (custom internal flag). """ doi_batch = etree.Element( "doi_batch", nsmap={ None: CROSSREF_NS, "xsi": XSI_NS, "jats": JATS_NS, "ai": AI_NS, } ) doi_batch.set("version", "4.4.2") doi_batch.set( f"{{{XSI_NS}}}schemaLocation", "http://www.crossref.org/schema/4.4.2 " "http://www.crossref.org/schema/deposit/crossref4.4.2.xsd" ) # HEAD head = etree.SubElement(doi_batch, "head") etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"] ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") etree.SubElement(head, "timestamp").text = ts depositor = etree.SubElement(head, "depositor") etree.SubElement(depositor, "depositor_name").text = depositor_meta["depositor_name"] etree.SubElement(depositor, "email_address").text = depositor_meta["depositor_email"] etree.SubElement(head, "registrant").text = depositor_meta["registrant"] # BODY body = etree.SubElement(doi_batch, "body") # Determine structure based on book_type if book_type == "report-paper": report_paper = etree.SubElement(body, "report-paper") # Decide between report-paper_metadata and report-paper_series_metadata has_series = (book_meta.get("series_title") or book_meta.get("series_issn")) if has_series: metadata_root = etree.SubElement(report_paper, "report-paper_series_metadata") # 1. SERIES METADATA (Required if using report-paper_series_metadata) series_metadata = etree.SubElement(metadata_root, "series_metadata") if book_meta.get("series_title"): stitles = etree.SubElement(series_metadata, "titles") etree.SubElement(stitles, "title").text = book_meta["series_title"] if book_meta.get("series_issn"): etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"] else: metadata_root = etree.SubElement(report_paper, "report-paper_metadata") else: # BOOK STRUCTURE (Edited Book or Monograph) book = etree.SubElement(body, "book", book_type=book_type) # If it's a monograph or edited book, we often use book_series_metadata or book_metadata # For simplicity and to match the schema, let's stick to book_series_metadata if series exists if book_meta.get("series_title") or book_meta.get("series_issn"): metadata_root = etree.SubElement(book, "book_series_metadata") series_metadata = etree.SubElement(metadata_root, "series_metadata") if book_meta.get("series_title"): stitles = etree.SubElement(series_metadata, "titles") etree.SubElement(stitles, "title").text = book_meta["series_title"] if book_meta.get("series_issn"): etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"] else: metadata_root = etree.SubElement(book, "book_metadata") # 2. CONTRIBUTORS # Contributors (Editors or Authors) contributors_list = [] role = "editor" if book_type in ["monograph", "report-paper"]: contributors_list = book_meta.get("authors", []) role = "author" else: contributors_list = book_meta.get("editors", []) role = "editor" if contributors_list: contribs = etree.SubElement(metadata_root, "contributors") for idx, person in enumerate(contributors_list): pn = etree.SubElement( contribs, "person_name", sequence="first" if idx == 0 else "additional", contributor_role=role ) etree.SubElement(pn, "given_name").text = person["given"] etree.SubElement(pn, "surname").text = person["family"] # 3. TITLES titles = etree.SubElement(metadata_root, "titles") etree.SubElement(titles, "title").text = book_meta["book_title"] # 4. PUBLICATION DATE pub = etree.SubElement(metadata_root, "publication_date", media_type="online") if book_meta.get("pub_month") and book_meta.get("pub_month").strip(): try: etree.SubElement(pub, "month").text = f"{int(book_meta['pub_month']):02d}" except ValueError: pass if book_meta.get("pub_day") and book_meta.get("pub_day").strip(): try: etree.SubElement(pub, "day").text = f"{int(book_meta['pub_day']):02d}" except ValueError: pass etree.SubElement(pub, "year").text = str(book_meta["pub_year"]) # 5. NOISBN (only for books) if book_type != "report-paper": if book_meta.get("noisbn_reason"): etree.SubElement(metadata_root, "noisbn", reason=book_meta["noisbn_reason"]) # 6. PUBLISHER pub_node = etree.SubElement(metadata_root, "publisher") etree.SubElement(pub_node, "publisher_name").text = book_meta["publisher_name"] # 7. PUBLISHER ITEM (Report Number) - Only for report-paper if book_type == "report-paper" and book_meta.get("report_number"): publisher_item = etree.SubElement(metadata_root, "publisher_item") etree.SubElement(publisher_item, "identifier", id_type="report-number").text = book_meta["report_number"] # 8. DOI DATA if book_meta.get("book_doi") or book_meta.get("book_resource"): doi_data = etree.SubElement(metadata_root, "doi_data") if book_meta.get("book_doi"): etree.SubElement(doi_data, "doi").text = book_meta["book_doi"] if book_meta.get("book_resource"): etree.SubElement(doi_data, "resource").text = book_meta["book_resource"] # 10. COMPONENTS (Chapters) # Sort and append chapters if book_type != "report-paper": # For books, chapters are children of node # But wait, in the loop below we append to 'book' variable. # 'book' variable is only defined if book_type != 'report-paper'. pass chapter_items.sort(key=lambda x: x[1]) for ci, _page in chapter_items: if book_type == "report-paper": report_paper.append(ci) else: book.append(ci) xml_bytes = etree.tostring( doi_batch, pretty_print=True, encoding="UTF-8", xml_declaration=True ) return xml_bytes class CrossrefSchemaResolver(etree.Resolver): """Custom resolver to fetch included XSD schemas from Crossref and W3C.""" def resolve(self, url, id, context): # Map of known schema locations schema_map = { 'mathml3-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-content.xsd', 'mathml3-presentation.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-presentation.xsd', 'mathml3-strict-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-strict-content.xsd', 'mathml3-common.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-common.xsd', } # Determine the URL to fetch if url.startswith("http://") or url.startswith("https://"): schema_url = url elif url in schema_map: schema_url = schema_map[url] else: schema_url = f"https://www.crossref.org/schemas/{url}" try: response = requests.get(schema_url, timeout=15) response.raise_for_status() return self.resolve_string(response.content, context) except Exception: # If fetching fails, return None to use default behavior return None def validate_crossref_xml(xml_bytes: bytes) -> tuple[bool, list[str]]: """ Validiert Crossref XML gegen das offizielle XSD Schema. Returns: tuple: (is_valid, error_messages) """ errors = [] try: # Parse XML doc = etree.fromstring(xml_bytes) # Crossref XSD Schema URL schema_url = "https://www.crossref.org/schemas/crossref4.4.2.xsd" # Download schema (mit Caching in Session State für Performance) if 'crossref_schema' not in st.session_state: try: # Create parser with custom resolver parser = etree.XMLParser() parser.resolvers.add(CrossrefSchemaResolver()) # Download main schema schema_resp = requests.get(schema_url, timeout=30) schema_resp.raise_for_status() # Parse schema with resolver schema_doc = etree.fromstring(schema_resp.content, parser) st.session_state.crossref_schema = etree.XMLSchema(schema_doc) except Exception as e: errors.append(f"Fehler beim Laden des XSD Schemas: {e}") return False, errors schema = st.session_state.crossref_schema # Validierung is_valid = schema.validate(doc) if not is_valid: for error in schema.error_log: errors.append(f"Zeile {error.line}: {error.message}") return is_valid, errors except etree.XMLSyntaxError as e: errors.append(f"XML Syntax Fehler: {e}") return False, errors except Exception as e: errors.append(f"Unerwarteter Fehler: {e}") return False, errors # ===================================================================== # REPOSITORY CONFIGURATION # ===================================================================== REPO_CONFIG = { "wsl": { "publisher": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL", "registrant": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL", "prefix": "10.55419", "role": "wslx" }, "psi": { "publisher": "Paul Scherrer Institute, PSI", "registrant": "Paul Scherrer Institute, PSI", "prefix": "10.55402", "role": "psit" }, "empa": { "publisher": "Swiss Federal Laboratories for Materials Science and Technology, Empa", "registrant": "Swiss Federal Laboratories for Materials Science and Technology, Empa", "prefix": "10.55368", "role": "empa" }, "eawag": { "publisher": "Swiss Federal Institute of Aquatic Science and Technology, Eawag", "registrant": "Swiss Federal Institute of Aquatic Science and Technology, Eawag", "prefix": "10.55408", "role": "eawa" } } def main(): st.title("Crossref XML Generator/Uploader") st.markdown( "Dieses Dashboard lädt **MODS-Metadaten direkt aus DORA** mittels IDs " "und erzeugt ein vollständiges Crossref-XML (`doi_batch`) für Reports (WSL Berichte und PSI Berichte) und Edited Books/Conference Proceedings." ) st.subheader("Konfiguration & Quelle") col_config, col_source = st.columns(2) with col_config: st.markdown("#### Verbindung & Typ") base_url = st.text_input( "DORA Basis-URL", value="https://admin.dora.lib4ri.ch" ) repo_list = list(REPO_CONFIG.keys()) repo_code = st.selectbox( "Repository-Code", options=repo_list, index=0, format_func=lambda x: x.upper() ) repo_config = REPO_CONFIG[repo_code] repo_base_url = f"{base_url.rstrip('/')}/{repo_code}" pub_type = st.radio( "Publikationstyp", ("Edited Book", "Report (WSL, Monograph Series)", "Report (Eawag, PSI, Paper Series)"), horizontal=False ) # Mapping auf Crossref book_type / report type cr_book_type = "edited_book" if "Monograph" in pub_type: cr_book_type = "monograph" elif "Paper Series" in pub_type: cr_book_type = "report-paper" with col_source: st.markdown("#### MODS-Quelle") # Dynamic default ID based on repo default_id = "41891" if repo_code == "psi": default_id = "84057" book_id_or_url = st.text_input( "DORA-ID oder MODS-URL", value=f"{repo_code}:{default_id}", help="Beispiel: wsl:41900 oder komplette URL" ) st.write("") # Spacer if st.button("Metadaten laden", type="primary"): try: mods_url = build_dora_mods_url(base_url, repo_code, book_id_or_url) st.info(f"Lade MODS von: {mods_url}") book_root = fetch_mods_xml(mods_url) meta = parse_book_mods(book_root, repo_base_url) # --- Attempt to extract report number from MODS --- ns = book_root.nsmap.copy() if "mods" not in ns: ns["mods"] = MODS_NS report_num = get_text(book_root, ".//mods:identifier[@type='report number']", ns) if not report_num: report_num = get_text(book_root, ".//mods:identifier[@type='report-number']", ns) if not report_num: # Check report_num = get_text(book_root, ".//mods:note[@type='report number']", ns) if report_num: meta["report_number"] = report_num st.info(f"Report Number gefunden: {report_num}") # -------------------------------------------------- # Update flat fields in session state for widgets for k, v in meta.items(): if k in ["book_title", "series_title", "series_issn", "publisher_name", "pub_year", "pub_month", "pub_day", "noisbn_reason", "book_doi", "book_resource", "report_number"]: st.session_state[k] = v st.session_state.book_meta[k] = v # Special handling for persons text area if cr_book_type in ["monograph", "report-paper"]: current_list = meta.get("authors", []) else: current_list = meta.get("editors", []) st.session_state["persons_input"] = "\n".join(f"{e['given']};{e['family']}" for e in current_list) st.session_state.book_meta_loaded = True st.success("Metadaten erfolgreich geladen.") st.rerun() except Exception as e: st.error(f"Fehler beim Laden der MODS: {e}") import traceback st.text(traceback.format_exc()) # Session State Init Logic (unchanged but placed after UI definition for clarity in reading flow, strictly it runs before inputs generally) if "book_meta_loaded" not in st.session_state: st.session_state.book_meta_loaded = False # Current date for defaults today = datetime.date.today() # Initialize session state keys for widgets if not present if "book_title" not in st.session_state: st.session_state.book_title = "" if "series_title" not in st.session_state: st.session_state.series_title = "" if "series_issn" not in st.session_state: st.session_state.series_issn = "" if "publisher_name" not in st.session_state: st.session_state.publisher_name = repo_config["publisher"] if "pub_year" not in st.session_state: st.session_state.pub_year = today.year if "pub_month" not in st.session_state: st.session_state.pub_month = str(today.month) if "pub_day" not in st.session_state: st.session_state.pub_day = str(today.day) if "noisbn_reason" not in st.session_state: st.session_state.noisbn_reason = "" if "book_doi" not in st.session_state: st.session_state.book_doi = "" if "book_resource" not in st.session_state: st.session_state.book_resource = "" if "report_number" not in st.session_state: st.session_state.report_number = "" if "persons_input" not in st.session_state: st.session_state.persons_input = "" if "book_meta" not in st.session_state: st.session_state.book_meta = { "book_title": "", "series_title": "", "series_issn": "", "publisher_name": repo_config["publisher"], "pub_year": today.year, "pub_month": str(today.month), "pub_day": str(today.day), "noisbn_reason": "", "book_doi": "", "book_resource": "", "report_number": "", "editors": [], "authors": [], } # CHECK: has the repo code changed since last run? if "last_repo_code" not in st.session_state: st.session_state.last_repo_code = repo_code st.session_state.registrant = repo_config["registrant"] st.session_state.cr_role = repo_config.get("role", "") if st.session_state.last_repo_code != repo_code: # Repo changed! Update defaults st.session_state.publisher_name = repo_config["publisher"] st.session_state.book_meta["publisher_name"] = repo_config["publisher"] st.session_state.registrant = repo_config["registrant"] # If the user hasn't typed anything yet or if we force update? # Let's force update the role in session state so the input widget picks it up st.session_state.cr_role = repo_config.get("role", "") st.session_state.last_repo_code = repo_code st.markdown("---") st.subheader("Metadaten & Inhalte") # Use expander for metadata editing to keep UI clean with st.expander("Metadaten bearbeiten", expanded=True): bm = st.session_state.book_meta col_b1, col_b2 = st.columns(2) with col_b1: st.text_input("Titel", key="book_title") st.text_input("Serientitel", key="series_title") st.text_input("Serien-ISSN", key="series_issn") st.text_input("Publisher Name", key="publisher_name") if cr_book_type == "report-paper": st.text_input("Report Number", key="report_number") with col_b2: c_y, c_m, c_d = st.columns(3) with c_y: st.number_input("Jahr", min_value=1900, max_value=2100, key="pub_year") with c_m: st.text_input("Monat", key="pub_month") with c_d: st.text_input("Tag", key="pub_day") if cr_book_type != "report-paper": st.text_input("noisbn reason", key="noisbn_reason") st.markdown("##### Identifikatoren") col_id1, col_id2 = st.columns(2) with col_id1: st.text_input("DOI", key="book_doi") with col_id2: st.text_input("Resource URL", key="book_resource") st.caption(f"Basis DOI Prefix: {repo_config['prefix']}") st.markdown("##### Mitwirkende") # Decide label based on type if cr_book_type in ["monograph", "report-paper"]: st.info("Bitte **Autoren** eintragen (Vorname;Nachname).") label = "Autoren" else: st.info("Bitte **Editoren** eintragen (Vorname;Nachname).") label = "Editoren" persons_text = st.text_area(label, key="persons_input", height=100) # Parse and save back new_persons = [] for line in persons_text.splitlines(): line = line.strip() if not line: continue parts = [p.strip() for p in line.split(";")] if len(parts) == 2: new_persons.append({"given": parts[0], "family": parts[1]}) if cr_book_type in ["monograph", "report-paper"]: bm["authors"] = new_persons else: bm["editors"] = new_persons st.markdown("---") st.subheader("Depositor & Batch Info") with st.expander("Depositor Details", expanded=False): col_d1, col_d2 = st.columns(2) with col_d1: depositor_name = st.text_input( "Depositor Name", value="Lib4RI - Library for the Research Institutes within the ETH Domain: Eawag, Empa, PSI & WSL" ) with col_d2: depositor_email = st.text_input("Depositor Email", value="dora@lib4ri.ch") ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S") batch_prefix = "book" if cr_book_type == "report-paper": batch_prefix = "report" elif cr_book_type == "monograph": batch_prefix = "monograph" doi_batch_id = st.text_input( "DOI Batch ID", value=f"{batch_prefix}_{ts}", help="Wird im XML-Header verwendet. Sollte eindeutig sein." ) if "registrant" not in st.session_state: st.session_state.registrant = repo_config["registrant"] registrant = st.text_input("Registrant", value=st.session_state.registrant) st.session_state.registrant = registrant depositor_meta = { "depositor_name": depositor_name, "depositor_email": depositor_email, "registrant": st.session_state.registrant, "doi_batch_id": doi_batch_id } st.subheader("Kapitel / Inhalte") st.caption("Ein Eintrag pro Zeile: ID (z.B. wsl:12345) oder URL") st.markdown( "Gib **eine DORA-ID** (z.B. `wsl:41900`) oder eine **komplette MODS-URL** " "pro Zeile ein." ) chapters_text = st.text_area("Kapitel-Liste", height=200, help="Liste der IDs oder URLs") st.markdown("---") st.subheader("XML Generierung") if st.button("Crossref XML generieren", type="primary"): try: chapter_items = [] for line in chapters_text.splitlines(): line = line.strip() if not line: continue mods_url = build_dora_mods_url(base_url, repo_code, line) st.write(f"Lade Kapitel-MODS von: {mods_url}") mods_root = fetch_mods_xml(mods_url) ci, page_no = mods_to_content_item(mods_root, repo_base_url) chapter_items.append((ci, page_no)) if not chapter_items and cr_book_type == "edited_book": st.warning("Keine Kapitel angegeben! Ein Edited Book sollte normalerweise Kapitel enthalten.") # book_meta aus session state / widgets zusammenbauen book_meta = { "book_title": st.session_state.book_title, "series_title": st.session_state.series_title, "series_issn": st.session_state.series_issn, "publisher_name": st.session_state.publisher_name, "pub_year": int(st.session_state.pub_year) if st.session_state.get("pub_year") else 0, "pub_month": st.session_state.pub_month, "pub_day": st.session_state.pub_day, "noisbn_reason": st.session_state.get("noisbn_reason", ""), "book_doi": st.session_state.book_doi, "book_resource": st.session_state.book_resource, "report_number": st.session_state.get("report_number", ""), "editors": new_persons if cr_book_type not in ["monograph", "report-paper"] else [], "authors": new_persons if cr_book_type in ["monograph", "report-paper"] else [], } xml_bytes = build_doi_batch_xml(book_meta, depositor_meta, chapter_items, book_type=cr_book_type) # Store in session state st.session_state.crossref_xml = xml_bytes st.session_state.crossref_filename = "crossref_edited_book.xml" st.success("Crossref XML erfolgreich erzeugt!") # Validierung gegen Crossref XSD Schema st.subheader("XML Validierung") with st.spinner("Validiere XML gegen Crossref Schema..."): is_valid, validation_errors = validate_crossref_xml(xml_bytes) if is_valid: st.success("✓ XML ist valide und bereit für Crossref!") else: st.error("✗ XML Validierung fehlgeschlagen:") for error in validation_errors: st.error(f" • {error}") st.warning("Das XML kann trotzdem heruntergeladen werden, wird aber möglicherweise von Crossref abgelehnt.") except Exception as e: st.error(f"Fehler bei der Erzeugung des XML: {e}") import traceback st.text(traceback.format_exc()) # Display Download and Upload if XML exists in session state if "crossref_xml" in st.session_state: xml_bytes = st.session_state.crossref_xml # Download Button st.download_button( label="XML herunterladen", data=xml_bytes, file_name=st.session_state.crossref_filename, mime="application/xml" ) # --------------------------------------------------------- # Crossref Upload Section # --------------------------------------------------------- st.markdown("---") st.subheader("Automatischer Upload zu Crossref") # Determine default role if not in session state if "cr_role" not in st.session_state: st.session_state.cr_role = REPO_CONFIG.get(st.session_state.last_repo_code, {}).get("role", "") col_u1, col_u2 = st.columns(2) with col_u1: cr_user = st.text_input("Crossref Username", value="dora@lib4ri.ch") # Use key to bind to session state cr_role = st.text_input("Crossref Role (wslx, empa, eawa, psit)", key="cr_role") with col_u2: cr_pass = st.text_input("Crossref Password", type="password") if st.button("Upload to Crossref"): if not cr_user or not cr_pass: st.error("Bitte Username und Passwort für Crossref angeben.") else: with st.spinner("Lade zu Crossref hoch..."): res = upload_to_crossref(xml_bytes, cr_user, cr_pass, cr_role) if isinstance(res, str) and res.startswith("Exception"): st.error(f"Upload fehlgeschlagen: {res}") else: # Crossref returns 200 even on some logic errors, text contains details if res.status_code == 200: if "successfully received" in res.text: st.success("Upload erfolgreich! Crossref hat die Datei empfangen.") with st.expander("Server-Antwort ansehen"): st.text(res.text) else: st.warning("Upload technisch erfolgreich (HTTP 200), aber Crossref meldet eventuell Fehler.") with st.expander("Server-Antwort ansehen (Fehleranalyse)"): st.text(res.text) else: st.error(f"HTTP Fehler: {res.status_code}") st.text(res.text) def upload_to_crossref(xml_content, username, password, role=None): url = "https://doi.crossref.org/servlet/deposit" # Construct login_id with role if provided (format: username/role) login_id = username if role and role.strip(): login_id = f"{username}/{role.strip()}" # Multipart form data # 'operation': 'doMDUpload' # 'login_id': username (or username/role) # 'login_passwd': password # 'fname': (filename, file_content, content_type) files = { 'fname': ('crossref_submission.xml', xml_content, 'application/xml') } data = { 'operation': 'doMDUpload', 'login_id': login_id, 'login_passwd': password } try: response = requests.post(url, files=files, data=data, timeout=60) return response except Exception as e: return f"Exception: {e}" if __name__ == "__main__": main()