| import spacy, re |
| from spacy.tokenizer import Tokenizer |
|
|
| """ |
| python -m spacy package ref_he packages -c /Users/nss/sefaria/project/sefaria/spacy_function_registry.py -b wheel,sdist -n ref_ner -v 1.0.0 |
| python -m spacy huggingface-hub push packages/en_torah_ner-1.0.0/dist/en_torah_ner-1.0.0-py3-none-any.whl -o Sefaria |
| """ |
|
|
| @spacy.registry.tokenizers("inner_punct_tokenizer") |
| def inner_punct_tokenizer_factory(): |
| def inner_punct_tokenizer(nlp): |
| |
| infix_re = re.compile(r'''[.,?!:;…‘’`“”"'~–\-/()<>]''') |
| prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) |
| suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) |
|
|
| return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, |
| suffix_search=suffix_re.search, |
| infix_finditer=infix_re.finditer, |
| token_match=None) |
| return inner_punct_tokenizer |