| from yargy import ( |
| Parser, |
| rule, |
| and_, or_ |
| ) |
| from yargy.interpretation import fact |
| from yargy.predicates import ( |
| eq, gte, lte, length_eq, |
| dictionary, normalized, |
| ) |
| import pandas as pd |
|
|
|
|
| DateRange = fact( |
| 'DateRange', |
| ['start_day', 'start_month', 'start_year', 'stop_day', 'stop_month', 'stop_year'] |
| ) |
| class DateRange(DateRange): |
| years_collection = [1900] |
|
|
| @property |
| def normalized(self): |
| if self.start_year != None: |
| self.years_collection.append(self.start_year) |
| else: |
| self.start_year = self.years_collection[-1] |
| |
| if self.start_day == None: |
| self.start_day = 0 |
|
|
| if self.start_month == None: |
| self.start_month = 0 |
|
|
| if self.stop_year == None: |
| self.stop_year = self.start_year |
| |
| if self.stop_month == None: |
| self.stop_month = self.start_month |
| |
| if self.stop_day == None: |
| self.stop_day = self.start_day |
|
|
| return self |
| @property |
| def get_start_date(self): |
| return str(self.start_year) + '-' + str(self.start_month).zfill(2) + '-' + str(self.start_day).zfill(2) |
| @property |
| def get_stop_date(self): |
| return str(self.stop_year) + '-' + str(self.stop_month).zfill(2) + '-' + str(self.stop_day).zfill(2) |
|
|
| MONTHS = { |
| 'январь': 1, |
| 'февраль': 2, |
| 'март': 3, |
| 'апрель': 4, |
| 'май': 5, |
| 'июнь': 6, |
| 'июль': 7, |
| 'август': 8, |
| 'сентябрь': 9, |
| 'октябрь': 10, |
| 'ноябрь': 11, |
| 'декабрь': 12, |
| } |
|
|
|
|
| MONTHS_LATIN = { |
| 'I': 1, |
| 'II': 2, |
| 'III': 3, |
| 'IV': 4, |
| 'V': 5, |
| 'VI': 6, |
| 'VII': 7, |
| 'VIII': 8, |
| 'IX': 9, |
| 'X': 10, |
| 'XI': 11, |
| 'XII': 12 |
| } |
|
|
| DAY_START = and_( |
| gte(1), |
| lte(31) |
| ).interpretation( |
| DateRange.start_day.custom(int) |
| ) |
|
|
| DAY_STOP = and_( |
| gte(1), |
| lte(31) |
| ).interpretation( |
| DateRange.stop_day.custom(int) |
| ) |
|
|
| MONTH_NAME_START = dictionary(MONTHS).interpretation( |
| DateRange.start_month.normalized().custom(MONTHS.__getitem__) |
| ) |
|
|
| MONTH_NAME_STOP = dictionary(MONTHS).interpretation( |
| DateRange.stop_month.normalized().custom(MONTHS.__getitem__) |
| ) |
|
|
| MONTH_LATIN_NAME_START = dictionary(MONTHS_LATIN).interpretation( |
| DateRange.start_month.custom(MONTHS_LATIN.__getitem__) |
| ) |
|
|
| MONTH_LATIN_NAME_STOP = dictionary(MONTHS_LATIN).interpretation( |
| DateRange.stop_month.custom(MONTHS_LATIN.__getitem__) |
| ) |
|
|
| MONTH_START = and_( |
| gte(1), |
| lte(12) |
| ).interpretation( |
| DateRange.start_month.custom(int) |
| ) |
|
|
| MONTH_STOP = and_( |
| gte(1), |
| lte(12) |
| ).interpretation( |
| DateRange.stop_month.custom(int) |
| ) |
|
|
|
|
| YEAR_START = and_( |
| gte(1800), |
| lte(2100) |
| ).interpretation( |
| DateRange.start_year.custom(int) |
| ) |
|
|
| YEAR_STOP = and_( |
| gte(1800), |
| lte(2100) |
| ).interpretation( |
| DateRange.stop_year.custom(int) |
| ) |
|
|
| YEAR_SHORT_START = and_( |
| length_eq(2), |
| gte(0), |
| lte(99) |
| ).interpretation( |
| DateRange.start_year.custom(lambda _: 1900 + int(_)) |
| ) |
|
|
| YEAR_SHORT_STOP = and_( |
| length_eq(2), |
| gte(0), |
| lte(99) |
| ).interpretation( |
| DateRange.stop_year.custom(lambda _: 1900 + int(_)) |
| ) |
|
|
| YEAR_WORD = or_( |
| rule('г', eq('.').optional()), |
| rule(normalized('год')) |
| ) |
|
|
| PUNCT_DIVISION_DATES = or_( |
| rule('-'), |
| rule('—'), |
| rule('—') |
| ) |
|
|
| PUNCT = or_( |
| rule('.'), |
| rule('/') |
| ) |
|
|
| DATE_RANGE = or_( |
| |
| rule( |
| DAY_START, |
| PUNCT_DIVISION_DATES, |
| DAY_STOP, |
| PUNCT.optional(), |
| or_( |
| MONTH_NAME_START, |
| MONTH_START, |
| MONTH_LATIN_NAME_START |
| ), |
| PUNCT.optional(), |
| or_( |
| YEAR_START, |
| YEAR_SHORT_START |
| ).optional(), |
| YEAR_WORD.optional() |
| ), |
| |
| rule( |
| DAY_START, |
| PUNCT.optional(), |
| or_( |
| MONTH_NAME_START, |
| MONTH_START, |
| MONTH_LATIN_NAME_START |
| ), |
| PUNCT_DIVISION_DATES, |
| DAY_STOP, |
| PUNCT.optional(), |
| or_( |
| MONTH_NAME_STOP, |
| MONTH_STOP, |
| MONTH_LATIN_NAME_STOP |
| ), |
| PUNCT.optional(), |
| or_( |
| YEAR_START, |
| YEAR_SHORT_START |
| ).optional(), |
| YEAR_WORD.optional() |
| ), |
| |
| rule( |
| DAY_START, |
| PUNCT.optional(), |
| or_( |
| MONTH_NAME_START, |
| MONTH_START, |
| MONTH_LATIN_NAME_START |
| ), |
| PUNCT.optional(), |
| or_( |
| YEAR_START, |
| YEAR_SHORT_START |
| ), |
| PUNCT_DIVISION_DATES, |
| DAY_STOP, |
| PUNCT.optional(), |
| or_( |
| MONTH_NAME_STOP, |
| MONTH_STOP, |
| MONTH_LATIN_NAME_STOP |
| ), |
| PUNCT.optional(), |
| or_( |
| YEAR_STOP, |
| YEAR_SHORT_STOP |
| ), |
| YEAR_WORD.optional() |
| ), |
| |
| rule( |
| DAY_START, |
| PUNCT, |
| MONTH_LATIN_NAME_START, |
| '-', |
| or_( |
| YEAR_START, |
| YEAR_SHORT_START |
| ), |
| PUNCT_DIVISION_DATES, |
| DAY_STOP, |
| PUNCT, |
| MONTH_LATIN_NAME_STOP, |
| '-', |
| or_( |
| YEAR_STOP, |
| YEAR_SHORT_STOP |
| ), |
| YEAR_WORD.optional() |
| ), |
| |
| rule( |
| DAY_START, |
| PUNCT.optional(), |
| or_( |
| MONTH_START, |
| MONTH_NAME_START, |
| MONTH_LATIN_NAME_START |
| ), |
| PUNCT.optional(), |
| or_( |
| YEAR_START, |
| YEAR_SHORT_START |
| ).optional(), |
| YEAR_WORD.optional() |
| ), |
| |
| rule( |
| DAY_START, |
| PUNCT, |
| MONTH_LATIN_NAME_START, |
| '-', |
| or_( |
| YEAR_START, |
| YEAR_SHORT_START |
| ), |
| YEAR_WORD.optional() |
| ), |
| |
| rule( |
| YEAR_START, |
| YEAR_WORD.optional() |
| ), |
| |
| rule( |
| MONTH_NAME_START, |
| or_( |
| YEAR_START, |
| YEAR_SHORT_START |
| ), |
| YEAR_WORD.optional() |
| ), |
| ).interpretation( |
| DateRange |
| ) |
|
|
|
|
| def date_extractor_for_diary(text): |
| res = { |
| 'date_start' : [], |
| 'date_stop' : [], |
| 'text' : [] |
| } |
| entry = '' |
| for paragraph in text.split('\n'): |
| parser = Parser(DATE_RANGE) |
| for match in parser.findall(paragraph): |
| record = match.fact.normalized |
| if record.spans[0].start in range (0, 3): |
| start = record.get_start_date |
| stop = record.get_stop_date |
| res['date_start'].append(start) |
| res['date_stop'].append(stop) |
| if entry != '': |
| res['text'].append(entry) |
| entry = '' |
| break |
| entry += paragraph |
| entry += '\n' |
| if entry != '': |
| res['text'].append(entry) |
| |
| return pd.DataFrame(res) |
|
|
| def normalize_dates(start, stop): |
| if start == stop: |
| return start |
| else: |
| return f'{start} - {stop}' |
|
|
| |
|
|
| |
|
|
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
|
|
| |
|
|
| |
|
|
| |
|
|
| |
|
|
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |