hugo-page/scripts/read-competition-notification/competitionNotificationReader/competitionParser.py

193 lines
7.3 KiB
Python

import bs4
import logging
import re
import os
import jinja2
class ParsingFailedEception(Exception):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
class CompetitionParser:
def __init__(self):
self._l = logging.getLogger(__name__)
self._partner = ''
self._partnerin = ''
self._date = ''
self._title = ''
self._number = ''
self._group = ''
self._class = ''
self._section = ''
self._ort = ''
self._verein = ''
self._telefon = ''
self._reName = re.compile('Neue Meldung für (.*) / (.*)!')
self._reDate = re.compile('([0-9]+)\\.([0-9]+)\\.([0-9]+)')
self._reNumber = re.compile('Turnier: ([0-9]+)')
self._rePhone = re.compile('Telefon: (\\+?[0-9 /-]+)')
self._rePlace = re.compile('Ort: (.*), (.*)')
self._reCompetition = re.compile('(.*) ([A-ES]) ((?:Std)|(?:Lat)|(?:Kombi))')
self._reWDSFCompetition = re.compile('WDSF Open ([a-zA-Z0-9 ]*) ((?:Standard)|(?:Latin))(?: *-.*)?')
self._reWDSFCompetitionReversed = re.compile('WDSF Open ((?:Standard)|(?:Latin)) ([a-zA-Z0-9 ]*)(?: *-.*)?')
self._reCleaningString = re.compile('[^a-z0-9-]')
self._reDashes = re.compile('-+')
def parseMail(self, body: str):
parser = bs4.BeautifulSoup(body, 'html.parser')
self._getNames(parser.h2)
self._parseTable(parser.table)
def _getNames(self, h2):
matcher = self._reName.match(h2.string)
if matcher is None:
self._l.error('Parsing of header "%s" failed.', h2)
raise ParsingFailedEception('Header could not be successfully parsed')
self._partner = matcher.group(1)
self._partnerin = matcher.group(2)
def _parseTable(self, table):
def parseDate(date):
match = self._reDate.fullmatch(date)
if match is None:
raise ParsingFailedEception('Cannot parse date %s in mail' % date)
self._date = f'{match.group(3)}-{match.group(2)}-{match.group(1)}'
def parseNumber(content):
match = self._reNumber.fullmatch(content)
if match is None:
raise ParsingFailedEception(f'Cannot parse the turnier number in field {content}')
self._number = match.group(1)
def parseCompetition(competition):
def parseDTVCompetition():
match = self._reCompetition.fullmatch(competition)
if match is None:
raise ParsingFailedEception(f'Cannot parse the competition line {competition}')
self._group = match.group(1)
self._class = match.group(2)
self._section = match.group(3)
def parseWDSFCompetition():
def checkMatch(match):
if match is None:
raise ParsingFailedEception(f'Cannot parse WDSF competition line')
def parseForward():
match = self._reWDSFCompetition.fullmatch(competition.strip())
checkMatch(match)
return match.group(2), match.group(1)
def parseReverse():
match = self._reWDSFCompetitionReversed.fullmatch(competition.strip())
checkMatch(match)
return match.group(1), match.group(2).strip()
groupMap = {
'juvenile i': 'Kin',
'juvenile ii': 'Kin',
'junior i': 'Jun 1',
'junior ii': 'Jun 2',
'youth': 'Jug',
'adult': 'Hgr',
'senior i': 'Mas I',
'senior ii': 'Mas II',
'senior iii': 'Mas III',
'senior iv': 'Mas IV',
'senior v': 'Mas V',
}
sectionMap = {
'standard': 'Std',
'latin': 'Lat',
}
funs = [parseForward, parseReverse]
for fun in funs:
try:
sec, grp = fun()
self._group = groupMap.get(grp.lower(), grp)
self._class = 'WDSF Open'
self._section = sectionMap.get(sec.lower(), sec)
return
except ParsingFailedEception:
pass
raise ParsingFailedEception('Neither forward not reversed parsing worked')
functions = [parseDTVCompetition, parseWDSFCompetition]
for fun in functions:
try:
fun()
return
except ParsingFailedEception:
pass
raise ParsingFailedEception(f'No more matchers for the competition line "{competition}" were left.')
def parsePlace(place):
match = self._rePlace.fullmatch(place)
if match is None:
raise ParsingFailedEception(f'Cannot parse the place entry {place}')
self._verein = match.group(1)
self._ort = match.group(2)
def parsePhone(phone):
match = self._rePhone.fullmatch(phone)
if match is None:
raise ParsingFailedEception(f'Cannot parse the phone line {phone}')
self._telefon = match.group(1)
tds = table('td')
parseDate(tds[0].string.strip())
self._title = tds[1].string.strip()
parseNumber(tds[2].string.strip())
parseCompetition(tds[3].string.strip())
parsePlace(tds[4].string.strip())
parsePhone(tds[5].string.strip())
def _cleanName(self, name: str) -> str:
cleanedName = name.lower()
cleanedName = re.sub('ä', 'ae', cleanedName)
cleanedName = re.sub('ö', 'oe', cleanedName)
cleanedName = re.sub('ü', 'ue', cleanedName)
cleanedName = re.sub('ß', 'ss', cleanedName)
cleanedName = re.sub(self._reCleaningString, '-', cleanedName)
cleanedName = re.sub(self._reDashes, '-', cleanedName)
return cleanedName.lower()
def getFilename(self, prefix: str) -> str:
namePartner = self._cleanName(self._partner)
namePartnerin = self._cleanName(self._partnerin)
competition = f'{self._group} {self._class} {self._section}'
competitionName = self._cleanName(competition)
ort = self._cleanName(self._ort)
filename = f'{self._date}-{ort}-{namePartner}-{namePartnerin}-{competitionName}.md'
return os.path.join(
prefix,
self._date[0:4],
re.sub(self._reDashes, '-', filename)
)
def getContent(self) -> str:
with open(os.path.join(os.path.dirname(__file__), 'contenttemplate.md.tmpl')) as fp:
tpl = fp.read()
j2 = jinja2.Template(tpl)
vars = {
'date': self._date,
'partner': self._partner,
'partnerin': self._partnerin,
'verein': self._verein,
'ort': self._ort,
'telefon': self._telefon,
'group': self._group,
'class': self._class,
'section': self._section,
'title': self._title,
'number': self._number,
}
return j2.render(**vars)