Make the parsing compatible with the new structure of HTML exports

This commit is contained in:
Christian Wolf 2024-03-14 10:56:00 +01:00
parent 08a9f7bd0e
commit 5c4b0106fc
2 changed files with 101 additions and 22 deletions

View File

@ -15,6 +15,11 @@ class IncompleteRoundException(Exception):
super(IncompleteRoundException, self).__init__(*args) super(IncompleteRoundException, self).__init__(*args)
class CannotParseRowException(Exception):
def __init__(self, *args):
super(CannotParseRowException, self).__init__(*args)
class HtmlParser: class HtmlParser:
def __init__(self, text: str, fileName: str = None): def __init__(self, text: str, fileName: str = None):
self.l = logging.getLogger("solo_turnier.html_parser") self.l = logging.getLogger("solo_turnier.html_parser")
@ -56,46 +61,119 @@ class HtmlParser:
def parseResult(self) -> HtmlResultImport: def parseResult(self) -> HtmlResultImport:
participants = {} participants = {}
def __parseRows(rows, finalist: bool): nameRegex = re.compile("(.*) \\(([0-9]+)\\)")
def __parseRow(row): def __parseNameAndId(string: str, tds) -> tuple[str, str]:
tds = row.find_all("td") match = nameRegex.fullmatch(string)
if match is None:
self.l.error("Could not match %s to regex search pattern", str(tds))
raise CannotParseRowException(f"Could not match {tds} to regex search pattern for 'name (id)'")
name = match.group(1)
number = match.group(2)
return name, number
if len(tds) != 2: def __parseRows(rows, parsers):
return def parseRow(row):
for parser in parsers:
try:
parser(row('td'))
return
except CannotParseRowException:
pass
if tds[1].contents[0].startswith("Alle Starter weiter genommen."): # No parser was found if we get here.
self.l.info("No excluded starters found.") self.l.error('Cannot parse row in table.')
return
regex = re.compile("(.*) \\(([0-9]+)\\)") for row in rows:
parseRow(row)
def __ensureLength(tds, length):
if len(tds) != length:
raise CannotParseRowException('The row has %d entries but %d are expected.' % (len(tds), length))
def __parseFormationRowGeneric(tds, finalist):
__ensureLength(tds, 2)
place = tds[0].contents[0] place = tds[0].contents[0]
name, number = __parseNameAndId(tds[1].contents[0], tds)
match = regex.fullmatch(tds[1].contents[0])
if match is None:
self.l.error("Could not match %s to regex search pattern", str(tds))
raise Exception(f"Could not match {tds} to regex search pattern")
name = match.group(1)
number = match.group(2)
participant = HtmlParticipant(name, number) participant = HtmlParticipant(name, number)
participant.finalist = finalist participant.finalist = finalist
participant.club = ''
participants[participant] = place participants[participant] = place
for row in rows:
__parseRow(row)
def __parseFirstTable(table): def __parseFirstTable(table):
roundName = table.tr.td.contents[0] roundName = table.tr.td.contents[0]
if roundName != "Endrunde": if roundName != "Endrunde":
self.l.warning("Found table with round name %s.", roundName) self.l.warning("Found table with round name %s.", roundName)
raise IncompleteRoundException("Could not parse HTML file") raise IncompleteRoundException("Could not parse HTML file")
__parseRows(table.find_all("tr")[2:], True) def __parseFormationRow(tds):
__parseFormationRowGeneric(tds, True)
def __parsePairRow(tds):
__ensureLength(tds, 4)
place = tds[0].contents[0]
tdNameClub = tds[1]
tdClub = tdNameClub.i.extract()
name, number = __parseNameAndId(tdNameClub.contents[0], tds)
participant = HtmlParticipant(name, number)
participant.finalist = True
participant.club = tdClub.contents[0]
participants[participant] = place
__parseRows(table.find_all("tr")[2:], [__parsePairRow, __parseFormationRow,])
def __parseRemainingTables(tables): def __parseRemainingTables(tables):
def __parseFormationRow(tds):
__parseFormationRowGeneric(tds, False)
def __parsePairRow(tds):
__ensureLength(tds, 3)
place = tds[0].contents[0]
name, number = __parseNameAndId(tds[1].contents[0], tds)
participant = HtmlParticipant(name, number)
participant.finalist = True
participant.club = tds[2].contents[0]
participants[participant] = place
def __parseSeparatorRow(tds):
__ensureLength(tds, 1)
if len(list(tds[0].stripped_strings)) == 0:
return
raise CannotParseRowException('No empty string')
regexZwischenRunde = re.compile('[1-9]\. Zwischenrunde')
def __parseRoundHeading(tds):
__ensureLength(tds, 1)
s = ''.join(tds[0].stripped_strings)
if s.startswith('Vorrunde'):
return
if regexZwischenRunde.match(s) is not None:
return
raise CannotParseRowException('Kein Header einer Runde gefunden.')
def __parseAllSolosQualifiedFormation(tds):
__ensureLength(tds, 2)
if tds[1].contents[0].startswith("Alle Starter weiter genommen."):
return
raise CannotParseRowException('Not found the text "Alle Starter weiter genommen"')
def __parseAllSolosQualifiedPair(tds):
__ensureLength(tds, 3)
if tds[1].contents[0].startswith("Alle Mannschaften weiter genommen."):
return
raise CannotParseRowException('Not found the text "Alle Mannschaften weiter genommen"')
for table in tables: for table in tables:
__parseRows(table.find_all("tr"), False) __parseRows(table.find_all("tr"), [__parseAllSolosQualifiedFormation, __parseAllSolosQualifiedPair, __parsePairRow, __parseFormationRow, __parseSeparatorRow, __parseRoundHeading])
tables = self.soup.find("div", class_="extract").find_all("table") tables = self.soup.find("div", class_="extract").find_all("table")

View File

@ -3,6 +3,7 @@ class HtmlParticipant:
self.name = name self.name = name
self.id = id self.id = id
self.finalist = None self.finalist = None
self.club = None
def __eq__(self, o): def __eq__(self, o):
if type(o) != HtmlParticipant: if type(o) != HtmlParticipant: