From 5c4b0106fc9ec258f0aae767eaeff5257b2454fd Mon Sep 17 00:00:00 2001 From: Christian Wolf Date: Thu, 14 Mar 2024 10:56:00 +0100 Subject: [PATCH] Make the parsing compatible with the new structure of HTML exports --- src/solo_turnier/html_parser.py | 122 ++++++++++++++++++---- src/solo_turnier/types/htmlParticipant.py | 1 + 2 files changed, 101 insertions(+), 22 deletions(-) diff --git a/src/solo_turnier/html_parser.py b/src/solo_turnier/html_parser.py index cb17e51..d826415 100644 --- a/src/solo_turnier/html_parser.py +++ b/src/solo_turnier/html_parser.py @@ -15,6 +15,11 @@ class IncompleteRoundException(Exception): super(IncompleteRoundException, self).__init__(*args) +class CannotParseRowException(Exception): + def __init__(self, *args): + super(CannotParseRowException, self).__init__(*args) + + class HtmlParser: def __init__(self, text: str, fileName: str = None): self.l = logging.getLogger("solo_turnier.html_parser") @@ -56,46 +61,119 @@ class HtmlParser: def parseResult(self) -> HtmlResultImport: participants = {} - def __parseRows(rows, finalist: bool): - def __parseRow(row): - tds = row.find_all("td") + nameRegex = re.compile("(.*) \\(([0-9]+)\\)") + def __parseNameAndId(string: str, tds) -> tuple[str, str]: + match = nameRegex.fullmatch(string) + if match is None: + self.l.error("Could not match %s to regex search pattern", str(tds)) + raise CannotParseRowException(f"Could not match {tds} to regex search pattern for 'name (id)'") + name = match.group(1) + number = match.group(2) + return name, number - if len(tds) != 2: - return + def __parseRows(rows, parsers): + def parseRow(row): + for parser in parsers: + try: + parser(row('td')) + return + except CannotParseRowException: + pass + + # No parser was found if we get here. + self.l.error('Cannot parse row in table.') - if tds[1].contents[0].startswith("Alle Starter weiter genommen."): - self.l.info("No excluded starters found.") - return - - regex = re.compile("(.*) \\(([0-9]+)\\)") + for row in rows: + parseRow(row) + + def __ensureLength(tds, length): + if len(tds) != length: + raise CannotParseRowException('The row has %d entries but %d are expected.' % (len(tds), length)) + + def __parseFormationRowGeneric(tds, finalist): + __ensureLength(tds, 2) place = tds[0].contents[0] - - match = regex.fullmatch(tds[1].contents[0]) - if match is None: - self.l.error("Could not match %s to regex search pattern", str(tds)) - raise Exception(f"Could not match {tds} to regex search pattern") - name = match.group(1) - number = match.group(2) + name, number = __parseNameAndId(tds[1].contents[0], tds) participant = HtmlParticipant(name, number) participant.finalist = finalist + participant.club = '' participants[participant] = place - for row in rows: - __parseRow(row) - def __parseFirstTable(table): roundName = table.tr.td.contents[0] if roundName != "Endrunde": self.l.warning("Found table with round name %s.", roundName) raise IncompleteRoundException("Could not parse HTML file") - __parseRows(table.find_all("tr")[2:], True) + def __parseFormationRow(tds): + __parseFormationRowGeneric(tds, True) + + def __parsePairRow(tds): + __ensureLength(tds, 4) + + place = tds[0].contents[0] + tdNameClub = tds[1] + tdClub = tdNameClub.i.extract() + name, number = __parseNameAndId(tdNameClub.contents[0], tds) + + participant = HtmlParticipant(name, number) + participant.finalist = True + participant.club = tdClub.contents[0] + + participants[participant] = place + + __parseRows(table.find_all("tr")[2:], [__parsePairRow, __parseFormationRow,]) def __parseRemainingTables(tables): + + def __parseFormationRow(tds): + __parseFormationRowGeneric(tds, False) + + def __parsePairRow(tds): + __ensureLength(tds, 3) + + place = tds[0].contents[0] + name, number = __parseNameAndId(tds[1].contents[0], tds) + + participant = HtmlParticipant(name, number) + participant.finalist = True + participant.club = tds[2].contents[0] + + participants[participant] = place + + def __parseSeparatorRow(tds): + __ensureLength(tds, 1) + if len(list(tds[0].stripped_strings)) == 0: + return + raise CannotParseRowException('No empty string') + + regexZwischenRunde = re.compile('[1-9]\. Zwischenrunde') + def __parseRoundHeading(tds): + __ensureLength(tds, 1) + s = ''.join(tds[0].stripped_strings) + if s.startswith('Vorrunde'): + return + if regexZwischenRunde.match(s) is not None: + return + raise CannotParseRowException('Kein Header einer Runde gefunden.') + + def __parseAllSolosQualifiedFormation(tds): + __ensureLength(tds, 2) + if tds[1].contents[0].startswith("Alle Starter weiter genommen."): + return + raise CannotParseRowException('Not found the text "Alle Starter weiter genommen"') + + def __parseAllSolosQualifiedPair(tds): + __ensureLength(tds, 3) + if tds[1].contents[0].startswith("Alle Mannschaften weiter genommen."): + return + raise CannotParseRowException('Not found the text "Alle Mannschaften weiter genommen"') + + for table in tables: - __parseRows(table.find_all("tr"), False) + __parseRows(table.find_all("tr"), [__parseAllSolosQualifiedFormation, __parseAllSolosQualifiedPair, __parsePairRow, __parseFormationRow, __parseSeparatorRow, __parseRoundHeading]) tables = self.soup.find("div", class_="extract").find_all("table") diff --git a/src/solo_turnier/types/htmlParticipant.py b/src/solo_turnier/types/htmlParticipant.py index 8502d6e..dae8c52 100644 --- a/src/solo_turnier/types/htmlParticipant.py +++ b/src/solo_turnier/types/htmlParticipant.py @@ -3,6 +3,7 @@ class HtmlParticipant: self.name = name self.id = id self.finalist = None + self.club = None def __eq__(self, o): if type(o) != HtmlParticipant: