Make the parsing compatible with the new structure of HTML exports
This commit is contained in:
parent
08a9f7bd0e
commit
5c4b0106fc
@ -15,6 +15,11 @@ class IncompleteRoundException(Exception):
|
||||
super(IncompleteRoundException, self).__init__(*args)
|
||||
|
||||
|
||||
class CannotParseRowException(Exception):
|
||||
def __init__(self, *args):
|
||||
super(CannotParseRowException, self).__init__(*args)
|
||||
|
||||
|
||||
class HtmlParser:
|
||||
def __init__(self, text: str, fileName: str = None):
|
||||
self.l = logging.getLogger("solo_turnier.html_parser")
|
||||
@ -56,46 +61,119 @@ class HtmlParser:
|
||||
def parseResult(self) -> HtmlResultImport:
|
||||
participants = {}
|
||||
|
||||
def __parseRows(rows, finalist: bool):
|
||||
def __parseRow(row):
|
||||
tds = row.find_all("td")
|
||||
nameRegex = re.compile("(.*) \\(([0-9]+)\\)")
|
||||
def __parseNameAndId(string: str, tds) -> tuple[str, str]:
|
||||
match = nameRegex.fullmatch(string)
|
||||
if match is None:
|
||||
self.l.error("Could not match %s to regex search pattern", str(tds))
|
||||
raise CannotParseRowException(f"Could not match {tds} to regex search pattern for 'name (id)'")
|
||||
name = match.group(1)
|
||||
number = match.group(2)
|
||||
return name, number
|
||||
|
||||
if len(tds) != 2:
|
||||
return
|
||||
def __parseRows(rows, parsers):
|
||||
def parseRow(row):
|
||||
for parser in parsers:
|
||||
try:
|
||||
parser(row('td'))
|
||||
return
|
||||
except CannotParseRowException:
|
||||
pass
|
||||
|
||||
# No parser was found if we get here.
|
||||
self.l.error('Cannot parse row in table.')
|
||||
|
||||
if tds[1].contents[0].startswith("Alle Starter weiter genommen."):
|
||||
self.l.info("No excluded starters found.")
|
||||
return
|
||||
|
||||
regex = re.compile("(.*) \\(([0-9]+)\\)")
|
||||
for row in rows:
|
||||
parseRow(row)
|
||||
|
||||
def __ensureLength(tds, length):
|
||||
if len(tds) != length:
|
||||
raise CannotParseRowException('The row has %d entries but %d are expected.' % (len(tds), length))
|
||||
|
||||
def __parseFormationRowGeneric(tds, finalist):
|
||||
__ensureLength(tds, 2)
|
||||
|
||||
place = tds[0].contents[0]
|
||||
|
||||
match = regex.fullmatch(tds[1].contents[0])
|
||||
if match is None:
|
||||
self.l.error("Could not match %s to regex search pattern", str(tds))
|
||||
raise Exception(f"Could not match {tds} to regex search pattern")
|
||||
name = match.group(1)
|
||||
number = match.group(2)
|
||||
name, number = __parseNameAndId(tds[1].contents[0], tds)
|
||||
|
||||
participant = HtmlParticipant(name, number)
|
||||
participant.finalist = finalist
|
||||
participant.club = ''
|
||||
participants[participant] = place
|
||||
|
||||
for row in rows:
|
||||
__parseRow(row)
|
||||
|
||||
def __parseFirstTable(table):
|
||||
roundName = table.tr.td.contents[0]
|
||||
if roundName != "Endrunde":
|
||||
self.l.warning("Found table with round name %s.", roundName)
|
||||
raise IncompleteRoundException("Could not parse HTML file")
|
||||
|
||||
__parseRows(table.find_all("tr")[2:], True)
|
||||
def __parseFormationRow(tds):
|
||||
__parseFormationRowGeneric(tds, True)
|
||||
|
||||
def __parsePairRow(tds):
|
||||
__ensureLength(tds, 4)
|
||||
|
||||
place = tds[0].contents[0]
|
||||
tdNameClub = tds[1]
|
||||
tdClub = tdNameClub.i.extract()
|
||||
name, number = __parseNameAndId(tdNameClub.contents[0], tds)
|
||||
|
||||
participant = HtmlParticipant(name, number)
|
||||
participant.finalist = True
|
||||
participant.club = tdClub.contents[0]
|
||||
|
||||
participants[participant] = place
|
||||
|
||||
__parseRows(table.find_all("tr")[2:], [__parsePairRow, __parseFormationRow,])
|
||||
|
||||
def __parseRemainingTables(tables):
|
||||
|
||||
def __parseFormationRow(tds):
|
||||
__parseFormationRowGeneric(tds, False)
|
||||
|
||||
def __parsePairRow(tds):
|
||||
__ensureLength(tds, 3)
|
||||
|
||||
place = tds[0].contents[0]
|
||||
name, number = __parseNameAndId(tds[1].contents[0], tds)
|
||||
|
||||
participant = HtmlParticipant(name, number)
|
||||
participant.finalist = True
|
||||
participant.club = tds[2].contents[0]
|
||||
|
||||
participants[participant] = place
|
||||
|
||||
def __parseSeparatorRow(tds):
|
||||
__ensureLength(tds, 1)
|
||||
if len(list(tds[0].stripped_strings)) == 0:
|
||||
return
|
||||
raise CannotParseRowException('No empty string')
|
||||
|
||||
regexZwischenRunde = re.compile('[1-9]\. Zwischenrunde')
|
||||
def __parseRoundHeading(tds):
|
||||
__ensureLength(tds, 1)
|
||||
s = ''.join(tds[0].stripped_strings)
|
||||
if s.startswith('Vorrunde'):
|
||||
return
|
||||
if regexZwischenRunde.match(s) is not None:
|
||||
return
|
||||
raise CannotParseRowException('Kein Header einer Runde gefunden.')
|
||||
|
||||
def __parseAllSolosQualifiedFormation(tds):
|
||||
__ensureLength(tds, 2)
|
||||
if tds[1].contents[0].startswith("Alle Starter weiter genommen."):
|
||||
return
|
||||
raise CannotParseRowException('Not found the text "Alle Starter weiter genommen"')
|
||||
|
||||
def __parseAllSolosQualifiedPair(tds):
|
||||
__ensureLength(tds, 3)
|
||||
if tds[1].contents[0].startswith("Alle Mannschaften weiter genommen."):
|
||||
return
|
||||
raise CannotParseRowException('Not found the text "Alle Mannschaften weiter genommen"')
|
||||
|
||||
|
||||
for table in tables:
|
||||
__parseRows(table.find_all("tr"), False)
|
||||
__parseRows(table.find_all("tr"), [__parseAllSolosQualifiedFormation, __parseAllSolosQualifiedPair, __parsePairRow, __parseFormationRow, __parseSeparatorRow, __parseRoundHeading])
|
||||
|
||||
tables = self.soup.find("div", class_="extract").find_all("table")
|
||||
|
||||
|
@ -3,6 +3,7 @@ class HtmlParticipant:
|
||||
self.name = name
|
||||
self.id = id
|
||||
self.finalist = None
|
||||
self.club = None
|
||||
|
||||
def __eq__(self, o):
|
||||
if type(o) != HtmlParticipant:
|
||||
|
Loading…
Reference in New Issue
Block a user