Make the parsing compatible with the new structure of HTML exports
This commit is contained in:
parent
08a9f7bd0e
commit
5c4b0106fc
@ -15,6 +15,11 @@ class IncompleteRoundException(Exception):
|
|||||||
super(IncompleteRoundException, self).__init__(*args)
|
super(IncompleteRoundException, self).__init__(*args)
|
||||||
|
|
||||||
|
|
||||||
|
class CannotParseRowException(Exception):
|
||||||
|
def __init__(self, *args):
|
||||||
|
super(CannotParseRowException, self).__init__(*args)
|
||||||
|
|
||||||
|
|
||||||
class HtmlParser:
|
class HtmlParser:
|
||||||
def __init__(self, text: str, fileName: str = None):
|
def __init__(self, text: str, fileName: str = None):
|
||||||
self.l = logging.getLogger("solo_turnier.html_parser")
|
self.l = logging.getLogger("solo_turnier.html_parser")
|
||||||
@ -56,46 +61,119 @@ class HtmlParser:
|
|||||||
def parseResult(self) -> HtmlResultImport:
|
def parseResult(self) -> HtmlResultImport:
|
||||||
participants = {}
|
participants = {}
|
||||||
|
|
||||||
def __parseRows(rows, finalist: bool):
|
nameRegex = re.compile("(.*) \\(([0-9]+)\\)")
|
||||||
def __parseRow(row):
|
def __parseNameAndId(string: str, tds) -> tuple[str, str]:
|
||||||
tds = row.find_all("td")
|
match = nameRegex.fullmatch(string)
|
||||||
|
|
||||||
if len(tds) != 2:
|
|
||||||
return
|
|
||||||
|
|
||||||
if tds[1].contents[0].startswith("Alle Starter weiter genommen."):
|
|
||||||
self.l.info("No excluded starters found.")
|
|
||||||
return
|
|
||||||
|
|
||||||
regex = re.compile("(.*) \\(([0-9]+)\\)")
|
|
||||||
|
|
||||||
place = tds[0].contents[0]
|
|
||||||
|
|
||||||
match = regex.fullmatch(tds[1].contents[0])
|
|
||||||
if match is None:
|
if match is None:
|
||||||
self.l.error("Could not match %s to regex search pattern", str(tds))
|
self.l.error("Could not match %s to regex search pattern", str(tds))
|
||||||
raise Exception(f"Could not match {tds} to regex search pattern")
|
raise CannotParseRowException(f"Could not match {tds} to regex search pattern for 'name (id)'")
|
||||||
name = match.group(1)
|
name = match.group(1)
|
||||||
number = match.group(2)
|
number = match.group(2)
|
||||||
|
return name, number
|
||||||
|
|
||||||
|
def __parseRows(rows, parsers):
|
||||||
|
def parseRow(row):
|
||||||
|
for parser in parsers:
|
||||||
|
try:
|
||||||
|
parser(row('td'))
|
||||||
|
return
|
||||||
|
except CannotParseRowException:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# No parser was found if we get here.
|
||||||
|
self.l.error('Cannot parse row in table.')
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
parseRow(row)
|
||||||
|
|
||||||
|
def __ensureLength(tds, length):
|
||||||
|
if len(tds) != length:
|
||||||
|
raise CannotParseRowException('The row has %d entries but %d are expected.' % (len(tds), length))
|
||||||
|
|
||||||
|
def __parseFormationRowGeneric(tds, finalist):
|
||||||
|
__ensureLength(tds, 2)
|
||||||
|
|
||||||
|
place = tds[0].contents[0]
|
||||||
|
name, number = __parseNameAndId(tds[1].contents[0], tds)
|
||||||
|
|
||||||
participant = HtmlParticipant(name, number)
|
participant = HtmlParticipant(name, number)
|
||||||
participant.finalist = finalist
|
participant.finalist = finalist
|
||||||
|
participant.club = ''
|
||||||
participants[participant] = place
|
participants[participant] = place
|
||||||
|
|
||||||
for row in rows:
|
|
||||||
__parseRow(row)
|
|
||||||
|
|
||||||
def __parseFirstTable(table):
|
def __parseFirstTable(table):
|
||||||
roundName = table.tr.td.contents[0]
|
roundName = table.tr.td.contents[0]
|
||||||
if roundName != "Endrunde":
|
if roundName != "Endrunde":
|
||||||
self.l.warning("Found table with round name %s.", roundName)
|
self.l.warning("Found table with round name %s.", roundName)
|
||||||
raise IncompleteRoundException("Could not parse HTML file")
|
raise IncompleteRoundException("Could not parse HTML file")
|
||||||
|
|
||||||
__parseRows(table.find_all("tr")[2:], True)
|
def __parseFormationRow(tds):
|
||||||
|
__parseFormationRowGeneric(tds, True)
|
||||||
|
|
||||||
|
def __parsePairRow(tds):
|
||||||
|
__ensureLength(tds, 4)
|
||||||
|
|
||||||
|
place = tds[0].contents[0]
|
||||||
|
tdNameClub = tds[1]
|
||||||
|
tdClub = tdNameClub.i.extract()
|
||||||
|
name, number = __parseNameAndId(tdNameClub.contents[0], tds)
|
||||||
|
|
||||||
|
participant = HtmlParticipant(name, number)
|
||||||
|
participant.finalist = True
|
||||||
|
participant.club = tdClub.contents[0]
|
||||||
|
|
||||||
|
participants[participant] = place
|
||||||
|
|
||||||
|
__parseRows(table.find_all("tr")[2:], [__parsePairRow, __parseFormationRow,])
|
||||||
|
|
||||||
def __parseRemainingTables(tables):
|
def __parseRemainingTables(tables):
|
||||||
|
|
||||||
|
def __parseFormationRow(tds):
|
||||||
|
__parseFormationRowGeneric(tds, False)
|
||||||
|
|
||||||
|
def __parsePairRow(tds):
|
||||||
|
__ensureLength(tds, 3)
|
||||||
|
|
||||||
|
place = tds[0].contents[0]
|
||||||
|
name, number = __parseNameAndId(tds[1].contents[0], tds)
|
||||||
|
|
||||||
|
participant = HtmlParticipant(name, number)
|
||||||
|
participant.finalist = True
|
||||||
|
participant.club = tds[2].contents[0]
|
||||||
|
|
||||||
|
participants[participant] = place
|
||||||
|
|
||||||
|
def __parseSeparatorRow(tds):
|
||||||
|
__ensureLength(tds, 1)
|
||||||
|
if len(list(tds[0].stripped_strings)) == 0:
|
||||||
|
return
|
||||||
|
raise CannotParseRowException('No empty string')
|
||||||
|
|
||||||
|
regexZwischenRunde = re.compile('[1-9]\. Zwischenrunde')
|
||||||
|
def __parseRoundHeading(tds):
|
||||||
|
__ensureLength(tds, 1)
|
||||||
|
s = ''.join(tds[0].stripped_strings)
|
||||||
|
if s.startswith('Vorrunde'):
|
||||||
|
return
|
||||||
|
if regexZwischenRunde.match(s) is not None:
|
||||||
|
return
|
||||||
|
raise CannotParseRowException('Kein Header einer Runde gefunden.')
|
||||||
|
|
||||||
|
def __parseAllSolosQualifiedFormation(tds):
|
||||||
|
__ensureLength(tds, 2)
|
||||||
|
if tds[1].contents[0].startswith("Alle Starter weiter genommen."):
|
||||||
|
return
|
||||||
|
raise CannotParseRowException('Not found the text "Alle Starter weiter genommen"')
|
||||||
|
|
||||||
|
def __parseAllSolosQualifiedPair(tds):
|
||||||
|
__ensureLength(tds, 3)
|
||||||
|
if tds[1].contents[0].startswith("Alle Mannschaften weiter genommen."):
|
||||||
|
return
|
||||||
|
raise CannotParseRowException('Not found the text "Alle Mannschaften weiter genommen"')
|
||||||
|
|
||||||
|
|
||||||
for table in tables:
|
for table in tables:
|
||||||
__parseRows(table.find_all("tr"), False)
|
__parseRows(table.find_all("tr"), [__parseAllSolosQualifiedFormation, __parseAllSolosQualifiedPair, __parsePairRow, __parseFormationRow, __parseSeparatorRow, __parseRoundHeading])
|
||||||
|
|
||||||
tables = self.soup.find("div", class_="extract").find_all("table")
|
tables = self.soup.find("div", class_="extract").find_all("table")
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ class HtmlParticipant:
|
|||||||
self.name = name
|
self.name = name
|
||||||
self.id = id
|
self.id = id
|
||||||
self.finalist = None
|
self.finalist = None
|
||||||
|
self.club = None
|
||||||
|
|
||||||
def __eq__(self, o):
|
def __eq__(self, o):
|
||||||
if type(o) != HtmlParticipant:
|
if type(o) != HtmlParticipant:
|
||||||
|
Loading…
Reference in New Issue
Block a user