2022-11-13 16:01:44 +00:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
import logging
|
|
|
|
import re
|
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
from .types import HtmlPreviewParticipant, HtmlParticipant, HtmlResultTotalTable
|
2023-09-13 13:23:04 +00:00
|
|
|
from .types import HtmlPreviewImport as HtmlImport, HtmlResultImport
|
|
|
|
from .group import GroupParser
|
|
|
|
from .competition_class import CompetitionClassParser
|
2022-11-15 13:38:59 +00:00
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2023-10-06 15:52:52 +00:00
|
|
|
class IncompleteRoundException(Exception):
|
|
|
|
def __init__(self, *args):
|
|
|
|
super(IncompleteRoundException, self).__init__(*args)
|
|
|
|
|
2022-11-13 16:01:44 +00:00
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
class HtmlParser:
|
2022-12-03 13:29:01 +00:00
|
|
|
def __init__(self, text: str, fileName: str = None):
|
2023-11-19 16:07:20 +00:00
|
|
|
self.l = logging.getLogger("solo_turnier.html_parser")
|
|
|
|
self.soup = BeautifulSoup(text, "html.parser")
|
2022-12-03 13:29:01 +00:00
|
|
|
self.fileName = fileName
|
2023-09-13 13:23:04 +00:00
|
|
|
self.groupParser = GroupParser()
|
|
|
|
self.classParser = CompetitionClassParser()
|
2022-12-03 13:29:01 +00:00
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
if self.fileName is None:
|
2023-11-19 16:07:20 +00:00
|
|
|
return "HtmlParser(direct text)"
|
2022-12-03 13:29:01 +00:00
|
|
|
else:
|
2023-11-19 16:07:20 +00:00
|
|
|
return f"HtmlParser({self.fileName})"
|
|
|
|
|
2022-11-26 07:43:15 +00:00
|
|
|
def getEventTitle(self):
|
2023-11-19 16:07:20 +00:00
|
|
|
return self.soup.find("div", class_="eventhead").table.tr.td.contents[0]
|
2022-11-26 07:43:15 +00:00
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
def guessDataFromHtmlTitle(self, title=None):
|
2022-11-26 07:43:15 +00:00
|
|
|
if title is None:
|
|
|
|
title = self.getEventTitle()
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2023-11-19 12:19:32 +00:00
|
|
|
match = re.compile('.*?ETW, Solos (.*?)(?: ".*")?').fullmatch(title)
|
2022-11-26 07:43:15 +00:00
|
|
|
if match is None:
|
2023-11-19 12:19:32 +00:00
|
|
|
self.l.error('Cannot parse html title "%s". Possible bug?', title)
|
2022-11-26 07:43:15 +00:00
|
|
|
raise Exception(f'Cannot parse title "{title}"')
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2022-11-26 07:43:15 +00:00
|
|
|
rest = match.group(1)
|
2023-11-19 16:07:20 +00:00
|
|
|
rawGroup, rawClass, dance = rest.split(" ", 2)
|
2022-11-26 07:43:15 +00:00
|
|
|
|
|
|
|
return {
|
2023-11-19 16:07:20 +00:00
|
|
|
"dance": dance.strip(),
|
|
|
|
"class_": str(self.classParser.parseClass(rawClass, True)),
|
|
|
|
"group": str(self.groupParser.parseClass(rawGroup)),
|
2022-11-26 07:43:15 +00:00
|
|
|
}
|
|
|
|
|
2023-09-13 13:23:04 +00:00
|
|
|
def parseResult(self):
|
2022-11-13 17:04:49 +00:00
|
|
|
participants = {}
|
2022-11-13 16:01:44 +00:00
|
|
|
|
|
|
|
def __parseRows(rows, finalist: bool):
|
|
|
|
def __parseRow(row):
|
2023-11-19 16:07:20 +00:00
|
|
|
tds = row.find_all("td")
|
2023-09-13 13:23:04 +00:00
|
|
|
|
|
|
|
if len(tds) != 2:
|
|
|
|
return
|
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
if tds[1].contents[0].startswith("Alle Starter weiter genommen."):
|
|
|
|
self.l.info("No excluded starters found.")
|
2023-11-08 19:00:01 +00:00
|
|
|
return
|
2023-11-08 19:45:44 +00:00
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
regex = re.compile("(.*) \\(([0-9]+)\\)")
|
|
|
|
|
2022-11-13 16:01:44 +00:00
|
|
|
place = tds[0].contents[0]
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2022-11-13 16:01:44 +00:00
|
|
|
match = regex.fullmatch(tds[1].contents[0])
|
|
|
|
if match is None:
|
2023-11-19 16:07:20 +00:00
|
|
|
self.l.error("Could not match %s to regex search pattern", str(tds))
|
|
|
|
raise Exception(f"Could not match {tds} to regex search pattern")
|
2022-11-13 16:01:44 +00:00
|
|
|
name = match.group(1)
|
|
|
|
number = match.group(2)
|
|
|
|
|
2023-09-13 13:23:04 +00:00
|
|
|
participant = HtmlParticipant(name, number)
|
|
|
|
participant.finalist = finalist
|
|
|
|
participants[participant] = place
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2022-11-13 16:01:44 +00:00
|
|
|
for row in rows:
|
|
|
|
__parseRow(row)
|
|
|
|
|
|
|
|
def __parseFirstTable(table):
|
|
|
|
roundName = table.tr.td.contents[0]
|
2023-11-19 16:07:20 +00:00
|
|
|
if roundName != "Endrunde":
|
|
|
|
self.l.warning("Found table with round name %s.", roundName)
|
|
|
|
raise IncompleteRoundException("Could not parse HTML file")
|
|
|
|
|
|
|
|
__parseRows(table.find_all("tr")[2:], True)
|
2022-11-13 16:01:44 +00:00
|
|
|
|
|
|
|
def __parseRemainingTables(tables):
|
|
|
|
for table in tables:
|
2023-11-19 16:07:20 +00:00
|
|
|
__parseRows(table.find_all("tr"), False)
|
|
|
|
|
|
|
|
tables = self.soup.find("div", class_="extract").find_all("table")
|
2022-11-13 16:01:44 +00:00
|
|
|
|
2023-10-06 15:52:52 +00:00
|
|
|
try:
|
|
|
|
if len(tables) > 0:
|
|
|
|
__parseFirstTable(tables[0])
|
2022-11-13 16:01:44 +00:00
|
|
|
|
2023-10-06 15:52:52 +00:00
|
|
|
__parseRemainingTables(tables[1:])
|
|
|
|
except IncompleteRoundException:
|
|
|
|
pass
|
2022-11-13 16:01:44 +00:00
|
|
|
|
2023-09-13 13:23:04 +00:00
|
|
|
# title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
|
2022-11-13 17:04:49 +00:00
|
|
|
|
2023-09-13 13:23:04 +00:00
|
|
|
# ret = HtmlImport(title, participants)
|
2023-11-19 16:07:20 +00:00
|
|
|
ret = HtmlResultImport(participants)
|
2022-11-13 16:01:44 +00:00
|
|
|
return ret
|
2022-11-15 13:19:10 +00:00
|
|
|
|
2022-11-26 07:43:15 +00:00
|
|
|
def parsePreparationRound(self):
|
2023-11-19 16:07:20 +00:00
|
|
|
title = self.soup.find("div", class_="eventhead").table.tr.td.contents[0]
|
2022-11-19 06:38:22 +00:00
|
|
|
tableData = []
|
|
|
|
rowTitles = []
|
|
|
|
|
|
|
|
def __mapBr(td):
|
2023-11-19 16:07:20 +00:00
|
|
|
for br in td.find_all("br"):
|
|
|
|
br.replace_with("\n")
|
2022-11-19 06:38:22 +00:00
|
|
|
td.smooth()
|
|
|
|
return td
|
|
|
|
|
|
|
|
def __extractTitles(table):
|
2023-11-19 16:07:20 +00:00
|
|
|
for row in table.find_all("tr")[1:]:
|
2022-11-19 06:38:22 +00:00
|
|
|
rowTitles.append(__mapBr(row.td).string)
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2022-11-19 06:38:22 +00:00
|
|
|
def __extractColumns(table):
|
|
|
|
content = []
|
|
|
|
|
|
|
|
def __extractContent(td):
|
2023-11-19 16:07:20 +00:00
|
|
|
for br in td.find_all("br"):
|
|
|
|
br.replace_with("\n")
|
|
|
|
|
2022-11-19 06:38:22 +00:00
|
|
|
span = td.span
|
|
|
|
if span is not None:
|
|
|
|
span = span.extract()
|
|
|
|
meta = span.string
|
|
|
|
else:
|
|
|
|
meta = None
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2022-11-19 06:38:22 +00:00
|
|
|
td.smooth()
|
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
return {"text": td.string.replace("\xa0", " ").strip(), "meta": meta}
|
2022-11-19 06:38:22 +00:00
|
|
|
|
|
|
|
def __extractRow(row):
|
|
|
|
entries = []
|
2023-11-19 16:07:20 +00:00
|
|
|
for entry in row.find_all("td")[1:]:
|
2022-11-19 06:38:22 +00:00
|
|
|
entries.append(__extractContent(entry))
|
|
|
|
return entries
|
2023-11-19 16:07:20 +00:00
|
|
|
|
|
|
|
for row in table.find_all("tr")[1:]:
|
2022-11-19 06:38:22 +00:00
|
|
|
content.append(__extractRow(row))
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2022-11-19 06:38:22 +00:00
|
|
|
return content
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2022-11-19 06:38:22 +00:00
|
|
|
def __mergeColumns(columns1, columns2):
|
|
|
|
return list(map(lambda x, y: x + y, columns1, columns2))
|
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
extract = self.soup.find("div", class_="extract")
|
|
|
|
tables = extract.find_all("table", class_="tab1")
|
2022-11-19 06:38:22 +00:00
|
|
|
|
|
|
|
__extractTitles(tables[0])
|
|
|
|
tableData = __extractColumns(tables[0])
|
|
|
|
|
|
|
|
for table in tables[1:]:
|
|
|
|
tableData = __mergeColumns(tableData, __extractColumns(table))
|
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
data = {"titles": rowTitles, "table": tableData}
|
|
|
|
|
|
|
|
return {"title": title, "data": data}
|
2022-11-19 06:38:22 +00:00
|
|
|
|
|
|
|
def cleanPreparationRoundImport(self, data):
|
|
|
|
def __cleanTable(table):
|
|
|
|
def __cleanText(s: str):
|
2022-11-27 08:10:17 +00:00
|
|
|
# print("cleaning string ", s)
|
2023-11-19 16:07:20 +00:00
|
|
|
return s.strip(" \n\xa0")
|
|
|
|
|
2022-11-19 06:38:22 +00:00
|
|
|
def __cleanEntry(entry):
|
2023-11-19 16:07:20 +00:00
|
|
|
entry["text"] = __cleanText(entry["text"])
|
|
|
|
if entry["meta"] is not None:
|
|
|
|
entry["meta"] = __cleanText(entry["meta"])
|
|
|
|
|
2022-11-19 06:38:22 +00:00
|
|
|
for row in table:
|
|
|
|
for entry in row:
|
2022-11-27 08:10:17 +00:00
|
|
|
# print(entry)
|
2022-11-19 06:38:22 +00:00
|
|
|
__cleanEntry(entry)
|
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
data["title"] = data["title"].strip()
|
|
|
|
__cleanTable(data["data"]["table"])
|
2022-11-19 06:38:22 +00:00
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
def parseIndividualResult(self, competitionGroup, competitionClass, dance):
|
|
|
|
participants = {}
|
|
|
|
|
|
|
|
def __parseTable(table):
|
2023-11-19 16:07:20 +00:00
|
|
|
rows = table.find_all("tr")
|
2023-09-26 10:35:35 +00:00
|
|
|
|
|
|
|
def __getIds():
|
|
|
|
row = rows[1]
|
2023-11-19 16:07:20 +00:00
|
|
|
entries = row("td")
|
2023-09-26 10:35:35 +00:00
|
|
|
entries = entries[1:]
|
|
|
|
entries = [x for x in entries if len(x.contents[0].strip()) > 0]
|
|
|
|
return [x.contents[0].strip() for x in entries]
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
ids = __getIds()
|
|
|
|
numIds = len(ids)
|
2023-11-19 16:07:20 +00:00
|
|
|
self.l.log(5, "Found ids in dataset: %s", ids)
|
2023-09-26 10:35:35 +00:00
|
|
|
|
|
|
|
def findRowIndex(prefixStr):
|
|
|
|
def isRowMatchingCriteria(row):
|
|
|
|
if row.td.contents[0].startswith(prefixStr):
|
|
|
|
return True
|
|
|
|
return False
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
l = list(map(isRowMatchingCriteria, rows))
|
|
|
|
|
|
|
|
if True not in l:
|
|
|
|
return None
|
|
|
|
return l.index(True)
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
def getPlaces():
|
2023-11-19 16:07:20 +00:00
|
|
|
placeRowIdx = findRowIndex("Platz von")
|
|
|
|
placeTags = rows[placeRowIdx]("td")[1 : (numIds + 1)]
|
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
def getSinglePlaceStr(tag):
|
2023-11-19 16:07:20 +00:00
|
|
|
for br in tag("br"):
|
|
|
|
br.replace_with("-")
|
2023-09-26 10:35:35 +00:00
|
|
|
tag.smooth()
|
|
|
|
rawStr = tag.contents[0].strip()
|
2023-11-19 16:07:20 +00:00
|
|
|
if rawStr.endswith("-"):
|
2023-09-26 10:35:35 +00:00
|
|
|
rawStr = rawStr[:-1]
|
|
|
|
return rawStr
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
places = list(map(getSinglePlaceStr, placeTags))
|
|
|
|
return places
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
places = getPlaces()
|
2023-11-19 16:07:20 +00:00
|
|
|
self.l.log(5, "Found places: %s", places)
|
2023-09-26 10:35:35 +00:00
|
|
|
|
|
|
|
def getClass():
|
2023-11-19 16:07:20 +00:00
|
|
|
classRow = findRowIndex("Startklasse")
|
2023-09-26 10:35:35 +00:00
|
|
|
if classRow is not None:
|
2023-11-19 16:07:20 +00:00
|
|
|
classTags = rows[classRow]("td")[1 : (numIds + 1)]
|
2023-09-26 10:35:35 +00:00
|
|
|
return list(map(lambda x: x.contents[0], classTags))
|
|
|
|
return None
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
classes = getClass()
|
2023-11-19 16:07:20 +00:00
|
|
|
self.l.log(5, "Classes: %s", classes)
|
2023-09-26 10:35:35 +00:00
|
|
|
|
|
|
|
def getGroups():
|
2023-11-19 16:07:20 +00:00
|
|
|
groupRow = findRowIndex("Startgruppe")
|
2023-09-26 10:35:35 +00:00
|
|
|
if groupRow is not None:
|
2023-11-19 16:07:20 +00:00
|
|
|
classTags = rows[groupRow]("td")[1 : (numIds + 1)]
|
2023-09-26 10:35:35 +00:00
|
|
|
return list(map(lambda x: x.contents[0], classTags))
|
|
|
|
return None
|
2023-11-19 16:07:20 +00:00
|
|
|
|
2023-09-26 10:35:35 +00:00
|
|
|
groups = getGroups()
|
2023-11-19 16:07:20 +00:00
|
|
|
self.l.log(5, "Groups: %s", groups)
|
2023-09-26 10:35:35 +00:00
|
|
|
|
|
|
|
for idx, id in enumerate(ids):
|
|
|
|
cls = classes[idx] if classes is not None else None
|
|
|
|
grp = groups[idx] if groups is not None else None
|
|
|
|
|
|
|
|
tup = (competitionGroup, competitionClass, dance, id)
|
|
|
|
participants[tup] = (places[idx], cls, grp)
|
|
|
|
|
2023-11-19 16:07:20 +00:00
|
|
|
tables = self.soup.find("div", class_="extract").find_all("table")
|
2023-09-26 10:35:35 +00:00
|
|
|
for table in tables:
|
|
|
|
__parseTable(table)
|
2023-11-19 16:07:20 +00:00
|
|
|
|
|
|
|
return HtmlResultTotalTable(participants)
|