2022-11-13 16:01:44 +00:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
import logging
|
|
|
|
import re
|
|
|
|
|
2022-11-15 13:38:59 +00:00
|
|
|
class HtmlParticipant:
|
|
|
|
def __init__(self, name, place, finalist):
|
|
|
|
self.name = name
|
|
|
|
self.place = place
|
|
|
|
self.finalist = finalist
|
2022-11-15 17:11:40 +00:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return f'{self.name} (with place {self.place})'
|
2022-11-15 13:38:59 +00:00
|
|
|
|
|
|
|
class HtmlImport:
|
|
|
|
def __init__(self, title: str, participants: dict[int, HtmlParticipant]):
|
|
|
|
self.title = title
|
|
|
|
self.participants = participants
|
|
|
|
|
2022-11-13 16:01:44 +00:00
|
|
|
class HtmlParser:
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.l = logging.getLogger('solo_turnier.html_parser')
|
|
|
|
|
|
|
|
def parseString(self, text: str):
|
|
|
|
soup = BeautifulSoup(text, 'html.parser')
|
|
|
|
|
2022-11-13 17:04:49 +00:00
|
|
|
participants = {}
|
2022-11-13 16:01:44 +00:00
|
|
|
|
|
|
|
def __parseRows(rows, finalist: bool):
|
|
|
|
def __parseRow(row):
|
|
|
|
tds = row.find_all('td')
|
|
|
|
regex = re.compile('(.*) \\(([0-9]+)\\)')
|
|
|
|
|
|
|
|
place = tds[0].contents[0]
|
|
|
|
|
|
|
|
match = regex.fullmatch(tds[1].contents[0])
|
|
|
|
if match is None:
|
|
|
|
raise Exception(f'Could not match {tds} to regex search pattern')
|
|
|
|
name = match.group(1)
|
|
|
|
number = match.group(2)
|
|
|
|
|
2022-11-15 13:38:59 +00:00
|
|
|
participant = HtmlParticipant(name, place, finalist)
|
2022-11-13 17:04:49 +00:00
|
|
|
participants[number] = participant
|
2022-11-13 16:01:44 +00:00
|
|
|
|
|
|
|
for row in rows:
|
|
|
|
__parseRow(row)
|
|
|
|
|
|
|
|
def __parseFirstTable(table):
|
|
|
|
roundName = table.tr.td.contents[0]
|
|
|
|
if roundName != 'Endrunde':
|
|
|
|
raise Exception('Could not parse HTML file')
|
|
|
|
|
|
|
|
__parseRows(table.find_all('tr')[2:], True)
|
|
|
|
|
|
|
|
def __parseRemainingTables(tables):
|
|
|
|
for table in tables:
|
|
|
|
__parseRows(table.find_all('tr')[2:], False)
|
|
|
|
|
|
|
|
tables = soup.find('div', class_='extract').find_all('table')
|
|
|
|
if len(tables) > 0:
|
|
|
|
__parseFirstTable(tables[0])
|
|
|
|
|
|
|
|
__parseRemainingTables(tables[1:])
|
|
|
|
|
2022-11-13 17:04:49 +00:00
|
|
|
title = soup.find('div', class_='eventhead').table.tr.td.contents[0]
|
|
|
|
|
2022-11-15 13:38:59 +00:00
|
|
|
ret = HtmlImport(title, participants)
|
2022-11-13 16:01:44 +00:00
|
|
|
return ret
|
2022-11-15 13:19:10 +00:00
|
|
|
|
|
|
|
def guessDataFromHtmlTitle(self, title):
|
|
|
|
match = re.compile('.*?ETW, Solos (.*)').match(title)
|
|
|
|
if match is None:
|
|
|
|
raise Exception(f'Cannot parse title "{title}"')
|
|
|
|
|
|
|
|
rest = match.group(1)
|
|
|
|
rawGroup, rawClass, dance = rest.split(' ', 2)
|
|
|
|
|
|
|
|
classMap = {
|
|
|
|
'Newcomer': 'Newc.',
|
|
|
|
'Beginner': 'Beg.',
|
|
|
|
'Advanced': 'Adv.'
|
|
|
|
}
|
|
|
|
|
|
|
|
groupMap = {
|
|
|
|
'Kinder': 'Kin.',
|
|
|
|
'Junioren': 'Jun.',
|
|
|
|
'Jugend': 'Jug.',
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
2022-11-15 15:46:57 +00:00
|
|
|
'dance': dance.strip(),
|
2022-11-15 13:19:10 +00:00
|
|
|
'class_': classMap.get(rawClass, rawClass),
|
|
|
|
'group': groupMap.get(rawGroup, rawGroup)
|
|
|
|
}
|