2022-11-13 16:01:44 +00:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
import logging
|
|
|
|
import re
|
|
|
|
|
|
|
|
class HtmlParser:
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.l = logging.getLogger('solo_turnier.html_parser')
|
|
|
|
|
|
|
|
def parseString(self, text: str):
|
|
|
|
soup = BeautifulSoup(text, 'html.parser')
|
|
|
|
|
2022-11-13 17:04:49 +00:00
|
|
|
participants = {}
|
2022-11-13 16:01:44 +00:00
|
|
|
|
|
|
|
def __parseRows(rows, finalist: bool):
|
|
|
|
def __parseRow(row):
|
|
|
|
tds = row.find_all('td')
|
|
|
|
regex = re.compile('(.*) \\(([0-9]+)\\)')
|
|
|
|
|
|
|
|
place = tds[0].contents[0]
|
|
|
|
|
|
|
|
match = regex.fullmatch(tds[1].contents[0])
|
|
|
|
if match is None:
|
|
|
|
raise Exception(f'Could not match {tds} to regex search pattern')
|
|
|
|
name = match.group(1)
|
|
|
|
number = match.group(2)
|
|
|
|
|
|
|
|
participant = {
|
|
|
|
'name': name,
|
|
|
|
'place': place,
|
|
|
|
'finalist': finalist
|
|
|
|
}
|
2022-11-13 17:04:49 +00:00
|
|
|
participants[number] = participant
|
2022-11-13 16:01:44 +00:00
|
|
|
|
|
|
|
for row in rows:
|
|
|
|
__parseRow(row)
|
|
|
|
|
|
|
|
def __parseFirstTable(table):
|
|
|
|
roundName = table.tr.td.contents[0]
|
|
|
|
if roundName != 'Endrunde':
|
|
|
|
raise Exception('Could not parse HTML file')
|
|
|
|
|
|
|
|
__parseRows(table.find_all('tr')[2:], True)
|
|
|
|
|
|
|
|
def __parseRemainingTables(tables):
|
|
|
|
for table in tables:
|
|
|
|
__parseRows(table.find_all('tr')[2:], False)
|
|
|
|
|
|
|
|
tables = soup.find('div', class_='extract').find_all('table')
|
|
|
|
if len(tables) > 0:
|
|
|
|
__parseFirstTable(tables[0])
|
|
|
|
|
|
|
|
__parseRemainingTables(tables[1:])
|
|
|
|
|
2022-11-13 17:04:49 +00:00
|
|
|
title = soup.find('div', class_='eventhead').table.tr.td.contents[0]
|
|
|
|
|
|
|
|
ret = {
|
|
|
|
'participants': participants,
|
|
|
|
'title': title
|
|
|
|
}
|
2022-11-13 16:01:44 +00:00
|
|
|
return ret
|
2022-11-15 13:19:10 +00:00
|
|
|
|
|
|
|
def guessDataFromHtmlTitle(self, title):
|
|
|
|
match = re.compile('.*?ETW, Solos (.*)').match(title)
|
|
|
|
if match is None:
|
|
|
|
raise Exception(f'Cannot parse title "{title}"')
|
|
|
|
|
|
|
|
rest = match.group(1)
|
|
|
|
rawGroup, rawClass, dance = rest.split(' ', 2)
|
|
|
|
|
|
|
|
classMap = {
|
|
|
|
'Newcomer': 'Newc.',
|
|
|
|
'Beginner': 'Beg.',
|
|
|
|
'Advanced': 'Adv.'
|
|
|
|
}
|
|
|
|
|
|
|
|
groupMap = {
|
|
|
|
'Kinder': 'Kin.',
|
|
|
|
'Junioren': 'Jun.',
|
|
|
|
'Jugend': 'Jug.',
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
'dance': dance,
|
|
|
|
'class_': classMap.get(rawClass, rawClass),
|
|
|
|
'group': groupMap.get(rawGroup, rawGroup)
|
|
|
|
}
|