Written basic code to parse existsing exports successfully

This commit is contained in:
Christian Wolf 2023-09-13 15:23:04 +02:00
parent d39a8d590e
commit eafca2e9aa
10 changed files with 407 additions and 42 deletions

24
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,24 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Remote Attach",
"type": "python",
"request": "attach",
"connect": {
"host": "localhost",
"port": 5678
},
"pathMappings": [
{
"localRoot": "${workspaceFolder:code}",
"remoteRoot": "."
}
],
"justMyCode": true
}
]
}

View File

@ -5,7 +5,8 @@
"name": "code"
},
{
"path": "../../../../../nextcloud/Documents/Projekte/SLT/Auswertungsskript Solo"
"path": "../files",
"name": "files"
}
],
"settings": {

View File

@ -3,6 +3,7 @@ beautifulsoup4==4.11.1
colorama==0.4.6
coloredlogs==15.0.1
coverage==6.5.0
debugpy==1.6.7
exceptiongroup==1.0.1
humanfriendly==10.0
iniconfig==1.1.1

View File

@ -2,7 +2,7 @@
_dir="$(dirname "$0")"
if [ -n "$PTHONPATH" ]; then
if [ -n "$PYTHONPATH" ]; then
PYTHONPATH="$PYTHONPATH:$_dir/src"
else
PYTHONPATH="$_dir/src"

View File

@ -181,21 +181,25 @@ class BatchWorker:
htmlCandidatesPreview = locator.findPreviewRoundCandidates(self.config.importHtmlPath())
self.l.debug('Found HTML file candidates for preview rounds: %s', htmlCandidatesPreview)
worker = solo_turnier.worker.Worker()
worker.collectAllData(htmlCandidatesPreview, self.config.importCSVPath())
htmlResultFiles = locator.findCandidates(self.config.importHtmlPath())
self.l.debug('Using HTML result files for result extraction: %s', htmlResultFiles)
# csvReader = solo_turnier.reader.AllResultReader(self.config.importCSVPath())
worker = solo_turnier.worker.Worker()
importedData = worker.collectAllData(htmlCandidatesPreview, self.config.importCSVPath(), htmlResultFiles)
worker.combineData(importedData)
# csvReader = solo_turnier.reader.CSVResultReader(self.config.importCSVPath())
# self.l.info('Loading the total result CSV file %s', self.config.importCSVPath())
# csvData = csvReader.readFile()
# self.l.info('CSV file has been read')
# csvExtractor = solo_turnier.worker.CSVExtractor()
# csvExtractor = solo_turnier.reader.CSVExtractor()
# self.l.info('Importing CSV data into internal structures')
# csvRows = csvExtractor.mapCSVImport(csvData)
# worker = solo_turnier.worker.DataWorker()
# /////
# self.l.info('Checking for feasible HTML export files in "%s"', self.config.importHtmlPath())
# htmlCandidates = locator.findCandidates(self.config.importHtmlPath())

View File

@ -1,6 +1,8 @@
import argparse
import logging
import debugpy
class Cli:
def __init__(self, l: logging.Logger):
parser = argparse.ArgumentParser()
@ -11,8 +13,13 @@ class Cli:
parser.add_argument('-o', '--output', help='Set the output path of the script', nargs=1, default=[None])
parser.add_argument('-v', '--verbose', help='Increase verbosity', action='count', default=0)
parser.add_argument('-d', '--debug', action='store_true', help='Activate debugging during startup')
self.__args = parser.parse_args()
if self.__args.debug:
debugpy.listen(5678)
debugpy.wait_for_client()
map = {
0: logging.ERROR,
1: logging.WARN,

View File

@ -78,3 +78,19 @@ class GroupParser:
def isPureClass(self, cls: str) -> bool:
parsedClass = self.parseClass(cls)
return isinstance(parsedClass, Group)
def getGroups(self) -> list[Group]:
return[
GroupParser.KIN,
GroupParser.JUN,
GroupParser.JUG,
GroupParser.HGR,
GroupParser.MAS1,
GroupParser.MAS2,
GroupParser.MAS3,
GroupParser.MAS4,
GroupParser.MAS5
]
def getGroupsAsSortedList(self, groups) -> list[Group]:
return [x for x in self.getGroups() if x in groups]

View File

@ -3,8 +3,10 @@ from bs4 import BeautifulSoup
import logging
import re
from .types import HtmlPreviewParticipant as HtmlParticipant
from .types import HtmlPreviewImport as HtmlImport
from .types import HtmlPreviewParticipant, HtmlParticipant
from .types import HtmlPreviewImport as HtmlImport, HtmlResultImport
from .group import GroupParser
from .competition_class import CompetitionClassParser
class HtmlParser:
@ -12,6 +14,8 @@ class HtmlParser:
self.l = logging.getLogger('solo_turnier.html_parser')
self.soup = BeautifulSoup(text, 'html.parser')
self.fileName = fileName
self.groupParser = GroupParser()
self.classParser = CompetitionClassParser()
def __repr__(self):
if self.fileName is None:
@ -33,32 +37,22 @@ class HtmlParser:
rest = match.group(1)
rawGroup, rawClass, dance = rest.split(' ', 2)
classMap = {
'Newcomer': 'Newc.',
'Beginner': 'Beg.',
'Advanced': 'Adv.'
}
groupMap = {
'Kinder': 'Kin.',
'Junioren': 'Jun.',
'Jugend': 'Jug.',
}
return {
'dance': dance.strip(),
'class_': classMap.get(rawClass, rawClass),
'group': groupMap.get(rawGroup, rawGroup)
'class_': str(self.classParser.parseClass(rawClass, True)),
'group': str(self.groupParser.parseClass(rawGroup))
}
def parseString(self, text: str):
soup = BeautifulSoup(text, 'html.parser')
def parseResult(self):
participants = {}
def __parseRows(rows, finalist: bool):
def __parseRow(row):
tds = row.find_all('td')
if len(tds) != 2:
return
regex = re.compile('(.*) \\(([0-9]+)\\)')
place = tds[0].contents[0]
@ -69,8 +63,9 @@ class HtmlParser:
name = match.group(1)
number = match.group(2)
participant = HtmlParticipant(name, place, finalist)
participants[number] = participant
participant = HtmlParticipant(name, number)
participant.finalist = finalist
participants[participant] = place
for row in rows:
__parseRow(row)
@ -84,17 +79,18 @@ class HtmlParser:
def __parseRemainingTables(tables):
for table in tables:
__parseRows(table.find_all('tr')[2:], False)
__parseRows(table.find_all('tr'), False)
tables = soup.find('div', class_='extract').find_all('table')
tables = self.soup.find('div', class_='extract').find_all('table')
if len(tables) > 0:
__parseFirstTable(tables[0])
__parseRemainingTables(tables[1:])
title = soup.find('div', class_='eventhead').table.tr.td.contents[0]
# title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
ret = HtmlImport(title, participants)
# ret = HtmlImport(title, participants)
ret = HtmlResultImport(participants)
return ret
def parsePreparationRound(self):

View File

@ -21,10 +21,11 @@ class CSVResultRow:
return f'{self.name} ({self.id}, {self.club}) is in {self.group} {self.class_} and danced the {self.dance} in {self.competitionGroup} {self.competitionClass} getting place {self.place}-{self.placeTo}'
class HtmlPreviewParticipant:
def __init__(self, name, id, participant_group):
def __init__(self, name, id, group_):
self.name = name
self.id = id
self.group = group.GroupParser().parseClass(participant_group)
groupParser = group.GroupParser()
self.group = groupParser.parseClass(group_)
def __eq__(self, o):
if type(o) != HtmlPreviewParticipant:
@ -33,14 +34,52 @@ class HtmlPreviewParticipant:
return all(map(lambda x, y: x == y, (self.name, self.id, self.group), (o.name, o.id, o.group)))
def __repr__(self):
return f'{self.id}: {self.name} ({self.group})'
return f'{self.id} ({self.name}, {self.group})'
class HtmlPreviewImport:
def __init__(self, participants: dict[int, HtmlPreviewParticipant]):
self.participants = participants
def __hash__(self):
return hash((self.id, self.name, self.group))
class HtmlParticipant:
def __init__(self, name, id):
self.name = name
self.id = id
self.finalist = None
def __eq__(self, o):
if type(o) != HtmlPreviewParticipant:
return False
return all(map(lambda x, y: x == y, (self.name, self.id, self.group), (o.name, o.id, o.group)))
def __repr__(self):
return str(self.participants)
return f'{self.id}: {self.name}'
def __hash__(self):
return hash((self.id, self.name))
# class PreviewParticipationData:
# def __init__(self, dance: str, class_: competition_class.CompetitionClass):
# self.class_ = class_
# self.dance = dance
class HtmlPreviewImport:
def __init__(
self,
participants: dict[int, list[HtmlPreviewParticipant]],
results: dict[HtmlPreviewParticipant, dict[str, competition_class.CompetitionClass]]
):
self.participants = participants
self.results = results
def __repr__(self):
return (str(self.participants), str(self.results))
class HtmlResultImport:
def __init__(self, results: dict[HtmlParticipant, str]):
self.results = results
def __repr__(self):
return str(self.results)
class HtmlCompetitionResultRow:
def __init__(self, name, id, dance, group, class_, place, placeTo, finalist):
@ -84,6 +123,17 @@ class HtmlSingleCompetitionResult:
self.placeTo = placeTo
self.finalist = finalist
def __repr__(self):
if self.placeTo is None:
place = self.place
else:
place = f'{self.place}-{self.placeTo}'
if self.finalist:
return f'Res({self.name} [F], placed {place})'
else:
return f'Res({self.name}, placed {place})'
class HtmlCompetitionTotalResults:
def __init__(self):
self.results = {}
@ -94,12 +144,65 @@ class HtmlCompetitionTotalResults:
def get(self, group: group.Group_t, class_: competition_class.Class_t, dance: str, id: int) -> list[HtmlSingleCompetitionResult]:
return self.results[self.__getTuple(group, class_, dance, id)]
def getById(self, id: int) -> dict[tuple[str, group.Group_t, competition_class.Class_t], HtmlSingleCompetitionResult]:
ret = {}
for k in self.results:
if int(k[3]) != id:
continue
# ret = ret + self.results[k]
# Dance, Group, Class
key = (k[2], k[0], k[1])
ret[key] = self.results[k]
return ret
def add(self, group, class_, dance, id, result: HtmlSingleCompetitionResult):
tup = self.__getTuple(group, class_, dance, id)
l = self.results.get(tup, [])
l.append(result)
self.results[tup] = l
class SingleParticipantResult:
def __init__(
self,
competitionClass: competition_class.Class_t,
dance: str,
finalist: bool,
place: int,
placeTo: int|None
):
self.competitionClass = competitionClass
self.dance = dance
self.finalist = finalist
self.place = place
self.placeTo = placeTo
if placeTo == place:
self.placeTo = None
def __repr__(self):
asFinalist = ' as finalist' if self.finalist else ''
if self.placeTo is None:
return f'SR[{self.place} in {self.dance} {self.competitionClass}{asFinalist}]'
return f'SR[{self.place}-{self.placeTo} in {self.dance} {self.competitionClass}{asFinalist}]'
class TotalGroupResult:
def __init__(self, dances: list[str], results: dict[HtmlPreviewParticipant, list[SingleParticipantResult]]):
self.dances = dances
self.results = results
class State4:
def __init__(
self,
resultPerGroup: dict[group.Group, TotalGroupResult]
):
parser = group.GroupParser()
self.groups = parser.getGroupsAsSortedList(resultPerGroup.keys())
self.results = resultPerGroup
class State3:
def __init__(
self,

View File

@ -1,11 +1,14 @@
import logging
from pprint import pformat
import re
import solo_turnier
from solo_turnier import html_parser
from .reader import ResultRow
from .types import HtmlCompetitionResultRow as CompetitionResult
from . import types
from . import competition_class
class HtmlPerson:
def __init__(self, name, id, group):
@ -72,6 +75,7 @@ class PreviewWorker:
def __init__(self):
self.l = logging.getLogger('solo_turnier.worker.PreviewWorker')
self.participants = {}
self.previewResults = {}
def filterFilesPreview(self, files: list[str]) -> ParserList_t:
self.l.debug('Filtering the list of parsers by removing all non preview entries.')
@ -101,6 +105,12 @@ class PreviewWorker:
parser.cleanPreparationRoundImport(imported)
data = imported['data']
headerData = parser.guessDataFromHtmlTitle()
dance = headerData['dance']
def getRowIndexOfClass():
return data['titles'].index('Platz von\nPlatz bis')
self.l.log(5, data)
if data['titles'][0] != 'Wertungsrichter':
@ -115,6 +125,8 @@ class PreviewWorker:
group = parser.guessDataFromHtmlTitle(imported['title'])['group']
extractGroup = False
classRowIndex = getRowIndexOfClass()
for index, e in enumerate(data['table'][0]):
if e['text'] == '':
# Skip empty columns
@ -126,6 +138,9 @@ class PreviewWorker:
if extractGroup:
group = data['table'][-1][index]['text']
# dance =
class_ = data['table'][classRowIndex][index]['text']
participant = types.HtmlPreviewParticipant(name, id, group)
l = self.participants.get(id, [])
@ -134,6 +149,10 @@ class PreviewWorker:
l.append(participant)
self.participants[id] = l
results = self.previewResults.get(participant, {})
results[dance] = class_
self.previewResults[participant] = results
def importAllData(self, parsers: ParserList_t) -> types.HtmlPreviewImport:
self.participants = {}
@ -141,7 +160,79 @@ class PreviewWorker:
parser = parsers[file]
self.__extractPersonsFromSinglePreview(parser)
return types.HtmlPreviewImport(self.participants)
return types.HtmlPreviewImport(self.participants, self.previewResults)
class ResultExtractor:
def __init__(self):
self.l = logging.getLogger('solo_turnier.worker.ResultExtractor')
self.rePlaceSingle = re.compile(' *([0-9]+) *')
self.rePlaceDouble = re.compile(' *([0-9]+) *- *([0-9]+) *')
def getAllParsers(self, files: list[str]) -> ParserList_t:
ret = {}
classParser = competition_class.CompetitionClassParser()
for file in files:
with open(file, 'r') as fp:
text = fp.read()
parser = html_parser.HtmlParser(text, file)
try:
data = parser.guessDataFromHtmlTitle()
except:
self.l.error('Cannot parse HTML file %s to check if it is a valid result. Check manually.', file)
continue
try:
guessedClass = classParser.parseClass(data['class_'])
except:
self.l.error('Issue parsing class of file %s. Check manually.', file)
continue
self.l.debug('Fetched result data: %s, guessed class %s', data, guessedClass)
ret[file] = parser
return ret
def _extractPlace(self, placeStr: str):
s = placeStr.replace('.', '')
matches = self.rePlaceSingle.fullmatch(s)
if matches is not None:
return (int(matches.group(1)), None)
matches = self.rePlaceDouble.fullmatch(s)
if matches is not None:
return (int(matches.group(1)), int(matches.group(2)))
self.l.error('Could not parse place string "%s"', placeStr)
raise Exception('Place cannot be parsed')
def _analyzeSingleParser(self, parser: html_parser.HtmlParser, results: types.HtmlCompetitionTotalResults):
data = parser.guessDataFromHtmlTitle()
competitionClass = data['class_']
competitionGroup = data['group']
dance = data['dance']
result = parser.parseResult()
self.l.log(5, 'Raw data extracted: %s', result)
for person in result.results.keys():
placeStr = result.results[person]
place, placeTo = self._extractPlace(placeStr)
competitionResult = types.HtmlSingleCompetitionResult(person.name, place, placeTo, person.finalist)
results.add(competitionGroup, competitionClass, dance, person.id, competitionResult)
#
def extractAllData(self, parsers: ParserList_t) -> types.HtmlCompetitionTotalResults:
ret = types.HtmlCompetitionTotalResults()
for fileName in parsers:
self.l.debug('Extracting data from file %s', fileName)
self._analyzeSingleParser(parsers[fileName], ret)
return ret
class DataWorker:
def __init__(self):
@ -296,24 +387,146 @@ class DataWorker:
class Worker:
def __init__(self):
self.l = logging.getLogger('solo_turnier.worker.Worker')
self._allDances = (
['Samba', 'Cha Cha', 'Rumba', 'Paso Doble', 'Jive'] +
['Langs. Walzer', 'Tango', 'Wiener Walzer', 'Slowfox', 'Quickstep']
)
def collectAllData(
self,
htmlCandidatesPreview: list[str],
csvFile: str
csvFile: str,
htmlResultsFileNames: list[str]
) -> types.State3:
previewWorker = PreviewWorker()
self.l.info('Filtering for pure preview rounds.')
parsers = previewWorker.filterFilesPreview(htmlCandidatesPreview)
self.l.debug('Remaining files: %s', parsers.keys())
self.l.debug('Remaining files: %s', list(parsers.keys()))
self.l.info('Extracting person data from the preview rounds.')
previewImport = previewWorker.importAllData(parsers)
self.l.debug('Total preview import: %s', previewImport)
self.l.debug('Total preview imported participants: %s', pformat(previewImport.participants))
self.l.log(5, 'Total preview results: %s', pformat(previewImport.results))
csvReader = solo_turnier.reader.CSVResultReader(csvFile)
self.l.info('Loading the total result CSV file %s', csvFile)
csvRows = csvReader.extractResult()
return None
resultExtractor = ResultExtractor()
resultParsers = resultExtractor.getAllParsers(htmlResultsFileNames)
htmlResults = resultExtractor.extractAllData(resultParsers)
self.l.info('Overall result data extracted: %s', pformat(htmlResults.results))
return types.State3(csvRows, previewImport, htmlResults)
def combineData(self, importedData: types.State3):
self.l.info('Starting to build data sets.')
groups = self._extractGroups(importedData)
self.l.debug('Found groups in the dataset: %s', groups)
totalResult = {}
for group in groups:
self.l.debug('Collecting data for total result of group %s', group)
dances = self._extractDancesPerGroup(importedData, group)
self.l.log(5, 'Found dances in group %s: %s', group, dances)
participants = self._extractParticipantsPerGroup(importedData.previewImport, group)
self.l.log(5, 'Related participants %s', participants)
results = {}
for participant in participants:
self.l.log(5, 'Collecting data for %s', participant)
resultsOfParticipant = self._getResultOfSingleParticipant(
participant, group, importedData.previewImport, importedData.htmlResults, dances
)
self.l.log(5, 'Obtained result %s', resultsOfParticipant)
results[participant] = resultsOfParticipant
totalResult[group] = types.TotalGroupResult(dances, results)
self.l.log(5, 'Total result of all groups: %s', pformat(totalResult))
ret = types.State4(totalResult)
return ret
def _extractGroups(self, data: types.State3):
groupSet = set([])
for id in data.previewImport.participants:
participants = data.previewImport.participants[id]
for participant in participants:
groupSet.add(participant.group)
self.l.log(5, 'Set of active groups: %s', groupSet)
groupParser = solo_turnier.group.GroupParser()
groups = groupParser.getGroupsAsSortedList(groupSet)
return groups
def _extractDancesPerGroup(self, data: types.State3, group: solo_turnier.group.Group):
dances = set()
additionalDances = set()
for part in data.previewImport.results.keys():
allFoundDances = set(data.previewImport.results[part].keys())
dances.update(allFoundDances.intersection(self._allDances))
additionalDances.update(allFoundDances.difference(self._allDances))
if len(additionalDances) > 0:
self.l.warning('There were dances found, that are not registered. A bug? The dances were: %s', additionalDances)
dancesList = [x for x in self._allDances if x in dances]
additionalDancesList = list(additionalDances)
additionalDancesList.sort()
return dancesList + additionalDancesList
def _extractParticipantsPerGroup(
self,
previewData: types.HtmlPreviewImport,
group: solo_turnier.group.Group
) -> list[types.HtmlPreviewParticipant]:
ret = []
for id in previewData.participants:
participantList = previewData.participants[id]
for participant in participantList:
if participant.group == group:
ret.append(participant)
return ret
def _getResultOfSingleParticipant(
self,
participant: types.HtmlPreviewParticipant,
nominalGroup: solo_turnier.group.Group,
previewResults: types.HtmlPreviewImport,
totalResults: types.HtmlCompetitionTotalResults,
allDances: list[str]
) -> list[types.SingleParticipantResult|None]:
rawResults = totalResults.getById(participant.id)
self.l.log(5, 'Found result data (raw): %s', rawResults)
results = [None for x in allDances]
for danceIdx, dance in enumerate(allDances):
# self.l.log(5, '%s %s', dance, danceIdx)
def getResult() -> types.SingleParticipantResult|None:
for key in rawResults:
if key[0] != dance:
continue
rawResult = rawResults[key]
if len(rawResult) != 1:
raise Exception('Multiple results found with same key')
rawResult = rawResult[0]
# self.l.log(5, 'Result %s => %s', key, rawResult)
return types.SingleParticipantResult(
key[2], dance, rawResult.finalist,
rawResult.place, rawResult.placeTo
)
return None
results[danceIdx] = getResult()
return results