From eafca2e9aa5c9c38fd78cfbb1c508507a21db5ab Mon Sep 17 00:00:00 2001 From: Christian Wolf Date: Wed, 13 Sep 2023 15:23:04 +0200 Subject: [PATCH] Written basic code to parse existsing exports successfully --- .vscode/launch.json | 24 ++++ auswertung.code-workspace | 3 +- requiremnts.txt | 1 + solo_runner.sh | 2 +- src/solo_turnier/batch.py | 14 +- src/solo_turnier/cli.py | 7 + src/solo_turnier/group.py | 16 +++ src/solo_turnier/html_parser.py | 46 +++---- src/solo_turnier/types.py | 113 +++++++++++++++- src/solo_turnier/worker.py | 223 +++++++++++++++++++++++++++++++- 10 files changed, 407 insertions(+), 42 deletions(-) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..72281d1 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,24 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Remote Attach", + "type": "python", + "request": "attach", + "connect": { + "host": "localhost", + "port": 5678 + }, + "pathMappings": [ + { + "localRoot": "${workspaceFolder:code}", + "remoteRoot": "." + } + ], + "justMyCode": true + } + ] +} diff --git a/auswertung.code-workspace b/auswertung.code-workspace index 207029b..f7266e4 100644 --- a/auswertung.code-workspace +++ b/auswertung.code-workspace @@ -5,7 +5,8 @@ "name": "code" }, { - "path": "../../../../../nextcloud/Documents/Projekte/SLT/Auswertungsskript Solo" + "path": "../files", + "name": "files" } ], "settings": { diff --git a/requiremnts.txt b/requiremnts.txt index f4c322c..ae84fcc 100644 --- a/requiremnts.txt +++ b/requiremnts.txt @@ -3,6 +3,7 @@ beautifulsoup4==4.11.1 colorama==0.4.6 coloredlogs==15.0.1 coverage==6.5.0 +debugpy==1.6.7 exceptiongroup==1.0.1 humanfriendly==10.0 iniconfig==1.1.1 diff --git a/solo_runner.sh b/solo_runner.sh index a3a60f1..dea163a 100755 --- a/solo_runner.sh +++ b/solo_runner.sh @@ -2,7 +2,7 @@ _dir="$(dirname "$0")" -if [ -n "$PTHONPATH" ]; then +if [ -n "$PYTHONPATH" ]; then PYTHONPATH="$PYTHONPATH:$_dir/src" else PYTHONPATH="$_dir/src" diff --git a/src/solo_turnier/batch.py b/src/solo_turnier/batch.py index e54f853..30702b9 100644 --- a/src/solo_turnier/batch.py +++ b/src/solo_turnier/batch.py @@ -181,21 +181,25 @@ class BatchWorker: htmlCandidatesPreview = locator.findPreviewRoundCandidates(self.config.importHtmlPath()) self.l.debug('Found HTML file candidates for preview rounds: %s', htmlCandidatesPreview) - worker = solo_turnier.worker.Worker() - worker.collectAllData(htmlCandidatesPreview, self.config.importCSVPath()) + htmlResultFiles = locator.findCandidates(self.config.importHtmlPath()) + self.l.debug('Using HTML result files for result extraction: %s', htmlResultFiles) - # csvReader = solo_turnier.reader.AllResultReader(self.config.importCSVPath()) + worker = solo_turnier.worker.Worker() + importedData = worker.collectAllData(htmlCandidatesPreview, self.config.importCSVPath(), htmlResultFiles) + worker.combineData(importedData) + + # csvReader = solo_turnier.reader.CSVResultReader(self.config.importCSVPath()) # self.l.info('Loading the total result CSV file %s', self.config.importCSVPath()) # csvData = csvReader.readFile() # self.l.info('CSV file has been read') - # csvExtractor = solo_turnier.worker.CSVExtractor() + # csvExtractor = solo_turnier.reader.CSVExtractor() # self.l.info('Importing CSV data into internal structures') # csvRows = csvExtractor.mapCSVImport(csvData) # worker = solo_turnier.worker.DataWorker() - +# ///// # self.l.info('Checking for feasible HTML export files in "%s"', self.config.importHtmlPath()) # htmlCandidates = locator.findCandidates(self.config.importHtmlPath()) diff --git a/src/solo_turnier/cli.py b/src/solo_turnier/cli.py index be35e9a..cde610f 100644 --- a/src/solo_turnier/cli.py +++ b/src/solo_turnier/cli.py @@ -1,6 +1,8 @@ import argparse import logging +import debugpy + class Cli: def __init__(self, l: logging.Logger): parser = argparse.ArgumentParser() @@ -11,7 +13,12 @@ class Cli: parser.add_argument('-o', '--output', help='Set the output path of the script', nargs=1, default=[None]) parser.add_argument('-v', '--verbose', help='Increase verbosity', action='count', default=0) + parser.add_argument('-d', '--debug', action='store_true', help='Activate debugging during startup') self.__args = parser.parse_args() + + if self.__args.debug: + debugpy.listen(5678) + debugpy.wait_for_client() map = { 0: logging.ERROR, diff --git a/src/solo_turnier/group.py b/src/solo_turnier/group.py index e8112da..7e0f13d 100644 --- a/src/solo_turnier/group.py +++ b/src/solo_turnier/group.py @@ -78,3 +78,19 @@ class GroupParser: def isPureClass(self, cls: str) -> bool: parsedClass = self.parseClass(cls) return isinstance(parsedClass, Group) + + def getGroups(self) -> list[Group]: + return[ + GroupParser.KIN, + GroupParser.JUN, + GroupParser.JUG, + GroupParser.HGR, + GroupParser.MAS1, + GroupParser.MAS2, + GroupParser.MAS3, + GroupParser.MAS4, + GroupParser.MAS5 + ] + + def getGroupsAsSortedList(self, groups) -> list[Group]: + return [x for x in self.getGroups() if x in groups] diff --git a/src/solo_turnier/html_parser.py b/src/solo_turnier/html_parser.py index 39d8b1b..98139c1 100644 --- a/src/solo_turnier/html_parser.py +++ b/src/solo_turnier/html_parser.py @@ -3,8 +3,10 @@ from bs4 import BeautifulSoup import logging import re -from .types import HtmlPreviewParticipant as HtmlParticipant -from .types import HtmlPreviewImport as HtmlImport +from .types import HtmlPreviewParticipant, HtmlParticipant +from .types import HtmlPreviewImport as HtmlImport, HtmlResultImport +from .group import GroupParser +from .competition_class import CompetitionClassParser class HtmlParser: @@ -12,6 +14,8 @@ class HtmlParser: self.l = logging.getLogger('solo_turnier.html_parser') self.soup = BeautifulSoup(text, 'html.parser') self.fileName = fileName + self.groupParser = GroupParser() + self.classParser = CompetitionClassParser() def __repr__(self): if self.fileName is None: @@ -33,32 +37,22 @@ class HtmlParser: rest = match.group(1) rawGroup, rawClass, dance = rest.split(' ', 2) - classMap = { - 'Newcomer': 'Newc.', - 'Beginner': 'Beg.', - 'Advanced': 'Adv.' - } - - groupMap = { - 'Kinder': 'Kin.', - 'Junioren': 'Jun.', - 'Jugend': 'Jug.', - } - return { 'dance': dance.strip(), - 'class_': classMap.get(rawClass, rawClass), - 'group': groupMap.get(rawGroup, rawGroup) + 'class_': str(self.classParser.parseClass(rawClass, True)), + 'group': str(self.groupParser.parseClass(rawGroup)) } - def parseString(self, text: str): - soup = BeautifulSoup(text, 'html.parser') - + def parseResult(self): participants = {} def __parseRows(rows, finalist: bool): def __parseRow(row): tds = row.find_all('td') + + if len(tds) != 2: + return + regex = re.compile('(.*) \\(([0-9]+)\\)') place = tds[0].contents[0] @@ -69,8 +63,9 @@ class HtmlParser: name = match.group(1) number = match.group(2) - participant = HtmlParticipant(name, place, finalist) - participants[number] = participant + participant = HtmlParticipant(name, number) + participant.finalist = finalist + participants[participant] = place for row in rows: __parseRow(row) @@ -84,17 +79,18 @@ class HtmlParser: def __parseRemainingTables(tables): for table in tables: - __parseRows(table.find_all('tr')[2:], False) + __parseRows(table.find_all('tr'), False) - tables = soup.find('div', class_='extract').find_all('table') + tables = self.soup.find('div', class_='extract').find_all('table') if len(tables) > 0: __parseFirstTable(tables[0]) __parseRemainingTables(tables[1:]) - title = soup.find('div', class_='eventhead').table.tr.td.contents[0] + # title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0] - ret = HtmlImport(title, participants) + # ret = HtmlImport(title, participants) + ret = HtmlResultImport(participants) return ret def parsePreparationRound(self): diff --git a/src/solo_turnier/types.py b/src/solo_turnier/types.py index 5fd8e92..a32754e 100644 --- a/src/solo_turnier/types.py +++ b/src/solo_turnier/types.py @@ -21,10 +21,11 @@ class CSVResultRow: return f'{self.name} ({self.id}, {self.club}) is in {self.group} {self.class_} and danced the {self.dance} in {self.competitionGroup} {self.competitionClass} getting place {self.place}-{self.placeTo}' class HtmlPreviewParticipant: - def __init__(self, name, id, participant_group): + def __init__(self, name, id, group_): self.name = name self.id = id - self.group = group.GroupParser().parseClass(participant_group) + groupParser = group.GroupParser() + self.group = groupParser.parseClass(group_) def __eq__(self, o): if type(o) != HtmlPreviewParticipant: @@ -33,15 +34,53 @@ class HtmlPreviewParticipant: return all(map(lambda x, y: x == y, (self.name, self.id, self.group), (o.name, o.id, o.group))) def __repr__(self): - return f'{self.id}: {self.name} ({self.group})' + return f'{self.id} ({self.name}, {self.group})' + + def __hash__(self): + return hash((self.id, self.name, self.group)) + +class HtmlParticipant: + def __init__(self, name, id): + self.name = name + self.id = id + self.finalist = None + + def __eq__(self, o): + if type(o) != HtmlPreviewParticipant: + return False + + return all(map(lambda x, y: x == y, (self.name, self.id, self.group), (o.name, o.id, o.group))) + + def __repr__(self): + return f'{self.id}: {self.name}' + + def __hash__(self): + return hash((self.id, self.name)) + +# class PreviewParticipationData: +# def __init__(self, dance: str, class_: competition_class.CompetitionClass): +# self.class_ = class_ +# self.dance = dance class HtmlPreviewImport: - def __init__(self, participants: dict[int, HtmlPreviewParticipant]): + def __init__( + self, + participants: dict[int, list[HtmlPreviewParticipant]], + results: dict[HtmlPreviewParticipant, dict[str, competition_class.CompetitionClass]] + ): self.participants = participants + self.results = results def __repr__(self): - return str(self.participants) + return (str(self.participants), str(self.results)) +class HtmlResultImport: + def __init__(self, results: dict[HtmlParticipant, str]): + self.results = results + + def __repr__(self): + return str(self.results) + class HtmlCompetitionResultRow: def __init__(self, name, id, dance, group, class_, place, placeTo, finalist): self.dance = dance @@ -83,6 +122,17 @@ class HtmlSingleCompetitionResult: self.place = place self.placeTo = placeTo self.finalist = finalist + + def __repr__(self): + if self.placeTo is None: + place = self.place + else: + place = f'{self.place}-{self.placeTo}' + + if self.finalist: + return f'Res({self.name} [F], placed {place})' + else: + return f'Res({self.name}, placed {place})' class HtmlCompetitionTotalResults: def __init__(self): @@ -94,12 +144,65 @@ class HtmlCompetitionTotalResults: def get(self, group: group.Group_t, class_: competition_class.Class_t, dance: str, id: int) -> list[HtmlSingleCompetitionResult]: return self.results[self.__getTuple(group, class_, dance, id)] + def getById(self, id: int) -> dict[tuple[str, group.Group_t, competition_class.Class_t], HtmlSingleCompetitionResult]: + ret = {} + + for k in self.results: + if int(k[3]) != id: + continue + # ret = ret + self.results[k] + # Dance, Group, Class + key = (k[2], k[0], k[1]) + ret[key] = self.results[k] + + return ret + def add(self, group, class_, dance, id, result: HtmlSingleCompetitionResult): tup = self.__getTuple(group, class_, dance, id) l = self.results.get(tup, []) l.append(result) self.results[tup] = l +class SingleParticipantResult: + def __init__( + self, + competitionClass: competition_class.Class_t, + dance: str, + finalist: bool, + place: int, + placeTo: int|None + ): + self.competitionClass = competitionClass + self.dance = dance + self.finalist = finalist + self.place = place + self.placeTo = placeTo + + if placeTo == place: + self.placeTo = None + + def __repr__(self): + asFinalist = ' as finalist' if self.finalist else '' + + if self.placeTo is None: + return f'SR[{self.place} in {self.dance} {self.competitionClass}{asFinalist}]' + + return f'SR[{self.place}-{self.placeTo} in {self.dance} {self.competitionClass}{asFinalist}]' + +class TotalGroupResult: + def __init__(self, dances: list[str], results: dict[HtmlPreviewParticipant, list[SingleParticipantResult]]): + self.dances = dances + self.results = results + +class State4: + def __init__( + self, + resultPerGroup: dict[group.Group, TotalGroupResult] + ): + parser = group.GroupParser() + self.groups = parser.getGroupsAsSortedList(resultPerGroup.keys()) + self.results = resultPerGroup + class State3: def __init__( self, diff --git a/src/solo_turnier/worker.py b/src/solo_turnier/worker.py index aa8644a..fe85e77 100644 --- a/src/solo_turnier/worker.py +++ b/src/solo_turnier/worker.py @@ -1,11 +1,14 @@ import logging from pprint import pformat +import re + import solo_turnier from solo_turnier import html_parser from .reader import ResultRow from .types import HtmlCompetitionResultRow as CompetitionResult from . import types +from . import competition_class class HtmlPerson: def __init__(self, name, id, group): @@ -72,6 +75,7 @@ class PreviewWorker: def __init__(self): self.l = logging.getLogger('solo_turnier.worker.PreviewWorker') self.participants = {} + self.previewResults = {} def filterFilesPreview(self, files: list[str]) -> ParserList_t: self.l.debug('Filtering the list of parsers by removing all non preview entries.') @@ -100,6 +104,12 @@ class PreviewWorker: imported = parser.parsePreparationRound() parser.cleanPreparationRoundImport(imported) data = imported['data'] + + headerData = parser.guessDataFromHtmlTitle() + dance = headerData['dance'] + + def getRowIndexOfClass(): + return data['titles'].index('Platz von\nPlatz bis') self.l.log(5, data) @@ -115,6 +125,8 @@ class PreviewWorker: group = parser.guessDataFromHtmlTitle(imported['title'])['group'] extractGroup = False + classRowIndex = getRowIndexOfClass() + for index, e in enumerate(data['table'][0]): if e['text'] == '': # Skip empty columns @@ -125,6 +137,9 @@ class PreviewWorker: id = int(e['text']) if extractGroup: group = data['table'][-1][index]['text'] + + # dance = + class_ = data['table'][classRowIndex][index]['text'] participant = types.HtmlPreviewParticipant(name, id, group) @@ -134,6 +149,10 @@ class PreviewWorker: l.append(participant) self.participants[id] = l + results = self.previewResults.get(participant, {}) + results[dance] = class_ + self.previewResults[participant] = results + def importAllData(self, parsers: ParserList_t) -> types.HtmlPreviewImport: self.participants = {} @@ -141,7 +160,79 @@ class PreviewWorker: parser = parsers[file] self.__extractPersonsFromSinglePreview(parser) - return types.HtmlPreviewImport(self.participants) + return types.HtmlPreviewImport(self.participants, self.previewResults) + +class ResultExtractor: + + def __init__(self): + self.l = logging.getLogger('solo_turnier.worker.ResultExtractor') + self.rePlaceSingle = re.compile(' *([0-9]+) *') + self.rePlaceDouble = re.compile(' *([0-9]+) *- *([0-9]+) *') + + def getAllParsers(self, files: list[str]) -> ParserList_t: + ret = {} + classParser = competition_class.CompetitionClassParser() + + for file in files: + with open(file, 'r') as fp: + text = fp.read() + parser = html_parser.HtmlParser(text, file) + + try: + data = parser.guessDataFromHtmlTitle() + except: + self.l.error('Cannot parse HTML file %s to check if it is a valid result. Check manually.', file) + continue + + try: + guessedClass = classParser.parseClass(data['class_']) + except: + self.l.error('Issue parsing class of file %s. Check manually.', file) + continue + + self.l.debug('Fetched result data: %s, guessed class %s', data, guessedClass) + ret[file] = parser + + return ret + + def _extractPlace(self, placeStr: str): + s = placeStr.replace('.', '') + + matches = self.rePlaceSingle.fullmatch(s) + if matches is not None: + return (int(matches.group(1)), None) + + matches = self.rePlaceDouble.fullmatch(s) + if matches is not None: + return (int(matches.group(1)), int(matches.group(2))) + + self.l.error('Could not parse place string "%s"', placeStr) + raise Exception('Place cannot be parsed') + + def _analyzeSingleParser(self, parser: html_parser.HtmlParser, results: types.HtmlCompetitionTotalResults): + data = parser.guessDataFromHtmlTitle() + competitionClass = data['class_'] + competitionGroup = data['group'] + dance = data['dance'] + + result = parser.parseResult() + self.l.log(5, 'Raw data extracted: %s', result) + + for person in result.results.keys(): + placeStr = result.results[person] + place, placeTo = self._extractPlace(placeStr) + competitionResult = types.HtmlSingleCompetitionResult(person.name, place, placeTo, person.finalist) + results.add(competitionGroup, competitionClass, dance, person.id, competitionResult) + # + + def extractAllData(self, parsers: ParserList_t) -> types.HtmlCompetitionTotalResults: + ret = types.HtmlCompetitionTotalResults() + + for fileName in parsers: + self.l.debug('Extracting data from file %s', fileName) + self._analyzeSingleParser(parsers[fileName], ret) + + return ret class DataWorker: def __init__(self): @@ -296,24 +387,146 @@ class DataWorker: class Worker: def __init__(self): self.l = logging.getLogger('solo_turnier.worker.Worker') + self._allDances = ( + ['Samba', 'Cha Cha', 'Rumba', 'Paso Doble', 'Jive'] + + ['Langs. Walzer', 'Tango', 'Wiener Walzer', 'Slowfox', 'Quickstep'] + ) def collectAllData( self, htmlCandidatesPreview: list[str], - csvFile: str + csvFile: str, + htmlResultsFileNames: list[str] ) -> types.State3: previewWorker = PreviewWorker() self.l.info('Filtering for pure preview rounds.') parsers = previewWorker.filterFilesPreview(htmlCandidatesPreview) - self.l.debug('Remaining files: %s', parsers.keys()) + self.l.debug('Remaining files: %s', list(parsers.keys())) self.l.info('Extracting person data from the preview rounds.') previewImport = previewWorker.importAllData(parsers) - self.l.debug('Total preview import: %s', previewImport) + self.l.debug('Total preview imported participants: %s', pformat(previewImport.participants)) + self.l.log(5, 'Total preview results: %s', pformat(previewImport.results)) csvReader = solo_turnier.reader.CSVResultReader(csvFile) self.l.info('Loading the total result CSV file %s', csvFile) csvRows = csvReader.extractResult() + + resultExtractor = ResultExtractor() + resultParsers = resultExtractor.getAllParsers(htmlResultsFileNames) + htmlResults = resultExtractor.extractAllData(resultParsers) + self.l.info('Overall result data extracted: %s', pformat(htmlResults.results)) - return None + return types.State3(csvRows, previewImport, htmlResults) + + def combineData(self, importedData: types.State3): + self.l.info('Starting to build data sets.') + groups = self._extractGroups(importedData) + self.l.debug('Found groups in the dataset: %s', groups) + + totalResult = {} + + for group in groups: + self.l.debug('Collecting data for total result of group %s', group) + + dances = self._extractDancesPerGroup(importedData, group) + self.l.log(5, 'Found dances in group %s: %s', group, dances) + + participants = self._extractParticipantsPerGroup(importedData.previewImport, group) + self.l.log(5, 'Related participants %s', participants) + + results = {} + + for participant in participants: + self.l.log(5, 'Collecting data for %s', participant) + resultsOfParticipant = self._getResultOfSingleParticipant( + participant, group, importedData.previewImport, importedData.htmlResults, dances + ) + self.l.log(5, 'Obtained result %s', resultsOfParticipant) + results[participant] = resultsOfParticipant + + totalResult[group] = types.TotalGroupResult(dances, results) + + self.l.log(5, 'Total result of all groups: %s', pformat(totalResult)) + + ret = types.State4(totalResult) + return ret + + + def _extractGroups(self, data: types.State3): + groupSet = set([]) + for id in data.previewImport.participants: + participants = data.previewImport.participants[id] + for participant in participants: + groupSet.add(participant.group) + + self.l.log(5, 'Set of active groups: %s', groupSet) + groupParser = solo_turnier.group.GroupParser() + groups = groupParser.getGroupsAsSortedList(groupSet) + return groups + + def _extractDancesPerGroup(self, data: types.State3, group: solo_turnier.group.Group): + dances = set() + additionalDances = set() + for part in data.previewImport.results.keys(): + allFoundDances = set(data.previewImport.results[part].keys()) + dances.update(allFoundDances.intersection(self._allDances)) + additionalDances.update(allFoundDances.difference(self._allDances)) + + if len(additionalDances) > 0: + self.l.warning('There were dances found, that are not registered. A bug? The dances were: %s', additionalDances) + + dancesList = [x for x in self._allDances if x in dances] + additionalDancesList = list(additionalDances) + additionalDancesList.sort() + return dancesList + additionalDancesList + + def _extractParticipantsPerGroup( + self, + previewData: types.HtmlPreviewImport, + group: solo_turnier.group.Group + ) -> list[types.HtmlPreviewParticipant]: + ret = [] + for id in previewData.participants: + participantList = previewData.participants[id] + for participant in participantList: + if participant.group == group: + ret.append(participant) + return ret + + def _getResultOfSingleParticipant( + self, + participant: types.HtmlPreviewParticipant, + nominalGroup: solo_turnier.group.Group, + previewResults: types.HtmlPreviewImport, + totalResults: types.HtmlCompetitionTotalResults, + allDances: list[str] + ) -> list[types.SingleParticipantResult|None]: + rawResults = totalResults.getById(participant.id) + self.l.log(5, 'Found result data (raw): %s', rawResults) + + results = [None for x in allDances] + + for danceIdx, dance in enumerate(allDances): + # self.l.log(5, '%s %s', dance, danceIdx) + def getResult() -> types.SingleParticipantResult|None: + for key in rawResults: + if key[0] != dance: + continue + rawResult = rawResults[key] + + if len(rawResult) != 1: + raise Exception('Multiple results found with same key') + rawResult = rawResult[0] + + # self.l.log(5, 'Result %s => %s', key, rawResult) + return types.SingleParticipantResult( + key[2], dance, rawResult.finalist, + rawResult.place, rawResult.placeTo + ) + return None + + results[danceIdx] = getResult() + + return results