solo-auswertung/src/solo_turnier/worker.py

678 lines
26 KiB
Python
Raw Normal View History

2022-11-14 19:01:32 +00:00
import logging
2022-12-03 13:29:35 +00:00
from pprint import pformat
2022-11-14 19:01:32 +00:00
import re
2022-12-03 20:46:19 +00:00
import solo_turnier
from solo_turnier import html_parser
2022-12-03 13:29:35 +00:00
from .reader import ResultRow
2022-12-03 20:46:19 +00:00
from .types import HtmlCompetitionResultRow as CompetitionResult
from . import types
from . import competition_class
2022-11-27 08:10:17 +00:00
class HtmlPerson:
def __init__(self, name, id, group):
self.name = name
self.id = id
self.group = group
def __repr__(self):
return f'{self.name} ({self.id}, {self.group})'
2022-12-03 13:29:35 +00:00
def __eq__(self, o):
if not isinstance(o, HtmlPerson):
return False
return str(self) == str(o)
def __hash__(self):
return str(self).__hash__()
2022-11-27 08:10:17 +00:00
2022-11-14 19:01:32 +00:00
class ResultPerson:
2022-11-15 09:48:50 +00:00
def __init__(self, firstName, lastName, club, id = None, group = None):
2022-11-14 19:01:32 +00:00
self.firstName = firstName
self.lastName = lastName
self.name = f'{firstName} {lastName}'
self.club = club
self.id = id
2022-11-15 09:48:50 +00:00
self.group = group
2022-11-14 19:01:32 +00:00
@staticmethod
def extractFromResultRow(row: ResultRow):
return ResultPerson(
firstName=row.firstName,
lastName=row.lastName,
club=row.club
)
2022-11-15 09:48:50 +00:00
def __eq__(self, o):
if not isinstance(o, ResultPerson):
return False
return (
self.firstName == o.firstName and
self.lastName == o.lastName and
self.club == o.club and
self.id == o.id
)
def __repr__(self):
if self.id is None:
return f'{self.name} ({self.club})'
else:
return f'{self.name} ({self.club}) [{self.id}]'
def __hash__(self):
text = str(self)
return text.__hash__()
2022-11-14 19:01:32 +00:00
2022-12-03 13:29:35 +00:00
class ImportNotParsableException(Exception):
pass
2022-11-15 09:48:50 +00:00
2022-12-03 20:46:19 +00:00
ParserList_t = dict[str, html_parser.HtmlParser]
2022-11-27 08:10:17 +00:00
class PreviewWorker:
def __init__(self):
self.l = logging.getLogger('solo_turnier.worker.PreviewWorker')
2022-12-03 20:46:19 +00:00
self.participants = {}
self.previewResults = {}
2022-11-27 08:10:17 +00:00
2022-12-03 20:46:19 +00:00
def filterFilesPreview(self, files: list[str]) -> ParserList_t:
2022-11-27 08:10:17 +00:00
self.l.debug('Filtering the list of parsers by removing all non preview entries.')
ret = {}
for file in files:
with open(file, 'r') as fp:
text = fp.read()
2022-12-03 13:29:35 +00:00
parser = html_parser.HtmlParser(text, file)
2022-11-27 08:10:17 +00:00
try:
data = parser.guessDataFromHtmlTitle()
except:
self.l.error(f'Unable to parse html file in {file}. Please check manually.')
continue
if data['class_'] == 'Sichtung':
self.l.debug(f"Found candidate in {file}. Adding to the list.")
ret[file] = parser
else:
self.l.debug(f'Rejecting file {file} as the name {data["class_"]} did not match.')
return ret
def __extractPersonsFromSinglePreview(self, parser: html_parser.HtmlParser):
imported = parser.parsePreparationRound()
parser.cleanPreparationRoundImport(imported)
data = imported['data']
headerData = parser.guessDataFromHtmlTitle()
dance = headerData['dance']
classParser = solo_turnier.competition_class.CompetitionClassParser()
def getRowIndexOfClass():
return data['titles'].index('Platz von\nPlatz bis')
2022-11-27 08:10:17 +00:00
self.l.log(5, data)
if data['titles'][0] != 'Wertungsrichter':
self.l.fatal('Cannot parse the parsed content of the preview file.')
2022-12-03 13:29:35 +00:00
raise ImportNotParsableException('Incompatible export file')
2022-11-27 08:10:17 +00:00
if data['titles'][-1] == 'Startgruppe':
2022-12-03 20:46:19 +00:00
self.l.debug('Combined competition found. Extracting group from table required.')
extractGroup = True
2022-11-27 08:10:17 +00:00
else:
self.l.debug('Using group from the title.')
group = parser.guessDataFromHtmlTitle(imported['title'])['group']
2022-12-03 20:46:19 +00:00
extractGroup = False
2022-11-27 08:10:17 +00:00
classRowIndex = getRowIndexOfClass()
2022-12-03 20:46:19 +00:00
for index, e in enumerate(data['table'][0]):
if e['text'] == '':
# Skip empty columns
continue
# Extract data from column
name = e['meta']
id = int(e['text'])
if extractGroup:
group = data['table'][-1][index]['text']
# dance =
class_ = classParser.parseClass(data['table'][classRowIndex][index]['text'])
2022-12-03 20:46:19 +00:00
participant = types.HtmlPreviewParticipant(name, id, group)
2022-12-03 13:29:35 +00:00
2022-12-03 20:46:19 +00:00
l = self.participants.get(id, [])
self.l.log(5, 'Checking for existence of %s in %s: %s', participant, l, participant in l)
if participant not in l:
l.append(participant)
self.participants[id] = l
2022-11-27 08:10:17 +00:00
results = self.previewResults.get(participant, {})
results[dance] = class_
self.previewResults[participant] = results
2022-12-03 20:46:19 +00:00
def importAllData(self, parsers: ParserList_t) -> types.HtmlPreviewImport:
self.participants = {}
2022-12-03 13:29:35 +00:00
2022-11-27 08:10:17 +00:00
for file in parsers:
2022-12-03 20:46:19 +00:00
parser = parsers[file]
self.__extractPersonsFromSinglePreview(parser)
2022-12-03 13:29:35 +00:00
return types.HtmlPreviewImport(self.participants, self.previewResults)
class ResultExtractor:
def __init__(self):
self.l = logging.getLogger('solo_turnier.worker.ResultExtractor')
self.rePlaceSingle = re.compile(' *([0-9]+) *')
self.rePlaceDouble = re.compile(' *([0-9]+) *- *([0-9]+) *')
def getAllParsers(self, files: list[tuple[str,str]]) -> ParserList_t:
ret = {}
classParser = competition_class.CompetitionClassParser()
for filePair in files:
with open(filePair[0], 'r') as fp:
text = fp.read()
with open(filePair[1], 'r') as fp:
textTab = fp.read()
parser = html_parser.HtmlParser(text, filePair[0])
parserTab = html_parser.HtmlParser(textTab, filePair[1])
try:
data = parser.guessDataFromHtmlTitle()
except:
self.l.error('Cannot parse HTML file %s to check if it is a valid result. Check manually.', filePair[0])
continue
try:
guessedClass = classParser.parseClass(data['class_'])
except:
self.l.error('Issue parsing class of file %s. Check manually.', filePair[0])
continue
self.l.debug('Fetched result data: %s, guessed class %s', data, guessedClass)
ret[filePair] = (parser, parserTab)
return ret
def _extractPlace(self, placeStr: str):
s = placeStr.replace('.', '')
matches = self.rePlaceSingle.fullmatch(s)
if matches is not None:
return (int(matches.group(1)), None)
matches = self.rePlaceDouble.fullmatch(s)
if matches is not None:
return (int(matches.group(1)), int(matches.group(2)))
self.l.error('Could not parse place string "%s"', placeStr)
raise Exception('Place cannot be parsed')
def _analyzeSingleParser(self, parser: html_parser.HtmlParser, results: types.HtmlCompetitionTotalResults):
data = parser.guessDataFromHtmlTitle()
competitionClass = data['class_']
competitionGroup = data['group']
dance = data['dance']
result = parser.parseResult()
self.l.log(5, 'Raw data extracted: %s', result)
for person in result.results.keys():
placeStr = result.results[person]
place, placeTo = self._extractPlace(placeStr)
competitionResult = types.HtmlSingleCompetitionResult(person.name, place, placeTo, person.finalist)
results.add(competitionGroup, competitionClass, dance, person.id, competitionResult)
#
def _analyzeIndividualResults(self, parser: html_parser.HtmlParser, results: types.HtmlCompetitionTotalResults):
data = parser.guessDataFromHtmlTitle()
competitionClass = data['class_']
competitionGroup = data['group']
dance = data['dance']
result = parser.parseIndividualResult(competitionGroup, competitionClass, dance)
self.l.log(5, 'Found individual results: %s', result.participants)
results.tabges.update(result.participants)
def extractAllData(self, parsers: ParserList_t) -> types.HtmlCompetitionTotalResults:
ret = types.HtmlCompetitionTotalResults()
for fileNameTuple in parsers:
fileName = fileNameTuple[0]
self.l.debug('Extracting data from file %s', fileName)
self._analyzeSingleParser(parsers[fileNameTuple][0], ret)
self.l.debug('Fetching individual result of combined competitions in %s', fileName)
self._analyzeIndividualResults(parsers[fileNameTuple][1], ret)
return ret
2022-11-27 08:10:17 +00:00
2022-11-15 09:48:50 +00:00
class DataWorker:
def __init__(self):
2022-12-03 13:29:35 +00:00
self.l = logging.getLogger('solo_turnier.worker.DataWorker')
2022-11-15 09:48:50 +00:00
def combineRowsByPerson(self, rows: list[ResultRow]) -> dict[ResultPerson, list[CompetitionResult]]:
ret = {}
for row in rows:
result = CompetitionResult.extractFromResultRow(row)
if result.place == '-' or result.placeTo == '-':
continue
person = ResultPerson.extractFromResultRow(row)
if person not in ret:
ret[person] = []
ret[person].append(result)
return ret
def checkUniqueIds(self, data: dict[ResultPerson, list[CompetitionResult]]) -> bool:
unique = True
for person in data:
ids = set([c.id for c in data[person]])
if len(ids) == 1:
person.id = list(ids)[0]
else:
unique = False
return unique
"""
Return a tuple
The first one is True, if all persons could be unambiguously identified a group
The second one is True if there was the need to override a group but it was possible to extract from other data
The second one can be seen as a warning
"""
def consolidateGroups(self, data:dict[ResultPerson, list[CompetitionResult]]) -> tuple[bool, bool]:
ambiguous = False
warnChange = False
unambiguousGroups = set(['Kin.', 'Jun.', 'Jug.'])
combinations = set(['Kin./Jun.', 'Jun./Jug.'])
for person in data:
groupsRaw = set([c.group for c in data[person]])
unknown = groupsRaw.difference(unambiguousGroups).difference(combinations)
if len(unknown) > 0:
raise Exception(f'There were unknown groups found for {person}: {unknown}')
numUnambiguousGroups = len(groupsRaw.intersection(unambiguousGroups))
if numUnambiguousGroups == 0:
if len(groupsRaw) == 2:
warnChange = True
person.group = 'Jun.'
else:
ambiguous = True
if len(groupsRaw) == 1:
person.group = list(groupsRaw)[0]
elif numUnambiguousGroups == 1:
if len(groupsRaw.intersection(combinations)) > 0:
warnChange = True
person.group = list(groupsRaw.intersection(unambiguousGroups))[0]
else:
raise Exception(f'{person} cannot have different groups.')
return (not ambiguous, warnChange)
def _createHtmlLUT(self, htmlImports: list[html_parser.HtmlImport]):
ret = {}
2022-11-26 07:43:15 +00:00
parser = html_parser.HtmlParser('')
for imp in htmlImports:
parsed = parser.guessDataFromHtmlTitle(imp.title)
key = (parsed['group'], parsed['class_'], parsed['dance'])
ret[key] = imp
2022-11-16 09:22:09 +00:00
self.l.debug('LUT[%s] = %s', key, imp)
self.l.debug('LUT completed')
return ret
def mergeHtmlData(self, data:dict[ResultPerson, list[CompetitionResult]], htmlImports: list[html_parser.HtmlImport]):
lut = self._createHtmlLUT(htmlImports)
for person in data:
for competition in data[person]:
key = (competition.competitionGroup, competition.competitionClass, competition.dance)
htmlImport = lut[key]
participant = htmlImport.participants[str(competition.id)]
if participant.name != person.name:
self.l.error(f'Names for {person} and participant in HTML import ({participant}) do not match. Please check carefully.')
competition.finalist = participant.finalist
def getAllDancesInCompetitions(self, data:dict[ResultPerson, list[CompetitionResult]]) -> list[str]:
allDances = [
'Samba', 'Cha Cha', 'Rumba', 'Paso Doble', 'Jive',
'Langs. Walzer', 'Tango', 'Wiener Walzer', 'Slowfox', 'Quickstep'
]
dancesPresent = {d: False for d in allDances}
for person in data:
for competition in data[person]:
dancesPresent[competition.dance] = True
return [d for d in allDances if dancesPresent[d]]
def collectPersonsInGroups(self, data:dict[ResultPerson, list[CompetitionResult]]) -> list[tuple[str, list[ResultPerson]]]:
groups = {
'Kin.': [p for p in data.keys() if p.group == 'Kin.'],
'Jun.': [p for p in data.keys() if p.group == 'Jun.'],
'Jug.': [p for p in data.keys() if p.group == 'Jug.'],
}
found = groups['Kin.'] + groups['Jun.'] + groups['Jug.']
groups['Sonst'] = [p for p in data.keys() if p not in found]
return groups
def sortPersonsInGroup(self, persons: list[ResultPerson]) -> list[ResultPerson]:
ids = [p.id for p in persons]
def decorateByName(p: ResultPerson):
return (f'{p.name} ({p.club})', p)
def decorateById(p: ResultPerson):
return (p.id, p)
if any([id == None for id in ids]):
# We need to sort by name
decorated = [decorateByName(p) for p in persons]
showIds = False
else:
decorated = [decorateById(p) for p in persons]
showIds = True
decorated.sort()
return ([d[1] for d in decorated], showIds)
def mapPersonResultsToDanceList(self, results: list[CompetitionResult], dances: list[str]) -> list[CompetitionResult|None]:
ret = []
for dance in dances:
competitions = [c for c in results if c.dance == dance]
if len(competitions) == 0:
ret.append(None)
elif len(competitions) > 1:
raise Exception(f'Multiple competitions with the same dance "{dance}" found.')
else:
ret.append(competitions[0])
return ret
2022-11-27 08:10:17 +00:00
2022-12-03 20:46:19 +00:00
class Worker:
def __init__(self):
self.l = logging.getLogger('solo_turnier.worker.Worker')
self._allDances = (
['Samba', 'Cha Cha', 'Rumba', 'Paso Doble', 'Jive'] +
['Langs. Walzer', 'Tango', 'Wiener Walzer', 'Slowfox', 'Quickstep']
)
2022-12-03 20:46:19 +00:00
def collectAllData(
self,
htmlCandidatesPreview: list[str],
htmlResultsFileNames: list[str]
2022-12-03 20:46:19 +00:00
) -> types.State3:
previewWorker = PreviewWorker()
self.l.info('Filtering for pure preview rounds.')
parsers = previewWorker.filterFilesPreview(htmlCandidatesPreview)
self.l.debug('Remaining files: %s', list(parsers.keys()))
2022-12-03 20:46:19 +00:00
self.l.info('Extracting person data from the preview rounds.')
previewImport = previewWorker.importAllData(parsers)
self.l.debug('Total preview imported participants: %s', pformat(previewImport.participants))
self.l.log(5, 'Total preview results: %s', pformat(previewImport.results))
2022-12-03 20:46:19 +00:00
resultExtractor = ResultExtractor()
resultParsers = resultExtractor.getAllParsers(htmlResultsFileNames)
htmlResults = resultExtractor.extractAllData(resultParsers)
self.l.info('Overall result data extracted: %s', pformat(htmlResults.results))
return types.State3(previewImport, htmlResults)
def combineData(self, importedData: types.State3):
self.l.info('Starting to build data sets.')
groups = self._extractGroups(importedData)
self.l.debug('Found groups in the dataset: %s', groups)
totalResult = {}
for group in groups:
self.l.debug('Collecting data for total result of group %s', group)
dances = self._extractDancesPerGroup(importedData, group)
self.l.log(5, 'Found dances in group %s: %s', group, dances)
participants = self._extractParticipantsPerGroup(importedData.previewImport, group)
self.l.log(5, 'Related participants %s', participants)
results = {}
for participant in participants:
self.l.log(5, 'Collecting data for %s', participant)
resultsOfParticipant = self._getResultOfSingleParticipant(
2023-09-13 14:07:55 +00:00
participant, group, importedData.previewImport,
importedData.htmlResults, dances
)
self.l.log(5, 'Obtained result %s', resultsOfParticipant)
results[participant] = resultsOfParticipant
self.l.log(5, 'Result before native fixing: %s', (results))
# self._fixNativePlaces(dances, results)
self._fixNativePlacesFromTable(dances, results, importedData.htmlResults)
# self.l.log(5, 'Result after native fixing: %s', pformat(results))
self.l.log(5,'Data %s', results)
totalResult[group] = types.TotalGroupResult(dances, results)
self.l.log(5, 'Total result of all groups: %s', pformat(totalResult))
ret = types.State4(totalResult)
return ret
def _extractGroups(self, data: types.State3):
groupSet = set([])
for id in data.previewImport.participants:
participants = data.previewImport.participants[id]
for participant in participants:
groupSet.add(participant.group)
self.l.log(5, 'Set of active groups: %s', groupSet)
groupParser = solo_turnier.group.GroupParser()
groups = groupParser.getGroupsAsSortedList(groupSet)
return groups
def _extractDancesPerGroup(self, data: types.State3, group: solo_turnier.group.Group):
dances = set()
additionalDances = set()
for part in data.previewImport.results.keys():
allFoundDances = set(data.previewImport.results[part].keys())
dances.update(allFoundDances.intersection(self._allDances))
additionalDances.update(allFoundDances.difference(self._allDances))
if len(additionalDances) > 0:
self.l.warning('There were dances found, that are not registered. A bug? The dances were: %s', additionalDances)
dancesList = [x for x in self._allDances if x in dances]
additionalDancesList = list(additionalDances)
additionalDancesList.sort()
return dancesList + additionalDancesList
def _extractParticipantsPerGroup(
self,
previewData: types.HtmlPreviewImport,
group: solo_turnier.group.Group
2023-09-13 14:07:55 +00:00
) -> list[types.HtmlPreviewParticipant]:
ret = []
for id in previewData.participants:
participantList = previewData.participants[id]
for participant in participantList:
if participant.group == group:
ret.append(participant)
return ret
def _getResultOfSingleParticipant(
self,
participant: types.HtmlPreviewParticipant,
nominalGroup: solo_turnier.group.Group,
previewResults: types.HtmlPreviewImport,
totalResults: types.HtmlCompetitionTotalResults,
allDances: list[str]
2023-09-13 14:07:55 +00:00
) -> list[types.SingleParticipantResult|None]:
rawResults = totalResults.getById(participant.id)
self.l.log(5, 'Found result data (raw): %s', rawResults)
results = [None for x in allDances]
for danceIdx, dance in enumerate(allDances):
# self.l.log(5, '%s %s', dance, danceIdx)
def getResult() -> types.SingleParticipantResult|None:
for key in rawResults:
if key[0] != dance:
continue
rawResult = rawResults[key]
if len(rawResult) != 1:
raise Exception('Multiple results found with same key')
rawResult = rawResult[0]
2023-09-13 14:07:55 +00:00
nativeClass = previewResults.results[participant][dance]
# self.l.log(5, 'Result %s => %s', key, rawResult)
ret = types.SingleParticipantResult(
2023-09-13 14:07:55 +00:00
key[2], nativeClass, dance, rawResult.finalist,
rawResult.place, rawResult.placeTo
)
return ret
return None
results[danceIdx] = getResult()
2022-12-03 20:46:19 +00:00
return results
def _fixNativePlacesFromTable(
self,
dances: list[str],
data: dict[types.HtmlPreviewParticipant, list[types.SingleParticipantResult]],
importedData: types.HtmlCompetitionTotalResults
):
rePlace = re.compile('([0-9]+)(?:-([0-9]+))?')
for participant in data.keys():
self.l.log(5, 'fixing participant %s', participant)
results = data[participant]
for result in results:
if result is None:
continue
self.l.log(5, 'Looking at result set %s', result)
def selectEntry(k):
return k[2] == result.dance and int(k[3]) == participant.id
keys = list(importedData.tabges.keys())
selected = list(map(selectEntry, keys))
selectedIndex = selected.index(True)
raw = importedData.tabges[keys[selectedIndex]]
self.l.log(5,'Raw %s', raw)
nativePlaceRaw = raw[0]
matcher = rePlace.fullmatch(nativePlaceRaw)
if matcher is None:
self.l.error('Cannot parse place string %s for participant %u (%s) in dance %s', nativePlaceRaw, participant.id, participant, result.dance)
continue
self.l.log(5, 'Found strings by regex: %s', matcher.groups())
result.placeNative = matcher.group(1)
result.placeNativeTo = matcher.group(2)
pass
def _fixNativePlaces(
self,
dances: list[str],
data: dict[types.HtmlPreviewParticipant, list[types.SingleParticipantResult]]
):
classParser = solo_turnier.competition_class.CompetitionClassParser()
allClasses = classParser.getAllClasses()
allClasses.reverse()
for class_ in allClasses:
for danceIdx, dance in enumerate(dances):
self.l.log(5, 'Fixing native places for class %s in dance %s', class_, dance)
remainingParticipants = []
for participant in data.keys():
results = data[participant]
danceResult = results[danceIdx]
if danceResult is None:
continue
# self.l.log(5, 'Result of dance: %s', danceResult)
if classParser.isABetterThanB(danceResult.nativeClass, class_):
# self.l.log(5, 'Skipping %s as the native class is higher', participant)
continue
remainingParticipants.append((danceResult.place, participant.id, participant))
remainingParticipants.sort()
# self.l.log(5, 'Remaining participants %s', remainingParticipants)
def getAllParticipantsWithSamePlace():
first = remainingParticipants.pop(0)
ret = [first]
while len(remainingParticipants) > 0 and remainingParticipants[0][0] == first[0]:
ret.append(remainingParticipants.pop(0))
return ret
def updateNativePlaces(samePlaced, placeStart):
nextPlace = placeStart + len(samePlaced)
if len(samePlaced) == 1:
placeTo = None
else:
placeTo = nextPlace - 1
for p in samePlaced:
data[p[2]][danceIdx].placeNative = placeStart
data[p[2]][danceIdx].placeNativeTo = placeTo
return nextPlace
places = list(map(lambda x: x[0], remainingParticipants))
place = 1
while len(remainingParticipants) > 0:
samePlaced = getAllParticipantsWithSamePlace()
place = updateNativePlaces(samePlaced, place)
# self.l.log(5, '(Partially) fixed places: %s', (data))
def filterOutFinalists(self, data: types.State4):
for group in data.results:
self.l.debug('Cleaning up group %s', group.name)
participants = data.results[group].results.keys()
droppedParticipants = []
for participant in participants:
self.l.debug('Checking %s', participant)
def isFinalistInDance(x: types.HtmlSingleCompetitionResult|None):
if x is None:
return False
return x.finalist
mapped = list(map(isFinalistInDance, data.results[group].results[participant]))
finalist = True in mapped
self.l.log(5,'Check for finalist (in dances %s): %s', mapped, finalist)
if not finalist:
self.l.warning('Dropping %s from the output as no finalist', participant)
droppedParticipants.append(participant)
for droppedParticipant in droppedParticipants:
data.results[group].results.pop(droppedParticipant)
pass