from data_process.data_functions import read_tei class TEIFile(object): def __init__(self, filename, textfilename): self.filename = filename self.soup = read_tei(filename) self._text = None self._Head = '' self._Objecttype = '' self._attribution = '' self._Class = '' self._normclass = '' self._englishclass = '' self._generatedclass = '' self._author = '' if self.soup.find('index', type='head'): self._Head = self.soup.find('index', type='head')['value'] if self.soup.find('index', type='objecttype'): self._Objecttype = self.soup.find('index', type='objecttype')['value'] if self.soup.find('index', type='attribution'): self._attribution = self.soup.find('index', type='attribution')['value'] if self.soup.find('index', type='class') and self.soup.find('index', type='class').has_attr('value') : self._Class = self.soup.find('index', type='class')['value'] if self.soup.find('index', type='normclass'): self._normclass = self.soup.find('index', type='normclass')['value'] if self.soup.find('index', type='englishclass'): self._englishclass = self.soup.find('index', type='englishclass')['value'] if self.soup.find('index', type='generatedclass'): self._generatedclass = self.soup.find('index', type='generatedclass')['value'] if self.soup.find('index', type = 'author'): self._author = self.soup.find('index', type='author')['value'] ps = self.soup.find_all('p') Texts = [] for p in ps[1:]: Texts.append(p.getText()) self._text = ' '.join(Texts)